December 8, 202510 min read

Merging PDFs at Scale: Performance Tips and Tricks

Discover how to efficiently merge thousands of PDFs using our API. Optimize memory usage, handle large files, and process documents in parallel.

Merging PDFs seems simple - combine files into one document. But at scale, naive implementations quickly hit performance bottlenecks. Let's explore how to merge PDFs efficiently for production workloads.

The Challenge of Scale

Merging a few PDFs is easy. Merging thousands reveals challenges:

Memory constraints: Loading all files into memory crashes your application
API rate limits: Sequential processing is too slow
File size limits: Individual files may be huge
Network reliability: Long operations fail on timeouts

Strategy 1: Batch Processing

Instead of merging all files at once, process in manageable batches:

async function mergeLargeBatch(files) {
  const BATCH_SIZE = 10; // Merge 10 files at a time
  const batches = chunkArray(files, BATCH_SIZE);
  const mergedBatches = [];
  
  // Process each batch
  for (const batch of batches) {
    const merged = await mergeFiles(batch);
    mergedBatches.push(merged);
  }
  
  // Final merge of all batches
  return await mergeFiles(mergedBatches);
}

function chunkArray(array, size) {
  const chunks = [];
  for (let i = 0; i < array.length; i += size) {
    chunks.push(array.slice(i, i + size));
  }
  return chunks;
}

Strategy 2: Parallel Processing

Process multiple batches simultaneously to maximize throughput:

async function mergeParallel(files) {
  const BATCH_SIZE = 10;
  const PARALLEL_BATCHES = 3;
  
  const batches = chunkArray(files, BATCH_SIZE);
  const results = [];
  
  // Process 3 batches in parallel
  for (let i = 0; i < batches.length; i += PARALLEL_BATCHES) {
    const parallelBatches = batches.slice(i, i + PARALLEL_BATCHES);
    const batchResults = await Promise.all(
      parallelBatches.map(batch => mergeFiles(batch))
    );
    results.push(...batchResults);
  }
  
  return await mergeFiles(results);
}

Strategy 3: Streaming for Memory Efficiency

Stream files instead of loading them entirely into memory:

const fs = require('fs');
const FormData = require('form-data');

async function streamMerge(filePaths) {
  const form = new FormData();
  
  // Stream each file
  filePaths.forEach((path, index) => {
    const stream = fs.createReadStream(path);
    form.append(`file${index}`, stream);
  });
  
  const response = await fetch('https://pdfmunk.com/api/merge', {
    method: 'POST',
    headers: {
      'Authorization': 'Bearer YOUR_API_KEY',
      ...form.getHeaders()
    },
    body: form
  });
  
  return response.body; // Stream the response too
}

Strategy 4: Progressive Merging

For very large operations, merge progressively and store intermediate results:

async function progressiveMerge(files, onProgress) {
  let currentMerged = files[0];
  
  for (let i = 1; i < files.length; i++) {
    // Merge current result with next file
    currentMerged = await mergeTwo(currentMerged, files[i]);
    
    // Report progress
    onProgress({
      processed: i,
      total: files.length,
      percent: (i / files.length) * 100
    });
    
    // Optional: Save checkpoint
    if (i % 100 === 0) {
      await saveCheckpoint(currentMerged, i);
    }
  }
  
  return currentMerged;
}

Handling Large Individual Files

When individual PDFs are large (50MB+), special care is needed:

Compress first: Reduce file size before merging
Split then merge: Break large files into smaller chunks
Direct upload: Use presigned URLs for very large files
Increase timeouts: Allow more time for large operations

Optimizing API Usage

Maximize efficiency and minimize costs:

// ❌ Bad: Many small merges
for (const file of files) {
  result = await merge(result, file); // N API calls
}

// ✅ Good: One large merge
result = await merge(files); // 1 API call

// ✅ Better: Balanced batching
const batches = chunk(files, 20);
const merged = await Promise.all(
  batches.map(batch => merge(batch))
);
result = await merge(merged); // 1 + ceil(N/20) calls

Error Recovery at Scale

With large batches, failures are inevitable. Build in resilience:

async function robustMerge(files) {
  const MAX_RETRIES = 3;
  const failed = [];
  
  for (const file of files) {
    let success = false;
    
    for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
      try {
        await processFile(file);
        success = true;
        break;
      } catch (error) {
        console.error(`Attempt ${attempt + 1} failed:`, error);
        await sleep(Math.pow(2, attempt) * 1000);
      }
    }
    
    if (!success) {
      failed.push(file);
    }
  }
  
  if (failed.length > 0) {
    throw new Error(`Failed to process ${failed.length} files`);
  }
}

Performance Benchmarks

Sequential: ~1000 files/hour

Batched (10 per batch): ~5000 files/hour

Parallel (3 workers): ~15000 files/hour

Optimized pipeline: ~25000+ files/hour

*Benchmarks based on average 2MB PDFs with 10 pages each

Conclusion

Merging PDFs at scale requires thoughtful architecture. Start with batch processing, add parallelization for speed, and implement proper error handling. Monitor performance and adjust batch sizes based on your specific workload.

Want to try these techniques? Our API documentation includes working examples and performance guidelines.