pypdf
Manipulate PDF documents programmatically. Merge, split, rotate, and watermark PDFs. Extract text and metadata. Handle form filling and encryption/decryption.
What this skill does
# PyPDF PDF Manipulation Skill
## Overview
PyPDF is a pure-Python library for working with PDF files. This skill covers comprehensive patterns for PDF manipulation including:
- **PDF merging** - Combine multiple PDFs into one document
- **PDF splitting** - Extract specific pages or split into multiple files
- **Page rotation** - Rotate pages by 90, 180, or 270 degrees
- **Watermarking** - Add text or image watermarks to pages
- **Text extraction** - Extract text content from PDF pages
- **Metadata handling** - Read and modify PDF metadata
- **Form filling** - Fill PDF form fields programmatically
- **Encryption/Decryption** - Secure PDFs with passwords
## When to Use This Skill
### USE when:
- Merging multiple PDF files into a single document
- Splitting large PDFs into smaller files
- Extracting specific pages from PDFs
- Adding watermarks or stamps to documents
- Extracting text content for analysis
- Reading or modifying PDF metadata
- Filling PDF forms programmatically
- Encrypting or decrypting PDF files
- Adding page numbers or headers/footers
- Rotating or reordering pages
- Automating PDF workflows in pipelines
### DON'T USE when:
- Creating PDFs from scratch (use reportlab or weasyprint)
- Need advanced text layout control (use reportlab)
- Converting other formats to PDF (use dedicated converters)
- Need OCR for scanned documents (use pytesseract + pdf2image)
- Working with complex form creation (use reportlab)
- Need to edit existing text content (limited support)
## Prerequisites
### Installation
```bash
# Basic installation
pip install pypdf
# Using uv (recommended)
uv pip install pypdf
# With crypto support for encryption
pip install pypdf[crypto]
# For creating PDFs (watermarks, overlays)
pip install pypdf reportlab
# Full installation
pip install pypdf[crypto] reportlab Pillow
```
### Verify Installation
```python
from pypdf import PdfReader, PdfWriter, PdfMerger
from pypdf.errors import PdfReadError
print("pypdf installed successfully!")
print(f"Version: {pypdf.__version__}")
```
## Core Capabilities
### 1. PDF Merging
```python
"""
Merge multiple PDF files into a single document.
"""
from pypdf import PdfMerger, PdfReader, PdfWriter
from pathlib import Path
from typing import List, Optional
def merge_pdfs(
pdf_paths: List[str],
output_path: str,
bookmarks: bool = True
) -> None:
"""Merge multiple PDFs into one file."""
merger = PdfMerger()
for pdf_path in pdf_paths:
path = Path(pdf_path)
if path.exists():
# Add with bookmark (outline entry)
merger.append(
str(pdf_path),
outline_item=path.stem if bookmarks else None
)
print(f"Added: {path.name}")
else:
print(f"Warning: File not found - {pdf_path}")
merger.write(output_path)
merger.close()
print(f"Merged PDF saved to: {output_path}")
def merge_with_page_selection(
pdf_configs: List[dict],
output_path: str
) -> None:
"""Merge specific pages from multiple PDFs.
Args:
pdf_configs: List of dicts with 'path', 'pages' (optional) keys
pages can be tuple (start, end) or list of page numbers
output_path: Output file path
"""
merger = PdfMerger()
for config in pdf_configs:
pdf_path = config['path']
pages = config.get('pages')
if pages is None:
# Add all pages
merger.append(pdf_path)
elif isinstance(pages, tuple):
# Add page range (start, end)
merger.append(pdf_path, pages=pages)
elif isinstance(pages, list):
# Add specific pages
reader = PdfReader(pdf_path)
for page_num in pages:
if 0 <= page_num < len(reader.pages):
merger.append(pdf_path, pages=(page_num, page_num + 1))
print(f"Added: {pdf_path} - Pages: {pages or 'all'}")
merger.write(output_path)
merger.close()
print(f"Merged PDF saved to: {output_path}")
def merge_directory(
directory: str,
output_path: str,
pattern: str = "*.pdf",
sort_key: Optional[str] = "name"
) -> int:
"""Merge all PDFs in a directory."""
dir_path = Path(directory)
pdf_files = list(dir_path.glob(pattern))
if not pdf_files:
print(f"No PDF files found in {directory}")
return 0
# Sort files
if sort_key == "name":
pdf_files.sort(key=lambda x: x.name.lower())
elif sort_key == "date":
pdf_files.sort(key=lambda x: x.stat().st_mtime)
elif sort_key == "size":
pdf_files.sort(key=lambda x: x.stat().st_size)
merge_pdfs([str(f) for f in pdf_files], output_path)
return len(pdf_files)
# Example usage
# merge_pdfs(['report1.pdf', 'report2.pdf', 'appendix.pdf'], 'complete_report.pdf')
#
# merge_with_page_selection([
# {'path': 'doc1.pdf', 'pages': (0, 5)}, # First 5 pages
# {'path': 'doc2.pdf', 'pages': [0, 2, 4]}, # Pages 1, 3, 5
# {'path': 'doc3.pdf'} # All pages
# ], 'combined.pdf')
```
### 2. PDF Splitting
```python
"""
Split PDF files into separate documents.
"""
from pypdf import PdfReader, PdfWriter
from pathlib import Path
from typing import List, Tuple, Optional
def split_pdf_by_pages(
input_path: str,
output_dir: str,
pages_per_file: int = 1
) -> List[str]:
"""Split PDF into multiple files with specified pages per file."""
reader = PdfReader(input_path)
total_pages = len(reader.pages)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
input_name = Path(input_path).stem
created_files = []
for start in range(0, total_pages, pages_per_file):
writer = PdfWriter()
end = min(start + pages_per_file, total_pages)
for page_num in range(start, end):
writer.add_page(reader.pages[page_num])
# Generate output filename
if pages_per_file == 1:
output_file = output_path / f"{input_name}_page_{start + 1}.pdf"
else:
output_file = output_path / f"{input_name}_pages_{start + 1}-{end}.pdf"
writer.write(str(output_file))
created_files.append(str(output_file))
print(f"Created: {output_file.name}")
print(f"Split into {len(created_files)} files")
return created_files
def extract_pages(
input_path: str,
output_path: str,
page_numbers: List[int]
) -> None:
"""Extract specific pages from a PDF.
Args:
input_path: Source PDF file
output_path: Destination file
page_numbers: List of page numbers (0-indexed)
"""
reader = PdfReader(input_path)
writer = PdfWriter()
for page_num in page_numbers:
if 0 <= page_num < len(reader.pages):
writer.add_page(reader.pages[page_num])
print(f"Extracted page {page_num + 1}")
else:
print(f"Warning: Page {page_num + 1} out of range")
writer.write(output_path)
print(f"Extracted pages saved to: {output_path}")
def split_by_ranges(
input_path: str,
output_dir: str,
ranges: List[Tuple[int, int, str]]
) -> List[str]:
"""Split PDF by specified page ranges.
Args:
input_path: Source PDF file
output_dir: Output directory
ranges: List of (start, end, name) tuples
start and end are 0-indexed
"""
reader = PdfReader(input_path)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
created_files = []
for start, end, name in ranges:
writer = PdfWriter()
for page_num in range(start, min(end, len(reader.pages))):
writer.add_page(reader.pages[page_num])
output_file = output_path / f"{name}.pdf"
writer.write(str(output_file))
created_files.append(str(output_file))
print(f"Created: {output_file.name} (pages {start + 1}-{end})")
return created_files
def split_byRelated in office-docs
openpyxl
IncludedCreate and manipulate Microsoft Excel workbooks programmatically. Build spreadsheets with formulas, charts, conditional formatting, and pivot tables. Handle large datasets efficiently with streaming mode.
python-docx
IncludedCreate and manipulate Microsoft Word documents programmatically. Build reports, contracts, and documentation with full control over paragraphs, tables, headers, styles, and images.
python-pptx
IncludedCreate and manipulate PowerPoint presentations programmatically. Build slide decks with layouts, shapes, charts, tables, and images. Generate data-driven presentations from templates.
docx-templates
IncludedTemplate-based Word document generation using Jinja2 syntax. Create reports, contracts, and documents with loops, conditionals, tables, and mail merge capabilities.