一、核心原理与库选择
主要Python库对比
| 库名称 |
优点 |
缺点 |
适用场景 |
|---|
| PyPDF2 (最新版:PyPDF4) |
纯Python实现,轻量级,功能全面 |
对某些复杂PDF支持有限 |
一般PDF操作,简单合并 |
| pdfrw |
轻量快速,支持注释保留 |
功能相对较少 |
需要保留表单字段的合并 |
| pikepdf |
基于C++,性能好,支持现代PDF |
安装稍复杂(需要C++库) |
处理加密、损坏PDF |
| PyMuPDF (fitz) |
功能最强大,性能最佳 |
API稍复杂 |
需要提取文本、图像等高级操作 |
推荐选择:对于大多数合并需求,PyPDF2/PyPDF4 是最简单直接的选择。
二、安装与环境准备
# 安装 PyPDF2(推荐 PyPDF4,它是 PyPDF2 的维护版本)
pip install PyPDF4
# 或者安装其他库
pip install pikepdf
pip install pymupdf
三、基础合并方法
方法1:使用 PyPDF4(最常用)
import os
from PyPDF4 import PdfFileReader, PdfFileWriter
def merge_pdfs_pypdf4(pdf_paths, output_path):
"""
使用 PyPDF4 合并多个PDF文件
参数:
pdf_paths: PDF文件路径列表
output_path: 输出文件路径
"""
pdf_writer = PdfFileWriter()
for path in pdf_paths:
try:
with open(path, 'rb') as pdf_file:
pdf_reader = PdfFileReader(pdf_file)
# 获取总页数
num_pages = pdf_reader.getNumPages()
print(f"正在处理: {os.path.basename(path)} (共{num_pages}页)")
# 逐页添加到输出
for page_num in range(num_pages):
page = pdf_reader.getPage(page_num)
pdf_writer.addPage(page)
except Exception as e:
print(f"处理文件 {path} 时出错: {str(e)}")
continue
# 写入输出文件
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"合并完成!输出文件: {output_path}")
# 使用示例
if __name__ == "__main__":
# 获取当前目录所有PDF文件
pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf')]
# 按文件名排序(可选)
pdf_files.sort()
# 设置输出文件名
output_file = "merged_output.pdf"
# 合并PDF
merge_pdfs_pypdf4(pdf_files, output_file)
方法2:使用 PyMuPDF(fitz)- 性能最佳
import fitz # PyMuPDF
import os
def merge_pdfs_pymupdf(pdf_paths, output_path):
"""
使用 PyMuPDF 合并PDF(性能最好)
"""
# 创建新的PDF文档
result_pdf = fitz.open()
for pdf_path in pdf_paths:
try:
# 打开每个PDF
with fitz.open(pdf_path) as pdf:
# 插入整个文档
result_pdf.insert_pdf(pdf)
print(f"已添加: {os.path.basename(pdf_path)} (共{len(pdf)}页)")
except Exception as e:
print(f"处理 {pdf_path} 时出错: {e}")
continue
# 保存合并后的PDF
result_pdf.save(output_path)
result_pdf.close()
print(f"合并完成!文件保存至: {output_path}")
四、高级功能实现
1. 带进度条的合并工具
import os
from PyPDF4 import PdfFileReader, PdfFileWriter
from tqdm import tqdm # 需要安装: pip install tqdm
def merge_pdfs_with_progress(pdf_paths, output_path):
"""
带进度显示的PDF合并
"""
pdf_writer = PdfFileWriter()
total_pages = 0
# 先统计总页数
print("正在扫描PDF文件...")
for path in pdf_paths:
try:
with open(path, 'rb') as f:
pdf_reader = PdfFileReader(f)
total_pages += pdf_reader.getNumPages()
except:
print(f"警告: 无法读取 {path}")
print(f"共发现 {len(pdf_paths)} 个PDF文件,总计约 {total_pages} 页")
# 创建进度条
pbar = tqdm(total=total_pages, desc="合并进度", unit="页")
# 合并文件
for path in pdf_paths:
try:
with open(path, 'rb') as pdf_file:
pdf_reader = PdfFileReader(pdf_file)
filename = os.path.basename(path)
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
pdf_writer.addPage(page)
pbar.update(1)
pbar.set_postfix(file=filename[:20])
except Exception as e:
print(f"\n处理 {path} 时出错: {e}")
continue
pbar.close()
# 保存文件
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"\n✅ 合并完成!输出文件: {output_path}")
2. 智能排序与批量处理
import os
import glob
import re
from PyPDF4 import PdfFileReader, PdfFileWriter
class PDFMerger:
def __init__(self):
self.writer = PdfFileWriter()
def find_pdfs(self, directory, pattern="*.pdf", recursive=False):
"""
查找PDF文件
参数:
directory: 目录路径
pattern: 文件匹配模式
recursive: 是否递归查找子目录
"""
if recursive:
search_pattern = os.path.join(directory, "**", pattern)
pdf_files = glob.glob(search_pattern, recursive=True)
else:
search_pattern = os.path.join(directory, pattern)
pdf_files = glob.glob(search_pattern)
return pdf_files
def natural_sort(self, pdf_list):
"""
自然排序(按数字顺序)
"""
def natural_key(text):
return [int(c) if c.isdigit() else c.lower()
for c in re.split(r'(\d+)', text)]
return sorted(pdf_list, key=lambda x: natural_key(os.path.basename(x)))
def merge_directory(self, directory, output_file, recursive=False, sort_natural=True):
"""
合并整个目录的PDF文件
"""
# 查找PDF文件
pdf_files = self.find_pdfs(directory, recursive=recursive)
if not pdf_files:
print(f"在目录 {directory} 中未找到PDF文件")
return False
print(f"找到 {len(pdf_files)} 个PDF文件")
# 排序
if sort_natural:
pdf_files = self.natural_sort(pdf_files)
# 显示文件列表
for i, f in enumerate(pdf_files, 1):
print(f"{i:3d}. {os.path.basename(f)}")
# 合并
return self.merge_files(pdf_files, output_file)
def merge_files(self, pdf_paths, output_path):
"""
合并指定文件列表
"""
for path in pdf_paths:
try:
with open(path, 'rb') as f:
pdf_reader = PdfFileReader(f)
filename = os.path.basename(path)
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
self.writer.addPage(page)
print(f"✓ 已添加: {filename} ({pdf_reader.getNumPages()}页)")
except Exception as e:
print(f"✗ 处理 {path} 失败: {e}")
continue
# 写入输出文件
with open(output_path, 'wb') as output_file:
self.writer.write(output_file)
print(f"\n🎉 合并完成!文件保存至: {output_path}")
return True
# 使用示例
if __name__ == "__main__":
merger = PDFMerger()
# 合并当前目录所有PDF
merger.merge_directory(".", "merged.pdf")
# 或者合并特定文件列表
# files = ["file1.pdf", "file2.pdf", "file3.pdf"]
# merger.merge_files(files, "output.pdf")
3. 添加书签/目录功能
from PyPDF4 import PdfFileReader, PdfFileWriter
def merge_with_bookmarks(pdf_paths, output_path, bookmark_names=None):
"""
合并PDF并添加书签
"""
writer = PdfFileWriter()
bookmarks = []
start_page = 0
for idx, path in enumerate(pdf_paths):
with open(path, 'rb') as f:
reader = PdfFileReader(f)
total_pages = reader.getNumPages()
# 添加页面
for page_num in range(total_pages):
page = reader.getPage(page_num)
writer.addPage(page)
# 创建书签
if bookmark_names and idx < len(bookmark_names):
bookmark_name = bookmark_names[idx]
else:
bookmark_name = f"文档_{idx+1}"
# 添加书签(指向第一页)
bookmark = writer.addBookmark(bookmark_name, start_page)
bookmarks.append(bookmark)
print(f"添加: {bookmark_name} (第{start_page+1}-{start_page+total_pages}页)")
start_page += total_pages
# 保存文件
with open(output_path, 'wb') as output_file:
writer.write(output_file)
print(f"合并完成,共添加 {len(bookmarks)} 个书签")
# 使用示例
if __name__ == "__main__":
files = ["chap1.pdf", "chap2.pdf", "appendix.pdf"]
names = ["第一章", "第二章", "附录"]
merge_with_bookmarks(files, "book_with_toc.pdf", names)
五、命令行工具实现
#!/usr/bin/env python3
"""
pdf_merger.py - PDF合并命令行工具
用法: python pdf_merger.py [-h] [-o OUTPUT] [-r] [-s] pdf_files [pdf_files ...]
"""
import argparse
import sys
import os
from PyPDF4 import PdfFileReader, PdfFileWriter
def main():
parser = argparse.ArgumentParser(description='合并多个PDF文件')
parser.add_argument('pdf_files', nargs='+', help='要合并的PDF文件')
parser.add_argument('-o', '--output', default='merged.pdf',
help='输出文件名 (默认: merged.pdf)')
parser.add_argument('-r', '--recursive', action='store_true',
help='递归处理目录')
parser.add_argument('-s', '--sort', action='store_true',
help='按文件名排序')
args = parser.parse_args()
# 处理文件列表
pdf_paths = []
for pattern in args.pdf_files:
if os.path.isdir(pattern):
# 如果是目录,查找所有PDF
for root, dirs, files in os.walk(pattern):
for file in files:
if file.lower().endswith('.pdf'):
pdf_paths.append(os.path.join(root, file))
if not args.recursive:
break
elif os.path.isfile(pattern) and pattern.lower().endswith('.pdf'):
pdf_paths.append(pattern)
else:
print(f"警告: 跳过无效文件 {pattern}")
if not pdf_paths:
print("错误: 未找到有效的PDF文件")
sys.exit(1)
# 排序
if args.sort:
pdf_paths.sort()
# 显示要合并的文件
print(f"将合并以下 {len(pdf_paths)} 个文件:")
for i, path in enumerate(pdf_paths, 1):
print(f" {i:3d}. {os.path.basename(path)}")
# 合并
try:
writer = PdfFileWriter()
total_pages = 0
for path in pdf_paths:
with open(path, 'rb') as f:
reader = PdfFileReader(f)
pages = reader.getNumPages()
total_pages += pages
for page_num in range(pages):
page = reader.getPage(page_num)
writer.addPage(page)
with open(args.output, 'wb') as output_file:
writer.write(output_file)
print(f"\n✅ 成功合并 {len(pdf_paths)} 个PDF,共 {total_pages} 页")
print(f"📄 输出文件: {os.path.abspath(args.output)}")
except Exception as e:
print(f"❌ 合并失败: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
六、GUI界面版本(使用Tkinter)
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import os
from PyPDF4 import PdfFileReader, PdfFileWriter
class PDFMergerGUI:
def __init__(self, root):
self.root = root
self.root.title("PDF合并工具")
self.root.geometry("600x500")
self.pdf_files = []
# 创建界面
self.create_widgets()
def create_widgets(self):
# 按钮框架
btn_frame = tk.Frame(self.root)
btn_frame.pack(pady=10)
tk.Button(btn_frame, text="添加PDF文件", command=self.add_files).pack(side=tk.LEFT, padx=5)
tk.Button(btn_frame, text="添加文件夹", command=self.add_folder).pack(side=tk.LEFT, padx=5)
tk.Button(btn_frame, text="清空列表", command=self.clear_list).pack(side=tk.LEFT, padx=5)
# 文件列表
list_frame = tk.LabelFrame(self.root, text="待合并文件列表")
list_frame.pack(padx=10, pady=5, fill=tk.BOTH, expand=True)
# 创建滚动条
scrollbar = tk.Scrollbar(list_frame)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.listbox = tk.Listbox(list_frame, yscrollcommand=scrollbar.set, selectmode=tk.EXTENDED)
self.listbox.pack(fill=tk.BOTH, expand=True)
scrollbar.config(command=self.listbox.yview)
# 控制按钮
control_frame = tk.Frame(self.root)
control_frame.pack(pady=10)
tk.Button(control_frame, text="上移", command=self.move_up).pack(side=tk.LEFT, padx=5)
tk.Button(control_frame, text="下移", command=self.move_down).pack(side=tk.LEFT, padx=5)
tk.Button(control_frame, text="删除选中", command=self.delete_selected).pack(side=tk.LEFT, padx=5)
# 输出设置
output_frame = tk.Frame(self.root)
output_frame.pack(pady=10)
tk.Label(output_frame, text="输出文件名:").pack(side=tk.LEFT)
self.output_var = tk.StringVar(value="merged.pdf")
tk.Entry(output_frame, textvariable=self.output_var, width=30).pack(side=tk.LEFT, padx=5)
# 合并按钮
tk.Button(self.root, text="开始合并", command=self.merge_pdfs,
bg="green", fg="white", font=("Arial", 12, "bold"),
padx=20, pady=5).pack(pady=15)
# 进度条
self.progress = ttk.Progressbar(self.root, length=400, mode='indeterminate')
def add_files(self):
files = filedialog.askopenfilenames(
title="选择PDF文件",
filetypes=[("PDF文件", "*.pdf"), ("所有文件", "*.*")]
)
for file in files:
if file not in self.pdf_files:
self.pdf_files.append(file)
self.listbox.insert(tk.END, os.path.basename(file))
def add_folder(self):
folder = filedialog.askdirectory(title="选择文件夹")
if folder:
for file in os.listdir(folder):
if file.lower().endswith('.pdf'):
full_path = os.path.join(folder, file)
if full_path not in self.pdf_files:
self.pdf_files.append(full_path)
self.listbox.insert(tk.END, file)
def clear_list(self):
self.pdf_files.clear()
self.listbox.delete(0, tk.END)
def move_up(self):
selected = self.listbox.curselection()
if selected and selected[0] > 0:
index = selected[0]
# 交换列表中的元素
self.pdf_files[index], self.pdf_files[index-1] = self.pdf_files[index-1], self.pdf_files[index]
# 更新Listbox
self.refresh_listbox()
self.listbox.select_set(index-1)
def move_down(self):
selected = self.listbox.curselection()
if selected and selected[0] < len(self.pdf_files) - 1:
index = selected[0]
# 交换列表中的元素
self.pdf_files[index], self.pdf_files[index+1] = self.pdf_files[index+1], self.pdf_files[index]
# 更新Listbox
self.refresh_listbox()
self.listbox.select_set(index+1)
def delete_selected(self):
selected = self.listbox.curselection()
for index in reversed(selected):
del self.pdf_files[index]
self.refresh_listbox()
def refresh_listbox(self):
self.listbox.delete(0, tk.END)
for file in self.pdf_files:
self.listbox.insert(tk.END, os.path.basename(file))
def merge_pdfs(self):
if not self.pdf_files:
messagebox.showwarning("警告", "请先添加PDF文件")
return
output_file = self.output_var.get()
if not output_file:
output_file = "merged.pdf"
# 添加进度条
self.progress.pack(pady=10)
self.progress.start()
try:
writer = PdfFileWriter()
total_pages = 0
for i, pdf_file in enumerate(self.pdf_files, 1):
with open(pdf_file, 'rb') as f:
reader = PdfFileReader(f)
pages = reader.getNumPages()
total_pages += pages
for page_num in range(pages):
page = reader.getPage(page_num)
writer.addPage(page)
with open(output_file, 'wb') as output:
writer.write(output)
self.progress.stop()
self.progress.pack_forget()
messagebox.showinfo("成功",
f"合并完成!\n\n共合并 {len(self.pdf_files)} 个文件\n总页数: {total_pages}\n输出文件: {output_file}")
except Exception as e:
self.progress.stop()
self.progress.pack_forget()
messagebox.showerror("错误", f"合并失败: {str(e)}")
if __name__ == "__main__":
root = tk.Tk()
app = PDFMergerGUI(root)
root.mainloop()
七、最佳实践与注意事项
1. 处理大文件时的内存优化
def merge_large_pdfs(pdf_paths, output_path, batch_size=100):
"""
分批处理大型PDF文件,避免内存不足
"""
writer = PdfFileWriter()
for i, path in enumerate(pdf_paths):
with open(path, 'rb') as f:
reader = PdfFileReader(f)
total_pages = reader.getNumPages()
# 分批处理
for start in range(0, total_pages, batch_size):
end = min(start + batch_size, total_pages)
for page_num in range(start, end):
page = reader.getPage(page_num)
writer.addPage(page)
print(f"处理中: {os.path.basename(path)} ({end}/{total_pages}页)")
with open(output_path, 'wb') as output_file:
writer.write(output_file)
2. 错误处理增强版
def safe_merge_pdfs(pdf_paths, output_path):
"""
带有完整错误处理的合并函数
"""
if not pdf_paths:
raise ValueError("PDF文件列表为空")
valid_files = []
errors = []
# 验证文件
for path in pdf_paths:
if not os.path.exists(path):
errors.append(f"文件不存在: {path}")
elif not path.lower().endswith('.pdf'):
errors.append(f"不是PDF文件: {path}")
else:
try:
with open(path, 'rb') as f:
# 尝试读取以验证PDF有效性
PdfFileReader(f)
valid_files.append(path)
except Exception as e:
errors.append(f"无效的PDF文件 {path}: {str(e)}")
if errors:
print("发现以下问题:")
for error in errors:
print(f" - {error}")
if not valid_files:
raise Exception("没有有效的PDF文件可供合并")
print(f"找到 {len(valid_files)} 个有效PDF文件")
# 执行合并
return merge_pdfs_pypdf4(valid_files, output_path)
3. 性能优化建议
使用PyMuPDF处理大量文件:性能比PyPDF2快5-10倍
批量处理:超过100个文件时考虑分批处理
关闭不必要的功能:如不需要书签,避免相关操作
八、完整项目结构示例
pdf_merger_project/
├── pdf_merger/ # 主包
│ ├── __init__.py
│ ├── core.py # 核心合并功能
│ ├── cli.py # 命令行接口
│ ├── gui.py # GUI界面
│ └── utils.py # 工具函数
├── requirements.txt # 依赖列表
├── setup.py # 安装配置
└── README.md # 说明文档
requirements.txt
PyPDF4>=1.27.0
pikepdf>=5.0.0
PyMuPDF>=1.19.0
tkinter # 通常Python自带
tqdm>=4.60.0 # 可选,用于进度条
九、常见问题解决
Q1: 合并后文件过大?
解决方案:
# 压缩PDF(使用pikepdf)
import pikepdf
def compress_pdf(input_path, output_path):
with pikepdf.open(input_path) as pdf:
pdf.save(output_path, compress_streams=True)
Q2: 中文文件名乱码?
# 正确处理中文路径(Windows)
import sys
if sys.platform == 'win32':
import win32api # 需要安装 pywin32
def get_short_path(long_path):
"""获取Windows短路径避免中文问题"""
return win32api.GetShortPathName(long_path)
Q3: 需要保留表单字段?
# 使用pdfrw保留表单
from pdfrw import PdfReader, PdfWriter
def merge_preserve_forms(pdf_paths, output_path):
writer = PdfWriter()
for path in pdf_paths:
reader = PdfReader(path)
writer.addpages(reader.pages)
writer.write(output_path)
总结
这个完整的PDF合并指南涵盖了:
基础合并:使用PyPDF4/PyPDF2的基本方法
高级功能:进度显示、书签添加、批量处理
不同场景:命令行工具、GUI界面
性能优化:处理大文件、错误处理
扩展方案:保留表单、压缩文件等
您可以根据具体需求选择合适的方案。对于大多数日常使用,方法1(PyPDF4) 和 命令行工具 的组合是最实用和高效的。
需要特定功能或遇到问题,请随时询问!