阳江市本地信息网

Python合并多个PDF文件的完整指南与实践

2026-04-03 11:27:02 浏览次数:1
详细信息

一、核心原理与库选择

主要Python库对比

库名称 优点 缺点 适用场景
PyPDF2 (最新版:PyPDF4) 纯Python实现,轻量级,功能全面 对某些复杂PDF支持有限 一般PDF操作,简单合并
pdfrw 轻量快速,支持注释保留 功能相对较少 需要保留表单字段的合并
pikepdf 基于C++,性能好,支持现代PDF 安装稍复杂(需要C++库) 处理加密、损坏PDF
PyMuPDF (fitz) 功能最强大,性能最佳 API稍复杂 需要提取文本、图像等高级操作

推荐选择:对于大多数合并需求,PyPDF2/PyPDF4 是最简单直接的选择。

二、安装与环境准备

# 安装 PyPDF2(推荐 PyPDF4,它是 PyPDF2 的维护版本)
pip install PyPDF4

# 或者安装其他库
pip install pikepdf
pip install pymupdf

三、基础合并方法

方法1:使用 PyPDF4(最常用)

import os
from PyPDF4 import PdfFileReader, PdfFileWriter

def merge_pdfs_pypdf4(pdf_paths, output_path):
    """
    使用 PyPDF4 合并多个PDF文件

    参数:
        pdf_paths: PDF文件路径列表
        output_path: 输出文件路径
    """
    pdf_writer = PdfFileWriter()

    for path in pdf_paths:
        try:
            with open(path, 'rb') as pdf_file:
                pdf_reader = PdfFileReader(pdf_file)

                # 获取总页数
                num_pages = pdf_reader.getNumPages()
                print(f"正在处理: {os.path.basename(path)} (共{num_pages}页)")

                # 逐页添加到输出
                for page_num in range(num_pages):
                    page = pdf_reader.getPage(page_num)
                    pdf_writer.addPage(page)

        except Exception as e:
            print(f"处理文件 {path} 时出错: {str(e)}")
            continue

    # 写入输出文件
    with open(output_path, 'wb') as output_file:
        pdf_writer.write(output_file)

    print(f"合并完成!输出文件: {output_path}")

# 使用示例
if __name__ == "__main__":
    # 获取当前目录所有PDF文件
    pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf')]

    # 按文件名排序(可选)
    pdf_files.sort()

    # 设置输出文件名
    output_file = "merged_output.pdf"

    # 合并PDF
    merge_pdfs_pypdf4(pdf_files, output_file)

方法2:使用 PyMuPDF(fitz)- 性能最佳

import fitz  # PyMuPDF
import os

def merge_pdfs_pymupdf(pdf_paths, output_path):
    """
    使用 PyMuPDF 合并PDF(性能最好)
    """
    # 创建新的PDF文档
    result_pdf = fitz.open()

    for pdf_path in pdf_paths:
        try:
            # 打开每个PDF
            with fitz.open(pdf_path) as pdf:
                # 插入整个文档
                result_pdf.insert_pdf(pdf)
                print(f"已添加: {os.path.basename(pdf_path)} (共{len(pdf)}页)")
        except Exception as e:
            print(f"处理 {pdf_path} 时出错: {e}")
            continue

    # 保存合并后的PDF
    result_pdf.save(output_path)
    result_pdf.close()

    print(f"合并完成!文件保存至: {output_path}")

四、高级功能实现

1. 带进度条的合并工具

import os
from PyPDF4 import PdfFileReader, PdfFileWriter
from tqdm import tqdm  # 需要安装: pip install tqdm

def merge_pdfs_with_progress(pdf_paths, output_path):
    """
    带进度显示的PDF合并
    """
    pdf_writer = PdfFileWriter()
    total_pages = 0

    # 先统计总页数
    print("正在扫描PDF文件...")
    for path in pdf_paths:
        try:
            with open(path, 'rb') as f:
                pdf_reader = PdfFileReader(f)
                total_pages += pdf_reader.getNumPages()
        except:
            print(f"警告: 无法读取 {path}")

    print(f"共发现 {len(pdf_paths)} 个PDF文件,总计约 {total_pages} 页")

    # 创建进度条
    pbar = tqdm(total=total_pages, desc="合并进度", unit="页")

    # 合并文件
    for path in pdf_paths:
        try:
            with open(path, 'rb') as pdf_file:
                pdf_reader = PdfFileReader(pdf_file)
                filename = os.path.basename(path)

                for page_num in range(pdf_reader.getNumPages()):
                    page = pdf_reader.getPage(page_num)
                    pdf_writer.addPage(page)
                    pbar.update(1)
                    pbar.set_postfix(file=filename[:20])

        except Exception as e:
            print(f"\n处理 {path} 时出错: {e}")
            continue

    pbar.close()

    # 保存文件
    with open(output_path, 'wb') as output_file:
        pdf_writer.write(output_file)

    print(f"\n✅ 合并完成!输出文件: {output_path}")

2. 智能排序与批量处理

import os
import glob
import re
from PyPDF4 import PdfFileReader, PdfFileWriter

class PDFMerger:
    def __init__(self):
        self.writer = PdfFileWriter()

    def find_pdfs(self, directory, pattern="*.pdf", recursive=False):
        """
        查找PDF文件

        参数:
            directory: 目录路径
            pattern: 文件匹配模式
            recursive: 是否递归查找子目录
        """
        if recursive:
            search_pattern = os.path.join(directory, "**", pattern)
            pdf_files = glob.glob(search_pattern, recursive=True)
        else:
            search_pattern = os.path.join(directory, pattern)
            pdf_files = glob.glob(search_pattern)

        return pdf_files

    def natural_sort(self, pdf_list):
        """
        自然排序(按数字顺序)
        """
        def natural_key(text):
            return [int(c) if c.isdigit() else c.lower() 
                   for c in re.split(r'(\d+)', text)]

        return sorted(pdf_list, key=lambda x: natural_key(os.path.basename(x)))

    def merge_directory(self, directory, output_file, recursive=False, sort_natural=True):
        """
        合并整个目录的PDF文件
        """
        # 查找PDF文件
        pdf_files = self.find_pdfs(directory, recursive=recursive)

        if not pdf_files:
            print(f"在目录 {directory} 中未找到PDF文件")
            return False

        print(f"找到 {len(pdf_files)} 个PDF文件")

        # 排序
        if sort_natural:
            pdf_files = self.natural_sort(pdf_files)

        # 显示文件列表
        for i, f in enumerate(pdf_files, 1):
            print(f"{i:3d}. {os.path.basename(f)}")

        # 合并
        return self.merge_files(pdf_files, output_file)

    def merge_files(self, pdf_paths, output_path):
        """
        合并指定文件列表
        """
        for path in pdf_paths:
            try:
                with open(path, 'rb') as f:
                    pdf_reader = PdfFileReader(f)
                    filename = os.path.basename(path)

                    for page_num in range(pdf_reader.getNumPages()):
                        page = pdf_reader.getPage(page_num)
                        self.writer.addPage(page)

                    print(f"✓ 已添加: {filename} ({pdf_reader.getNumPages()}页)")

            except Exception as e:
                print(f"✗ 处理 {path} 失败: {e}")
                continue

        # 写入输出文件
        with open(output_path, 'wb') as output_file:
            self.writer.write(output_file)

        print(f"\n🎉 合并完成!文件保存至: {output_path}")
        return True

# 使用示例
if __name__ == "__main__":
    merger = PDFMerger()

    # 合并当前目录所有PDF
    merger.merge_directory(".", "merged.pdf")

    # 或者合并特定文件列表
    # files = ["file1.pdf", "file2.pdf", "file3.pdf"]
    # merger.merge_files(files, "output.pdf")

3. 添加书签/目录功能

from PyPDF4 import PdfFileReader, PdfFileWriter

def merge_with_bookmarks(pdf_paths, output_path, bookmark_names=None):
    """
    合并PDF并添加书签
    """
    writer = PdfFileWriter()
    bookmarks = []

    start_page = 0

    for idx, path in enumerate(pdf_paths):
        with open(path, 'rb') as f:
            reader = PdfFileReader(f)
            total_pages = reader.getNumPages()

            # 添加页面
            for page_num in range(total_pages):
                page = reader.getPage(page_num)
                writer.addPage(page)

            # 创建书签
            if bookmark_names and idx < len(bookmark_names):
                bookmark_name = bookmark_names[idx]
            else:
                bookmark_name = f"文档_{idx+1}"

            # 添加书签(指向第一页)
            bookmark = writer.addBookmark(bookmark_name, start_page)
            bookmarks.append(bookmark)

            print(f"添加: {bookmark_name} (第{start_page+1}-{start_page+total_pages}页)")
            start_page += total_pages

    # 保存文件
    with open(output_path, 'wb') as output_file:
        writer.write(output_file)

    print(f"合并完成,共添加 {len(bookmarks)} 个书签")

# 使用示例
if __name__ == "__main__":
    files = ["chap1.pdf", "chap2.pdf", "appendix.pdf"]
    names = ["第一章", "第二章", "附录"]
    merge_with_bookmarks(files, "book_with_toc.pdf", names)

五、命令行工具实现

#!/usr/bin/env python3
"""
pdf_merger.py - PDF合并命令行工具
用法: python pdf_merger.py [-h] [-o OUTPUT] [-r] [-s] pdf_files [pdf_files ...]
"""

import argparse
import sys
import os
from PyPDF4 import PdfFileReader, PdfFileWriter

def main():
    parser = argparse.ArgumentParser(description='合并多个PDF文件')
    parser.add_argument('pdf_files', nargs='+', help='要合并的PDF文件')
    parser.add_argument('-o', '--output', default='merged.pdf', 
                       help='输出文件名 (默认: merged.pdf)')
    parser.add_argument('-r', '--recursive', action='store_true',
                       help='递归处理目录')
    parser.add_argument('-s', '--sort', action='store_true',
                       help='按文件名排序')

    args = parser.parse_args()

    # 处理文件列表
    pdf_paths = []
    for pattern in args.pdf_files:
        if os.path.isdir(pattern):
            # 如果是目录,查找所有PDF
            for root, dirs, files in os.walk(pattern):
                for file in files:
                    if file.lower().endswith('.pdf'):
                        pdf_paths.append(os.path.join(root, file))
                if not args.recursive:
                    break
        elif os.path.isfile(pattern) and pattern.lower().endswith('.pdf'):
            pdf_paths.append(pattern)
        else:
            print(f"警告: 跳过无效文件 {pattern}")

    if not pdf_paths:
        print("错误: 未找到有效的PDF文件")
        sys.exit(1)

    # 排序
    if args.sort:
        pdf_paths.sort()

    # 显示要合并的文件
    print(f"将合并以下 {len(pdf_paths)} 个文件:")
    for i, path in enumerate(pdf_paths, 1):
        print(f"  {i:3d}. {os.path.basename(path)}")

    # 合并
    try:
        writer = PdfFileWriter()
        total_pages = 0

        for path in pdf_paths:
            with open(path, 'rb') as f:
                reader = PdfFileReader(f)
                pages = reader.getNumPages()
                total_pages += pages

                for page_num in range(pages):
                    page = reader.getPage(page_num)
                    writer.addPage(page)

        with open(args.output, 'wb') as output_file:
            writer.write(output_file)

        print(f"\n✅ 成功合并 {len(pdf_paths)} 个PDF,共 {total_pages} 页")
        print(f"📄 输出文件: {os.path.abspath(args.output)}")

    except Exception as e:
        print(f"❌ 合并失败: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

六、GUI界面版本(使用Tkinter)

import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import os
from PyPDF4 import PdfFileReader, PdfFileWriter

class PDFMergerGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("PDF合并工具")
        self.root.geometry("600x500")

        self.pdf_files = []

        # 创建界面
        self.create_widgets()

    def create_widgets(self):
        # 按钮框架
        btn_frame = tk.Frame(self.root)
        btn_frame.pack(pady=10)

        tk.Button(btn_frame, text="添加PDF文件", command=self.add_files).pack(side=tk.LEFT, padx=5)
        tk.Button(btn_frame, text="添加文件夹", command=self.add_folder).pack(side=tk.LEFT, padx=5)
        tk.Button(btn_frame, text="清空列表", command=self.clear_list).pack(side=tk.LEFT, padx=5)

        # 文件列表
        list_frame = tk.LabelFrame(self.root, text="待合并文件列表")
        list_frame.pack(padx=10, pady=5, fill=tk.BOTH, expand=True)

        # 创建滚动条
        scrollbar = tk.Scrollbar(list_frame)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)

        self.listbox = tk.Listbox(list_frame, yscrollcommand=scrollbar.set, selectmode=tk.EXTENDED)
        self.listbox.pack(fill=tk.BOTH, expand=True)
        scrollbar.config(command=self.listbox.yview)

        # 控制按钮
        control_frame = tk.Frame(self.root)
        control_frame.pack(pady=10)

        tk.Button(control_frame, text="上移", command=self.move_up).pack(side=tk.LEFT, padx=5)
        tk.Button(control_frame, text="下移", command=self.move_down).pack(side=tk.LEFT, padx=5)
        tk.Button(control_frame, text="删除选中", command=self.delete_selected).pack(side=tk.LEFT, padx=5)

        # 输出设置
        output_frame = tk.Frame(self.root)
        output_frame.pack(pady=10)

        tk.Label(output_frame, text="输出文件名:").pack(side=tk.LEFT)
        self.output_var = tk.StringVar(value="merged.pdf")
        tk.Entry(output_frame, textvariable=self.output_var, width=30).pack(side=tk.LEFT, padx=5)

        # 合并按钮
        tk.Button(self.root, text="开始合并", command=self.merge_pdfs, 
                 bg="green", fg="white", font=("Arial", 12, "bold"),
                 padx=20, pady=5).pack(pady=15)

        # 进度条
        self.progress = ttk.Progressbar(self.root, length=400, mode='indeterminate')

    def add_files(self):
        files = filedialog.askopenfilenames(
            title="选择PDF文件",
            filetypes=[("PDF文件", "*.pdf"), ("所有文件", "*.*")]
        )
        for file in files:
            if file not in self.pdf_files:
                self.pdf_files.append(file)
                self.listbox.insert(tk.END, os.path.basename(file))

    def add_folder(self):
        folder = filedialog.askdirectory(title="选择文件夹")
        if folder:
            for file in os.listdir(folder):
                if file.lower().endswith('.pdf'):
                    full_path = os.path.join(folder, file)
                    if full_path not in self.pdf_files:
                        self.pdf_files.append(full_path)
                        self.listbox.insert(tk.END, file)

    def clear_list(self):
        self.pdf_files.clear()
        self.listbox.delete(0, tk.END)

    def move_up(self):
        selected = self.listbox.curselection()
        if selected and selected[0] > 0:
            index = selected[0]
            # 交换列表中的元素
            self.pdf_files[index], self.pdf_files[index-1] = self.pdf_files[index-1], self.pdf_files[index]
            # 更新Listbox
            self.refresh_listbox()
            self.listbox.select_set(index-1)

    def move_down(self):
        selected = self.listbox.curselection()
        if selected and selected[0] < len(self.pdf_files) - 1:
            index = selected[0]
            # 交换列表中的元素
            self.pdf_files[index], self.pdf_files[index+1] = self.pdf_files[index+1], self.pdf_files[index]
            # 更新Listbox
            self.refresh_listbox()
            self.listbox.select_set(index+1)

    def delete_selected(self):
        selected = self.listbox.curselection()
        for index in reversed(selected):
            del self.pdf_files[index]
        self.refresh_listbox()

    def refresh_listbox(self):
        self.listbox.delete(0, tk.END)
        for file in self.pdf_files:
            self.listbox.insert(tk.END, os.path.basename(file))

    def merge_pdfs(self):
        if not self.pdf_files:
            messagebox.showwarning("警告", "请先添加PDF文件")
            return

        output_file = self.output_var.get()
        if not output_file:
            output_file = "merged.pdf"

        # 添加进度条
        self.progress.pack(pady=10)
        self.progress.start()

        try:
            writer = PdfFileWriter()
            total_pages = 0

            for i, pdf_file in enumerate(self.pdf_files, 1):
                with open(pdf_file, 'rb') as f:
                    reader = PdfFileReader(f)
                    pages = reader.getNumPages()
                    total_pages += pages

                    for page_num in range(pages):
                        page = reader.getPage(page_num)
                        writer.addPage(page)

            with open(output_file, 'wb') as output:
                writer.write(output)

            self.progress.stop()
            self.progress.pack_forget()

            messagebox.showinfo("成功", 
                f"合并完成!\n\n共合并 {len(self.pdf_files)} 个文件\n总页数: {total_pages}\n输出文件: {output_file}")

        except Exception as e:
            self.progress.stop()
            self.progress.pack_forget()
            messagebox.showerror("错误", f"合并失败: {str(e)}")

if __name__ == "__main__":
    root = tk.Tk()
    app = PDFMergerGUI(root)
    root.mainloop()

七、最佳实践与注意事项

1. 处理大文件时的内存优化

def merge_large_pdfs(pdf_paths, output_path, batch_size=100):
    """
    分批处理大型PDF文件,避免内存不足
    """
    writer = PdfFileWriter()

    for i, path in enumerate(pdf_paths):
        with open(path, 'rb') as f:
            reader = PdfFileReader(f)
            total_pages = reader.getNumPages()

            # 分批处理
            for start in range(0, total_pages, batch_size):
                end = min(start + batch_size, total_pages)
                for page_num in range(start, end):
                    page = reader.getPage(page_num)
                    writer.addPage(page)

                print(f"处理中: {os.path.basename(path)} ({end}/{total_pages}页)")

    with open(output_path, 'wb') as output_file:
        writer.write(output_file)

2. 错误处理增强版

def safe_merge_pdfs(pdf_paths, output_path):
    """
    带有完整错误处理的合并函数
    """
    if not pdf_paths:
        raise ValueError("PDF文件列表为空")

    valid_files = []
    errors = []

    # 验证文件
    for path in pdf_paths:
        if not os.path.exists(path):
            errors.append(f"文件不存在: {path}")
        elif not path.lower().endswith('.pdf'):
            errors.append(f"不是PDF文件: {path}")
        else:
            try:
                with open(path, 'rb') as f:
                    # 尝试读取以验证PDF有效性
                    PdfFileReader(f)
                    valid_files.append(path)
            except Exception as e:
                errors.append(f"无效的PDF文件 {path}: {str(e)}")

    if errors:
        print("发现以下问题:")
        for error in errors:
            print(f"  - {error}")

    if not valid_files:
        raise Exception("没有有效的PDF文件可供合并")

    print(f"找到 {len(valid_files)} 个有效PDF文件")

    # 执行合并
    return merge_pdfs_pypdf4(valid_files, output_path)

3. 性能优化建议

使用PyMuPDF处理大量文件:性能比PyPDF2快5-10倍 批量处理:超过100个文件时考虑分批处理 关闭不必要的功能:如不需要书签,避免相关操作

八、完整项目结构示例

pdf_merger_project/
├── pdf_merger/           # 主包
│   ├── __init__.py
│   ├── core.py          # 核心合并功能
│   ├── cli.py           # 命令行接口
│   ├── gui.py           # GUI界面
│   └── utils.py         # 工具函数
├── requirements.txt     # 依赖列表
├── setup.py            # 安装配置
└── README.md           # 说明文档

requirements.txt

PyPDF4>=1.27.0
pikepdf>=5.0.0
PyMuPDF>=1.19.0
tkinter  # 通常Python自带
tqdm>=4.60.0  # 可选,用于进度条

九、常见问题解决

Q1: 合并后文件过大?

解决方案

# 压缩PDF(使用pikepdf)
import pikepdf

def compress_pdf(input_path, output_path):
    with pikepdf.open(input_path) as pdf:
        pdf.save(output_path, compress_streams=True)

Q2: 中文文件名乱码?

# 正确处理中文路径(Windows)
import sys

if sys.platform == 'win32':
    import win32api  # 需要安装 pywin32

    def get_short_path(long_path):
        """获取Windows短路径避免中文问题"""
        return win32api.GetShortPathName(long_path)

Q3: 需要保留表单字段?

# 使用pdfrw保留表单
from pdfrw import PdfReader, PdfWriter

def merge_preserve_forms(pdf_paths, output_path):
    writer = PdfWriter()
    for path in pdf_paths:
        reader = PdfReader(path)
        writer.addpages(reader.pages)
    writer.write(output_path)

总结

这个完整的PDF合并指南涵盖了:

基础合并:使用PyPDF4/PyPDF2的基本方法 高级功能:进度显示、书签添加、批量处理 不同场景:命令行工具、GUI界面 性能优化:处理大文件、错误处理 扩展方案:保留表单、压缩文件等

您可以根据具体需求选择合适的方案。对于大多数日常使用,方法1(PyPDF4)命令行工具 的组合是最实用和高效的。

需要特定功能或遇到问题,请随时询问!

相关推荐