Bulk merge PDF files with Python (bookmarking)

Keywords: Python encoding

On the Internet, I found several softwares that combine pdf, which are not very easy to use. Generally, they don't have the function of bookmarking.

I went to find the script of python merging pdf, and found that there was no function of bookmarking.

So I wrote a small tool by myself, using PyPDF2, the code is as follows:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
   #File name: pdfmerge.py
   //This script is used to merge pdf files. The output pdf file generates bookmarks according to the input pdf file name
   //Examples of use are as follows:
   python pdfmerge.py -p "D:\pdf-files" -o "merged-out.pdf" -b True'

   //Example:
   //The path of the pdf file to be merged: D: \ pdf files
   //The output file name of the combined PDF file: merged-out.pdf
   //Import bookmark value from pdf file or not: True
'''
import os
from argparse import ArgumentParser, RawTextHelpFormatter
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger

def getfilenames(filepath='',filelist_out=[],file_ext='all'):
    # ergodic filepath All files under, including files under subdirectories
    for fpath, dirs, fs in os.walk(filepath):
        for f in fs:
            fi_d = os.path.join(fpath, f)
            if  file_ext == 'all':
                filelist_out.append(fi_d)
            elif os.path.splitext(fi_d)[1] == file_ext:
                filelist_out.append(fi_d)
            else:
                pass
    return filelist_out

def mergefiles(path, output_filename, import_bookmarks=False):
    # Traverse all under the directory pdf Merge it and output it to one pdf In files, exported pdf The file is bookmarked by default, and the book signature is the previous file name
    # By default, the bookmarks of the original file are not imported. Use the import_bookmarks=True You can also import bookmarks from the original file to the output pdf In file
    merger = PdfFileMerger()
    filelist = getfilenames(filepath=path, file_ext='.pdf')

    for filename in filelist:
        f = open(filename, 'rb')
        file_rd = PdfFileReader(f)
        short_filename = os.path.basename(os.path.splitext(filename)[0])
        if file_rd.isEncrypted == True:
            print('Unsupported encrypted file:%s'%(filename))
            continue
        merger.append(file_rd, bookmark=short_filename, import_bookmarks=import_bookmarks)
        print('Merge file:%s'%(filename))
        f.close()
    out_filename=os.path.join(os.path.abspath(path), output_filename)
    merger.write(out_filename)
    print('Combined output file:%s'%(out_filename))
    merger.close()

if __name__ == "__main__":
    description="\n This script is used to merge pdf Files, exporting pdf Files by input pdf File name generate bookmark\n Examples of use are as follows:"
    description=description+'\npython pdfmerge.py -p "D:\pdf-files" -o "merged-out.pdf" -b True'
    description=description+'\n\n'+"Example:"
    description=description+'\n'+"To merge pdf Path of file: D:\pdf-files"
    description=description+'\n'+"Combined pdf Output file name of the file: merged-out.pdf"
    description=description+'\n'+"From pdf Value of import bookmark in file: True"

    # Add program help, program help supports line break
    parser = ArgumentParser(description=description, formatter_class=RawTextHelpFormatter)

    # Add command line options

    parser.add_argument("-p", "--path",
                        dest="path",
                        default=".",
                        help="PDF File directory")
    parser.add_argument("-o", "--output",
                        dest="output_filename",
                        default="merged.pdf",
                        help="merge PDF Output file name of",
                        metavar="FILE")
    parser.add_argument("-b", "--bookmark",
                    dest="import_bookmarks",
                    default="False",
                    help="From pdf Import bookmark in file, value can be'True'perhaps'False'")

    args = parser.parse_args()
    try:
        mergefiles(args.path, args.output_filename, args.import_bookmarks)
    except:
        print('Error to merge pdf file:')
        print(sys.exc_info()[0],sys.exc_info()[1])

 

Note: in the actual use process, it is found that some pdf files will report errors due to character encoding problems, resulting in code failure.

Posted by gwolff2005 on Sat, 02 May 2020 00:12:31 -0700