python 将pdf分页后插入至word中

所用技术

　　1. python编程基础

　　2. 使用pyPdf

　　3. 使用python操作word

　　4. 正则表达式的使用

　　5. windows的bat编程

下面是一个pyPdf库使用的示例：

    from pyPdf import PdfFileWriter, PdfFileReader

    output = PdfFileWriter()
    input1 = PdfFileReader(file("document1.pdf", "rb"))

    # add page 1 from input1 to output document, unchanged
    output.addPage(input1.getPage(0))

    # add page 2 from input1, but rotated clockwise 90 degrees
    output.addPage(input1.getPage(1).rotateClockwise(90))

    # add page 3 from input1, rotated the other way:
    output.addPage(input1.getPage(2).rotateCounterClockwise(90))
    # alt: output.addPage(input1.getPage(2).rotateClockwise(270))

    # add page 4 from input1, but first add a watermark from another pdf:
    page4 = input1.getPage(3)
    watermark = PdfFileReader(file("watermark.pdf", "rb"))
    page4.mergePage(watermark.getPage(0))

    # add page 5 from input1, but crop it to half size:
    page5 = input1.getPage(4)
    page5.mediaBox.upperRight = (
        page5.mediaBox.getUpperRight_x() / 2,
        page5.mediaBox.getUpperRight_y() / 2
    )
    output.addPage(page5)

    # print how many pages input1 has:
    print "document1.pdf has %s pages." % input1.getNumPages())

    # finally, write "output" to document-output.pdf
    outputStream = file("document-output.pdf", "wb")
    output.write(outputStream)

有了该库，就可以很容易将现有的pdf做分割。

因为我的需求是要将pdf中的关键字提取出来，用它来作为文件名。pyPdf中提供了将pdf中的文字全部提取出来。

inputfile.getPage(0).extractText()

这里返回的unicode，需要转为str

inputfile.getPage(0).extractText().encode("utf-8")

然后将每页的关键字提取出来，增加函数如下：

p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
    m = p_sheetName.search(str)
    if m:
        return m.group(1)
    else:
        return None;

最终代码如下：

from pyPdf import PdfFileWriter, PdfFileReader
import re,os

p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
    m = p_sheetName.search(str)
    if m:
        return m.group(1)
    else:
        return None;

def splitpdf(srcFile):
        input1 = file(srcFile,"rb")
        inputfile = PdfFileReader(input1)
        numofpages = inputfile.getNumPages()
        print "pages: %d" % numofpages
        #new directory
        folderName,ext_ = os.path.splitext(srcFile)
        if not os.path.isdir(folderName):
            os.makedirs(folderName)
        for page_index in range(1,numofpages+1):
            output = PdfFileWriter()
            output.addPage(inputfile.getPage(page_index-1))
            
            sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
            #save file
            saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
            print saveFileName
            outputFile = file(saveFileName,"wb")
            output.write(outputFile)
            outputFile.close()
        input1.close()


splitpdf("E:\test.pdf")

下一步，将pdf参数化

from pyPdf import PdfFileWriter, PdfFileReader
import re,sys,os,string

def translator(frm='', to='', delete='', keep=None):
    if len(to) == 1 :
        to = to * len(frm)
    trans = string.maketrans(frm,to)
    if keep is not None:
        allchars = string.maketrans('','')
        delete = allchars.translate(allchars,keep.translate(allchars,delete))
    def translate(s):
        return s.translate(trans,delete)
    return translate

delete_some_speicl = translator(delete="/:\?*><|")

p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
    m = p_sheetName.search(str)
    return delete_some_speicl(m.group(1))

def splitpdf(srcFile):
    try:
        folderName,ext_ = os.path.splitext(srcFile)
        if ext_ != '.pdf':
            raise Exception(os.path.basename(srcFile) + " is not pdf!")
        input1 = file(srcFile,"rb")
        inputfile = PdfFileReader(input1)
        numofpages = inputfile.getNumPages()
        print "pages: %d" % numofpages
        #new directory
        if not os.path.isdir(folderName):
            os.makedirs(folderName)
        for page_index in range(1,numofpages+1):
            output = PdfFileWriter()
            output.addPage(inputfile.getPage(page_index-1))
            
            sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
            #save file
            saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
            print saveFileName
            outputFile = file(saveFileName,"wb")
            output.write(outputFile)
            outputFile.close()
        input1.close()
        print "Split success!"
        print "please find them at " + folderName
    except Exception,e:
        print e

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'usage: %s filename' % os.path.basename(sys.argv[0])
        exit(0)
    #print sys.argv[1]
    splitpdf(sys.argv[1])

这里translator函数是将关键字中的特殊字符过滤掉，因为新建文件时可能会出错。

其实分开pdf也还需要一些手动操作，不然还需用vba导入到word中，我想直接用python干完这些事，如果就用到了win32com来操作word

下面是使用操作word的一个示例：

import win32com
from win32com.client import Dispatch, constants

w = win32com.client.Dispatch('Word.Application')
# 或者使用下面的方法，使用启动独立的进程：
# w = win32com.client.DispatchEx('Word.Application')

# 后台运行，不显示，不警告
w.Visible = 0
w.DisplayAlerts = 0

# 打开新的文件
doc = w.Documents.Open( FileName = filenamein )
# worddoc = w.Documents.Add() # 创建新的文档

# 插入文字
myRange = doc.Range(0,0)
myRange.InsertBefore('Hello from Python!')

# 使用样式
wordSel = myRange.Select()
wordSel.Style = constants.wdStyleHeading1

# 正文文字替换
w.Selection.Find.ClearFormatting()
w.Selection.Find.Replacement.ClearFormatting()
w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2)

# 页眉文字替换
w.ActiveDocument.Sections[0].Headers[0].Range.Find.ClearFormatting()
w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting()
w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2)

# 表格操作
doc.Tables[0].Rows[0].Cells[0].Range.Text ='123123'
worddoc.Tables[0].Rows.Add() # 增加一行

# 转换为html
wc = win32com.client.constants
w.ActiveDocument.WebOptions.RelyOnCSS = 1
w.ActiveDocument.WebOptions.OptimizeForBrowser = 1
w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4
w.ActiveDocument.WebOptions.OrganizeInFolder = 0
w.ActiveDocument.WebOptions.UseLongFileNames = 1
w.ActiveDocument.WebOptions.RelyOnVML = 0
w.ActiveDocument.WebOptions.AllowPNG = 1
w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML )

# 打印
doc.PrintOut()

# 关闭
# doc.Close()
w.Documents.Close(wc.wdDoNotSaveChanges)
w.Quit()

仿照上例，修改前面的代码如下：

from pyPdf import PdfFileWriter, PdfFileReader
import re,sys,os,string,win32com
from win32com.client import Dispatch, constants
win32com.client.gencache.EnsureDispatch('Word.Application')


def translator(frm='', to='', delete='', keep=None):
    if len(to) == 1 :
        to = to * len(frm)
    trans = string.maketrans(frm,to)
    if keep is not None:
        allchars = string.maketrans('','')
        delete = allchars.translate(allchars,keep.translate(allchars,delete))
    def translate(s):
        return s.translate(trans,delete)
    return translate

delete_some_speicl = translator(delete="/:\?*><|")

p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
    m = p_sheetName.search(str)
    return m.group(1)

def splitPdfToWord(srcFile):
    try:
        folderName,ext_ = os.path.splitext(srcFile)
        if ext_ != '.pdf':
            raise Exception(os.path.basename(srcFile) + " is not pdf!")
        input1 = file(srcFile,"rb")
        inputfile = PdfFileReader(input1)
        numofpages = inputfile.getNumPages()
        print "Total Pages: %d" % numofpages
        wordApp = win32com.client.Dispatch('Word.Application')
        wordApp.Visible = False
        wordApp.DisplayAlerts = 0
        doc = wordApp.Documents.Add()
        sel = wordApp.Selection
        #new directory
        if not os.path.isdir(folderName):
            os.makedirs(folderName)
        for page_index in range(1,numofpages+1):
            output = PdfFileWriter()
            output.addPage(inputfile.getPage(page_index-1))
            
            sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
            sel.Style = constants.wdStyleHeading1
            sel.TypeText("Page%d %s" % (page_index,sheetName))
            sheetName = delete_some_speicl(sheetName)
            #save file
            saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
            print "Add Page %d" % page_index
            #print saveFileName
            outputFile = file(saveFileName,"wb")
            output.write(outputFile)
            outputFile.close()
            sel.TypeParagraph()
            sel.Style = constants.wdStyleBodyText
            sel.InlineShapes.AddOLEObject(ClassType="AcroExch.Document.11",FileName=saveFileName)
            sel.InsertBreak(Type=constants.wdPageBreak)
        input1.close()
        doc.SaveAs(folderName+".doc")
        print "Split success!"
        print "please find them at " + folderName
        print "create word document success!"
        print "Location:" + folderName + ".doc"
    except Exception,e:
        print e
    finally:
        wordApp.Quit()

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'usage: %s filename' % os.path.basename(sys.argv[0])
        sys.exit(1)
    splitPdfToWord(sys.argv[1])