camelot工具进行pdf表格解析重建

camelot内置生成html文件的方法,但表格数据转化成pandas.dataframe的过程中,丢失了跨行跨列的结构信息,故生成html的表格无跨行跨列结构。

于是我在输出部分选择直接手写html表格..

import camelot
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

# def listdir(path, list_name):  # 传入存储的list
#     for file in os.listdir(path):
#         file_path = os.path.join(path, file)
#         if os.path.isdir(file_path):
#             listdir(file_path, list_name)
#         else:
#             list_name.append(file_path)
#批量文件
# filenames=[r'E:pdf_download']
# listdir('E:pdf_download',filenames)
# for onefile in filenames:
#     filename=onefile.split(".", )[0]

#单个文件
onefile=r'1202007288.pdf'
print("loading...", onefile)
tables = camelot.read_pdf(onefile,pages='28',strip_text=' .
',line_scale=80,split_text=True)

for onetable in tables:
    mask = np.zeros((len(onetable.rows)+1, len(onetable.cols)+1))
    colspan = np.ones((len(onetable.rows)+1, len(onetable.cols)+1))
    rowspan = np.ones((len(onetable.rows)+1, len(onetable.cols)+1))
    for onerow in onetable.cells:
        for onecell in onerow:
            thisrow = onetable.cells.index(onerow)
            thiscol = onerow.index(onecell)
            if mask[thisrow][thiscol] == 0:
                if not onecell.right:
                    for i in range(thiscol,len(onerow)-1):
                        if not onerow[i].right:
                            mask[thisrow][i + 1] = 1
                            colspan[thisrow][thiscol] += 1
                        else:
                            break
                if not onecell.bottom:
                    for i in range(thisrow,len(onetable.cells)-1):
                        if not onetable.cells[i][thiscol].bottom:
                            mask[i + 1][thiscol] = 1
                            rowspan[thisrow][thiscol] += 1
                        else:
                            break
    head='''<table border="1" class="dataframe">
  <tbody>'''
    f = open(onefile + '-page'+str(onetable.page) + '-table-'+str(onetable.order)+'.html', 'w')
    f.write(head)
    for onerow in onetable.cells:
        writerow = '''
    <tr>'''
        f.write(writerow)
        for onecell in onerow:
            thisrow = onetable.cells.index(onerow)
            thiscol = onerow.index(onecell)
            if mask[thisrow][thiscol] == 0:
                if int(colspan[thisrow][thiscol]) > 1:
                    Colspan = 'colspan=' + str(int(colspan[thisrow][thiscol]))
                else:
                    Colspan=''
                if int(rowspan[thisrow][thiscol]) > 1:
                    Rowspan = 'rowspan=' + str(int(rowspan[thisrow][thiscol]))
                else:
                    Rowspan = ''
                writecell = '''
                <td %s %s>%s</td>'''%(Colspan,Rowspan,onecell.text)
                f.write(writecell)
        writerow = '''
    </tr>'''
        f.write(writerow)
    f.close()
原文地址:https://www.cnblogs.com/wind-chaser/p/10690083.html