从pdf中读取表格数据并且写入datatable中

首先引入第三方组件的使用Tabula,这个是一个开源的组件,该组件基于pdfpig组件实现

        /// <summary>
        /// 提取表格的方法
        /// </summary>
        /// <param name="pdfPath"></param>
        /// <param name="startNumber"></param>
        /// <param name="endNumber"></param>
        /// <returns></returns>
        private List<DataTable> ExtractTables(string pdfPath, int startNumber, int endNumber)
        {
            try
            {
                using (UglyToad.PdfPig.PdfDocument document = UglyToad.PdfPig.PdfDocument.Open(pdfPath, new ParsingOptions() { ClipPaths = true }))
                {

                    ObjectExtractor oe = new ObjectExtractor(document);
                    IExtractionAlgorithm ea = new SpreadsheetExtractionAlgorithm();
                    var pagesNumber = document.NumberOfPages;
                    if (startNumber < pagesNumber && endNumber > pagesNumber)
                    {
                        endNumber = pagesNumber;
                    }
                    if (startNumber > pagesNumber || endNumber > pagesNumber)
                    {
                        throw new IndexOutOfRangeException("页码超出范围!");
                    }

                    List<DataTable> dtList = new List<DataTable>();

                    for (int i = startNumber; i <= endNumber; i++)
                    {

                        PageArea page = oe.Extract(i);
                        List<Table> tables = ea.Extract(page);
                        foreach (Table tb in tables)
                        {
                            DataTable dt = new DataTable();
                            var columnCount = tb.ColumnCount;
                            for (int b = 0; b < columnCount; b++)
                            {
                                dt.Columns.Add(b.ToString(), typeof(string));
                            }
                            var rows = tb.Rows;
                            foreach (IReadOnlyList<Cell> row in tb.Rows)
                            {
                                DataRow dr = dt.NewRow();
                                for (int c = 0; c < columnCount; c++)
                                {
                                    dr[c] = row[c];
                                }
                                dt.Rows.Add(dr);
                            }
                            dtList.Add(dt);
                        }
                    }
                    return dtList;
                }

            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
原文地址:https://www.cnblogs.com/njcxwz/p/15637239.html