itextsharp读取pdf文档内容

itextsharp是开源的组件,可以用于读取pdf的文本内容,要求可以逐行读取,可以空格区分间隔大的文本块,并且支持一定的误差兼容

 public class TextAsParagraphsExtractionStrategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
    {
        //Text buffer 
        private StringBuilder result = new StringBuilder();

        //Store last used properties 
        private Vector lastStartPoint;
        private Vector lastEndPoint;

        //Buffer of lines of text and their Y coordinates. NOTE, these should be exposed as properties instead of fields but are left as is for simplicity's sake 
        public List<string> strings = new List<String>();
        public List<float> baselines = new List<float>();

        //This is called whenever a run of text is encountered 
        public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
        {

            //This code assumes that if the baseline changes then we're on a newline 
            var text = renderInfo.GetText();
            LineSegment line = renderInfo.GetBaseline();
            Vector curStartPoint = renderInfo.GetBaseline().GetStartPoint();
            Vector curEndPoint = renderInfo.GetBaseline().GetEndPoint();
            //See if the baseline has changed 
            if ((this.lastStartPoint != null) && (curStartPoint[Vector.I2] <= lastStartPoint[Vector.I2] - 2))
            {
                //See if we have text and not just whitespace 
                if ((!String.IsNullOrWhiteSpace(this.result.ToString())))
                {
                    //Mark the previous line as done by adding it to our buffers 
                    this.baselines.Add(this.lastStartPoint[Vector.I2]);
                    this.strings.Add(this.result.ToString());
                }
                //Reset our "line" buffer 
                this.result.Clear();
            }
            else
            {
                if ((this.lastEndPoint != null) && (curStartPoint[Vector.I1] >= lastEndPoint[Vector.I1] + 2))
                {
                    this.result.Append(" ");
                }
            }

            //Append the current text to our line buffer 

            this.result.Append(renderInfo.GetText());

            //Reset the last used line 
            this.lastStartPoint = curStartPoint;
            this.lastEndPoint = curEndPoint;
        }

        public string GetResultantText()
        {
            //One last time, see if there's anything left in the buffer 
            if ((!String.IsNullOrWhiteSpace(this.result.ToString())))
            {
                this.baselines.Add(this.lastStartPoint[Vector.I2]);
                this.strings.Add(this.result.ToString());
            }
            //We're not going to use this method to return a string, instead after callers should inspect this class's strings and baselines fields. 
            return null;
        }

        //Not needed, part of interface contract 
        public void BeginTextBlock() { }
        public void EndTextBlock() { 
        }
        public void RenderImage(ImageRenderInfo renderInfo) { }
    }

调用读取文本的内容

            PdfReader reader = new PdfReader(@"d:\20212.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            TextAsParagraphsExtractionStrategy S = new TextAsParagraphsExtractionStrategy();
            iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < S.strings.Count; i++)
            {
                sb.AppendLine(string.Format("Line {0,-5}: {1}", S.baselines[i], S.strings[i]));
            }
            var sss = sb.ToString();

            reader.Close();
原文地址:https://www.cnblogs.com/njcxwz/p/15637178.html