根据关键字查找其在pdf 文件中的页面

package com.icil.elsa.milestone.common.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfCopy;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfImportedPage;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.ContentByteUtils;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;

/**
 * @Package Name :elsa-billing-service
 * @Project Name :com.icil.elsa.milestone.common.util
 * @File Name	 : PDFUtil
 * @Version	 :1.0
 * @Author	 :peterwong
 * @Creation Date:Jun 8, 20208:23:56 PM
 * @Purpose      :
 */
public class PDFUtil {
	
	   public static void main(String[] args) throws IOException {
	        //1.给定文件
	        File pdfFile = new File("/home/peterwong/Documents/项目流程管理.pdf");
	        //2.定义一个byte数组,长度为文件的长度
	        byte[] pdfData = new byte[(int) pdfFile.length()];

	        //3.IO流读取文件内容到byte数组
	        FileInputStream inputStream = null;
	        try {
	            inputStream = new FileInputStream(pdfFile);
	            inputStream.read(pdfData);
	        } catch (IOException e) {
	            throw e;
	        } finally {
	            if (inputStream != null) {
	                try {
	                    inputStream.close();
	                } catch (IOException e) {
	                }
	            }
	        }

	        //4.指定关键字
	        String keyword = "ICIL项目流程管理";

	        //5.调用方法,给定关键字和文件
	        List<float[]> positions = findKeywordPostions(pdfData, keyword);

	        //6.返回值类型是  List<float[]> 每个list元素代表一个匹配的位置,分别为 float[0]所在页码  float[1]所在x轴 float[2]所在y轴
	        System.out.println("total:" + positions.size());
	        int start = 0;
	        int end = 0;
	       
	        if (positions != null && positions.size() > 0) {
	        	 start = (int) positions.get(0)[0];
	        	 end = (int) positions.get(positions.size()-1)[0];
	            for (float[] position : positions) {
	                System.out.print("pageNum: " + (int) position[0]);
	                System.out.print("	x: " + position[1]);
	                System.out.println("	y: " + position[2]);
	            }
	        }
	        splitPDFByRange("/home/peterwong/Documents/", "项目流程管理.pdf", 
					"/home/peterwong/Documents/E+/",start, end);
	       /* splitPDFByRange("D:\inputPath", "test.pdf", 
					"/home/peterwong/Documents/E+/",16, 30);*/
	    }

	   
	   /**
	    * @author Reverse_XML
	    * 把PDF 按指定页数范围 startPage 到 endPage 拆分
	    * @param path 源PDF路径
	    * @param fileName 源PDF文件名
	    * @param outputPath 拆分后输出的PDF路径
	    * @param startPage 开始页码
	    * @param endPage 结束页码
	    */
	   public static void splitPDFByRange(String path, String fileName, String outputPath, 
	   							Integer startPage, Integer endPage) {
	       String sep = java.io.File.separator;
	       Document document = null;
	       PdfCopy copy = null;
	       PdfReader reader = null;
	       try {
	           reader = new PdfReader(path + sep + fileName);
	           int numberOfPages = reader.getNumberOfPages();
	           if (endPage == 0) {
	               endPage = numberOfPages;
	           }
	           String savePath = outputPath + sep +
	   				fileName.substring(0, fileName.lastIndexOf("."))
	                   + "_from_" + startPage + "_to_" + endPage + "_.pdf";
	           document = new Document(reader.getPageSize(1));
	           copy = new PdfCopy(document, new FileOutputStream(savePath));
	           document.open();
	           for (int i = startPage; i <= endPage; i++) {
	               document.newPage();
	               PdfImportedPage page = copy.getImportedPage(reader, i);
	               copy.addPage(page);
	           }
	           document.close();
	       } catch (IOException e) {
	           System.out.println(e.getMessage());
	       } catch (DocumentException e) {
	    	   System.out.println(e.getMessage());
	       } finally {
	           if (document != null)
	               document.close();
	           if (reader != null)
	               reader.close();
	           if (copy != null)
	               copy.close();
	       }
	   }
	 

	    /**
	     * findKeywordPostions
	     * @param pdfData     通过IO流 PDF文件转化的byte数组
	     * @param keyword     关键字
	     * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y
	     * @throws IOException
	     */
	    public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
	        List<float[]> result = new ArrayList<>();
	        List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);


	        for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
	            List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition);
	            if (charPositions == null || charPositions.size() < 1) {
	                continue;
	            }
	            result.addAll(charPositions);
	        }
	        return result;
	    }


	    private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
	        PdfReader reader = new PdfReader(pdfData);


	        List<PdfPageContentPositions> result = new ArrayList<>();


	        int pages = reader.getNumberOfPages();
	        for (int pageNum = 1; pageNum <= pages; pageNum++) {
	            float width = reader.getPageSize(pageNum).getWidth();
	            float height = reader.getPageSize(pageNum).getHeight();


	            PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height);


	            //解析pdf,定位位置
	            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);
	            PdfDictionary pageDic = reader.getPageN(pageNum);
	            PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
	            try {
	                processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
	            } catch (IOException e) {
	                reader.close();
	                throw e;
	            }


	            String content = pdfRenderListener.getContent();
	            List<CharPosition> charPositions = pdfRenderListener.getcharPositions();


	            List<float[]> positionsList = new ArrayList<>();
	            for (CharPosition charPosition : charPositions) {
	                float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};
	                positionsList.add(positions);
	            }


	            PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();
	            pdfPageContentPositions.setContent(content);
	            pdfPageContentPositions.setPostions(positionsList);


	            result.add(pdfPageContentPositions);
	        }
	        reader.close();
	        return result;
	    }


	    private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {


	        List<float[]> result = new ArrayList<>();


	        String content = pdfPageContentPositions.getContent();
	        List<float[]> charPositions = pdfPageContentPositions.getPositions();


	        for (int pos = 0; pos < content.length(); ) {
	            int positionIndex = content.indexOf(keyword, pos);
	            if (positionIndex == -1) {
	                break;
	            }
	            System.out.println("page is "+ positionIndex);
	            float[] postions = charPositions.get(positionIndex);
	            result.add(postions);
	            pos = positionIndex + 1;
	        }
	        return result;
	    }


	    private static class PdfPageContentPositions {
	        private String content;
	        private List<float[]> positions;


	        public String getContent() {
	            return content;
	        }


	        public void setContent(String content) {
	            this.content = content;
	        }


	        public List<float[]> getPositions() {
	            return positions;
	        }


	        public void setPostions(List<float[]> positions) {
	            this.positions = positions;
	        }
	    }



	    private static class PdfRenderListener implements RenderListener {
	        private int pageNum;
	        private float pageWidth;
	        private float pageHeight;
	        private StringBuilder contentBuilder = new StringBuilder();
	        private List<CharPosition> charPositions = new ArrayList<>();


	        public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
	            this.pageNum = pageNum;
	            this.pageWidth = pageWidth;
	            this.pageHeight = pageHeight;
	        }


	        public void beginTextBlock() {
	        }


	        public void renderText(TextRenderInfo renderInfo) {
	            List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
	            for (TextRenderInfo textRenderInfo : characterRenderInfos) {
	                String word = textRenderInfo.getText();
	                if (word.length() > 1) {
	                    word = word.substring(word.length() - 1, word.length());
	                }
	               com.itextpdf.awt.geom.Rectangle2D.Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();

	                float x = (float)rectangle.getX();
	                float y = (float)rectangle.getY();
//	                float x = (float)rectangle.getCenterX();
//	                float y = (float)rectangle.getCenterY();
//	                double x = rectangle.getMinX();
//	                double y = rectangle.getMaxY();




	                //这两个是关键字在所在页面的XY轴的百分比
	                float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
	                float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;


//	                CharPosition charPosition = new CharPosition(pageNum, xPercent, yPercent);
	                CharPosition charPosition = new CharPosition(pageNum, (float)x, (float)y);
	                charPositions.add(charPosition);
	                contentBuilder.append(word);
	            }
	        }


	        public void endTextBlock() {
	        }


	        public void renderImage(ImageRenderInfo renderInfo) {
	        }


	        public String getContent() {
	            return contentBuilder.toString();
	        }


	        public List<CharPosition> getcharPositions() {
	            return charPositions;
	        }
	    }


	    private static class CharPosition {
	        private int pageNum = 0;
	        private float x = 0;
	        private float y = 0;


	        public CharPosition(int pageNum, float x, float y) {
	            this.pageNum = pageNum;
	            this.x = x;
	            this.y = y;
	        }


	        public int getPageNum() {
	            return pageNum;
	        }


	        public float getX() {
	            return x;
	        }


	        public float getY() {
	            return y;
	        }


	        @Override
	        public String toString() {
	            return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";
	        }
	    }
	
	

}


原文地址:https://www.cnblogs.com/wanthune/p/13353874.html