使用itextpdf提取pdf内容

  1 package test;
  2 
  3 import java.io.FileInputStream;
  4 import java.io.IOException;
  5 import java.util.ArrayList;
  6 import java.util.List;
  7 import java.util.regex.Pattern;
  8 
  9 import org.apache.pdfbox.cos.COSDocument;
 10 import org.apache.pdfbox.pdfparser.PDFParser;
 11 import org.apache.pdfbox.util.PDFTextStripper;
 12 
 13 import com.itextpdf.text.pdf.PdfReader;
 14 import com.itextpdf.text.pdf.parser.PdfTextExtractor;
 15 
 16 public class UploadUtils {
 17     
 18     private final static Pattern pattern = Pattern.compile("\d+");
 19     private final static int stateParaOverFlag = 800;
 20     private final static int thankParaOverFlag = 800;
 21     
 22     /**
 23      * 读取pdf参考文献内容
 24      * 
 25      * @param s
 26      * @return
 27      */
 28     public String readPdf(String filePath) {
 29         StringBuilder buffer = new StringBuilder();
 30         FileInputStream fis = null;
 31         PdfReader pdfReader = null;
 32         COSDocument cosDocument = null;
 33         String[] paragraphs = null;
 34         PDFParser p;
 35         boolean addBool = true;
 36         boolean judgeState = false;
 37         boolean judgeThank = false;
 38         StringBuilder tempSb = new StringBuilder();
 39         try {
 40             fis = new FileInputStream(filePath);
 41             p = new PDFParser(fis);
 42             p.parse();
 43             cosDocument = p.getDocument();
 44             // 加密文档判断
 45             if (cosDocument.isEncrypted()) {
 46                 StringBuilder tempContent = new StringBuilder();
 47                 pdfReader = new PdfReader(filePath);
 48                 int i = pdfReader.getNumberOfPages();
 49                 for (int j = 1; j <= i; j++) {
 50                     tempContent.append(PdfTextExtractor.getTextFromPage(pdfReader, j));
 51                 }
 52                 paragraphs = tempContent.toString().split("
");
 53             } else {
 54                 PDFTextStripper ts = new PDFTextStripper();
 55                 paragraphs = ts.getText(p.getPDDocument()).split("
");
 56             }
 57             boolean mark = false;
 58             List<Integer> errornum = new ArrayList<Integer>();
 59             int flag = 0;
 60             int endRange = paragraphs.length * 70 / 100;
 61             int rangeFlag = 0;
 62             for (String lineContent : paragraphs) {
 63                 if (judgeState) {
 64                     tempSb.append(lineContent);
 65                     if (tempSb.length() >= stateParaOverFlag) {
 66                         judgeState = false;
 67                         addBool = true;
 68                         tempSb.delete(0, tempSb.length() - 1);
 69                     }
 70                 }
 71                 if (judgeThank) {
 72                     tempSb.append(lineContent);
 73                     if (tempSb.length() >= thankParaOverFlag) {
 74                         judgeThank = false;
 75                         addBool = true;
 76                         tempSb.delete(0, tempSb.length() - 1);
 77                     }
 78                 }
 79                 if (addBool) {
 80                     buffer.append(lineContent);
 81                 }
 82                 if (mark && rangeFlag >= endRange) {
 83                     if (lineContent.length() < 5) {
 84                         errornum.add(++flag);
 85                         rangeFlag++;
 86                         continue;
 87                     }
 88                     if (pattern.matcher(lineContent.substring(0, 5)).find()) {
 89                         if (flag != 0) {
 90                             flag = 0;
 91                             errornum.clear();
 92                         }
 93                     } else {
 94                         errornum.add(++flag);
 95                     }
 96                     if (errornum.size() > 2) {
 97                         mark = false;
 98                     }
 99                 }
100                 rangeFlag++;
101             }
102         } catch (Exception e) {
103             e.printStackTrace();
104         } finally {
105             if (fis != null) {
106                 try {
107                     fis.close();
108                 } catch (IOException e) {
109                     e.printStackTrace();
110                 } finally {
111                     fis = null;
112                 }
113             }
114             if (pdfReader != null) {
115                 pdfReader.close();
116             }
117             if (cosDocument != null) {
118                 try {
119                     cosDocument.close();
120                 } catch (IOException e) {
121                     e.printStackTrace();
122                 } finally {
123                     cosDocument = null;
124                 }
125             }
126         }
127         return buffer.toString();
128     }
129     
130     public static boolean isBlank(CharSequence cs) {
131         int strLen;
132         if (cs == null || (strLen = cs.length()) == 0) {
133             return true;
134         }
135         for (int i = 0; i < strLen; i++) {
136             if (Character.isWhitespace(cs.charAt(i)) == false) {
137                 return false;
138             }
139         }
140         return true;
141     }
142     
143     public static void main(String[] args) {
144         // System.err.println(new UploadUtils()
145         // .readPdf("/opt/fileCache/2014/125/13/shuangping_D7037870CF4FC5C421A3E5359DCF8BBE.pdf"));
146         System.err.println(new UploadUtils().readPdf("E:\MyWork\guyezhai\pdf提取\路径依赖视角下高校新专业建设的策略创新(1).pdf"));
147         
148     }
149     
150 }

其中用到的jar包：

bcpkix-jdk15on-1.47.jar
bcprov-jdk15on-1.49.jar
commons-logging-1.1.2.jar
fontbox-1.8.2.jar
icu4j-4.0.1.jar
itextpdf-5.4.3.jar
jempbox-1.8.2.jar
pdfbox-1.8.2.jar