Java 实现pdf转excel

最近项目需要解析pdf单据，获取里面的字段数据，通过网上的查阅发现itext比pdfbox的文档要多一点，所以选择了itext（不是说pdfbox不好，只是api和例子太少，难以解）。因pdf非模板化（某政府发放），所以靠表单域获取变得不现实。一开始通过PdfReaderContentParser获取的文档内容，但是获取到的是所有内容拼接成的一个字符串，而需求需要将数据精确到字段，靠截取字符串来达到解析的目的是行不通的，因为获取的内容毫无规律。后查看源码代码，发现解析过程是逐字随机字符解析的，所以只有通过字段所在坐标范围来获取字段内容。

/**
* Created by luon 2018/3/21.
*/

@Override
protected ModelAndView onSubmit(HttpServletRequest request, HttpServletResponse response, Object command, BindException errors) throws Exception {
FileUploadForm form = (FileUploadForm) command;

SimpleResult result = SimpleResult.create(false);
if (form.getFile() == null || form.getFile().getSize() == 0) {
result.setMessage("请上传pdf文件");
return new ModelAndView(new JsonView(result));
}
if (!"pdf".equals(FilenameUtils.getExtension(form.getFile().getOriginalFilename().toLowerCase()))) {
result.setMessage("请上传pdf格式的文件");
return new ModelAndView(new JsonView(result));
}

//获取pdf文件流
InputStream inputStream = form.getFile().getInputStream();
//获取pdf内容
List<List<Map<String, String>>> listAll = exportPdfList(inputStream);

final String path = "/excel模板路径/xls/fillbls.xls";
Workbook workbook = ExcelLoader.loadXls(this.getClass().getResourceAsStream(path));

//遍历pdf 内容插入Excel；

List<ExcelRow> sheet = new ArrayList<>();
for (int i = 0; i < listAll.size(); i++) {
List<Map<String, String>> listdata = listAll.get(i);
for (Map<String, String> map : listdata) {
String orderNum = map.get("orderNum");
String trackNum = map.get("trackNum");
String serviceType = map.get("serviceType");
String actualWeight = map.get("actualWeight");
String actualWeightUnits = map.get("actualWeightUnits");
String ratedWeight = map.get("ratedWeight");
String ratedWeightUnits = map.get("ratedWeightUnits");
String amount = map.get("amount");

String chargeDesion1 = map.get("chargeDesion1");
String chargeDesionCash1 = map.get("chargeDesionCash1");


ExcelRow itemRow = new ExcelRow();
itemRow.add(orderNum);
itemRow.add(trackNum);
itemRow.add(serviceType);
itemRow.add(actualWeight);
itemRow.add(actualWeightUnits);
itemRow.add(ratedWeight);
itemRow.add(ratedWeightUnits);
itemRow.add(amount);
itemRow.add(chargeDesion1);
itemRow.add(chargeDesionCash1);
sheet.add(itemRow);

}
}

ExcelWriter.write(workbook, sheet, 0, 1);
InputStream outStream = ExcelWriter.close(workbook);
String fileName = "美国境内账单表.xls";
fileName = java.net.URLEncoder.encode(fileName, "UTF-8");
return new ModelAndView(new DownloadView(outStream, fileName));
}

 

//读取pdf内容  注意此方法没有贴上详细代码。

public List<List<Map<String, String>>> exportPdfList(InputStream inputStream) {

List<List<Map<String, String>>> listAll = new ArrayList<>();
try {
Map<String, byte[]> pdfData = LabelSpliter.byPageNum(inputStream);
//分页取pdf
List<Map.Entry<String, byte[]>> list = new ArrayList<>(pdfData.entrySet());
//排序从 第一页开始
Collections.sort(list, new Comparator<Map.Entry<String, byte[]>>() {
public int compare(Map.Entry<String, byte[]> o1, Map.Entry<String, byte[]> o2) {
return (new Integer(o1.getKey())).compareTo(new Integer(o2.getKey()));
}
});

for (Map.Entry<String, byte[]> entry : list) {
System.out.println(entry.getKey() + "-------------------------------------" + entry.getValue());
byte[] pdfBypage = entry.getValue();
InputStream inputfjsb = new ByteArrayInputStream(pdfBypage);
PDDocument document = PDDocument.load(inputfjsb);
if (!document.isEncrypted()) {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);

listMap.add(map);

listAll.add(listMap);

return listAll;
}

}

通过PdfReaderContentParser获取的文档内容，但是获取到的是所有内容拼接成的一个字符串，而需求需要将数据精确到字段，靠截取字符串来达到解析的目的是行不通的，因为获取的内容毫无规律。以上代码仅提供思路，谢谢!