代码片段,使用TIKA来解析PDF,WORD和EMAIL

/**
 * com.jiaoyiping.pdstest.TestTika.java
 * Copyright (c) 2009 Hewlett-Packard Development Company, L.P.
 * All rights reserved.
 */
package com.jiaoyiping.pdstest;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mail.RFC822Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;

/**
 * <pre>
 * Desc: 
 * @author 焦一平
 * @refactor 焦一平
 * @date   2014年12月4日 下午1:31:09
 * @version 1.0
 * @see  
 * REVISIONS: 
 * Version 	   Date 		    Author 			  Description
 * ------------------------------------------------------------------- 
 * 1.0 		  2014年12月4日 	                              焦一平  	         1. Created this class. 
 * </pre>  
 */
public class TestTika {
	
	//解析PDF
	@Test
	public void testPdf() throws Exception{
		Long start = System.currentTimeMillis();
		Parser parser = new PDFParser();
		InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\我的微盘\文档\参考文档\Linux Shell脚本攻略.pdf")));
		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt")));
	    Metadata meta = new Metadata();  
	    meta.add(Metadata.CONTENT_ENCODING, "utf-8");  
        ContentHandler iHandler = new BodyContentHandler(os);  
	    parser.parse(is, iHandler, meta, new ParseContext());
	    Long end = System.currentTimeMillis();
	    Long used = (end-start)/1000;
	    System.out.println("耗时: "+used+"秒");
	}
	//解析Word
	@Test
	public void testWrod() throws Exception{
		Long start = System.currentTimeMillis();
		Parser parser = new OfficeParser();
		InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\我的微盘\文档\参考文档\jBPM5_用户指南中文版.doc")));
		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt")));
		Metadata meta = new Metadata();  
	    meta.add(Metadata.CONTENT_ENCODING, "utf-8");  
        ContentHandler iHandler = new BodyContentHandler(os);  
	    parser.parse(is, iHandler, meta, new ParseContext());
		
		Long end = System.currentTimeMillis();
		Long used = (end-start)/1000;
		System.out.println("耗时:"+used+"秒");
	}
	//解析EMAIL(只能解析标准的eml格式的,不能解析微软的msg格式) 
	//使用commons-email来进行解析的可以得到收件人、发件人、主题、内容等元数据,TIkA是否支持未尝试
	@Test
	public void testEmail() throws Exception{
		Long start = System.currentTimeMillis();
		Parser parser = new RFC822Parser();
		InputStream is = new BufferedInputStream(new FileInputStream(new File("C:\Users\Administrator\Downloads\回复_ RE_ 数据导入工作 - 外部系统枚举与U-Cloud枚举映射.eml")));
		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt")));
		Metadata meta = new Metadata();  
		meta.add(Metadata.CONTENT_ENCODING, "utf-8"); 
		ContentHandler iHandler = new BodyContentHandler(os);  
		parser.parse(is, iHandler, meta, new ParseContext());
		
		Long end = System.currentTimeMillis();
		Long used = (end-start)/1000;
		System.out.println("耗时:"+used+"秒");
	}
}

  

原文地址:https://www.cnblogs.com/jiaoyiping/p/4150238.html