poi读取word2003(.doc文档)中的表格

poi读取word2003(.doc文档)中的表格

Jakarta POI 是apache的子项目,目标是处理ole2对象。它提供了一组操纵Windows文档的Java API。在网上见到好多通过poi读取excel的文章,读写也很方便,和jxl有的一比。在这里,主要是poi对word中的表格数据读取。

具体见代码

 1 import java.io.File;  
 2 import java.io.FileInputStream;  
 3 import java.io.FileNotFoundException;  
 4   
 5 import org.apache.poi.hwpf.HWPFDocument;  
 6 import org.apache.poi.hwpf.usermodel.Paragraph;  
 7 import org.apache.poi.hwpf.usermodel.Range;  
 8 import org.apache.poi.hwpf.usermodel.Table;  
 9 import org.apache.poi.hwpf.usermodel.TableCell;  
10 import org.apache.poi.hwpf.usermodel.TableIterator;  
11 import org.apache.poi.hwpf.usermodel.TableRow;  
12   
13 import java.io.File;     
14 import java.io.FileInputStream;     
15 import java.io.InputStream;     
16     
17 import org.apache.poi.POIXMLDocument;     
18 import org.apache.poi.POIXMLTextExtractor;     
19 import org.apache.poi.hwpf.extractor.WordExtractor;     
20 import org.apache.poi.openxml4j.opc.OPCPackage;     
21 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;    
22   
23   
24 import org.apache.poi.poifs.filesystem.POIFSFileSystem;  
25   
26 public class ExportDocImpl  
27 {  
28     public void testWord(){  
29         try{  
30             FileInputStream in = new FileInputStream("D:\sinye.doc");//载入文档  
31            POIFSFileSystem pfs = new POIFSFileSystem(in);     
32             HWPFDocument hwpf = new HWPFDocument(pfs);     
33             Range range = hwpf.getRange();//得到文档的读取范围  
34             TableIterator it = new TableIterator(range);  
35            //迭代文档中的表格  
36             while (it.hasNext()) {     
37                 Table tb = (Table) it.next();     
38                 //迭代行,默认从0开始  
39                 for (int i = 0; i < tb.numRows(); i++) {     
40                     TableRow tr = tb.getRow(i);     
41                     //迭代列,默认从0开始  
42                     for (int j = 0; j < tr.numCells(); j++) {     
43                         TableCell td = tr.getCell(j);//取得单元格  
44                         //取得单元格的内容  
45                         for(int k=0;k<td.numParagraphs();k++){     
46                             Paragraph para =td.getParagraph(k);     
47                             String s = para.text();     
48                             System.out.println(s);  
49                         } //end for      
50                     }   //end for  
51                 }   //end for  
52             } //end while  
53         }catch(Exception e){  
54             e.printStackTrace();  
55         }  
56     }//end method  
57       
58       
59            public void testWord1(){  
60            try {     
61             //word 2003: 图片不会被读取     
62             InputStream is = new FileInputStream(new File("D:\sinye.doc"));     
63                   WordExtractor ex = new WordExtractor(is);     
64                   String text2003 = ex.getText();     
65                   System.out.println(text2003);     
66             //word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后     
67             OPCPackage opcPackage = POIXMLDocument.openPackage("D:\sinye.doc");     
68                   POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);     
69                   String text2007 = extractor.getText();     
70                   System.out.println(text2007);     
71                  
72         } catch (Exception e) {     
73                   e.printStackTrace();     
74         }   
75     }  
76 }  
原文地址:https://www.cnblogs.com/Renyi-Fan/p/8157111.html