word转html 和pdf

今天有个新的需求，就是要把word进行预览，为了实现打印，需要转成pdf或html，在网上找了一些方法，这里做个记录

首先是转html，看起来挺简单的

首先是两个maven包

<groupId>fr.opensagres.xdocreport</groupId>

<artifactId>fr.opensagres.xdocreport.document</artifactId>

</dependency>

<groupId>fr.opensagres.xdocreport</groupId>

<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>

</dependency>

<groupId>org.apache.poi</groupId>

</dependency>

<groupId>org.apache.poi</groupId>

<artifactId>poi-scratchpad</artifactId>

</dependency>

然后就是转换demo

package b2b.cn.util;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.parsers.ParserConfigurationException;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerException;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.converter.PicturesManager;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;

import org.apache.poi.hwpf.usermodel.PictureType;

import org.apache.poi.xwpf.converter.core.FileImageExtractor;

import org.apache.poi.xwpf.converter.core.FileURIResolver;

import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;

import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;

import org.apache.poi.xwpf.usermodel.XWPFDocument;

import org.junit.Test;

import org.w3c.dom.Document;

/**

* word 转换成html

public class GoHTML {

public static void main(String[] args) {

try {

// new GoHTML().Word2003ToHtml(); //doc

new GoHTML().Word2007ToHtml();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

/**

* 2007版本word转换成html

* @throws IOException

@Test

public void Word2007ToHtml() throws IOException {

String filepath = "E:/test/";

String fileName = "demo.docx";

String htmlName = "123.html";

final String file = filepath + fileName;

File f = new File(file);

if (!f.exists()) {

System.out.println("Sorry File does not Exists!");

} else {

if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {

// 1) 加载word文档生成 XWPFDocument对象

InputStream in = new FileInputStream(f);

XWPFDocument document = new XWPFDocument(in);

// 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)

File imageFolderFile = new File(filepath);

XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));

options.setExtractor(new FileImageExtractor(imageFolderFile));

options.setIgnoreStylesIfUnused(false);

options.setFragment(true);

// 3) 将 XWPFDocument转换成XHTML

OutputStream out = new FileOutputStream(new File(filepath + htmlName));

XHTMLConverter.getInstance().convert(document, out, options);

//也可以使用字符数组流获取解析的内容

// ByteArrayOutputStream baos = new ByteArrayOutputStream();

// XHTMLConverter.getInstance().convert(document, baos, options);

// String content = baos.toString();

// System.out.println(content);

// baos.close();

} else {

System.out.println("Enter only MS Office 2007+ files");

}

/**

* /**

* 2003版本word转换成html

* @throws IOException

* @throws TransformerException

* @throws ParserConfigurationException

@Test

public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {

final String imagepath = "F:/test/image/";//解析时候如果doc文件中有图片图片会保存在此路径

String filepath = "F:/test/";

String fileName = "demo.doc";

String htmlName = "123.html";

final String file = filepath + fileName;

InputStream input = new FileInputStream(new File(file));

HWPFDocument wordDocument = new HWPFDocument(input);

WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

//设置图片存放的位置

wordToHtmlConverter.setPicturesManager(new PicturesManager() {

public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {

File imgPath = new File(imagepath);

if(!imgPath.exists()){//图片目录不存在则创建

imgPath.mkdirs();

}

File file = new File(imagepath + suggestedName);

try {

OutputStream os = new FileOutputStream(file);

os.write(content);

os.close();

} catch (FileNotFoundException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

return imagepath + suggestedName;

}

});

//解析word文档

wordToHtmlConverter.processDocument(wordDocument);

Document htmlDocument = wordToHtmlConverter.getDocument();

File htmlFile = new File(filepath + htmlName);

OutputStream outStream = new FileOutputStream(htmlFile);

//也可以使用字符数组流获取解析的内容

// ByteArrayOutputStream baos = new ByteArrayOutputStream();

// OutputStream outStream = new BufferedOutputStream(baos);

DOMSource domSource = new DOMSource(htmlDocument);

StreamResult streamResult = new StreamResult(outStream);

TransformerFactory factory = TransformerFactory.newInstance();

Transformer serializer = factory.newTransformer();

serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");

serializer.setOutputProperty(OutputKeys.INDENT, "yes");

serializer.setOutputProperty(OutputKeys.METHOD, "html");

serializer.transform(domSource, streamResult);

//也可以使用字符数组流获取解析的内容

// String content = baos.toString();

// System.out.println(content);

// baos.close();

outStream.close();

}

我只测试了docx ，没有问题

但是转pdf出现了一点小麻烦

这个方法是网上很多人都在用的

<groupId>com.aspose.words</groupId>

<artifactId>aspose-words-jdk16</artifactId>

</dependency>

package b2b.cn.util;

import java.io.File;

import java.io.FileOutputStream;

import java.io.InputStream;

import org.junit.Test;

import com.aspose.words.Document;

import com.aspose.words.License;

import com.aspose.words.SaveFormat;

/***

* 转成pdf工具类

* 如果注释掉验证转化成功但是有水印

* @author Ic055

public class GoPDF {

public static void main(String[] args) {

doc2pdf("E:/test/demo.docx");

}

public static boolean getLicense() {

boolean result = false;

try {

InputStream is = Test.class.getClassLoader().getResourceAsStream("license.xml"); // license.xml应放在..WebRootWEB-INFclasses路径下

License aposeLic = new License();

aposeLic.setLicense(is);

result = true;

} catch (Exception e) {

e.printStackTrace();

}

return result;

}

public static void doc2pdf(String Address) {

if (!getLicense()) {

// 验证License 若不验证则转化出的pdf文档会有水印产生 return; }

return;

}

try {

long old = System.currentTimeMillis();

File file = new File("E:/demo11.pdf"); //新建一个空白pdf文档

FileOutputStream os = new FileOutputStream(file);

Document doc = new Document(Address); //Address是将要被转化的word文档

doc.save(os, SaveFormat.PDF);//全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互转换

long now = System.currentTimeMillis();

System.out.println("共耗时：" + ((now - old) / 1000.0) + "秒"); //转化用时

} catch (Exception e) {

e.printStackTrace();

}

license.xml

<Data>

<Product>Aspose.Total for Java</Product>

<Product>Aspose.Words for Java</Product>

</Products>

<EditionType>Enterprise</EditionType>

</Data>

<Signature>sNLLKGMUdF0r8O1kKilWAGdgfs2BvJb/2Xp8p5iuDVfZXmhppo+d0Ran1P9TKdjV4ABwAgKXxJ3jcQTqE/2IRfqwnPf8itN8aFZlV3TJPYeD3yWE7IT55Gz6EijUpC7aKeoohTb4w2fpox58wWoF3SNp6sK6jDfiAUGEHYJ9pjU=</Signature>

</License>

在网上当的绝大多数教程中这个文件应该放在WebRoot/WEB-INFO/classes目录下

但是发现一直找不到这个文件，后来偶然看到，对于maven项目来说，应该放在resouce目录下

还有一个小问题就是

<dependency>
<groupId>com.aspose.words</groupId>
<artifactId>aspose-words-jdk16</artifactId>
<version>15.8.0</version>
</dependency>

这个可能会报错，所以呢，这个要本地下载jar资源

aspose-words-15.8.0-jdk16 我这个是在网上找到的资源然后用eclipse添加到maven仓库就可以用啦

我放在百度网盘分享给大家

链接：https://pan.baidu.com/s/1DncAhgqUqfELv193WtTcDQ
提取码：q41z

除了用eclipse添加到maven仓库，我见到还有一种处理方式

内置属性：主要有两个常用内置属性——${basedir}表示项目根目录，即包含pom.xml文件的目录;${version}表示项目版本。