word转html 和pdf

今天有个新的需求,就是要把word进行预览,为了实现打印,需要转成pdf或html,在网上找了一些方法,这里做个记录

首先是转html,看起来挺简单的
 
首先是两个maven包
 
<!-- java word文档 转 html文件 -->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.document</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
<version>1.0.5</version>
</dependency>
 
 
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.12</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.12</version>
</dependency>
 
然后就是转换demo
package b2b.cn.util;
 
 
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
 
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
 
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import org.w3c.dom.Document;
 
/**
* word 转换成html
*/
public class GoHTML {
 
public static void main(String[] args) {
try {
// new GoHTML().Word2003ToHtml(); //doc
 
new GoHTML().Word2007ToHtml();
 
 
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
 
/**
* 2007版本word转换成html
* @throws IOException
*/
@Test
public void Word2007ToHtml() throws IOException {
String filepath = "E:/test/";
String fileName = "demo.docx";
String htmlName = "123.html";
final String file = filepath + fileName;
File f = new File(file);
if (!f.exists()) {
System.out.println("Sorry File does not Exists!");
} else {
if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
 
// 1) 加载word文档生成 XWPFDocument对象
InputStream in = new FileInputStream(f);
XWPFDocument document = new XWPFDocument(in);
 
// 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
File imageFolderFile = new File(filepath);
XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
options.setExtractor(new FileImageExtractor(imageFolderFile));
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
 
// 3) 将 XWPFDocument转换成XHTML
OutputStream out = new FileOutputStream(new File(filepath + htmlName));
XHTMLConverter.getInstance().convert(document, out, options);
 
//也可以使用字符数组流获取解析的内容
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// XHTMLConverter.getInstance().convert(document, baos, options);
// String content = baos.toString();
// System.out.println(content);
// baos.close();
} else {
System.out.println("Enter only MS Office 2007+ files");
}
}
}
 
/**
* /**
* 2003版本word转换成html
* @throws IOException
* @throws TransformerException
* @throws ParserConfigurationException
*/
@Test
public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {
final String imagepath = "F:/test/image/";//解析时候如果doc文件中有图片 图片会保存在此路径
String filepath = "F:/test/";
String fileName = "demo.doc";
String htmlName = "123.html";
final String file = filepath + fileName;
InputStream input = new FileInputStream(new File(file));
HWPFDocument wordDocument = new HWPFDocument(input);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//设置图片存放的位置
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
File imgPath = new File(imagepath);
if(!imgPath.exists()){//图片目录不存在则创建
imgPath.mkdirs();
}
File file = new File(imagepath + suggestedName);
try {
OutputStream os = new FileOutputStream(file);
os.write(content);
os.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return imagepath + suggestedName;
}
});
 
//解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
 
File htmlFile = new File(filepath + htmlName);
OutputStream outStream = new FileOutputStream(htmlFile);
 
//也可以使用字符数组流获取解析的内容
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// OutputStream outStream = new BufferedOutputStream(baos);
 
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
 
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
 
serializer.transform(domSource, streamResult);
 
//也可以使用字符数组流获取解析的内容
// String content = baos.toString();
// System.out.println(content);
// baos.close();
outStream.close();
}
 
}
我只测试了docx ,没有问题
 
 
但是转pdf出现了一点小麻烦
 
这个方法是网上很多人都在用的
<!-- 转pdf -->
<!-- https://mvnrepository.com/artifact/com.aspose/aspose-words -->
<dependency>
<groupId>com.aspose.words</groupId>
<artifactId>aspose-words-jdk16</artifactId>
<version>15.8.0</version>
</dependency>
 
package b2b.cn.util;
 
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
 
import org.junit.Test;
 
import com.aspose.words.Document;
import com.aspose.words.License;
import com.aspose.words.SaveFormat;
/***
* 转成pdf工具类
* 如果注释掉验证 转化成功但是有水印
* @author Ic055
*
*/
public class GoPDF {
 
public static void main(String[] args) {
doc2pdf("E:/test/demo.docx");
}
public static boolean getLicense() {
boolean result = false;
try {
InputStream is = Test.class.getClassLoader().getResourceAsStream("license.xml"); // license.xml应放在..WebRootWEB-INFclasses路径下
License aposeLic = new License();
aposeLic.setLicense(is);
result = true;
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
public static void doc2pdf(String Address) {
 
if (!getLicense()) {
// 验证License 若不验证则转化出的pdf文档会有水印产生 return; }
return;
 
}
try {
long old = System.currentTimeMillis();
File file = new File("E:/demo11.pdf"); //新建一个空白pdf文档
FileOutputStream os = new FileOutputStream(file);
Document doc = new Document(Address); //Address是将要被转化的word文档
doc.save(os, SaveFormat.PDF);//全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互转换
long now = System.currentTimeMillis();
System.out.println("共耗时:" + ((now - old) / 1000.0) + "秒"); //转化用时
} catch (Exception e) {
e.printStackTrace();
}
 
}
 
 
 
}
 
 
 
license.xml
 
<License>
<Data>
<Products>
<Product>Aspose.Total for Java</Product>
<Product>Aspose.Words for Java</Product>
</Products>
<EditionType>Enterprise</EditionType>
<SubscriptionExpiry>20991231</SubscriptionExpiry>
<LicenseExpiry>20991231</LicenseExpiry>
<SerialNumber>8bfe198c-7f0c-4ef8-8ff0-acc3237bf0d7</SerialNumber>
</Data>
<Signature>sNLLKGMUdF0r8O1kKilWAGdgfs2BvJb/2Xp8p5iuDVfZXmhppo+d0Ran1P9TKdjV4ABwAgKXxJ3jcQTqE/2IRfqwnPf8itN8aFZlV3TJPYeD3yWE7IT55Gz6EijUpC7aKeoohTb4w2fpox58wWoF3SNp6sK6jDfiAUGEHYJ9pjU=</Signature>
</License>
 
在网上当的绝大多数教程中 这个文件应该放在WebRoot/WEB-INFO/classes目录下
但是发现一直找不到这个文件,后来偶然看到,对于maven项目来说,应该放在resouce目录下
 
还有一个小问题 就是

<dependency>
<groupId>com.aspose.words</groupId>
<artifactId>aspose-words-jdk16</artifactId>
<version>15.8.0</version>
</dependency>

这个可能会报错,所以呢,这个要本地下载jar资源

aspose-words-15.8.0-jdk16 我这个是在网上找到的资源 然后用eclipse添加到maven仓库就可以用啦

我放在百度网盘分享给大家

链接:https://pan.baidu.com/s/1DncAhgqUqfELv193WtTcDQ
提取码:q41z

除了用eclipse添加到maven仓库,我见到还有一种处理方式

  • 内置属性:主要有两个常用内置属性——${basedir}表示项目根目录,即包含pom.xml文件的目录;${version}表示项目版本。
 

 
 
 
 
 
 
路径
 
 

原文地址:https://www.cnblogs.com/Mr-Y1907/p/11263807.html