POI实现Word转HTML文件

package cn.wgd.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.AbstractWordUtils;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.util.XMLHelper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;

import fr.opensagres.poi.xwpf.converter.core.IXWPFConverter;
import fr.opensagres.poi.xwpf.converter.core.ImageManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;

/**
 * @author Kevin 2018-3-14
 * 
 * 将word,pdf等文件转为html,用于附件预览!
 * 
 * 圖片處理https://www.cnblogs.com/feiruo/p/5924514.html
 * 
 * 本例程需要jar包:poi(poi3.17)相关jar包外,
 * fr.opensagres.poi.xwpf.converter.core-2.0.1.jar
 * fr.opensagres.poi.xwpf.converter.xhtml-2.0.1.jar
 * fr.opensagres.xdocreport.core-2.0.1.jar
 * ooxml-schemas-1.3.jar等
 * 
 * 注:此方法为简单实现,如word需要更多样式处理,还需要自行实现!
 *
 */
public class ConvertWord2HtmlUtil {

    public static void main(String[] args) throws IOException, ParserConfigurationException, TransformerException, SAXException {
        String path = "D:\testfile2html\test.docx";
        String descPath = "D:\testfile2html\test.html";
        String imagePath = "D:\testfile2html";
        word2007ToHtml(path, descPath, imagePath);
    }

    /**
     * 处理doc文件转HTML,此方法参考:org.apache.poi.hwpf.converter.WordToHtmlConverter.main()
     * @param path
     * @param descPath
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws TransformerException
     */
    public static void word95T2007ToHtml(String path, String descPath) 
            throws IOException, ParserConfigurationException, TransformerException{
        if(path == null)
            throw new NullPointerException("路径不能为空!");

        System.out.println( "Converting " + path );
        System.out.println( "Saving output to " + descPath );

        Document doc = ConvertWord2HtmlUtil.process(new File(path));

        DOMSource domSource = new DOMSource( doc );
        StreamResult streamResult = new StreamResult(new File(descPath));

        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        // TODO set encoding from a command argument
        serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
        serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
        serializer.setOutputProperty( OutputKeys.METHOD, "html" );
        serializer.transform( domSource, streamResult );
    }

    /**
     * 
     * 此方法来源于:org.apache.poi.hwpf.converter.WordToHtmlConverter
     * @param docFile
     * @return
     * @throws IOException
     * @throws ParserConfigurationException
     */
    static Document process( File docFile ) throws IOException, ParserConfigurationException
    {
        final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( docFile );
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                XMLHelper.getDocumentBuilderFactory().newDocumentBuilder()
                        .newDocument() );
        wordToHtmlConverter.processDocument( wordDocument );
        return wordToHtmlConverter.getDocument();
    }

    /**
     * @param path 源文件路径(doc or docx)
     * @param descPath 转化后的文件路径(html)
     * @param imagePath 图片存放地址(本地址默认为html文件同路径)
     * @throws IOException 
     * @throws ParserConfigurationException
     * @throws TransformerException
     * @throws SAXException
     */
    public static void word2007ToHtml(String path, String descPath, String imagePath) 
            throws IOException, ParserConfigurationException, TransformerException, SAXException{
        if(path == null){
            throw new NullPointerException("路径不能为空!");
        }
        File sourceFile = new File(path);
        if(!sourceFile.exists()){
            System.out.println("用户文件不存在!");
            return;
        }else{
            if(path.endsWith(".docx") || path.endsWith(".DOCX")){
                XWPFDocument document = new XWPFDocument(new FileInputStream(path));
                //html转化器
                IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();
                //html属性器
                XHTMLOptions options = XHTMLOptions.create();
                //图片处理,第二个参数为html文件同级目录下,否则图片找不到。
                ImageManager imageManager = new ImageManager(new File(imagePath), "image");
                options.setImageManager(imageManager);

                converter.convert(document, new FileOutputStream(descPath), options);
            }else{
                word95T2007ToHtml(path, descPath);
            }
        }
    }
}
原文地址:https://www.cnblogs.com/Kevin-1992/p/12608382.html