如何用Java将html转为word(保留html页面中文字原样式)

目前在项目中遇到一个需求:用户可以将网站发布的文章另存为word文件,并且要保留网站页面文字的原始样式,同时要给word增加页眉页脚。我目前的实现思路是:①将html代码读取出来 ②生成一个空的word,并将html代码存入word ③给生成的word添加页眉页脚,但是在执行到第三步的时候出现了问题,写入页眉页脚的时候失败了(代码并未报错,就是无法写入)。希望实现过类似需求的小伙伴能给指点一二。

import java.io.*;

import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

public class Html2Word {

    public static void main(String[] args) throws Exception {
        String html = "<div id=\"doccon\" style=\"padding-top:40px;\"><div class=\"Custom_UnionStyle\"><div style=\"text-align: center\">  <b>中华人民共和国自然资源部令</b>\n" +
                "</div><div style=\"text-align: center\"><b>  第 6 号</b>\n" +
                "</div><div>  《自然资源部关于第二批废止和修改的部门规章的决定》已经2020年自然资源部第1次部务会议审议通过,现予公布,自公布之日起施行。\n" +
                "</div><div style=\"text-align: right\"><span style=\"font-size: 12pt\">部长&nbsp; 陆&nbsp; 昊</span>\n" +
                "</div><div style=\"text-align: right\"><span style=\"font-size: 12pt\">2020年3月20日</span>\n" +
                "</div><div style=\"text-align: center\">  \n" +
                "</div>\n" +
                "</div>\n" +
                "</div>";
        createWordForHtml(html, "xxxx");
    }


    public static void createWordForHtml(String html,String fileName) {
        try {
            String savePath = "F:\\"+fileName+".docx";
            File file = new File(savePath);
            file.createNewFile();

            html = html.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", "\"").replace("&amp;", "&");
            String content="<html><body>"+html+"</body></html>";
            //这里是必须要设置编码的,不然导出中文就会乱码。
            byte b[] = content.getBytes("GBK");
            //将字节数组包装到流中
            ByteArrayInputStream bais = new ByteArrayInputStream(b);
            /*
             * 关键地方
             * 生成word格式 */
            POIFSFileSystem poifs = new POIFSFileSystem();
            DirectoryEntry directory = poifs.getRoot();
            DocumentEntry documentEntry = directory.createDocument("WordDocument", bais);
            OutputStream ostream = new FileOutputStream(file);
            //写入内容
            poifs.writeFilesystem(ostream);
            bais.close();
            ostream.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

import com.spire.doc.*;
import com.spire.doc.documents.*;
import com.spire.doc.fields.TextRange;

import java.awt.*;

public class HeaderFooterUtil {

    public static void main(String[] args){
        //加载测试文档
        Document doc = new Document("F:\\xxxx.docx");
        Section sec = doc.getSections().get(0);

        //加载图片添加到页眉,并设置图片在段落中的对齐方式
        HeaderFooter header = sec.getHeadersFooters().getHeader();
        Paragraph hpara= header.addParagraph();

        //添加文字到页眉,并设置字体、字号、字体加粗、对齐方式
        TextRange txt = hpara.appendText("XXXX");
        txt.getCharacterFormat().setUnderlineStyle(UnderlineStyle.None);
        txt.getCharacterFormat().setTextColor(Color.GRAY);
        txt.getCharacterFormat().setFontName("仿宋");
        txt.getCharacterFormat().setFontSize(12f);
        txt.getCharacterFormat().setBold(true);
        hpara.getFormat().setHorizontalAlignment(HorizontalAlignment.Right);
        //设置图片的文本环绕方式、页眉底部边线(粗细、间距)
        hpara.getFormat().getBorders().getBottom().setBorderType(BorderStyle.Single);
        hpara.getFormat().getBorders().getBottom().setLineWidth(0.5f);
        hpara.getFormat().getBorders().setSpace(2f);

        //添加页码到页脚,并设置页脚对齐方式,顶部边线粗细、间距
        HeaderFooter footer = sec.getHeadersFooters().getFooter();
        Paragraph fpara= footer.addParagraph();
        fpara.appendField("页码",FieldType.Field_Page);
        fpara.appendText("/");
        fpara.appendField("总页数",FieldType.Field_Num_Pages);
        fpara.getFormat().setHorizontalAlignment(HorizontalAlignment.Right);
        fpara.getFormat().getBorders().getTop().setBorderType(BorderStyle.Single);
        fpara.getFormat().getBorders().getTop().setLineWidth(1f);
        fpara.getFormat().getBorders().getTop().setSpace(2f);

        //保存文档
        doc.saveToFile("F:\\temp4.docx",FileFormat.Docx_2010);
    }
}

引入依赖

<dependency>
  <groupId>e-iceblue</groupId>
  <artifactId>spire.doc.free</artifactId>
  <version>2.7.3</version>
</dependency>

导出代码

public static void exportWord(HttpServletRequest request, HttpServletResponse response, String content, String fileName) {

        try {
            //新建Document对象
            Document document = new Document();
            //添加section
            Section sec = document.addSection();
            ByteArrayOutputStream os = new ByteArrayOutputStream();
            //添加段落并写入HTML文本
            sec.addParagraph().appendHTML(content);
            document.saveToStream(os,FileFormat.Docx);

            InputStream input = new ByteArrayInputStream(os.toByteArray());

            //输出文件
            request.setCharacterEncoding("utf-8");
            response.setContentType("application/msword");//导出word格式
            response.addHeader("Content-Disposition", "attachment;filename=" +
                    URLEncoder.encode(fileName, "utf-8") + ".docx");

            ServletOutputStream ostream = response.getOutputStream();
            int len =-1;
            byte []by = new byte[1024];
            while((len = input.read(by))!=-1) {
                ostream.write(by,0,len);
            }
            ostream.close();
            input.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }



/**
     * word格式html的标签头
     */
    public static final String HTML_TAG_BGN = "<html xmlns=\"http://www.w3.org/TR/REC-html40\" xmlns:v=\"urn:schemas-microsoft-com:vml\" xmlns:o=\"urn:schemas-microsoft-com:office:office\" xmlns:w=\"urn:schemas-microsoft-com:office:word\" xmlns:m=\"http://schemas.microsoft.com/office/2004/12/omml\"><head><meta name=\"ProgId\" content=\"Word.Document\" /><meta name=\"Generator\" content=\"Microsoft Word 12\" /><meta name=\"Originator\" content=\"Microsoft Word 12\" /> <!--[if gte mso 9]><xml><w:WordDocument><w:View>Print</w:View></w:WordDocument></xml><[endif]-->";
     
 
public filePath downloadWordReport(String htmlForPrint) {
        try {
            String wordString = htmlForPrint.replaceAll("<head>", "").replaceAll("<html>", HTML_TAG_BGN );
            String fileName = new String("测试文件.doc".getBytes(), "UTF-8");
            //上传文件方法
            return this.upload(new ByteArrayInputStream(wordString.getBytes()), fileName);
 
        } catch (Exception e) {
            return null;
        }
    }