poi将docx转WordML(可通过office打开成word的xml)

返回
Author Avatar
钢翼
2021-08-06
编程
90

由于是docx,用的XWPFDocument读取文档,所以不能通过WordToFoConverter转xml。

通过document.getDocument().toString()我们可以获取到以下格式的字符串。

<xml-fragment mc:Ignorable="w14 w15 wp14" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData">
  <w:body>
	……
  </w:body>
</xml-fragment>	

而我们直接通过office保存的xml格式如下

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<?mso-application progid="Word.Document"?>
<w:wordDocument xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core"  xmlns:aml="http://schemas.microsoft.com/aml/2001/core" xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" w:macrosPresent="no" w:embeddedObjPresent="no" w:ocxPresent="no" xml:space="preserve" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData">
  <o:DocumentProperties>
    ……
  </o:DocumentProperties>
  <o:CustomDocumentProperties>
    ……
  </o:CustomDocumentProperties>
  <w:fonts>
    ……
  </w:fonts>
  <w:styles>
    ……
  </w:styles>
  <w:docPr>
    ……
  </w:docPr>
  <w:body>
	……
  </w:body>
</w:wordDocument>

可以看出两个xml根节点不同,office保存的xml还多了很多其他节点,实际上除了<w:body>其他节点我们都不需要。

所以转换WordML可以通过以下代码自行处理。

 public static String asXML(XWPFDocument document) {
	//获取poi生成的xml
    String xmlString = document.getDocument().toString();	
	//去除根节点
    String result = xmlString.replaceAll("<xml-fragment.*?>", "").replaceAll("</xml-fragment>", "");
	//添加WordML头声明和根节点
    result = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n"
             + "<?mso-application progid=\"Word.Document\"?>\n"
             + "<w:wordDocument xmlns:w=\"http://schemas.microsoft.com/office/word/2003/wordml\" xmlns:v=\"urn:schemas-microsoft-com:vml\" xmlns:w10=\"urn:schemas-microsoft-com:office:word\" xmlns:sl=\"http://schemas.microsoft.com/schemaLibrary/2003/core\"  xmlns:aml=\"http://schemas.microsoft.com/aml/2001/core\" xmlns:wx=\"http://schemas.microsoft.com/office/word/2003/auxHint\" xmlns:o=\"urn:schemas-microsoft-com:office:office\" xmlns:dt=\"uuid:C2F41010-65B3-11d1-A29F-00AA00C14882\" w:macrosPresent=\"no\" w:embeddedObjPresent=\"no\" w:ocxPresent=\"no\" xml:space=\"preserve\" xmlns:wpsCustomData=\"http://www.wps.cn/officeDocument/2013/wpsCustomData\">"
             + result
             + "</w:wordDocument>";
	return result;

}

也可以用下面的格式

 public static String asXML2(XWPFDocument document) {
	String xmlString = document.getDocument().toString();
    String result = xmlString.replace("<xml-fragment", "<w:document").replace("</xml-fragment>", "</w:document>");
    result = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n" +
                "<?mso-application progid=\"Word.Document\"?>\n" +
                "<pkg:package xmlns:pkg=\"http://schemas.microsoft.com/office/2006/xmlPackage\">\n" +
                "<pkg:part pkg:name=\"/_rels/.rels\" pkg:contentType=\"application/vnd.openxmlformats-package.relationships+xml\">\n" +
                "<pkg:xmlData>\n" +
                "<Relationships xmlns=\"http://schemas.openxmlformats.org/package/2006/relationships\">\n" +
                "<Relationship Id=\"rId1\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument\" Target=\"word/document.xml\"/>\n" +
                "</Relationships>\n" +
                "</pkg:xmlData>\n" +
                "</pkg:part>\n" +
                "<pkg:part pkg:name=\"/word/document.xml\" pkg:contentType=\"application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml\">\n" +
                "<pkg:xmlData>" + result + "</pkg:xmlData>\n" +
                "</pkg:part>\n" +
                "</w:document>\n" +
                "</pkg:package>";
	return result;

}