poi将docx转WordML(可通过office打开成word的xml)
钢翼
编程
由于是docx,用的XWPFDocument读取文档,所以不能通过WordToFoConverter转xml。
通过document.getDocument().toString()我们可以获取到以下格式的字符串。
<xml-fragment mc:Ignorable="w14 w15 wp14" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData">
<w:body>
……
</w:body>
</xml-fragment>
而我们直接通过office保存的xml格式如下
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<?mso-application progid="Word.Document"?>
<w:wordDocument xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core" xmlns:aml="http://schemas.microsoft.com/aml/2001/core" xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" w:macrosPresent="no" w:embeddedObjPresent="no" w:ocxPresent="no" xml:space="preserve" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData">
<o:DocumentProperties>
……
</o:DocumentProperties>
<o:CustomDocumentProperties>
……
</o:CustomDocumentProperties>
<w:fonts>
……
</w:fonts>
<w:styles>
……
</w:styles>
<w:docPr>
……
</w:docPr>
<w:body>
……
</w:body>
</w:wordDocument>
可以看出两个xml根节点不同,office保存的xml还多了很多其他节点,实际上除了<w:body>其他节点我们都不需要。
所以转换WordML可以通过以下代码自行处理。
public static String asXML(XWPFDocument document) {
//获取poi生成的xml
String xmlString = document.getDocument().toString();
//去除根节点
String result = xmlString.replaceAll("<xml-fragment.*?>", "").replaceAll("</xml-fragment>", "");
//添加WordML头声明和根节点
result = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n"
+ "<?mso-application progid=\"Word.Document\"?>\n"
+ "<w:wordDocument xmlns:w=\"http://schemas.microsoft.com/office/word/2003/wordml\" xmlns:v=\"urn:schemas-microsoft-com:vml\" xmlns:w10=\"urn:schemas-microsoft-com:office:word\" xmlns:sl=\"http://schemas.microsoft.com/schemaLibrary/2003/core\" xmlns:aml=\"http://schemas.microsoft.com/aml/2001/core\" xmlns:wx=\"http://schemas.microsoft.com/office/word/2003/auxHint\" xmlns:o=\"urn:schemas-microsoft-com:office:office\" xmlns:dt=\"uuid:C2F41010-65B3-11d1-A29F-00AA00C14882\" w:macrosPresent=\"no\" w:embeddedObjPresent=\"no\" w:ocxPresent=\"no\" xml:space=\"preserve\" xmlns:wpsCustomData=\"http://www.wps.cn/officeDocument/2013/wpsCustomData\">"
+ result
+ "</w:wordDocument>";
return result;
}
也可以用下面的格式
public static String asXML2(XWPFDocument document) {
String xmlString = document.getDocument().toString();
String result = xmlString.replace("<xml-fragment", "<w:document").replace("</xml-fragment>", "</w:document>");
result = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n" +
"<?mso-application progid=\"Word.Document\"?>\n" +
"<pkg:package xmlns:pkg=\"http://schemas.microsoft.com/office/2006/xmlPackage\">\n" +
"<pkg:part pkg:name=\"/_rels/.rels\" pkg:contentType=\"application/vnd.openxmlformats-package.relationships+xml\">\n" +
"<pkg:xmlData>\n" +
"<Relationships xmlns=\"http://schemas.openxmlformats.org/package/2006/relationships\">\n" +
"<Relationship Id=\"rId1\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument\" Target=\"word/document.xml\"/>\n" +
"</Relationships>\n" +
"</pkg:xmlData>\n" +
"</pkg:part>\n" +
"<pkg:part pkg:name=\"/word/document.xml\" pkg:contentType=\"application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml\">\n" +
"<pkg:xmlData>" + result + "</pkg:xmlData>\n" +
"</pkg:part>\n" +
"</w:document>\n" +
"</pkg:package>";
return result;
}