POIWordToHtml.java 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. package com.steerinfo.dil.util;
  2. import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
  3. import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
  4. import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
  5. import org.apache.poi.hwpf.HWPFDocumentCore;
  6. import org.apache.poi.hwpf.converter.WordToHtmlConverter;
  7. import org.apache.poi.hwpf.converter.WordToHtmlUtils;
  8. import org.apache.poi.hwpf.usermodel.Picture;
  9. import org.apache.poi.xwpf.usermodel.*;
  10. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber;
  11. import org.w3c.dom.Document;
  12. import org.w3c.dom.Element;
  13. import javax.xml.parsers.DocumentBuilderFactory;
  14. import javax.xml.transform.OutputKeys;
  15. import javax.xml.transform.Transformer;
  16. import javax.xml.transform.TransformerFactory;
  17. import javax.xml.transform.dom.DOMSource;
  18. import javax.xml.transform.stream.StreamResult;
  19. import java.io.*;
  20. import java.math.BigInteger;
  21. import java.util.Base64;
  22. import java.util.Date;
  23. import java.util.List;
  24. /**
  25. * POIExcelToHtml 文件转换:
  26. *
  27. * @author generator
  28. * @version 1.0-SNAPSHORT 2021-08-09 18:06
  29. * 类描述
  30. * 修订历史:
  31. * 日期:2021-08-09
  32. * 作者:shadow
  33. * 参考:https://blog.csdn.net/u013585096/article/details/85104888
  34. * 描述:Execl转HTML
  35. * @Copyright 湖南视拓信息技术股份有限公司. All rights reserved.
  36. * @see null
  37. */
  38. public class POIWordToHtml {
  39. private static final String ENCODING = "GB2312";// UTF-8
  40. public String docToHtml(InputStream input) throws Exception {
  41. String htmlData = "预览失败";
  42. try {
  43. HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc(input);
  44. WordToHtmlConverter wordToHtmlConverter = new ImageConverter(
  45. DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()
  46. );
  47. //wordToHtmlConverter.setPicturesManager(new PicturesManager() {
  48. // @Override
  49. // public String savePicture(byte[] content,
  50. // PictureType pictureType, String suggestedName,
  51. // float widthInches, float heightInches) {
  52. // //给生成的页面写图片的路径
  53. // return "word/media/" + suggestedName;
  54. // }
  55. //});
  56. wordToHtmlConverter.processDocument(wordDocument);
  57. Document htmlDocument = wordToHtmlConverter.getDocument();
  58. ByteArrayOutputStream outStream = new ByteArrayOutputStream();
  59. DOMSource domSource = new DOMSource(htmlDocument);
  60. StreamResult streamResult = new StreamResult(outStream);
  61. TransformerFactory tf = TransformerFactory.newInstance();
  62. Transformer serializer = tf.newTransformer();
  63. serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
  64. serializer.setOutputProperty(OutputKeys.INDENT, "yes");
  65. serializer.setOutputProperty(OutputKeys.METHOD, "html");
  66. serializer.transform(domSource, streamResult);
  67. htmlData = outStream.toString();
  68. outStream.close();
  69. } catch (Exception e){
  70. e.printStackTrace();
  71. }
  72. return htmlData;
  73. }
  74. public String docxToHtml(InputStream inputStream) throws IOException {
  75. String htmlData = "预览失败";
  76. try{
  77. XWPFDocument docxDocument = new XWPFDocument(inputStream);
  78. XHTMLOptions options = XHTMLOptions.create().indent(4);
  79. //获取文档中的图片
  80. //List<XWPFPictureData> allPictures = docxDocument.getAllPictures();
  81. //for (XWPFPictureData xwpfPictureData : allPictures) {
  82. // String name = xwpfPictureData.getFileName();
  83. // byte[] data = xwpfPictureData.getData();
  84. // InputStream input = new ByteArrayInputStream(data);
  85. // TODO 图片处理
  86. //
  87. //}
  88. //final String imageUrl = "";
  89. //不把图片生成出来
  90. options.setExtractor(null);
  91. options.setIgnoreStylesIfUnused(false);
  92. options.setFragment(true);
  93. //options.URIResolver(new IURIResolver() {
  94. // @Override
  95. // public String resolve(String uri) {
  96. // return imageUrl + uri;
  97. // }
  98. //});
  99. // 图片转base64 新版本支持
  100. options.setImageManager(new Base64EmbedImgManager());
  101. //转换htm1
  102. ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
  103. XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
  104. htmlData = htmlStream.toString();
  105. htmlStream.close();
  106. } catch(Exception e) {
  107. e.printStackTrace();
  108. }
  109. return htmlData;
  110. }
  111. /**
  112. * 图片处理
  113. *
  114. *
  115. */
  116. public class ImageConverter extends WordToHtmlConverter {
  117. public ImageConverter(Document document) {
  118. super(document);
  119. }
  120. @Override
  121. protected void processImageWithoutPicturesManager(Element currentBlock, boolean inlined, Picture picture) {
  122. Element imgNode = currentBlock.getOwnerDocument().createElement("img");
  123. StringBuffer sb = new StringBuffer();
  124. sb.append(Base64.getMimeEncoder().encodeToString(picture.getRawContent()));
  125. sb.insert(0, "data:" + picture.getMimeType() + ";base64,");
  126. imgNode.setAttribute("src", sb.toString());
  127. currentBlock.appendChild(imgNode);
  128. }
  129. }
  130. /**
  131. * 读取word中的文本内容(段落、表格、图片分开处理)转HTML docx后缀名的Word
  132. * @param
  133. * @throws IOException
  134. */
  135. public String readWordImgToHtml(InputStream inputStream) throws IOException{
  136. String htmlData = "预览失败";
  137. XWPFDocument document = new XWPFDocument(inputStream);
  138. String htmlText="";
  139. try {
  140. // 获取word中的所有段落与表格
  141. List<IBodyElement> elements = document.getBodyElements();
  142. for (IBodyElement element : elements) {
  143. // 段落
  144. if (element instanceof XWPFParagraph) {
  145. htmlText+=getParagraphHtmlText((XWPFParagraph) element);
  146. }
  147. // 表格
  148. else if (element instanceof XWPFTable) {
  149. htmlText+=getTabelHtmlText((XWPFTable) element);
  150. }
  151. }
  152. XHTMLOptions options = XHTMLOptions.create().indent(4);
  153. options.setExtractor(null);
  154. options.setIgnoreStylesIfUnused(false);
  155. options.setFragment(true);
  156. options.setImageManager(new Base64EmbedImgManager());
  157. ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
  158. XHTMLConverter.getInstance().convert(document, htmlStream, options);
  159. htmlData = htmlStream.toString();
  160. htmlStream.close();
  161. //获取word中的所有图片
  162. List<XWPFPictureData> picLists= document.getAllPictures();
  163. for(XWPFPictureData pic:picLists) {
  164. System.out.println("图片名称:\t" + pic.getFileName());
  165. System.out.println("图片类型:\t" + pic.getPictureType());
  166. byte[] data = pic.getData();
  167. System.out.println(data);
  168. //字节流图片上传,并返回服务器地址
  169. String imgUrl = getImageUrl(data, pic.getFileName());
  170. System.out.println("图片服务器地址:" + imgUrl);
  171. //组装img
  172. htmlText += "<p><img alt='' src='" + imgUrl + "'></p>";
  173. }
  174. }
  175. catch (Exception e) {
  176. e.printStackTrace();
  177. }
  178. return htmlData;
  179. }
  180. /**
  181. * 获取段落内容并组装段落HTML
  182. * @param paragraph
  183. */
  184. private static String getParagraphHtmlText(XWPFParagraph paragraph) {
  185. // 获取段落中所有内容
  186. List<XWPFRun> runs = paragraph.getRuns();
  187. if (runs.size() == 0) {
  188. return "";
  189. }
  190. StringBuffer runText = new StringBuffer();
  191. for (XWPFRun run : runs) {
  192. runText.append(run.text());
  193. }
  194. return "<p style='margin:unset;text-align:"+paragraph.getAlignment().name()+"'>"+runText.toString()+"</p>";
  195. }
  196. /**
  197. * 获取表格内容并组装表格HTML
  198. * @param table
  199. */
  200. private static String getTabelHtmlText(XWPFTable table) {
  201. String result="";
  202. //获取表格数据行
  203. List<XWPFTableRow> rows = table.getRows();
  204. if(rows.size()>0){
  205. result+="<table border='1' cellspacing=0 style='border-collapse: collapse;'>";
  206. //遍历
  207. for (int i=0;i<rows.size();i++) {
  208. XWPFTableRow row = rows.get(i);
  209. result+="<tr style='font-weight: bold;'>";
  210. //获取每行的数据列
  211. List<XWPFTableCell> cells = row.getTableCells();
  212. for (XWPFTableCell cell : cells) {
  213. //获取单元格跨列个数
  214. BigInteger gridSpanNum = getCellGridSpanNum(cell);
  215. result+="<td colspan="+gridSpanNum+" valign=center style='text-align: center;vertical-align: middle;'>";
  216. String cellText="";
  217. // 简单获取内容(简单方式是不能获取字体对齐方式的)
  218. // System.out.println(cell.getText());
  219. // 一个单元格可以理解为一个word文档,单元格里也可以加段落与表格
  220. List<XWPFParagraph> paragraphs = cell.getParagraphs();
  221. for (XWPFParagraph paragraph : paragraphs) {
  222. cellText+="<p style='margin: unset;text-align:"+paragraph.getAlignment().name()+"'>"+getParagraphText(paragraph)+"</p>";
  223. }
  224. result+=cellText;
  225. result+="</td>";
  226. }
  227. result+="</tr>";
  228. }
  229. result+="</table>";
  230. }
  231. return result;
  232. }
  233. /**
  234. * 获取段落内容( docx后缀名的Word)
  235. * @param paragraph
  236. */
  237. private static String getParagraphText(XWPFParagraph paragraph) {
  238. // 获取段落中所有内容
  239. List<XWPFRun> runs = paragraph.getRuns();
  240. if (runs.size() == 0) {
  241. //System.out.println("按了回车(新段落)");
  242. return "";
  243. }
  244. StringBuffer runText = new StringBuffer();
  245. for (XWPFRun run : runs) {
  246. runText.append(run.text());
  247. }
  248. // if (runText.length() > 0) {
  249. // runText.append(",对齐方式:").append(paragraph.getAlignment().name());
  250. // System.out.println(runText);
  251. // }
  252. return runText.toString();
  253. }
  254. /**
  255. * 字节流图片上传
  256. * @param data:图片字节流
  257. * @param fileName:图片名称
  258. */
  259. public static String getImageUrl(byte[] data,String fileName) throws Exception{
  260. String imgUrl="";
  261. Long res =new Date().getTime();
  262. //设置文件存储路径,可以存放在你想要指定的路径里面
  263. String rootPath="D:/mimi/"+File.separator+"upload/images/";
  264. // 新文件名
  265. String newFileName =res + fileName.substring(fileName.lastIndexOf("."));
  266. //新文件
  267. File newFile=new File(rootPath+File.separator+newFileName);
  268. //判断文件目录是否存在
  269. if(!newFile.getParentFile().exists()){
  270. //如果目标文件所在的目录不存在,则创建父目录
  271. newFile.getParentFile().mkdirs();
  272. }
  273. //-------把图片文件写入磁盘 start ----------------
  274. FileOutputStream fos = new FileOutputStream(newFile);
  275. fos.write(data);
  276. fos.close();
  277. //-------把图片文件写入磁盘 end ----------------
  278. //服务器图片地址
  279. String baseURL = "http://192.168.0.76:8080/mimi/upload/images/";
  280. imgUrl=baseURL+newFileName;
  281. return imgUrl;
  282. }
  283. /**
  284. * 获取单元格跨列个数
  285. * @return
  286. */
  287. public static BigInteger getCellGridSpanNum(XWPFTableCell cell){
  288. BigInteger gridSpanNum =null;
  289. //获取单元格跨列
  290. CTDecimalNumber gridSpanXml = cell.getCTTc().getTcPr().getGridSpan();
  291. if(gridSpanXml!=null){
  292. gridSpanNum = gridSpanXml.getVal();
  293. System.out.println("gridSpanNum:"+gridSpanNum);
  294. }
  295. return gridSpanNum;
  296. }
  297. }