From cb1d7b53409cc4dfa3c72a98e8ce217c1c56392c Mon Sep 17 00:00:00 2001 From: George Kalampokis Date: Wed, 6 Oct 2021 15:15:41 +0300 Subject: [PATCH] Replace altChunk html to docx with direct parsing (manual) --- dmp-backend/pom.xml | 6 + .../documents/word/HtmlToWorldBuilder.java | 109 ++++++++++++++++++ .../utilities/documents/word/WordBuilder.java | 12 +- 3 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 dmp-backend/web/src/main/java/eu/eudat/logic/utilities/documents/word/HtmlToWorldBuilder.java diff --git a/dmp-backend/pom.xml b/dmp-backend/pom.xml index 254577ace..2fd661beb 100644 --- a/dmp-backend/pom.xml +++ b/dmp-backend/pom.xml @@ -186,6 +186,12 @@ fop 2.3 + + + org.jsoup + jsoup + 1.14.3 + fr.opensagres.xdocreport diff --git a/dmp-backend/web/src/main/java/eu/eudat/logic/utilities/documents/word/HtmlToWorldBuilder.java b/dmp-backend/web/src/main/java/eu/eudat/logic/utilities/documents/word/HtmlToWorldBuilder.java new file mode 100644 index 000000000..029d706e1 --- /dev/null +++ b/dmp-backend/web/src/main/java/eu/eudat/logic/utilities/documents/word/HtmlToWorldBuilder.java @@ -0,0 +1,109 @@ +package eu.eudat.logic.utilities.documents.word; + +import org.apache.poi.xwpf.usermodel.*; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.Elements; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.STHighlightColor; + +import java.util.LinkedHashMap; +import java.util.Map; + +public class HtmlToWorldBuilder implements NodeVisitor { + + private final Map properties = new LinkedHashMap<>(); + private final XWPFParagraph paragraph; + private XWPFRun run; + private Boolean dumpRun; + private final String identation; + private Boolean isIdentationUsed; + + public static void convert(XWPFDocument document, Document htmlDocument, String identation) { + Elements htmlParagraphs = htmlDocument.select("p"); + if (!htmlParagraphs.isEmpty()) { + for (Element htmlParagraph : htmlParagraphs) { + XWPFParagraph paragraph = document.createParagraph(); + HtmlToWorldBuilder htmlToWorldBuilder = new HtmlToWorldBuilder(paragraph, identation); + NodeTraversor.traverse(htmlToWorldBuilder, htmlParagraph); + } + } else { + XWPFParagraph paragraph = document.createParagraph(); + HtmlToWorldBuilder htmlToWorldBuilder = new HtmlToWorldBuilder(paragraph, identation); + NodeTraversor.traverse(htmlToWorldBuilder, htmlDocument); + } + } + + public HtmlToWorldBuilder(XWPFParagraph paragraph, String identation) { + this.paragraph = paragraph; + this.run = this.paragraph.createRun(); + this.dumpRun = false; + this.identation = identation; + this.isIdentationUsed = false; + this.run.setFontSize(11); + } + + @Override + public void head(Node node, int i) { + String name = node.nodeName(); + if (name.equals("#text")) { + String text = !isIdentationUsed ? identation + ((TextNode)node).text() : ((TextNode)node).text(); + this.run.setText(text); + this.isIdentationUsed = true; + this.dumpRun = true; + } else { + properties.put(name, true); + } + if (dumpRun) { + this.run = this.paragraph.createRun(); + this.run.setFontSize(11); + this.dumpRun = false; + } + parseProperties(); + + } + + private void parseProperties() { + properties.entrySet().forEach(stringBooleanEntry -> { + switch (stringBooleanEntry.getKey()) { + case "i" : + case "em": + this.run.setItalic(stringBooleanEntry.getValue()); + break; + case "b": + case "strong": + this.run.setBold(stringBooleanEntry.getValue()); + break; + case "u": + case "ins": + this.run.setUnderline(stringBooleanEntry.getValue() ? UnderlinePatterns.SINGLE : UnderlinePatterns.NONE); + break; + case "small": + this.run.setFontSize(stringBooleanEntry.getValue() ? 8 : 11); + break; + case "del": + this.run.setStrikeThrough(stringBooleanEntry.getValue()); + break; + case "mark": + this.run.setTextHighlightColor(stringBooleanEntry.getValue() ? STHighlightColor.YELLOW.toString() : STHighlightColor.NONE.toString()); + break; + case "sub": + this.run.setSubscript(stringBooleanEntry.getValue() ? VerticalAlign.SUBSCRIPT : VerticalAlign.BASELINE); + break; + case "sup": + this.run.setSubscript(stringBooleanEntry.getValue() ? VerticalAlign.SUPERSCRIPT : VerticalAlign.BASELINE); + break; + } + }); + } + + @Override + public void tail(Node node, int i) { + String name = node.nodeName(); + properties.put(name, false); + parseProperties(); + } +} diff --git a/dmp-backend/web/src/main/java/eu/eudat/logic/utilities/documents/word/WordBuilder.java b/dmp-backend/web/src/main/java/eu/eudat/logic/utilities/documents/word/WordBuilder.java index 82d71f9ff..abd14e990 100644 --- a/dmp-backend/web/src/main/java/eu/eudat/logic/utilities/documents/word/WordBuilder.java +++ b/dmp-backend/web/src/main/java/eu/eudat/logic/utilities/documents/word/WordBuilder.java @@ -13,8 +13,9 @@ import eu.eudat.models.data.user.components.datasetprofile.FieldSet; import eu.eudat.models.data.user.components.datasetprofile.Section; import eu.eudat.models.data.user.composite.DatasetProfilePage; import eu.eudat.models.data.user.composite.PagedDatasetProfile; -import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.xwpf.usermodel.*; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl; @@ -50,7 +51,7 @@ public class WordBuilder { return paragraph; }); this.options.put(ParagraphStyle.HTML, (mainDocumentPart, item) -> { - try { + /*try { XWPFHtmlDocument xwpfHtmlDocument = XWPFHtmlDocument.addHtmlDocument(mainDocumentPart); if (item != null) { xwpfHtmlDocument.setHtml(item); @@ -60,7 +61,12 @@ public class WordBuilder { logger.error(e.getLocalizedMessage(), e); } - return null; + return null;*/ + XWPFParagraph paragraph = null; + Document htmlDoc = Jsoup.parse(item); + HtmlToWorldBuilder.convert(mainDocumentPart, htmlDoc, " "); + + return paragraph; }); this.options.put(ParagraphStyle.TITLE, (mainDocumentPart, item) -> { XWPFParagraph paragraph = mainDocumentPart.createParagraph();