diff --git a/gemeinsamforschen/pom.xml b/gemeinsamforschen/pom.xml index 15a482c2f3fe879d70afb4e03b123ba13f601305..718c97172441539e8dc78d185ffae66c786afbca 100644 --- a/gemeinsamforschen/pom.xml +++ b/gemeinsamforschen/pom.xml @@ -301,6 +301,15 @@ <artifactId>poi-scratchpad</artifactId> <version>3.15</version> </dependency> + <dependency> + <groupId>net.sourceforge.htmlcleaner</groupId> + <artifactId>htmlcleaner</artifactId> + <version>2.22</version> + </dependency> + <dependency> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.12.1</version> + </dependency> </dependencies> - </project> \ No newline at end of file diff --git a/gemeinsamforschen/selenium/Automatisierungen.side b/gemeinsamforschen/selenium/Automatisierungen.side index 345ecc514ff79dfd3df4d37092996b9f3a5cf39b..f2161821af0a36c91982782d2608606da51081c8 100644 --- a/gemeinsamforschen/selenium/Automatisierungen.side +++ b/gemeinsamforschen/selenium/Automatisierungen.side @@ -13,13 +13,6 @@ "target": "http://localhost:8080/gemeinsamforschen_war_exploded/", "targets": [], "value": "" - }, { - "id": "4cdd2c67-b330-4ebf-9dad-06784ae81bbe", - "comment": "", - "command": "setWindowSize", - "target": "1024x768", - "targets": [], - "value": "" }, { "id": "e8ca50ce-88c8-4882-8051-b122be32fdc0", "comment": "", diff --git a/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/FileManagementService.java b/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/FileManagementService.java index 736ba575680f8ce2c328fe0030c3858f03c87c9e..7d26a9a38a1b8052d808a46cab45f72a08994286 100644 --- a/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/FileManagementService.java +++ b/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/FileManagementService.java @@ -7,13 +7,18 @@ import com.itextpdf.text.pdf.PdfPCell; import com.itextpdf.text.pdf.PdfPTable; import com.itextpdf.text.pdf.PdfWriter; import com.itextpdf.tool.xml.XMLWorkerHelper; -import com.itextpdf.tool.xml.exceptions.CssResolverException; import org.apache.commons.io.IOUtils; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlide; import org.codehaus.plexus.util.FileUtils; import org.glassfish.jersey.media.multipart.FormDataContentDisposition; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.HtmlSerializer; +import org.htmlcleaner.SimpleHtmlSerializer; +import org.htmlcleaner.TagNode; +import org.jsoup.Jsoup; +import org.jsoup.select.Elements; import unipotsdam.gf.modules.project.Project; import unipotsdam.gf.modules.user.User; @@ -75,7 +80,8 @@ public class FileManagementService { public void saveStringAsPDF(User user, Project project, String fileContent, FormDataContentDisposition fileDetail, FileRole fileRole, FileType fileType) throws IOException, DocumentException { - fileContent = correctingTags(fileContent); + fileContent = cleanHTML(fileContent); + //fileContent = manipulateIndentation(fileContent); InputStream inputStream = IOUtils.toInputStream(fileContent); saveFileAsPDF(user, project, inputStream, fileDetail, fileRole, fileType); @@ -84,28 +90,7 @@ public class FileManagementService { private String saveHTMLAsPDF(InputStream inputStream, String filenameWithoutExtension) throws IOException, DocumentException { String fileName = filenameWithoutExtension + ".pdf"; String path = getFullPath(fileName); - /* - Document document = new Document(); - - - PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(filename)); - document.open(); - TODO: css is not applied correctly, that's why an indent, doesn't work, need fix - example: - HTML PDF - 1. 1. - 2. 2. - a. 3. - - CSSResolver cssResolver = XMLWorkerHelper.getInstance().getDefaultCssResolver(false); - cssResolver.addCssFile("https://cdnjs.cloudflare.com/ajax/libs/quill/1.3.6/quill.snow.css", true); - HtmlPipelineContext htmlContext = new HtmlPipelineContext(null); - CssResolverPipeline pipeline = new CssResolverPipeline(cssResolver, new HtmlPipeline(htmlContext, new PdfWriterPipeline(document, writer))); - XMLWorker worker = new XMLWorker(pipeline, true); - XMLParser parser = new XMLParser(worker); - parser.parse(inputStream); - */ Document document = new Document(); PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(path)); document.open(); @@ -114,11 +99,30 @@ public class FileManagementService { return fileName; } - private String correctingTags(String fileContent) { - String correctedFileContent = fileContent.replaceAll("<br>", "<br/>"); - correctedFileContent = correctedFileContent.replaceAll("\">", "\"/>"); - return correctedFileContent; + private String cleanHTML(String fileContent) { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(fileContent); + HtmlSerializer htmlSerializer = new SimpleHtmlSerializer(htmlCleaner.getProperties()); + return htmlSerializer.getAsString(tagNode); + } + String manipulateIndentation(String fileContent) { + /* + todo: + implementation hier + ------------------- + - wenn text zurueckkommt, dann TextNode erzeugen, die text von vorheriger node beinhaltet, aber davor jeweilige aufzaehlungszeichen hinzufuegt + - am ende alle <ol> und </ol> loeschen (wenn noetig) + */ + org.jsoup.nodes.Document document = Jsoup.parse(fileContent); + + Elements elements = document.select("ol"); + elements.forEach(element -> { + JsoupConverter converter = new JsoupConverter(); + String fullText = converter.convertElementsToTextNodes(element.childNodes()); + element.text(fullText); + }); + return fileContent; } private String getDocumentFromFile(InputStream inputStream, String fileNameWithoutExtension) throws IOException { diff --git a/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/JsoupConverter.java b/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/JsoupConverter.java new file mode 100644 index 0000000000000000000000000000000000000000..e9ba3a2e7466dda6b222cb70d908d44b30c5cbc3 --- /dev/null +++ b/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/JsoupConverter.java @@ -0,0 +1,166 @@ +package unipotsdam.gf.modules.fileManagement; + +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import unipotsdam.gf.modules.fileManagement.Util.IndentationLetterFormat; +import unipotsdam.gf.modules.fileManagement.Util.RomanConverter; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +import static unipotsdam.gf.modules.fileManagement.Util.IndentationLetterFormat.LETTER; +import static unipotsdam.gf.modules.fileManagement.Util.IndentationLetterFormat.NUMBER; +import static unipotsdam.gf.modules.fileManagement.Util.IndentationLetterFormat.ROMAN; + +public class JsoupConverter { + + private HashMap<Integer, String> nextIndentationLetterMap; + private int lastSeenIndentationLevel = 0; + + public JsoupConverter() { + nextIndentationLetterMap = new HashMap<>(); + nextIndentationLetterMap.put(0, "1"); + nextIndentationLetterMap.put(1, "a"); + nextIndentationLetterMap.put(2, "1"); + nextIndentationLetterMap.put(3, "1"); + nextIndentationLetterMap.put(4, "a"); + nextIndentationLetterMap.put(5, "1"); + nextIndentationLetterMap.put(6, "1"); + nextIndentationLetterMap.put(7, "a"); + nextIndentationLetterMap.put(8, "1"); + } + + + public String convertElementsToTextNodes(List<Node> nodes) { + String fullText = ""; + for (Node node : nodes) { + if (!(node instanceof Element)) { + continue; + } + int indentationLevel = extractIndentationLevel(node); + String indentationText; + + IndentationLetterFormat letterFormat = getTextFormat(indentationLevel); + if (Objects.isNull(letterFormat)) { + continue; + } + // TODO: + // 1. indentation for roman is not reseted correctly + // 2. everything is encoded, so \t and \n are cut off. Also < is encoded, so <br> is not possible + // to fix this: solution could be to just add "fake" tags(!t! for tab, !n! for newline) and replace them in the string later + // 3. Test indentation again + if (indentationLevel != lastSeenIndentationLevel && indentationLevel > 2) { + resetLevelCounter(indentationLevel); + } + indentationText = generateIndentationText(letterFormat, indentationLevel); + increaseLevelCounter(indentationLevel); + lastSeenIndentationLevel = indentationLevel; + fullText += indentationText + ((Element) node).text() + "\t"; + TextNode textNode = new TextNode(fullText); + node.replaceWith(textNode); + } + return fullText; + } + + /* + todo: helperclass/parser + ------------------ + - fuer roemische zahlen: RomanConverter + - Irgendein helper, der automatisch richtiges Format waehlt (zahl, buchstabe oder römisch) + * bei buchstaben wird a-z, aa-zz etc benoetigt + -> generator generiert a-z, wenn z erreicht + 1 buchstabe von a bis z etc + - Ebenen 1-9 als counter (hashmap? mit key css class) + - last ebene seen variable + - handle class method + * wenn selbe Ebene: nummer/buchstabe in format rausgeben, counter + 1 + * wenn tiefere ebene: + -> aktuelle ebene zwischen 1 und 2?: post number, counter + 1 + -> aktuelle ebene groeßer als 2: reset counter, post, counter + 1 + * wenn hoehere ebene: + -> post und counter + 1 + * auf anzahl tabs achten + */ + + private int extractIndentationLevel(Node node) { + String attributes = node.attr("class"); + if (attributes.isEmpty()) { + return 0; + } + List<String> classNames = Arrays.asList(attributes.split(" ")); + List<String> indentClassList = classNames.stream().filter(cssClass -> cssClass.contains("ql-indent-")).collect(Collectors.toList()); + if (indentClassList.isEmpty()) { + return 0; + } + String indentCssClass = indentClassList.get(0); + return Integer.parseInt(Arrays.asList(indentCssClass.split("-")).get(2)); + } + + private IndentationLetterFormat getTextFormat(int indentationLevel) { + IndentationLetterFormat letterFormat; + switch (indentationLevel) { + case 0: + case 3: + case 6: + letterFormat = NUMBER; + break; + case 1: + case 4: + case 7: + letterFormat = LETTER; + break; + case 2: + case 5: + case 8: + letterFormat = ROMAN; + break; + default: + letterFormat = null; + } + return letterFormat; + } + + private void increaseLevelCounter(int indentationLevel) { + String indentationLetter = nextIndentationLetterMap.get(indentationLevel); + String nextIndentationLetter; + if (indentationLetter.matches("[0-9]+")) { + nextIndentationLetter = String.valueOf(Integer.parseInt(indentationLetter) + 1); + } else { + nextIndentationLetter = generateFollowupString(indentationLetter); + } + nextIndentationLetterMap.put(indentationLevel, nextIndentationLetter); + } + + private void resetLevelCounter(int indentationLevel) { + String indentationLetter = nextIndentationLetterMap.get(indentationLevel); + String nextIndentationLetter = indentationLetter.matches("[0-9]+") ? "1" : "a"; + nextIndentationLetterMap.put(indentationLevel, nextIndentationLetter); + } + + private String generateFollowupString(String previousString) { + // TODO: make working for aa-zz etc and fix + char lastLetter = previousString.charAt(previousString.length() - 1); + return Character.toString(++lastLetter); + } + + private String generateIndentationText(IndentationLetterFormat letterFormat, int indentationLevel) { + String indentationText; + String indentationLetter = nextIndentationLetterMap.get(indentationLevel); + switch (letterFormat) { + case ROMAN: + indentationText = RomanConverter.getRomanNumber(Integer.valueOf(indentationLetter)) + ". "; + break; + case NUMBER: + case LETTER: + indentationText = indentationLetter + ". "; + break; + default: + indentationText = ""; + } + + return indentationText; + } +} diff --git a/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/Util/IndentationLetterFormat.java b/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/Util/IndentationLetterFormat.java new file mode 100644 index 0000000000000000000000000000000000000000..8a44eacb3eeca42333b53468f43e6d7efea5dd38 --- /dev/null +++ b/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/Util/IndentationLetterFormat.java @@ -0,0 +1,8 @@ +package unipotsdam.gf.modules.fileManagement.Util; + +public enum IndentationLetterFormat { + + NUMBER, + ROMAN, + LETTER +} diff --git a/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/Util/RomanConverter.java b/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/Util/RomanConverter.java new file mode 100644 index 0000000000000000000000000000000000000000..7363a3fae7fc5247cff842bc4dc5eb3f926fb4a8 --- /dev/null +++ b/gemeinsamforschen/src/main/java/unipotsdam/gf/modules/fileManagement/Util/RomanConverter.java @@ -0,0 +1,24 @@ +package unipotsdam.gf.modules.fileManagement.Util; + +import static java.lang.String.join; +import static java.util.Collections.nCopies; + +public class RomanConverter { + + public static String getRomanNumber(int number) { + return join("", nCopies(number, "I")) + .replace("IIIII", "V") + .replace("IIII", "IV") + .replace("VV", "X") + .replace("VIV", "IX") + .replace("XXXXX", "L") + .replace("XXXX", "XL") + .replace("LL", "C") + .replace("LXL", "XC") + .replace("CCCCC", "D") + .replace("CCCC", "CD") + .replace("DD", "M") + .replace("DCD", "CM"); + } + +} diff --git a/gemeinsamforschen/src/main/java/unipotsdam/gf/process/DossierCreationProcess.java b/gemeinsamforschen/src/main/java/unipotsdam/gf/process/DossierCreationProcess.java index ec2111c5685bda94e2521cbbc69022082b57c5e5..bbb620815af1a07e234e98e030b60edd1b07b364 100644 --- a/gemeinsamforschen/src/main/java/unipotsdam/gf/process/DossierCreationProcess.java +++ b/gemeinsamforschen/src/main/java/unipotsdam/gf/process/DossierCreationProcess.java @@ -74,7 +74,7 @@ public class DossierCreationProcess { public FullSubmission addSubmission( FullSubmissionPostRequest fullSubmissionPostRequest, User user, Project project) throws DocumentException, IOException { - FormDataContentDispositionBuilder builder = FormDataContentDisposition.name("dossierUpload").fileName("dossier_" + user.getName() + ".pdf"); + FormDataContentDispositionBuilder builder = FormDataContentDisposition.name("dossierUpload").fileName("dossier_" + user.getEmail() + ".pdf"); fileManagementService.saveStringAsPDF(user, project, fullSubmissionPostRequest.getHtml(), builder.build(), FileRole.DOSSIER, FileType.HTML); diff --git a/gemeinsamforschen/src/test/java/unipotsdam/gf/modules/fileManagement/FileManagementServiceTest.java b/gemeinsamforschen/src/test/java/unipotsdam/gf/modules/fileManagement/FileManagementServiceTest.java new file mode 100644 index 0000000000000000000000000000000000000000..e57ec394e47b8867f2a947266463cd68f5f68d8c --- /dev/null +++ b/gemeinsamforschen/src/test/java/unipotsdam/gf/modules/fileManagement/FileManagementServiceTest.java @@ -0,0 +1,15 @@ +package unipotsdam.gf.modules.fileManagement; + +import org.junit.Test; + +public class FileManagementServiceTest { + + @Test + public void manipulateIndentation() { + String fileContent = "<h1>kekse</h1><p><br></p><p>Das ist ein beispieltext um zu testen, wie man intentation fixen kann.</p><p>Ich mag zuege sehr abcdefg</p><p><br></p><ol><li>1</li><li class=\"ql-indent-1\">a</li><li class=\"ql-indent-1\">b</li><li>2</li><li class=\"ql-indent-2\">i</li><li class=\"ql-indent-1\">bcd</li><li class=\"ql-indent-1\">abcd</li><li class=\"ql-indent-2\">i</li><li class=\"ql-indent-2\">ii</li><li class=\"ql-indent-2\">iii</li><li class=\"ql-indent-2\">iv</li><li class=\"ql-indent-1\">efgh</li><li class=\"ql-indent-2\">i</li><li class=\"ql-indent-2\">ii</li><li class=\"ql-indent-3\">1innen</li><li class=\"ql-indent-4\">ainnen</li><li class=\"ql-indent-3\">2innen</li><li class=\"ql-indent-4\">binnen</li></ol>"; + FileManagementService fileManagementService = new FileManagementService(); + + fileManagementService.manipulateIndentation(fileContent); + + } +} \ No newline at end of file