Skip to content

Method: JSoupXhtmlNormalizer()

1: /*
2: * #%L
3: * *********************************************************************************************************************
4: *
5: * NorthernWind - lightweight CMS
6: * http://northernwind.tidalwave.it - git clone git@bitbucket.org:tidalwave/northernwind-rca-src.git
7: * %%
8: * Copyright (C) 2013 - 2021 Tidalwave s.a.s. (http://tidalwave.it)
9: * %%
10: * *********************************************************************************************************************
11: *
12: * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
13: * the License. You may obtain a copy of the License at
14: *
15: * http://www.apache.org/licenses/LICENSE-2.0
16: *
17: * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
18: * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
19: * specific language governing permissions and limitations under the License.
20: *
21: * *********************************************************************************************************************
22: *
23: *
24: * *********************************************************************************************************************
25: * #L%
26: */
27: package it.tidalwave.northernwind.rca.ui.contenteditor.impl;
28:
29: import it.tidalwave.northernwind.rca.ui.contenteditor.spi.XhtmlNormalizer;
30: import javax.annotation.Nonnull;
31: import java.nio.charset.StandardCharsets;
32: import org.jsoup.Jsoup;
33: import org.jsoup.nodes.Document;
34: import org.jsoup.nodes.Entities;
35: import lombok.extern.slf4j.Slf4j;
36:
37: /***********************************************************************************************************************
38: *
39: * @author Fabrizio Giudici
40: *
41: **********************************************************************************************************************/
42: @Slf4j
43: public class JSoupXhtmlNormalizer implements XhtmlNormalizer
44: {
45: /*******************************************************************************************************************
46: *
47: *
48: *
49: ******************************************************************************************************************/
50: @Nonnull
51: @Override
52: public String asNormalizedString (@Nonnull final String text)
53: {
54: log.trace("asNormalizedString()\n{}", text);
55: final Document.OutputSettings os = new Document.OutputSettings()
56: .charset(StandardCharsets.UTF_8)
57: .escapeMode(Entities.EscapeMode.xhtml)
58: .indentAmount(2)
59: .prettyPrint(true)
60: .syntax(Document.OutputSettings.Syntax.xml);
61: final String result = finalCleanup(breakLongLines(Jsoup.parse(text).outputSettings(os).outerHtml()));
62: log.trace(">>>> returning:\n{}", result);
63: return result;
64: }
65:
66: /*******************************************************************************************************************
67: *
68: * Jsoup doesn't do everything properly, so we're patching the results a bit.
69: *
70: ******************************************************************************************************************/
71: @Nonnull
72: private static String breakLongLines (@Nonnull final String html)
73: {
74: final Document document = Jsoup.parse(html);
75: document.select("br").after("\n ");
76:
77: // Remove img attributes inserted by Aloha
78: document.select("img")
79: .removeAttr("draggable")
80: .removeAttr("contenteditable")
81: .forEach(element ->
82: {
83: final String style = element.attr("style");
84:
85: if (!"".equals(style))
86: {
87: element.attr("style", style.replaceAll(" *cursor: -webkit-grab;", ""));
88: }
89: });
90:
91: final Document.OutputSettings os = new Document.OutputSettings()
92: .charset(StandardCharsets.UTF_8)
93: .escapeMode(Entities.EscapeMode.xhtml)
94: .indentAmount(2)
95: .prettyPrint(false)
96: .syntax(Document.OutputSettings.Syntax.xml);
97: return document.outputSettings(os).outerHtml()
98: .replaceFirst("([^\\n])<html ", "$1\n<html ")
99: .replaceFirst("([^\\n]) *<head>", "$1\n <head>")
100: .replaceFirst("([^\\n])\\n<\\/body>", "$1<\\/body>")
101: .replaceFirst("<\\/body>([^\\n])", "<\\/body>\n$1");
102: }
103:
104: /*******************************************************************************************************************
105: *
106: * Jsoup doesn't do everything properly, so we're patching the results a bit.
107: *
108: ******************************************************************************************************************/
109: @Nonnull
110: private static String finalCleanup (@Nonnull final String string)
111: {
112: final StringBuilder buffer = new StringBuilder();
113:
114: boolean first = true;
115:
116: for (final String line : string.split("\n"))
117: {
118: if (first && !"<!DOCTYPE html>".equals(line))
119: {
120: buffer.append("<!DOCTYPE html>").append("\n");
121: }
122:
123: first = false;
124: buffer.append(line.replaceAll(" *$", "")).append("\n"); // trailing spaces
125: }
126:
127: return buffer.toString();
128: }
129: }