/*
* Copyright 2011 OverZealous Creations, LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.overzealous.remark.convert;
import com.overzealous.remark.Options;
import com.overzealous.remark.util.StringUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class is used to clean up plain text fields based on the selected set of options.
* It optionally escapes certain special characters, as well as replacing various
* HTML and Unicode entities with their plaintext equivalents.
*
* @author Phil DeJarnett
*/
public class TextCleaner {
/**
* Internal class simply used to hold the various escape regexes.
*/
private class Escape {
final Pattern pattern;
final String replacement;
public Escape(String pattern, String replacement) {
this.pattern = Pattern.compile(pattern);
this.replacement = replacement;
}
}
/** Used to track the replacements based on matched groups. */
private Map<String, String> replacements;
/** Compiled entity replacement pattern. */
private Pattern entityReplacementsPattern;
/** Compiled unicode replacement pattern. */
private Pattern unicodeReplacementsPattern = null;
/** List of possible escapes */
private List<Escape> escapes;
private Pattern unescapeLeadingChars;
private static final Pattern EMPTY_MATCHER = Pattern.compile("\\s+", Pattern.DOTALL);
private static final Pattern LINEBREAK_REMOVER = Pattern.compile("(\\s*\\r?+\\n)+");
private static final Pattern URL_CLEANER = Pattern.compile("([\\(\\) ])");
/**
* Create a new TextCleaner based on the configured options.
* @param options Options that will affect what is cleaned.
*/
public TextCleaner(Options options) {
setupReplacements(options);
setupEscapes(options);
}
/**
* Configures the basic replacements based on the configured options.
* @param options Options that will affect what is replaced.
*/
@SuppressWarnings({"OverlyLongMethod"})
private void setupReplacements(Options options) {
this.replacements = new HashMap<String, String> ();
// build replacement regex
StringBuilder entities = new StringBuilder(replacements.size()*5);
// this is a special case for double-encoded HTML entities.
entities.append("&(?>amp;([#a-z0-9]++;)|(?>");
addRepl(entities, "&", "&");
addRepl(entities, "<", "<");
addRepl(entities, ">", ">");
addRepl(entities, """, "\"");
if(options.reverseHtmlSmartQuotes) {
addRepl(entities, "“", "\"");
addRepl(entities, "”", "\"");
addRepl(entities, "‘", "\'");
addRepl(entities, "’", "\'");
addRepl(entities, "'", "\'");
addRepl(entities, "«", "<<");
addRepl(entities, "»", ">>");
}
if(options.reverseHtmlSmartPunctuation) {
addRepl(entities, "–", "--");
addRepl(entities, "—", "---");
addRepl(entities, "…", "...");
}
entities.replace(entities.length()-1, entities.length(), ");)");
entityReplacementsPattern = Pattern.compile(entities.toString(), Pattern.CASE_INSENSITIVE);
if(options.reverseUnicodeSmartPunctuation || options.reverseUnicodeSmartQuotes) {
StringBuilder unicode = new StringBuilder("[\\Q");
if(options.reverseUnicodeSmartQuotes) {
addRepl(unicode, "\u201c", "\""); // left double quote: “
addRepl(unicode, "\u201d", "\""); // right double quote: ”
addRepl(unicode, "\u2018", "\'"); // left single quote: ‘
addRepl(unicode, "\u2019", "\'"); // right single quote: ’
addRepl(unicode, "\u00ab", "<<"); // left angle quote: «
addRepl(unicode, "\u00bb", ">>"); // right angle quote: »
}
if(options.reverseUnicodeSmartPunctuation) {
addRepl(unicode, "\u2013", "--"); // en-dash: –
addRepl(unicode, "\u2014", "---"); // em-dash: —
addRepl(unicode, "\u2026", "..."); // ellipsis: …
}
unicode.append("\\E]");
unicodeReplacementsPattern = Pattern.compile(unicode.toString());
}
}
/**
* Utility method to make the code above easier to read.
* @param regex A character buffer to append the replacement to
* @param original Original character or string.
* @param replacement Replacement character or string.
*/
private void addRepl(StringBuilder regex, String original, String replacement) {
replacements.put(original, replacement);
if(original.charAt(0) == '&') {
// add entity
regex.append(original.substring(1, original.length() - 1));
regex.append('|');
} else {
// add single character
regex.append(original);
}
}
/**
* Configures the basic escapes based on the configured options.
* @param options Options that will affect what is escaped.
*/
private void setupEscapes(Options options) {
escapes = new ArrayList<Escape>();
// confusingly, this replaces single backslashes with double backslashes.
// Man, I miss Groovy's slashy strings in these moments...
escapes.add(new Escape("\\\\", "\\\\\\\\"));
// creates an set of characters that are universally escaped.
// these characters are wrapped in \Q...\E to ensure they aren't treated as special characters.
StringBuilder chars = new StringBuilder("([\\Q`*_{}[]#");
if(options.tables.isConvertedToText() && !options.tables.isRenderedAsCode()) {
chars.append('|');
}
chars.append("\\E])");
escapes.add(new Escape(chars.toString(), "\\\\$1"));
// finally, escape certain characters only if they are leading characters
StringBuilder leadingChars = new StringBuilder("^( ?+)([\\Q-+");
if(options.definitionLists) {
leadingChars.append(':');
}
leadingChars.append("\\E])");
escapes.add(new Escape(leadingChars.toString(), "$1\\\\$2"));
// setup the leading character reverser
// this is a bit of a hack to undo leading character escapes.
unescapeLeadingChars = Pattern.compile(leadingChars.insert(6, "\\\\").toString());
}
/**
* Clean the given input text based on the original configuration Options.
* Newlines are also replaced with a single space.
*
* @param input The text to be cleaned. Can be any object. JSoup nodes are handled specially.
* @return The cleaned text.
*/
public String clean(Object input) {
return clean(input, true);
}
/**
* Clean the given input text based on the original configuration Options.
* The text is treat as code, so it is not escaped, and newlines are preserved.
*
* @param input The text to be cleaned. Can be any object. JSoup nodes are handled specially.
* @return The cleaned text.
*/
public String cleanCode(Object input) {
return clean(input, false);
}
/**
* Clean the given input text based on the original configuration Options.
* Optionally, don't escape special characters.
*
* @param oinput The text to be cleaned. Can be any object. JSoup nodes are handled specially.
* @param normalText If false, don't escape special characters. This is usually only used for
* inline code or code blocks, because they don't need to be escaped.
* @return The cleaned text.
*/
private String clean(Object oinput, boolean normalText) {
String input;
if(oinput instanceof TextNode) {
input = getTextNodeText((TextNode)oinput, normalText);
} else if(oinput instanceof Element) {
if(normalText) {
input = ((Element)oinput).text();
} else {
input = getPreformattedText((Element)oinput);
}
} else {
input = oinput.toString();
}
String result;
if(input.length() == 0) {
// not seen, so just return an empty string.
result = "";
} else if(normalText) {
// For non-code text, newlines are _never_ allowed.
// Replace one or more set of whitespace chars followed by a newline with a single space.
input = LINEBREAK_REMOVER.matcher(input).replaceAll(" ");
// now escape special characters.
for(final Escape rep : escapes) {
input = rep.pattern.matcher(input).replaceAll(rep.replacement);
}
StringBuffer output = doReplacements(input, entityReplacementsPattern);
if(unicodeReplacementsPattern != null) {
output = doReplacements(output, unicodeReplacementsPattern);
}
result = output.toString();
} else {
// we have to revert ALL HTML entities for code, because they will end up
// double-encoded by markdown
// we also don't need to worry about escaping anything
// note: we have to manually replace ' because it is ignored by StringEscapeUtils for some reason.
result = StringEscapeUtils.unescapeHtml4(input.replace("'", "'"));
}
return result;
}
/**
* Replaces all {@code <br/>} tags with a newline in a copy of the input node, and
* returns the resulting innter text.
* This is necessary to ensure that manual linebreaks are supported in preformatted code.
*
* @param oinput Preformatted node to process
* @return inner text of the node.
*/
private String getPreformattedText(Element oinput) {
Element el = oinput.clone();
fixLineBreaks(el);
return el.text();
}
// recursively processes the element to replace <br>'s with \n
private void fixLineBreaks(Element el) {
for(final Element e : el.children()) {
if(e.tagName().equals("br")) {
e.before("\n");
e.remove();
} else {
fixLineBreaks(e);
}
}
}
/**
* Handles running the regex-based replacements in the input
* @param input String to process
* @param regex Pattern to use
* @return cleaned up input string
*/
private StringBuffer doReplacements(CharSequence input, Pattern regex) {
StringBuffer output = new StringBuffer();
Matcher m = regex.matcher(input);
while (m.find()) {
String repString;
// if we have a hard match, do a simple replacement.
String replacementKey = m.group().toLowerCase(Locale.ENGLISH);
if(replacements.containsKey(replacementKey)) {
repString = replacements.get(replacementKey);
} else {
// special case for escaped HTML entities.
repString = "\\\\&$1";
}
m.appendReplacement(output, repString);
}
m.appendTail(output);
return output;
}
/**
* Method to clean inline code, and, if necessary, add spaces to make sure that internal, leading, or
* trailing {@code '`'} characters don't break the inline code.
* Newlines are also replaced with spaces.
*
* This method also adds the leading and trailing {@code '`'} or {@code '```'} as necessary.
*
* @param input String to clean. Can be any object. JSoup nodes are handled specially.
* @return The cleaned text.
*/
public String cleanInlineCode(Object input) {
String output = clean(input, false).replace('\n', ' ');
if(output.indexOf('`') != -1) {
String prepend = "";
if(output.charAt(0) == '`') {
prepend = " ";
}
String append = "";
if(output.charAt(output.length()-1) == '`') {
append = " ";
}
String delim = getDelimiter(output);
output = String.format("%s%s%s%s%s", delim, prepend, output, append, delim);
} else {
output = String.format("`%s`", output);
}
return output;
}
/**
* Removes the escaping on leading characters, for example, when they are going to be rendered inside
* another node, such as a table.
* @param input String to process
* @return Cleaned string.
*/
public String unescapeLeadingCharacters(String input) {
// removes any leading escapes...
return unescapeLeadingChars.matcher(input).replaceAll("$1$2");
}
/**
* Handles escaping special characters in URLs to avoid issues when they are rendered out
* (ie: spaces, parentheses)
* @param input URL to process
* @return Cleaned URL
*/
public String cleanUrl(String input) {
StringBuffer output = new StringBuffer();
Matcher m = URL_CLEANER.matcher(input);
while (m.find()) {
char c = m.group().charAt(0);
m.appendReplacement(output, String.format("%%%02x", (int)c));
}
m.appendTail(output);
return output.toString();
}
String getDelimiter(String input) {
int max = 0;
int counter = 0;
for(int i=0; i<input.length(); i++) {
if(input.charAt(i) == '`') {
counter++;
} else {
max = Math.max(max, counter);
counter = 0;
}
}
// check in case the last tick was at the end.
max = Math.max(max, counter);
return StringUtils.multiply('`', max + 1);
}
private String getTextNodeText(TextNode tn, boolean normalText) {
String input = normalText ? tn.text() : tn.getWholeText();
Node prev = tn.previousSibling();
Node next = tn.nextSibling();
boolean parentIsBlock = isBlock(tn.parent());
if(isBlock(prev)) {
input = ltrim(input);
} else if(prev == null && parentIsBlock) {
input = ltrim(input);
} else if(normalText && prev instanceof TextNode) {
TextNode tprev = (TextNode)prev;
if(EMPTY_MATCHER.matcher(tprev.text()).matches()) {
input = ltrim(input);
}
}
if(input.length() > 0) {
if(isBlock(next)) {
input = rtrim(input);
} else if(next == null && parentIsBlock) {
input = rtrim(input);
} else if(normalText && next instanceof TextNode) {
TextNode tnext = (TextNode)next;
if(EMPTY_MATCHER.matcher(tnext.text()).matches()) {
input = rtrim(input);
}
}
}
return input;
}
private boolean isBlock(Node n) {
boolean block = false;
if(n != null && n instanceof Element) {
Element el = (Element)n;
block = el.isBlock() || el.tagName().equals("br");
}
return block;
}
private String ltrim(String s) {
int start = 0;
while((start+1 <= s.length()) &&
EMPTY_MATCHER.matcher(s.substring(start, start+1)).matches()) {
start++;
}
String ret = "";
if(start != s.length()) {
ret = s.substring(start);
}
return ret;
}
private String rtrim(String s) {
int end = s.length();
while((end-1 >= 0) &&
EMPTY_MATCHER.matcher(s.substring(end-1, end)).matches()) {
end--;
}
String ret = "";
if(end != 0) {
ret = s.substring(0, end);
}
return ret;
}
}