package com.redhat.ecs.commonutils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.redhat.ecs.commonstructures.Pair;
import com.redhat.ecs.commonstructures.StringToNodeCollection;
import com.redhat.ecs.internalsort.EntitySubstitutionBoundaryDataBoundaryStartSort;
import com.redhat.ecs.internaltructures.EntitySubstitutionBoundaryData;
* A collection of XML related functions. Note to self: See
* classpathx/jaxp/apidoc/gnu/xml/dom/ls/DomLSSerializer.html for LSSerializer
* options
public class XMLUtilities
/** The Docbook elements that contain translatable text */
public static final ArrayList<String> TRANSLATABLE_ELEMENTS = CollectionUtilities.toArrayList(new String[]
{ "ackno", "bridgehead", "caption", "conftitle", "contrib", "entry", "firstname", "glossterm", "indexterm", "jobtitle", "keyword", "label", "lastname", "lineannotation", "lotentry", "member", "orgdiv", "orgname", "othername", "para", "phrase", "productname", "refclass", "refdescriptor", "refentrytitle",
"refmiscinfo", "refname", "refpurpose", "releaseinfo", "revremark", "screeninfo", "secondaryie", "seealsoie", "seeie", "seg", "segtitle", "simpara", "subtitle", "surname", "term", "termdef", "tertiaryie", "title", "titleabbrev", "screen", "programlisting", "literallayout" });
* The Docbook elements that contain translatable text, and need to be kept
* inline
public static final ArrayList<String> INLINE_ELEMENTS = CollectionUtilities.toArrayList(new String[]
{ "footnote", "citerefentry", "indexterm", "productname", "phrase" });
/** The Docbook elements that should not have their text reformatted */
public static final ArrayList<String> VERBATIM_ELEMENTS = CollectionUtilities.toArrayList(new String[]
{ "screen", "programlisting", "literallayout" });
* The Docbook elements that should be translated only if their parent is
public static final ArrayList<String> TRANSLATABLE_IF_STANDALONE_ELEMENTS = CollectionUtilities.toArrayList(new String[]
{ "indexterm", "productname", "phrase" });
public static final String ENCODING_START = "encoding=\"";
public static final String START_CDATA = "<![CDATA[";
public static final String END_CDATA_RE = "\\]\\]>";
public static final String END_CDATA_REPLACE = "]]>";
public static final String XML_ENTITY_NAMED_GROUP = "name";
public static final String XML_ENTITY_RE = "\\&(?<" + XML_ENTITY_NAMED_GROUP + ">.*?);";
public static final String DOCTYPE_START = "<!DOCTYPE";
public static final String DOCTYPE_END = ">";
public static final String PREAMBLE_START = "<?xml";
public static final String PREAMBLE_END = ">";
public static final String TRAILING_WHITESPACE_RE = "^(?<content>.*?)\\s+$";
public static final String TRAILING_WHITESPACE_SIMPLE_RE = ".*?\\s+$";
public static final String PRECEEDING_WHITESPACE_SIMPLE_RE = "^\\s+.*";
public static final Pattern TRAILING_WHITESPACE_RE_PATTERN = Pattern.compile(TRAILING_WHITESPACE_RE, Pattern.MULTILINE | Pattern.DOTALL);
public static String findEncoding(final String xml)
final int encodingIndexStart = xml.indexOf(ENCODING_START);
final int firstLineBreak = xml.indexOf("\n");
// make sure we found the encoding attribute
if (encodingIndexStart != -1)
final int encodingIndexEnd = xml.indexOf("\"", encodingIndexStart + ENCODING_START.length());
// make sure the encoding attribute was found before the first
// line break
if (firstLineBreak == -1 || encodingIndexStart < firstLineBreak)
// make sure we found the end of the attribute
if (encodingIndexEnd != -1)
return xml.substring(encodingIndexStart + ENCODING_START.length(), encodingIndexEnd);
return null;
public static String findDocumentType(final String xml)
final int indexStart = xml.indexOf(DOCTYPE_START);
// make sure we found the encoding attribute
if (indexStart != -1)
final int indexEnd = xml.indexOf(DOCTYPE_END, indexStart + DOCTYPE_START.length());
// make sure we found the end of the attribute
if (indexEnd != -1)
return xml.substring(indexStart, indexEnd + DOCTYPE_END.length());
return null;
public static String findPreamble(final String xml)
final int indexStart = xml.indexOf(PREAMBLE_START);
// make sure we found the encoding attribute
if (indexStart != -1)
final int indexEnd = xml.indexOf(PREAMBLE_END, indexStart + PREAMBLE_START.length());
// make sure we found the end of the attribute
if (indexEnd != -1)
return xml.substring(indexStart, indexEnd + PREAMBLE_END.length());
return null;
* This function will return a map that contains entity names as keys, and
* random integer strings as values. The values are guaranteed not to have
* appeared in the original xml.
* @param xml
* The xml to generate the replacements for
* @return a map of entity names to unique random strings
private static Map<String, String> calculateEntityReplacements(final String xml)
final Map<String, String> retValue = new HashMap<String, String>();
final Random randomGenerator = new Random();
/* compile the regular expression */
final Pattern injectionSequencePattern = Pattern.compile(XML_ENTITY_RE);
/* find any matches */
final Matcher injectionSequencematcher = injectionSequencePattern.matcher(xml);
/* loop over the regular expression matches */
while (injectionSequencematcher.find())
final String entityName =;
if (!retValue.containsKey(entityName))
String randomReplacement;
randomReplacement = "[" + randomGenerator.nextInt() + "]";
while (xml.indexOf(randomReplacement) != -1);
retValue.put(entityName, randomReplacement);
return retValue;
* This function takes the Map generated by the calculateEntityReplacements
* function, and uses those values to replace any entities in the XML string
* with their unique random integer replacements. The end results is an XML
* string that contains no entities, but contains identifiable strings that
* can be used to replace those entities at a later point.
* @param replacements
* The Map generated by the calculateEntityReplacements function
* @param xml
* The XML string to modify
* @return The modified XML
private static String replaceEntities(final Map<String, String> replacements, final String xml)
String retValue = xml;
for (final String entity : replacements.keySet())
retValue = retValue.replaceAll("\\&" + entity + ";", replacements.get(entity));
return retValue;
* This function takes a parsed Document, along with the Map generated by
* the calculateEntityReplacements function, and restores all the entities.
* @param replacements
* The Map generated by the calculateEntityReplacements function
* @param node
* The node to modify
private static void restoreEntities(final Map<String, String> replacements, final Node node)
if (node == null || replacements == null || replacements.size() == 0)
/* make the substitutions for all children nodes */
final NodeList nodeList = node.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++)
restoreEntities(replacements, nodeList.item(i));
/* cdata sections just use a straight text replace */
if (node.getNodeType() == Node.CDATA_SECTION_NODE)
for (final Entry<String, String> entityReplacement : replacements.entrySet())
final String entity = "&" + entityReplacement.getKey() + ";";
final String markerAsRE = entityReplacement.getValue().replace("[", "\\[").replace("]", "\\]");
final String textContent = node.getTextContent();
final String fixedTextContent = textContent.replaceAll(markerAsRE, entity);
else if (node.getNodeType() == Node.TEXT_NODE)
/* The list of substitution string boundaries */
final List<EntitySubstitutionBoundaryData> boundaries = new ArrayList<EntitySubstitutionBoundaryData>();
* find the start and end indexes of all the substitutions in this
* text node
for (final Entry<String, String> entityReplacement : replacements.entrySet())
final String entityName = entityReplacement.getKey();
final String entityPlaceholder = entityReplacement.getValue();
/* The length of the placeholder string */
final int entityPlaceholderLength = entityPlaceholder.length();
/* The text in this node, with the substitutions */
final String originalText = node.getTextContent();
int startIndex = 0;
while ((startIndex = originalText.indexOf(entityPlaceholder, startIndex)) != -1)
boundaries.add(new EntitySubstitutionBoundaryData(entityName, entityPlaceholder, new Pair<Integer, Integer>(startIndex, startIndex + entityPlaceholderLength - 1)));
startIndex += entityPlaceholderLength;
* if there are no boundaries, there is no need to do any
* substitutions
if (boundaries.size() != 0)
/* Sort based on the start of the boundaries */
Collections.sort(boundaries, new EntitySubstitutionBoundaryDataBoundaryStartSort());
/* get the text content of the text node */
final String originalText = node.getTextContent();
/* the parent of this node holds only this text node. */
final Node parentNode = node.getParentNode();
* loop through all the boundaries that define the position of
* the substitutions, and replace them with entity reference
* nodes.
* this involves adding a new sequence of text and entity
* reference nodes before the existing text node, and then
* removing the existing text node.
for (int i = 0; i < boundaries.size(); ++i)
final EntitySubstitutionBoundaryData boundary = boundaries.get(i);
final EntitySubstitutionBoundaryData lastBoundary = i != 0 ? boundaries.get(i - 1) : null;
/* the entity node */
final Node entityNode = parentNode.getOwnerDocument().createEntityReference(boundary.getEntityName());
/* the first substitution where text proceeds it */
if (i == 0)
if (boundary.getBoundary().getFirst() != 0)
final Node textNode = parentNode.getOwnerDocument().createTextNode(originalText.substring(0, boundary.getBoundary().getFirst()));
parentNode.insertBefore(textNode, node);
/* append an entity node after the initial text node */
parentNode.insertBefore(entityNode, node);
* there is a gap between the last boundary and this
* boundary
if (lastBoundary.getBoundary().getSecond() + 1 != boundary.getBoundary().getFirst())
final Node textNode = parentNode.getOwnerDocument().createTextNode(originalText.substring(lastBoundary.getBoundary().getSecond() + 1, boundary.getBoundary().getFirst()));
parentNode.insertBefore(textNode, node);
* append an entity node after the text node following the
* last substitution
parentNode.insertBefore(entityNode, node);
/* the last substitution where text follows it */
if (i == boundaries.size() - 1)
/* append an entity node before the last text node */
parentNode.insertBefore(entityNode, node);
if (boundary.getBoundary().getSecond() != originalText.length() - 1)
final Node textNode = parentNode.getOwnerDocument().createTextNode(originalText.substring(boundary.getBoundary().getSecond() + 1));
parentNode.insertBefore(textNode, node);
/* finally, remove the existing text node */
// TODO: deal with entities in attributes
* @param xml
* The XML to be converted
* @return A Document converted from the supplied XML, or null if the
* supplied XML was invalid
public static Document convertStringToDocument(final String xml)
if (xml == null)
return null;
// find the encoding, defaulting to UTF-8
String encoding = findEncoding(xml);
if (encoding == null)
encoding = "UTF-8";
* Xerces does not seem to have any way of simply importing entities
* "as is". It will try to expand them, which we don't want. As a
* work around the calculateEntityReplacements() function will map
* entity names to random substitution markers. These markers are
* parsed as plain text (they are in the format "[random_integer]").
* The replaceEntities() function will then replace the entity
* definitions in the source XML text with these substitution
* markers.
* At this point the XML has no entities, and so Xerces will parse
* the string without trying to expand the entities.
* Once we have a Document object, we run the restoreEntities()
* function, which replaces the substitution markers with entity
* reference nodes. Xerces does not try to expand entites when
* serializing a Document object to a string, nor does it try to
* extand entity reference nodes when they are added. In this way we
* can parse any XML and retain the entities without having to link
* to any DTDs or implement any EntityResolvers.
final Map<String, String> replacements = calculateEntityReplacements(xml);
final String fixedXML = replaceEntities(replacements, xml);
final DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
builderFactory.setFeature("", false);
final DocumentBuilder builder = builderFactory.newDocumentBuilder();
final Document document = builder.parse(new ByteArrayInputStream(fixedXML.getBytes(encoding)));
restoreEntities(replacements, document.getDocumentElement());
return document;
catch (Exception ex)
return null;
* Converts a Document to a String
* @param doc
* The Document to be converted
* @return The String representation of the Document
public static String convertDocumentToString(final Document doc, final String encoding)
String retValue = convertDocumentToString(doc);
* The encoding used is the encoding of the DOMString type, i.e. UTF-16
* (
* writeToString). However, we need to use UTF-8
* ( So do a simple
* text replacement.
final String docEncoding = findEncoding(retValue);
if (docEncoding != null)
retValue = retValue.replace(docEncoding, encoding);
return retValue;
* Converts a Document to a String
* @param doc
* The Document to be converted
* @return The String representation of the Document
public static String convertDocumentToString(final Document doc)
final DOMImplementationLS domImplementation = (DOMImplementationLS) doc.getImplementation();
final LSSerializer lsSerializer = domImplementation.createLSSerializer();
// lsSerializer.getDomConfig().setParameter("format-pretty-print",
// Boolean.TRUE);
final String xml = lsSerializer.writeToString(doc);
return xml;
private static void appendIndent(final StringBuffer stringBuffer, final boolean tabIndent, final int indentLevel, final int indentCount)
final char indent = tabIndent ? '\t' : ' ';
final int totalIndentCount = indentLevel * indentCount;
for (int i = 0; i < totalIndentCount; ++i)
public static String convertNodeToString(final Node startNode, final boolean includeElementName)
return convertNodeToString(startNode, includeElementName, true, false, new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), true, 0, 0);
public static String convertNodeToString(final Node startNode, final List<String> verbatimElements, final List<String> inlineElements, final List<String> contentsInlineElements, final boolean tabIndent)
return convertNodeToString(startNode, true, false, false, verbatimElements, inlineElements, contentsInlineElements, tabIndent, 1, 0);
* Converts a Node to a String.
* @param node
* The Node to be converted
* @param includeElementName
* true if the string should include the name of the node, or
* false if it is just to include the contents of the node
* @return The String representation of the Node
public static String convertNodeToString(final Node startNode, final boolean includeElementName, final boolean verbatim, final boolean inline, final List<String> verbatimElements, final List<String> inlineElements, final List<String> contentsInlineElements, final boolean tabIndent, final int indentCount,
final int indentLevel)
/* Find out if this node is a document */
final Node node = startNode instanceof Document ? ((Document) startNode).getDocumentElement() : startNode;
final String nodeName = node.getNodeName();
final short nodeType = node.getNodeType();
final StringBuffer stringBuffer = new StringBuffer();
* Find out if the previous node was a comment (excluding any empty text
* nodes). Also find out if this is the first node in the parent.
boolean previousNodeWasComment = false;
Node previousNode = startNode.getPreviousSibling();
while (previousNode != null)
if ((previousNode.getNodeType() == Node.TEXT_NODE && previousNode.getNodeValue().trim().isEmpty()))
previousNode = previousNode.getPreviousSibling();
if (previousNode.getNodeType() == Node.COMMENT_NODE)
previousNodeWasComment = true;
/* Find out of this node is the document root node */
final boolean documentRoot = node.getOwnerDocument().getDocumentElement() == node;
final boolean firstNode = node.getPreviousSibling() == null;
if (Node.CDATA_SECTION_NODE == nodeType)
final StringBuffer retValue = new StringBuffer();
if (!verbatim && !inline)
appendIndent(retValue, tabIndent, indentLevel, indentCount);
if (includeElementName)
if (includeElementName)
return retValue.toString();
if (Node.COMMENT_NODE == nodeType)
final StringBuffer retValue = new StringBuffer();
if (!verbatim && !inline)
appendIndent(retValue, tabIndent, indentLevel, indentCount);
if (includeElementName)
if (includeElementName)
return retValue.toString();
if (Node.TEXT_NODE == nodeType)
if (!verbatim)
String trimmedNodeValue = cleanText(node.getNodeValue());
if (!trimmedNodeValue.trim().isEmpty())
final StringBuffer retValue = new StringBuffer();
* if this is the first text node, remove all preceeding
* whitespace, and then add the indent
final boolean firstNotInlinedTextNode = !inline && firstNode;
if (firstNotInlinedTextNode)
appendIndent(retValue, tabIndent, indentLevel, indentCount);
* Remove any white space at the begining and end of the
* text, save for one space
final boolean startedWithWhiteSpace = StringUtilities.startsWithWhitespace(trimmedNodeValue);
final boolean endedWithWhitespace = StringUtilities.endsWithWhitespace(trimmedNodeValue);
while (StringUtilities.startsWithWhitespace(trimmedNodeValue))
trimmedNodeValue = trimmedNodeValue.substring(1);
while (StringUtilities.endsWithWhitespace(trimmedNodeValue))
trimmedNodeValue = trimmedNodeValue.substring(0, trimmedNodeValue.length() - 1);
if (startedWithWhiteSpace && !firstNotInlinedTextNode)
trimmedNodeValue = " " + trimmedNodeValue;
if (endedWithWhitespace)
trimmedNodeValue += " ";
return retValue.toString();
return new String();
return node.getNodeValue();
if (Node.ENTITY_REFERENCE_NODE == nodeType)
final StringBuffer retValue = new StringBuffer();
if (includeElementName)
if (includeElementName)
return retValue.toString();
/* open the tag */
if (includeElementName)
if (!verbatim && !documentRoot && ((!inline && !inlineElements.contains(nodeName)) || previousNodeWasComment || (firstNode && !inline)))
appendIndent(stringBuffer, tabIndent, indentLevel, indentCount);
/* add attributes */
final NamedNodeMap attrs = node.getAttributes();
if (attrs != null)
for (int i = 0; i < attrs.getLength(); i++)
final Node attr = attrs.item(i);
stringBuffer.append(' ').append(attr.getNodeName()).append("=\"").append(attr.getNodeValue()).append("\"");
/* deal with children */
final NodeList children = node.getChildNodes();
if (children.getLength() == 0)
final String nodeTextContent = node.getTextContent();
if (nodeTextContent.length() == 0)
if (includeElementName)
/* indent */
if (!verbatim && !inline && !inlineElements.contains(nodeName))
appendIndent(stringBuffer, tabIndent, indentLevel, indentCount);
/* close that tag */
if (includeElementName)
if (includeElementName)
final boolean inlineMyChildren = inline || inlineElements.contains(nodeName) || contentsInlineElements.contains(nodeName);
final boolean verbatimMyChildren = verbatim || verbatimElements.contains(nodeName);
for (int i = 0; i < children.getLength(); ++i)
final String childToString = convertNodeToString(children.item(i), true, verbatimMyChildren, inlineMyChildren, verbatimElements, inlineElements, contentsInlineElements, tabIndent, indentCount, indentLevel + 1);
if (childToString.length() != 0)
/* close that tag */
if (includeElementName)
/* indent */
if (!verbatimMyChildren && !inlineMyChildren)
appendIndent(stringBuffer, tabIndent, indentLevel, indentCount);
return stringBuffer.toString();
* Scans a node and all of its children for nodes of a particular type.
* @param parent
* The parent node
* @param nodeName
* The node name to search for
* @return a List of all the nodes found matching the nodeName under the
* parent
public static List<Node> getNodes(final Node parent, final String nodeName)
final List<Node> nodes = new ArrayList<Node>();
final NodeList children = parent.getChildNodes();
for (int i = 0; i < children.getLength(); ++i)
final Node child = children.item(i);
if (child.getNodeName().equals(nodeName))
nodes.addAll(getNodes(child, nodeName));
return nodes;
public static List<Node> getComments(final Node parent)
return getNodes(parent, "#comment");
public static List<StringToNodeCollection> getTranslatableStrings(final Document xml, final boolean allowDuplicates)
if (xml == null)
return null;
final List<StringToNodeCollection> retValue = new ArrayList<StringToNodeCollection>();
final NodeList nodes = xml.getDocumentElement().getChildNodes();
for (int i = 0; i < nodes.getLength(); ++i)
final Node node = nodes.item(i);
getTranslatableStringsFromNode(node, retValue, allowDuplicates, new XMLProperties());
return retValue;
private static boolean doesElementContainTranslatableContent(final Node node)
final NodeList children = node.getChildNodes();
if (children != null)
/* check to see if any of the children are translatable nodes */
for (int j = 0; j < children.getLength(); ++j)
final Node child = children.item(j);
final String childName = child.getNodeName();
/* this child node is itself translatable, so return true */
if (TRANSLATABLE_ELEMENTS.contains(childName))
return true;
* now check to see if any of the child have children that are
* translatable
for (int j = 0; j < children.getLength(); ++j)
final Node child = children.item(j);
final NodeList grandChildren = child.getChildNodes();
for (int k = 0; k < grandChildren.getLength(); ++k)
final Node grandChild = grandChildren.item(k);
final boolean result = doesElementContainTranslatableContent(grandChild);
if (result)
return true;
return false;
private static void getTranslatableStringsFromNode(final Node node, final List<StringToNodeCollection> translationStrings, final boolean allowDuplicates, final XMLProperties props)
if (node == null || translationStrings == null)
XMLProperties xmlProperites = new XMLProperties(props);
final String nodeName = node.getNodeName();
final String nodeParentName = node.getParentNode() != null ? node.getParentNode().getNodeName() : null;
final boolean textElement = node.getNodeType() == Node.TEXT_NODE;
final boolean translatableElement = TRANSLATABLE_ELEMENTS.contains(nodeName);
final boolean standaloneElement = TRANSLATABLE_IF_STANDALONE_ELEMENTS.contains(nodeName);
final boolean translatableParentElement = TRANSLATABLE_ELEMENTS.contains(nodeParentName);
if (!xmlProperites.isInline() && INLINE_ELEMENTS.contains(nodeName)) xmlProperites.setInline(true);
if (!xmlProperites.isVerbatim() && VERBATIM_ELEMENTS.contains(nodeName)) xmlProperites.setVerbatim(true);
* this element has translatable strings if:
* 1. a text node
* OR
* 2. a translatableElement
* 3. a standaloneElement without a translatableParentElement
* 4. not a standaloneElement and not an inlineElement
if (textElement || (translatableElement && ((standaloneElement && !translatableParentElement) || (!standaloneElement && !xmlProperites.isInline()))))
final NodeList children = node.getChildNodes();
final boolean hasChildren = children == null || children.getLength() != 0;
/* dump the node if it has no children */
if (!hasChildren)
final String nodeText = convertNodeToString(node, false);
final String cleanedNodeText = cleanTranslationText(nodeText, true, true);
if (xmlProperites.isVerbatim())
addTranslationToNodeDetailsToCollection(nodeText, node, allowDuplicates, translationStrings);
else if (!cleanedNodeText.isEmpty())
addTranslationToNodeDetailsToCollection(cleanedNodeText, node, allowDuplicates, translationStrings);
* dump all child nodes until we hit one that itself contains a
* translatable element. in effect the translation strings can
* contain up to one level of xml elements.
ArrayList<Node> nodes = new ArrayList<Node>();
String translatableString = "";
final int childrenLength = children.getLength();
for (int i = 0; i < childrenLength; ++i)
final Node child = children.item(i);
* does this child have another level of translatable tags?
boolean containsTranslatableTags = doesElementContainTranslatableContent(child);
* if so, save the string we have been building up, process
* the child, and start building up a new string
if (containsTranslatableTags)
if (nodes.size() != 0)
* We have found a child node that itself contains
* some translatable children. In this case we
* create a new translatable string. It is possible
* that the translatableString has some
* insignificant trailing whitespace, because the
* call to the cleanTranslationText function in the
* else statement below has assumed that the node
* being processed was not the last one in the
* translatable string, making the trailing
* whitespace important. So we clean up the trailing
* whitespace here.
final Matcher matcher = TRAILING_WHITESPACE_RE_PATTERN.matcher(translatableString);
if (matcher.matches())
translatableString ="content");
addTranslationToNodeDetailsToCollection(translatableString, nodes, allowDuplicates, translationStrings);
translatableString = "";
nodes = new ArrayList<Node>();
getTranslatableStringsFromNode(child, translationStrings, allowDuplicates, xmlProperites);
final String childName = child.getNodeName();
final String childText = convertNodeToString(child, true);
final String cleanedChildText = cleanTranslationText(childText, i == 0, i == childrenLength - 1);
final boolean isVerbatimNode = VERBATIM_ELEMENTS.contains(childName);
final String thisTranslatableString = isVerbatimNode || xmlProperites.isVerbatim() ? childText : cleanedChildText;
translatableString += thisTranslatableString;
/* save the last translated string */
if (nodes.size() != 0)
addTranslationToNodeDetailsToCollection(translatableString, nodes, allowDuplicates, translationStrings);
translatableString = "";
nodes = new ArrayList<Node>();
/* if we hit a non-translatable element, process its children */
final NodeList nodeList = node.getChildNodes();
for (int i = 0; i < nodeList.getLength(); ++i)
final Node child = nodeList.item(i);
getTranslatableStringsFromNode(child, translationStrings, allowDuplicates, xmlProperites);
public static void replaceTranslatedStrings(final Document xml, final Map<String, String> translations)
if (xml == null || translations == null || translations.size() == 0)
* Get the translation strings and the nodes that the string maps to. We
* assume that the xml being provided here is an exact match for the
* xml that was supplied to getTranslatableStrings originally, which we
* then assume matches the strings supplied as the keys in the
* translations parameter.
final List<StringToNodeCollection> stringToNodeCollections = getTranslatableStrings(xml, false);
if (stringToNodeCollections == null || stringToNodeCollections.size() == 0)
for (final StringToNodeCollection stringToNodeCollection : stringToNodeCollections)
final String originalString = stringToNodeCollection.getTranslationString();
final ArrayList<ArrayList<Node>> nodeCollections = stringToNodeCollection.getNodeCollections();
if (nodeCollections != null && nodeCollections.size() != 0)
if (translations.containsKey(originalString))
final String translation = translations.get(originalString);
/* wrap the returned translation in a root element */
final String wrappedTranslation = "<tempRoot>" + translation + "</tempRoot>";
/* convert the wrapped translation into an XML document */
final Document translationDocument = convertStringToDocument(wrappedTranslation);
/* was the conversion successful */
if (translationDocument != null)
for (final ArrayList<Node> nodes : nodeCollections)
if (nodes != null && nodes.size() != 0)
* All nodes in a collection should share the
* same parent
final Node parent = nodes.get(0).getParentNode();
if (parent != null)
* Start by inserting the nodes created when
* we converted the translated text into
* XML. Do it in reverse order, because
* that's the easiest solution for appending
* to the start of the element in the
* original order.
final NodeList translatedChildren = translationDocument.getDocumentElement().getChildNodes();
for (int i = translatedChildren.getLength() - 1; i >= 0; --i)
* import the node from the translated
* xml "fragment"
final Node translatedNode = xml.importNode(translatedChildren.item(i), true);
* insert it into the xml doc to be
* translated
parent.insertBefore(translatedNode, parent.getFirstChild());
* remove the original nodes that the
* translated text came from
for (final Node node : nodes)
if (parent == node.getParentNode())
private static StringToNodeCollection findExistingText(final String text, final List<StringToNodeCollection> translationStrings)
for (final StringToNodeCollection stringToNodeCollection : translationStrings)
if (stringToNodeCollection.getTranslationString().equals(text))
return stringToNodeCollection;
return null;
private static void addTranslationToNodeDetailsToCollection(final String text, final Node node, final boolean allowDuplicates, final List<StringToNodeCollection> translationStrings)
final ArrayList<Node> nodes = new ArrayList<Node>();
addTranslationToNodeDetailsToCollection(text, nodes, allowDuplicates, translationStrings);
private static void addTranslationToNodeDetailsToCollection(final String text, final ArrayList<Node> nodes, final boolean allowDuplicates, final List<StringToNodeCollection> translationStrings)
if (allowDuplicates)
translationStrings.add(new StringToNodeCollection(text).addNodeCollection(nodes));
final StringToNodeCollection stringToNodeCollection = findExistingText(text, translationStrings);
if (stringToNodeCollection == null)
translationStrings.add(new StringToNodeCollection(text).addNodeCollection(nodes));
/** Cleans a string for presentation to a translator */
private static String cleanTranslationText(final String input, final boolean removeWhitespaceFromStart, final boolean removeWhitespaceFromEnd)
String retValue = cleanText(input);
final boolean hasStartWhiteSpace = PRECEEDING_WHITESPACE_SIMPLE_RE_PATTERN.matcher(input).matches();
final boolean hasEndWhiteSpace = TRAILING_WHITESPACE_SIMPLE_RE_PATTERN.matcher(input).matches();
retValue = retValue.trim();
* When presenting the contents of a childless XML node to the
* translator, there is no need for white space padding. When building
* up a translatable string from a succession of text nodes, whitespace
* becomes important.
if (!removeWhitespaceFromStart)
if (hasStartWhiteSpace)
retValue = " " + retValue;
if (!removeWhitespaceFromEnd)
if (hasEndWhiteSpace)
retValue += " ";
return retValue;
/** Cleans a string for of insignificant whitespace */
private static String cleanText(final String input)
/* get rid of line breaks */
String retValue = input.replaceAll("\\r\\n|\\r|\\n|\\t", " ");
/* get rid of double spaces */
while (retValue.indexOf(" ") != -1)
retValue = retValue.replaceAll(" ", " ");
return retValue;
* CDATA sections can not have a "]]>" in them. This method takes the input
* and wraps it up in one or more CDATA sections, converting any "]]>"
* strings into "]]>".
public static String wrapStringInCDATA(final String input)
final StringBuffer retValue = new StringBuffer("<![CDATA[");
retValue.append(input.replaceAll(END_CDATA_RE, END_CDATA_RE + END_CDATA_REPLACE + START_CDATA));
return retValue.toString();
class XMLProperties {
private boolean verbatim = false;
private boolean inline = false;
public XMLProperties() {
public XMLProperties(XMLProperties props) {
if (props != null) {
this.inline = props.isInline();
this.verbatim = props.isVerbatim();
public boolean isVerbatim() {
return verbatim;
public void setVerbatim(boolean verbatim) {
this.verbatim = verbatim;
public boolean isInline() {
return inline;
public void setInline(boolean inline) {
this.inline = inline;