Source Code of com.overzealous.remark.convert.DocumentConverter

/*
 * Copyright 2011 OverZealous Creations, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package com.overzealous.remark.convert;


import com.overzealous.remark.IgnoredHtmlElement;
import com.overzealous.remark.Options;
import com.overzealous.remark.util.BlockWriter;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;


import java.io.OutputStream;
import java.io.Writer;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * The class that does the heavy lifting for converting a JSoup Document into
 * valid Markdown
 *
 * @author Phil DeJarnett
 */
public class DocumentConverter {


  // These properties do not change for the life of this converter
  final Options options;
  final TextCleaner cleaner;
  private final Set<String> ignoredHtmlTags;
  final Map<String,NodeHandler> blockNodes;
  final Map<String,NodeHandler> inlineNodes;


  // These properties change for each conversion
  private Map<String,String> linkUrls; // for looking up links via URL
  private int genericLinkUrlCounter;
  private int genericImageUrlCounter;
  private Map<String,String> linkIds; // an inverse of linkUrls, for looking up links via ID
  private Map<String,String> abbreviations; // a cache of abbreviations mapped by abbreviated form
  BlockWriter output; // the output writer, which may change during recursion


  private Map<String,NodeHandler> lastNodeset;


  private static final Pattern COMMA = Pattern.compile(",");
  private static final Pattern LINK_MULTIPLE_SPACES = Pattern.compile(" {2,}", Pattern.DOTALL);
  private static final Pattern LINK_SAFE_CHARS = Pattern.compile("[^-\\w \\.]+", Pattern.DOTALL);
  private static final String LINK_REPLACEMENT = "_";
  private static final Pattern LINK_EDGE_REPLACE = Pattern.compile(String.format("(^%1$s++)|(%1$s++$)", LINK_REPLACEMENT));
  private static final Pattern LINK_MULTIPLE_REPLACE = Pattern.compile(String.format("%1$s{2,}", LINK_REPLACEMENT));
  private static final Pattern LINK_FILENAME = Pattern.compile("/([^/]++)$");




  /**
   * Creates a DocumentConverted with the given options.
   * @param options Options for this converter.
   */
  public DocumentConverter(Options options) {
    // configure final properties
    this.options = options;
    cleaner = new TextCleaner(options);
    ignoredHtmlTags = new HashSet<String>();
    blockNodes = new HashMap<String, NodeHandler>();
    inlineNodes = new HashMap<String, NodeHandler>();


    // configure ignored tags
    for(final IgnoredHtmlElement ihe : options.getIgnoredHtmlElements()) {
      ignoredHtmlTags.add(ihe.getTagName());
    }
    
    configureNodes();
  }


  private void configureNodes() {
    addInlineNode(new InlineStyle(),  "i,em,b,strong,font,span");
    addInlineNode(new InlineCode(),    "code,tt");
    addInlineNode(new Image(),      "img");
    addInlineNode(new Anchor(),      "a");
    addInlineNode(new Break(),      "br");
    addBlockNode (new Header(),      "h1,h2,h3,h4,h5,h6");
    addBlockNode (new Paragraph(),    "p");
    addBlockNode (new Codeblock(),    "pre");
    addBlockNode (new BlockQuote(),    "blockquote");
    addBlockNode (new HorizontalRule(),  "hr");
    addBlockNode (new List(),      "ol,ul");


    if(options.abbreviations) {
      addInlineNode(new Abbr(),    "abbr,acronym");
    }


    if(options.definitionLists) {
      addBlockNode(new Definitions(),  "dl");
    }




    // TABLES
    if(options.getTables().isConvertedToText()) {
      // if we are going to process it, add the handler
      addBlockNode(new Table(),    "table");


    } else if(options.getTables().isRemoved()) {
      addBlockNode(NodeRemover.getInstance(), "table");


    } // else, it's being added directly
  }


  @SuppressWarnings({"UnusedDeclaration"})
  public Options getOptions() {
    return options;
  }


  @SuppressWarnings({"UnusedDeclaration"})
  public TextCleaner getCleaner() {
    return cleaner;
  }


  @SuppressWarnings({"UnusedDeclaration"})
  public Map<String, NodeHandler> getBlockNodes() {
    return Collections.unmodifiableMap(blockNodes);
  }


  @SuppressWarnings({"UnusedDeclaration"})
  public Map<String, NodeHandler> getInlineNodes() {
    return Collections.unmodifiableMap(inlineNodes);
  }


  @SuppressWarnings({"UnusedDeclaration"})
  public BlockWriter getOutput() {
    return output;
  }


  @SuppressWarnings({"UnusedDeclaration"})
  public void setOutput(BlockWriter output) {
    this.output = output;
  }


  /**
   * Customize the processing for a node.  This node is added to the
   * inline list and the block list.  The inline list is used for nodes
   * that do not contain linebreaks, such as {@code <em>} or {@code <strong>}.
   *
   * The tagnames is a comma-delimited list of tagnames for
   * which this handler should be applied.
   *
   * @param handler The handler for the nodes
   * @param tagnames One or more tagnames
   */
  @SuppressWarnings({"WeakerAccess"})
  public void addInlineNode(NodeHandler handler, String tagnames) {
    for(final String key : COMMA.split(tagnames)) {
      if(key.length() > 0) {
        inlineNodes.put(key, handler);
        blockNodes.put(key, handler);
      }
    }
  }




  /**
   * Customize the processing for a node.  This node is added to the
   * block list only.  The node handler should properly use the
   * {@link com.overzealous.remark.util.BlockWriter#startBlock()} and
   * {@link com.overzealous.remark.util.BlockWriter#endBlock()} methods as
   * appropriate.
   *
   * The tagnames is a comma-delimited list of tagnames for
   * which this handler should be applied.
   *
   * @param handler The handler for the nodes
   * @param tagnames One or more tagnames
   */
  @SuppressWarnings({"WeakerAccess"})
  public void addBlockNode(NodeHandler handler, String tagnames) {
    for(final String key : COMMA.split(tagnames)) {
      if(key.length() > 0) {
        blockNodes.put(key, handler);
      }
    }
  }


  /**
   * Convert a document to the given writer.
   *
   * <p><strong>Note: It is up to the calling class to handle closing the writer!</strong></p>
   *
   * @param doc Document to convert
   * @param out Writer to receive the final output
   */
  public void convert(Document doc, Writer out) {
    this.output = new BlockWriter(out, true);
    this.convertImpl(doc);
  }


  /**
   * Convert a document to the given output stream.
   *
   * <p><strong>Note: It is up to the calling class to handle closing the stream!</strong></p>
   *
   * @param doc Document to convert
   * @param out OutputStream to receive the final output
   */
  public void convert(Document doc, OutputStream out) {
    this.output = new BlockWriter(out, true);
    this.convertImpl(doc);
  }


  /**
   * Convert a document and return a string.
   * When wanting a final string, this method should always be used.
   * It will attempt to calculate the size of the buffer necessary to hold the entire output.
   *
   * @param doc Document to convert
   * @return The Markdown-formatted string.
   */
  public String convert(Document doc) {
    // estimate the size necessary to handle the final output
    BlockWriter bw = BlockWriter.create(DocumentConverter.calculateLength(doc, 0));
    this.output = bw;
    this.convertImpl(doc);
    return bw.toString();
  }


  // Utility method to quickly walk the DOM tree and estimate the size of the
  // buffer necessary to hold the result.
  private static int calculateLength(Element el, int depth) {
    int result = 0;
    for(final Node n : el.childNodes()) {
      if(n instanceof Element) {
        result += (4 * depth) + calculateLength((Element)n, depth+1);
      } else if(n instanceof TextNode) {
        result += ((TextNode)n).text().length();
      }
    }
    return result;
  }


  // implementation of the convert method.  Basically handles setting up the
  private void convertImpl(Document doc) {
    
    // linked, because we want the resulting list of links in order they were added
    linkIds = new LinkedHashMap<String, String>();
    // To keep track of already added URLs
    linkUrls = new HashMap<String, String>();
    genericImageUrlCounter = 0;
    genericLinkUrlCounter = 0;
    // linked, to keep abbreviations in the order they were added
    abbreviations = new LinkedHashMap<String, String>();


    lastNodeset = blockNodes;


    // walk the DOM
    walkNodes(DefaultNodeHandler.getInstance(), doc.body(), blockNodes);


    if(!linkIds.isEmpty()) {
      // Add links
      output.startBlock();
      for(final Map.Entry<String,String> link : linkIds.entrySet()) {
        output.printf("\n[%s]: %s", link.getKey(), link.getValue());
      }
      output.endBlock();
    }
    if(!abbreviations.isEmpty()) {
      // Add abbreviations
      output.startBlock();
      for(final Map.Entry<String,String> abbr : abbreviations.entrySet()) {
        output.printf("\n*[%s]: %s", abbr.getKey(), cleaner.clean(abbr.getValue()));
      }
      output.endBlock();
    }


    // free up unused properties
    linkIds = null;
    linkUrls = null;
    abbreviations = null;
    output = null;
  }


  /**
   * Loops over the children of an HTML Element, handling TextNode and child Elements.
   *
   * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements.
   * @param el The parent HTML Element whose children are being looked at.
   */
  public void walkNodes(NodeHandler currentNode, Element el) {
    walkNodes(currentNode, el, lastNodeset);
  }


  /**
   * Loops over the children of an HTML Element, handling TextNode and child Elements.
   *
   * @param currentNodeHandler The default node handler for TextNodes and IgnoredHTMLElements.
   * @param el The parent HTML Element whose children are being looked at.
   * @param nodeList The list of valid nodes at this level.  Should be one of <b>blockNodes</b> or <b>inlineNodes</b>
   */
  public void walkNodes(NodeHandler currentNodeHandler, Element el, Map<String, NodeHandler> nodeList) {
    Map<String, NodeHandler> backupLastNodeset = lastNodeset;
    lastNodeset = nodeList;
    for(final Node n : el.childNodes()) {
      if(n instanceof TextNode) {
        // It's just text!
        currentNodeHandler.handleTextNode((TextNode) n, this);


      } else if(n instanceof Element) {
        // figure out who can handle this
        Element node = (Element)n;
        String tagName = node.tagName();


        if(nodeList.containsKey(tagName)) {
          // OK, we know how to handle this node
          nodeList.get(tagName).handleNode(currentNodeHandler, node, this);


        } else if(ignoredHtmlTags.contains(tagName)) {
          // User wants to leave this tag in the output.  Naughty user.
          currentNodeHandler.handleIgnoredHTMLElement(node, this);


        } else {
          // No-can-do, just remove the node, and keep on walkin'
          // The only thing we'll do is add block status in if the unknown node
          // usually renders as a block.
          // Due to BlockWriter's intelligent tracking, we shouldn't get a whole bunch
          // of empty lines for empty nodes.
          if(node.isBlock()) {
            output.startBlock();
          }
          walkNodes(currentNodeHandler, node, nodeList);
          if(node.isBlock()) {
            output.endBlock();
          }          
        }
      } // else: not a node we care about (e.g.: comment nodes)
    }
    lastNodeset = backupLastNodeset;
  }


  /**
   * Recursively processes child nodes and returns the potential output string.
   * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements.
   * @param el The parent HTML Element whose children are being looked at.
   * @return The potential output string.
   */
  public String getInlineContent(NodeHandler currentNode, Element el) {
    return this.getInlineContent(currentNode, el, false);
  }


  /**
   * Recursively processes child nodes and returns the potential output string.
   * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements.
   * @param el The parent HTML Element whose children are being looked at.
   * @param undoLeadingEscapes If true, leading escapes are removed
   * @return The potential output string.
   */
  public String getInlineContent(NodeHandler currentNode, Element el, boolean undoLeadingEscapes) {
    BlockWriter oldOutput = output;
    output = BlockWriter.create(1000);
    walkNodes(currentNode, el, inlineNodes);
    String ret = output.toString();
    output = oldOutput;
    if(undoLeadingEscapes) {
      ret = cleaner.unescapeLeadingCharacters(ret);
    }
    return ret;
  }


  /**
   * Adds a link to the link set, and returns the actual ID for the link.
   *
   * @param url URL for link
   * @param recommendedName A recommended name for non-simple link IDs. This might be modified.
   * @param image If true, use "img-" instead of "link-" for simple link IDs.
   * @return The actual link ID for this URL.
   */
  public String addLink(String url, String recommendedName, boolean image) {
    String linkId;
    if(linkUrls.containsKey(url)) {
      linkId = linkUrls.get(url);
    } else {
      if(options.simpleLinkIds) {
        linkId = (image ? "image-" : "") + String.valueOf(linkUrls.size()+1);
      } else {
        recommendedName = cleanLinkId(url, recommendedName, image);
        if(linkIds.containsKey(recommendedName)) {
          int incr = 1;
          while(linkIds.containsKey(String.format("%s %d", recommendedName, incr))) {
            incr++;
          }
          recommendedName = String.format("%s %d", recommendedName, incr);
        }
        linkId = recommendedName;
      }
      linkUrls.put(url, linkId);
      linkIds.put(linkId, url);
    }
    return linkId;
  }


  /**
   * Adds an abbreviation to the abbreviation set.
   * @param abbr The abbreviation to be used
   * @param definition The definition for the abbreviation, should NOT be pre-escaped.
   */
  void addAbbreviation(String abbr, String definition) {
    if(!abbreviations.containsKey(abbr)) {
      abbreviations.put(abbr, definition);
    }
  }


  String cleanLinkId(String url, String linkId, boolean image) {
    // no newlines
    String ret = linkId.replace('\n', ' ');
    // multiple spaces should be a single space
    ret = LINK_MULTIPLE_SPACES.matcher(ret).replaceAll(" ");
    // remove all characters except letters, numbers, spaces, and some basic punctuation
    ret = LINK_SAFE_CHARS.matcher(ret).replaceAll(LINK_REPLACEMENT);
    // replace multiple underscores with a single underscore
    ret = LINK_MULTIPLE_REPLACE.matcher(ret).replaceAll(LINK_REPLACEMENT);
    // replace underscores on the left or right with nothing
    ret = LINK_EDGE_REPLACE.matcher(ret).replaceAll("");
    // trim any leading or trailing spaces
    ret = ret.trim();
    if(ret.length() == 0 || ret.equals(LINK_REPLACEMENT)) {
      // if we have nothing usable left, use a generic ID
      if(image) {
        if(url != null) {
          Matcher m = LINK_FILENAME.matcher(url);
          if(m.find()) {
            ret = cleanLinkId(null, m.group(1), true);
          } else {
            genericImageUrlCounter++;
            ret = "Image " + genericImageUrlCounter;
          }
        } else {
          genericImageUrlCounter++;
          ret = "Image " + genericImageUrlCounter;
        }
      } else {
        genericLinkUrlCounter++;
        ret = "Link " + genericLinkUrlCounter;
      }
    } // else, use the cleaned id
    return ret;
  }
}
Source Code of com.overzealous.remark.convert.DocumentConverter

Related Classes of com.overzealous.remark.convert.DocumentConverter