Source Code of org.jboss.elasticsearch.river.remote.sitemap.SiteMapParser

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.jboss.elasticsearch.river.remote.sitemap;


import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.zip.GZIPInputStream;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;


import org.apache.commons.io.input.BOMInputStream;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.jboss.elasticsearch.river.remote.sitemap.AbstractSiteMap.SitemapType;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;


/**
 * @author http://code.google.com/p/crawler-commons
 */
public class SiteMapParser {


  private static final ESLogger logger = Loggers.getLogger(SiteMapParser.class);


  /** According to the specs, 50K URLs per Sitemap is the max */
  private static final int MAX_URLS = 50000;


  /** Sitemap docs must be limited to 10MB (10,485,760 bytes) */
  public static int MAX_BYTES_ALLOWED = 10485760;


  /** True (by default) if invalid URLs should be rejected */
  private boolean strict;


  public SiteMapParser() {
    this(true);
  }


  public SiteMapParser(boolean strict) {
    this.strict = strict;
  }


  /**
   * @return whether invalid URLs will be rejected
   */
  public boolean isStrict() {
    return strict;
  }


  /**
   * Returns a SiteMap or SiteMapIndex given a content type, byte content and the URL of a sitemap
   */
  public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException,
      IOException {


    // Use extension or MIME type to determine how we should try
    // to process the response
    if (url.getPath().endsWith(".xml") || contentType.contains("text/xml") || contentType.contains("application/xml")
        || contentType.contains("application/x-xml") || contentType.contains("application/atom+xml")
        || contentType.contains("application/rss+xml")) {


      // Try parsing the XML which could be in a number of formats
      return processXml(url, content);
    } else if (url.getPath().endsWith(".txt") || contentType.contains("text/plain")) {
      // plain text
      return (AbstractSiteMap) processText(content, url.toString());
    } else if (url.getPath().endsWith(".gz") || contentType.contains("application/gzip")
        || contentType.contains("application/x-gzip") || contentType.contains("application/x-gunzip")
        || contentType.contains("application/gzipped") || contentType.contains("application/gzip-compressed")
        || contentType.contains("application/x-compress") || contentType.contains("gzip/document")
        || contentType.contains("application/octet-stream")) {
      return processGzip(url, content);
    }
    throw new UnknownFormatException("Unknown format " + contentType + " at " + url);
  }


  /**
   * Parse the given XML content.
   * 
   * @param sitemapUrl
   * @param xmlContent
   * @return
   * @throws UnknownFormatException
   */
  private AbstractSiteMap processXml(URL sitemapUrl, byte[] xmlContent) throws UnknownFormatException {


    BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(xmlContent));
    InputSource is = new InputSource();
    is.setCharacterStream(new BufferedReader(new InputStreamReader(bomIs)));
    return processXml(sitemapUrl, is);
  }


  /**
   * Process a text-based Sitemap. Text sitemaps only list URLs but no priorities, last mods, etc.
   * 
   * @param content
   * @throws IOException
   */
  private SiteMap processText(byte[] content, String sitemapUrl) throws IOException {


    logger.debug("Processing textual Sitemap");


    SiteMap textSiteMap = new SiteMap(sitemapUrl);
    textSiteMap.setType(SitemapType.TEXT);


    BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(content));
    @SuppressWarnings("resource")
    BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs));


    String line;


    int i = 1;
    while ((line = reader.readLine()) != null) {
      if (line.length() > 0 && i <= MAX_URLS) {
        try {
          URL url = new URL(line);
          boolean valid = urlIsLegal(textSiteMap.getBaseUrl(), url.toString());


          if (valid || !strict) {
            if (logger.isDebugEnabled()) {
              StringBuffer sb = new StringBuffer("  ");
              sb.append(i).append(". ").append(url);
              logger.debug(sb.toString());
            }
            i++;
            SiteMapURL surl = new SiteMapURL(url, valid);
            textSiteMap.addSiteMapUrl(surl);
          }
        } catch (MalformedURLException e) {
          logger.debug("Bad URL [" + line + "].");
        }
      }
    }
    textSiteMap.setProcessed(true);
    return textSiteMap;
  }


  /**
   * Decompress the gzipped content and process the resulting XML Sitemap.
   * 
   * @param url - URL of the gzipped content
   * @param response - Gzipped content
   * @throws MalformedURLException
   * @throws IOException
   * @throws UnknownFormatException
   */
  private AbstractSiteMap processGzip(URL url, byte[] response) throws MalformedURLException, IOException,
      UnknownFormatException {


    logger.debug("Processing gzip");


    AbstractSiteMap smi;


    InputStream is = new ByteArrayInputStream(response);


    // Remove .gz ending
    String xmlUrl = url.toString().replaceFirst("\\.gz$", "");


    logger.debug("XML url = " + xmlUrl);


    BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is));
    InputSource in = new InputSource(decompressed);
    in.setSystemId(xmlUrl);
    smi = processXml(url, in);
    decompressed.close();
    return smi;
  }


  /**
   * Parse the given XML content.
   * 
   * @param sitemapUrl
   * @param is
   * @throws UnknownFormatException
   */
  private AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {


    Document doc = null;


    try {
      DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
      DocumentBuilder db = dbf.newDocumentBuilder();
      db.setErrorHandler(new ErrorHandler() {


        @Override
        public void warning(SAXParseException exception) throws SAXException {
          logger.warn("Sitemap XML warning: " + exception.getMessage());
        }


        @Override
        public void fatalError(SAXParseException exception) throws SAXException {
          logger.warn("Sitemap XML fatalError: " + exception.getMessage());
        }


        @Override
        public void error(SAXParseException exception) throws SAXException {
          logger.warn("Sitemap XML error: " + exception.getMessage());


        }
      });
      doc = db.parse(is);
    } catch (Exception e) {
      throw new UnknownFormatException("Error parsing XML for " + sitemapUrl);
    }


    // See if this is a sitemap index
    NodeList nodeList = doc.getElementsByTagName("sitemapindex");
    if (nodeList.getLength() > 0) {
      nodeList = doc.getElementsByTagName("sitemap");
      return parseSitemapIndex(sitemapUrl, nodeList);
    } else if (doc.getElementsByTagName("urlset").getLength() > 0) {
      // This is a regular Sitemap
      return parseXmlSitemap(sitemapUrl, doc);
    } else if (doc.getElementsByTagName("link").getLength() > 0) {
      // Could be RSS or Atom
      return parseSyndicationFormat(sitemapUrl, doc);
    }
    throw new UnknownFormatException("Unknown XML format for " + sitemapUrl);
  }


  /**
   * Parse XML that contains a valid Sitemap. Example of a Sitemap: <?xml version="1.0" encoding="UTF-8"?> <urlset
   * xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc>
   * <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc
   * >http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc> <changefreq>weekly</changefreq> </url>
   * </urlset>
   * 
   * @param doc
   */
  private SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) {


    SiteMap sitemap = new SiteMap(sitemapUrl);
    sitemap.setType(SitemapType.XML);


    NodeList list = doc.getElementsByTagName("url");


    // Loop through the <url>s
    for (int i = 0; i < list.getLength(); i++) {


      Node n = list.item(i);


      if (n.getNodeType() == Node.ELEMENT_NODE) {
        Element elem = (Element) n;


        String loc = getElementValue(elem, "loc");


        URL url = null;
        try {
          url = new URL(loc);
          String lastMod = getElementValue(elem, "lastmod");
          String changeFreq = getElementValue(elem, "changefreq");
          String priority = getElementValue(elem, "priority");
          boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString());


          if (valid || !strict) {
            SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid);
            sitemap.addSiteMapUrl(sUrl);
            if (logger.isDebugEnabled()) {
              StringBuffer sb = new StringBuffer("  ");
              sb.append(i + 1).append(". ").append(sUrl);
              logger.debug(sb.toString());
            }
          }
        } catch (MalformedURLException e) {
          // e.printStackTrace();


          // Can't create an entry with a bad URL
          logger.debug("Bad url: [" + loc + "]");
        }
      }
    }
    sitemap.setProcessed(true);
    return sitemap;
  }


  /**
   * Parse XML that contains a Sitemap Index. Example Sitemap Index:
   * 
   * <?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap>
   * <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> <sitemap>
   * <loc>http://www.example.com/sitemap2.xml.gz</loc> <lastmod>2005-01-01</lastmod> </sitemap> </sitemapindex>
   * 
   * @param url - URL of Sitemap Index
   * @param nodeList
   */
  private SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) {


    logger.debug("Parsing Sitemap Index");


    SiteMapIndex sitemapIndex = new SiteMapIndex(url);
    sitemapIndex.setType(SitemapType.INDEX);


    // Loop through the <sitemap>s
    for (int i = 0; i < nodeList.getLength() && i < MAX_URLS; i++) {


      Node firstNode = nodeList.item(i);


      URL sitemapUrl = null;
      Date lastModified = null;


      if (firstNode.getNodeType() == Node.ELEMENT_NODE) {
        Element elem = (Element) firstNode;
        String loc = getElementValue(elem, "loc");


        // try the text content when no loc element
        // has been specified
        if (loc == null) {
          loc = elem.getTextContent().trim();
        }


        try {
          sitemapUrl = new URL(loc);
          String lastmod = getElementValue(elem, "lastmod");
          lastModified = SiteMap.convertToDate(lastmod);


          // Right now we are not worried about sitemapUrls that point
          // to different websites.


          SiteMap s = new SiteMap(sitemapUrl, lastModified);
          sitemapIndex.addSitemap(s);
          if (logger.isDebugEnabled()) {
            StringBuffer sb = new StringBuffer("  ");
            sb.append(i + 1).append(". ").append(s);
            logger.debug(sb.toString());
          }
        } catch (MalformedURLException e) {
          // e.printStackTrace();


          // Don't create an entry for a bad URL
          logger.debug("Bad url: [" + loc + "]");
        }
      }
    }
    sitemapIndex.setProcessed(true);
    return sitemapIndex;
  }


  /**
   * Parse the XML document, looking for "feed" element to determine if it's an Atom doc and "rss" to determine if it's
   * an RSS doc.
   * 
   * @param sitemapUrl
   * @param doc - XML document to parse
   * @throws UnknownFormatException if XML does not appear to be Arom or RSS
   */
  private SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc) throws UnknownFormatException {


    SiteMap sitemap = new SiteMap(sitemapUrl);


    // See if this is an Atom feed by looking for "feed" element
    NodeList list = doc.getElementsByTagName("feed");
    if (list.getLength() > 0) {
      parseAtom(sitemap, (Element) list.item(0), doc);
      sitemap.setProcessed(true);
      return sitemap;
    } else {
      // See if RSS feed by looking for "rss" element
      list = doc.getElementsByTagName("rss");
      if (list.getLength() > 0) {
        parseRSS(sitemap, doc);
        sitemap.setProcessed(true);
        return sitemap;
      } else {
        throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl);
      }
    }
  }


  /**
   * Parse the XML document which is assumed to be in Atom format. Atom 1.0 example:
   * 
   * <?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom">
   * 
   * <title>Example Feed</title> <subtitle>A subtitle.</subtitle> <link href="http://example.org/feed/" rel="self"/>
   * <link href="http://example.org/"/> <modified>2003-12-13T18:30:02Z</modified> <author> <name>John Doe</name>
   * <email>johndoe@example.com</email> </author> <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
   * 
   * <entry> <title>Atom-Powered Robots Run Amok</title> <link href="http://example.org/2003/12/13/atom03"/>
   * <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2003-12-13T18:30:02Z</updated> <summary>Some
   * text.</summary> </entry>
   * 
   * </feed>
   * 
   * @param elem
   * @param doc
   */
  private void parseAtom(SiteMap sitemap, Element elem, Document doc) {


    // Grab items from <feed><entry><link href="URL" /></entry></feed>
    // Use lastmod date from <feed><modified>DATE</modified></feed>


    logger.debug("Parsing Atom XML");


    sitemap.setType(SitemapType.ATOM);


    String lastMod = getElementValue(elem, "modified");
    logger.debug("lastMod=" + lastMod);


    NodeList list = doc.getElementsByTagName("entry");


    // Loop through the <entry>s
    for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {


      Node n = list.item(i);


      if (n.getNodeType() == Node.ELEMENT_NODE) {
        elem = (Element) n;


        String href = getElementAttributeValue(elem, "link", "href");
        logger.debug("href=" + href);


        URL url = null;
        try {
          url = new URL(href);
          boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString());


          if (valid || !strict) {
            SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, null, null, valid);
            sitemap.addSiteMapUrl(sUrl);
            if (logger.isDebugEnabled()) {
              StringBuffer sb = new StringBuffer("  ");
              sb.append(i + 1).append(". ").append(sUrl);
              logger.debug(sb.toString());
            }
          }
        } catch (MalformedURLException e) {
          // Can't create an entry with a bad URL
          logger.debug("Bad url: [" + href + "]");
        }


      }
    }
  }


  /**
   * Parse XML document which is assumed to be in RSS format. RSS 2.0 example:
   * 
   * <?xml version="1.0"?> <rss version="2.0"> <channel> <title>Lift Off News</title>
   * <link>http://liftoff.msfc.nasa.gov/</link> <description>Liftoff to Space Exploration.</description>
   * <language>en-us</language> <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate> <lastBuildDate>Tue, 10 Jun 2003
   * 09:41:01 GMT</lastBuildDate> <docs>http://blogs.law.harvard.edu/tech/rss</docs> <generator>Weblog Editor
   * 2.0</generator> <managingEditor>editor@example.com</managingEditor> <webMaster>webmaster@example.com</webMaster>
   * <ttl>5</ttl>
   * 
   * <item> <title>Star City</title> <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
   * <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a
   * crash course in culture, language and protocol at Russia's Star City.</description> <pubDate>Tue, 03 Jun 2003
   * 09:39:21 GMT</pubDate> <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> </item>
   * 
   * <item> <title>Space Exploration</title> <link>http://liftoff.msfc.nasa.gov/</link> <description>Sky watchers in
   * Europe, Asia, and parts of Alaska and Canada will experience a partial eclipse of the Sun on Saturday, May
   * 31.</description> <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
   * <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid> </item>
   * 
   * </channel> </rss>
   * 
   * @param sitemap
   * @param doc
   */
  private void parseRSS(SiteMap sitemap, Document doc) {


    // Grab items from <item><link>URL</link></item>
    // and last modified date from <pubDate>DATE</pubDate>


    logger.debug("Parsing RSS doc");
    sitemap.setType(SitemapType.RSS);
    NodeList list = doc.getElementsByTagName("channel");
    Element elem = (Element) list.item(0);


    // Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT)
    String lastMod = getElementValue(elem, "pubDate");


    logger.debug("lastMod=" + lastMod);


    list = doc.getElementsByTagName("item");


    // Loop through the <item>s
    for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {


      Node n = list.item(i);


      if (n.getNodeType() == Node.ELEMENT_NODE) {
        elem = (Element) n;


        String link = getElementValue(elem, "link");
        logger.debug("link=" + link);


        try {
          URL url = new URL(link);
          boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString());


          if (valid || !strict) {
            SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, null, null, valid);
            sitemap.addSiteMapUrl(sUrl);
            if (logger.isDebugEnabled()) {
              StringBuffer sb = new StringBuffer("  ");
              sb.append(i + 1).append(". ").append(sUrl);
              logger.debug(sb.toString());
            }
          }
        } catch (MalformedURLException e) {
          // Can't create an entry with a bad URL
          logger.debug("Bad url: [" + link + "]");
        }
      }
    }
  }


  /**
   * Get the element's textual content.
   * 
   * @param elem
   * @param elementName
   * @return
   */
  private String getElementValue(Element elem, String elementName) {


    NodeList list = elem.getElementsByTagName(elementName);
    Element e = (Element) list.item(0);
    if (e != null) {
      NodeList children = e.getChildNodes();
      if (children.item(0) != null) {
        return ((Node) children.item(0)).getNodeValue().trim();
      }
    }


    return null;
  }


  /**
   * Get the element's attribute value.
   * 
   * @param elem
   * @param elementName
   * @param attributeName
   * @return
   */
  private String getElementAttributeValue(Element elem, String elementName, String attributeName) {


    NodeList list = elem.getElementsByTagName(elementName);
    Element e = (Element) list.item(0);
    if (e != null) {
      return e.getAttribute(attributeName);
    }


    return null;
  }


  /**
   * See if testUrl is under sitemapUrl. Only URLs under sitemapUrl are legal. Both URLs are first converted to
   * lowercase before the comparison is made (this could be an issue on web servers that are case sensitive).
   * 
   * @param sitemapUrl
   * @param testUrl
   * @return true if testUrl is under sitemapUrl, false otherwise
   */
  private boolean urlIsLegal(String sitemapBaseUrl, String testUrl) {


    boolean ret = false;


    // Don't try a comparison if the URL is too short to match
    if (sitemapBaseUrl != null && sitemapBaseUrl.length() <= testUrl.length()) {
      String u = testUrl.substring(0, sitemapBaseUrl.length()).toLowerCase();
      ret = sitemapBaseUrl.equals(u);
    }
    if (logger.isTraceEnabled()) {
      StringBuffer sb = new StringBuffer("urlIsLegal: ");
      sb.append(sitemapBaseUrl).append(" <= ").append(testUrl);
      sb.append(" ? ").append(ret);
      logger.trace(sb.toString());
    }


    return ret;
  }


}
Source Code of org.jboss.elasticsearch.river.remote.sitemap.SiteMapParser

Related Classes of org.jboss.elasticsearch.river.remote.sitemap.SiteMapParser