Package org.jboss.elasticsearch.river.remote

Source Code of org.jboss.elasticsearch.river.remote.GetSitemapHtmlClient

/*
* JBoss, Home of Professional Open Source
* Copyright 2014 Red Hat Inc. and/or its affiliates and other contributors
* as indicated by the @authors tag. All rights reserved.
*/
package org.jboss.elasticsearch.river.remote;

import java.io.ByteArrayInputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.SettingsException;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.jboss.elasticsearch.river.remote.exception.RemoteDocumentNotFoundException;
import org.jboss.elasticsearch.river.remote.sitemap.AbstractSiteMap;
import org.jboss.elasticsearch.river.remote.sitemap.SiteMap;
import org.jboss.elasticsearch.river.remote.sitemap.SiteMapParser;
import org.jboss.elasticsearch.river.remote.sitemap.SiteMapURL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

/**
* Class used to HTTP GET data from <a href="http://www.sitemaps.org">sitemap</a> and then download and process HTML for
* them.
* <p>
* Document structure returned from {@link #getChangedDocuments(String, int, Date)} contains fields defined in
* <code>DOC_FIELD_xx</code> constants.
*
* @author Vlastimil Elias (velias at redhat dot com)
*/
public class GetSitemapHtmlClient extends HttpRemoteSystemClientBase {

  protected static final String CFG_HM_STRIP_HTML = "stripHtml";
  protected static final String CFG_HM_CSS_SELECTOR = "cssSelector";
  protected static final String CFG_HM_VALUE_ATTRIBUTE = "valueAttribute";
  protected static final String CFG_URL_GET_SITEMAP = "urlGetSitemap";
  protected static final String CFG_HTML_MAPPING = "htmlMapping";

  private static final ESLogger logger = Loggers.getLogger(GetSitemapHtmlClient.class);

  public static final String DOC_FIELD_ID = "id";
  public static final String DOC_FIELD_URL = "url";
  public static final String DOC_FIELD_LAST_MODIFIED = "last_modified";
  public static final String DOC_FIELD_PRIORITY = "priority";

  /**
   * Extensions of sitemap provided URLs which are always ignored, so no detail document is downloaded for them
   */
  private static final Set<String> IGNORED_EXTENSIONS = new HashSet<>();
  static {
    IGNORED_EXTENSIONS.add("txt");
    IGNORED_EXTENSIONS.add("jpg");
    IGNORED_EXTENSIONS.add("jpeg");
    IGNORED_EXTENSIONS.add("tiff");
    IGNORED_EXTENSIONS.add("gif");
    IGNORED_EXTENSIONS.add("json");
    IGNORED_EXTENSIONS.add("otf");
    IGNORED_EXTENSIONS.add("eot");
    IGNORED_EXTENSIONS.add("svg");
    IGNORED_EXTENSIONS.add("ttf");
    IGNORED_EXTENSIONS.add("woff");
    IGNORED_EXTENSIONS.add("gz");
    IGNORED_EXTENSIONS.add("zip");
    IGNORED_EXTENSIONS.add("exe");
    IGNORED_EXTENSIONS.add("rar");
  }

  protected String urlGetSitemap;

  protected Map<String, Map<String, Object>> htmlMapping;

  protected SiteMapParser sitemapParser = new SiteMapParser();

  @SuppressWarnings("unchecked")
  @Override
  public void init(Map<String, Object> config, boolean spaceListLoadingEnabled, IPwdLoader pwdLoader) {
    urlGetSitemap = getUrlFromConfig(config, CFG_URL_GET_SITEMAP, true);

    try {
      htmlMapping = (Map<String, Map<String, Object>>) config.get(CFG_HTML_MAPPING);
    } catch (ClassCastException e) {
      throw new SettingsException("'remote/" + CFG_HTML_MAPPING + "' configuration section is invalid");
    }

    if (spaceListLoadingEnabled) {
      throw new SettingsException(
          "Dynamic Spaces obtaining is not supported, use 'remote/spacesIndexed' to configure one space or static list");
    }

    String remoteUsername = initHttpClient(logger, config, pwdLoader, urlGetSitemap);

    logger.info("Configured sitemap.xml HTML client for URL '{}', remote system user '{}'.", urlGetSitemap,
        remoteUsername != null ? remoteUsername : "Anonymous access");
  }

  @Override
  public List<String> getAllSpaces() throws Exception {
    throw new UnsupportedOperationException(
        "Dynamic Spaces obtaining is not supported, use 'remote/spacesIndexed' to configure one space or static list");
  }

  @Override
  public ChangedDocumentsResults getChangedDocuments(String spaceKey, int startAt, Date updatedAfter) throws Exception {
    HttpResponseContent responseData = performHttpGetCall(urlGetSitemap, null);

    logger.debug("HTTP GET sitemap response data: {}", responseData);

    List<Map<String, Object>> documents = processSitemap(responseData, urlGetSitemap);

    return new ChangedDocumentsResults(documents, 0, documents.size());
  }

  protected List<Map<String, Object>> processSitemap(HttpResponseContent response, String url) throws Exception {
    AbstractSiteMap asm = sitemapParser.parseSiteMap(response.contentType, response.content, new URL(url));

    if (asm.isIndex()) {
      throw new Exception("Sitemap index format is not supported by this river!");
    }

    SiteMap sm = (SiteMap) asm;

    List<Map<String, Object>> documents = new ArrayList<Map<String, Object>>();
    for (SiteMapURL smurl : sm.getSiteMapUrls()) {

      String durl = smurl.getUrl().toExternalForm();
      String ext = Utils.getFileExtensionLowercase(durl);
      if (ext == null || !IGNORED_EXTENSIONS.contains(ext)) {
        Map<String, Object> document = new HashMap<String, Object>();
        document.put(DOC_FIELD_ID, createIdFromUrl(durl));
        document.put(DOC_FIELD_URL, durl);
        document.put(DOC_FIELD_LAST_MODIFIED, DateTimeUtils.formatISODateTime(smurl.getLastModified()));
        document.put(DOC_FIELD_PRIORITY, new Double(smurl.getPriority()));
        documents.add(document);
      } else {
        logger.debug("Ignored URL as it contains ignored file extension: " + durl);
      }
    }
    return documents;
  }

  /**
   * Create document id from URL by replacing strange/problematic characters.
   *
   * @param url to crete id from
   * @return id
   */
  protected static String createIdFromUrl(String url) {
    if (url == null)
      return null;
    url = url.replace("://", "_");
    url = url.replace(":", "_");
    url = url.replace(".", "_");
    url = url.replace("=", "_");
    url = url.replace("\\", "_");
    url = url.replace("/", "_");
    url = url.replace("?", "_");
    url = url.replace("&", "_");
    url = url.replace("%", "_");
    url = url.replace("*", "_");
    url = url.replace("$", "_");
    url = url.replace("#", "_");
    url = url.replace("@", "_");
    url = url.replace("+", "_");
    url = url.replace("<", "_");
    url = url.replace(">", "_");
    return url;
  }

  @Override
  public Object getChangedDocumentDetails(String spaceKey, String documentId, Map<String, Object> document)
      throws Exception, RemoteDocumentNotFoundException {
    try {
      String url = (String) document.get(DOC_FIELD_URL);
      if (url == null) {
        return null;
      }

      HttpResponseContent response = performHttpGetCall(url, null);

      if (response.contentType != null && response.contentType.contains("text/html")) {

        try {
          Document doc = Jsoup.parse(new ByteArrayInputStream(response.content), null, url);

          if (htmlMapping == null) {
            return doc.html();
          } else {
            Map<String, String> ret = new HashMap<>();
            for (String dataField : htmlMapping.keySet()) {
              String value = null;
              Map<String, Object> fieldMappingConfig = htmlMapping.get(dataField);
              String cssSelector = Utils.trimToNull((String) fieldMappingConfig.get(CFG_HM_CSS_SELECTOR));
              boolean stripHtml = XContentMapValues.nodeBooleanValue(fieldMappingConfig.get(CFG_HM_STRIP_HTML), false);
              if (cssSelector != null) {
                Elements elements = doc.select(cssSelector);
                if (elements != null && !elements.isEmpty()) {
                  String valueAttribute = Utils.trimToNull((String) fieldMappingConfig.get(CFG_HM_VALUE_ATTRIBUTE));
                  if (valueAttribute != null) {
                    StringBuilder sb = new StringBuilder();
                    for (Element e : elements) {
                      String v = Utils.trimToNull(e.attr(valueAttribute));
                      if (v != null) {
                        if (sb.length() > 0) {
                          sb.append(" ");
                        }
                        sb.append(v);
                      }
                    }
                    value = Utils.trimToNull(sb.toString());
                  } else {
                    if (stripHtml) {
                      value = convertElementsToText(elements);
                    } else {
                      if (elements.size() == 1) {
                        value = elements.html();
                      } else {
                        value = elements.outerHtml();
                      }
                    }
                  }
                }
              } else {
                if (stripHtml) {
                  value = convertNodeToText(doc);
                } else {
                  value = doc.html();
                }
              }
              ret.put(dataField, value);
            }
            return ret;
          }
        } catch (ClassCastException e) {
          throw new SettingsException("'remote/" + CFG_HTML_MAPPING + "' configuration section is invalid");
        } catch (Exception e) {
          throw new RemoteDocumentNotFoundException("HTML document can't be processed: " + e.getMessage(), e);
        }
      } else {
        throw new RemoteDocumentNotFoundException("HTML document can't be processed as it is not html but: "
            + response.contentType);
      }
    } catch (ClientProtocolException e) {
      if (e.getCause() != null)
        throw new RemoteDocumentNotFoundException(e.getCause());
      else
        throw new RemoteDocumentNotFoundException(e);
    } catch (HttpCallException e) {
      if (e.getStatusCode() == HttpStatus.SC_NOT_FOUND) {
        throw new RemoteDocumentNotFoundException(e);
      } else {
        throw e;
      }
    } catch (URISyntaxException e) {
      throw new RemoteDocumentNotFoundException("URL of sitemap is invalid: " + e.getMessage(), e);
    }
  }

  protected static String convertNodeToText(Node node) {
    if (node == null)
      return "";
    StringBuilder buffer = new StringBuilder();
    new NodeTraversor(new ToTextNodeVisitor(buffer)).traverse(node);
    return buffer.toString().trim();
  }

  protected static String convertElementsToText(Elements elements) {
    if (elements == null || elements.isEmpty())
      return "";
    StringBuilder buffer = new StringBuilder();
    NodeTraversor nt = new NodeTraversor(new ToTextNodeVisitor(buffer));
    for (Element element : elements) {
      nt.traverse(element);
    }
    return buffer.toString().trim();
  }

  private static final class ToTextNodeVisitor implements NodeVisitor {
    final StringBuilder buffer;

    ToTextNodeVisitor(StringBuilder buffer) {
      this.buffer = buffer;
    }

    @Override
    public void head(Node node, int depth) {
      if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        String text = textNode.text().replace('\u00A0', ' ').trim(); // non breaking space
        if (!text.isEmpty()) {
          buffer.append(text);
          if (!text.endsWith(" ")) {
            buffer.append(" ");
          }
        }
      }
    }

    @Override
    public void tail(Node node, int depth) {
    }

  }

}
TOP

Related Classes of org.jboss.elasticsearch.river.remote.GetSitemapHtmlClient

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.