Package com.ontometrics.scraper.util

Source Code of com.ontometrics.scraper.util.ScraperUtil

package com.ontometrics.scraper.util;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;

import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ontometrics.scraper.TagOccurrence;
import com.ontometrics.scraper.extraction.ElementIdentifierType;
import com.ontometrics.scraper.extraction.Field;
import com.ontometrics.scraper.extraction.Link;
import com.ontometrics.scraper.extraction.ScrapedField;

public class ScraperUtil {

  private static final Logger log = LoggerFactory.getLogger(ScraperUtil.class);

  private ScraperUtil() {
  };

  /**
   * Get first occurrence of field value
   *
   * @param fields
   * @param label
   * @return
   */
  public static String getFieldValue(List<Field> fields, String label) {
    return getFieldValue(fields, label, 1);
  }

  /**
   * Occurrence starts at index 1
   *
   * @param fields
   * @param label
   * @param occurrence
   * @return
   */
  public static String getFieldValue(List<Field> fields, String label, int occurrence) {
    int foundOccurrenceCount = 0;
    String foundValue = null;

    if (fields != null && fields.size() > 0) {
      for (Field field : fields) {
        if (field.getLabel().equalsIgnoreCase(label)) {
          foundOccurrenceCount++;
          if (foundOccurrenceCount == occurrence) {
            foundValue = field.getValue();

            // TODO: Need a better method of replacing strange whitespace
            if (foundValue != null && foundValue.contains("\u00CA")) {
              foundValue = foundValue.replace("\u00CA", "");
            }

            break;
          }
        }
      }
    }
    return foundValue;
  }

  public static void setFieldValue(List<Field> fields, String label, String value, int occurrence) {
    int foundOccurrenceCount = 0;
    int foundIndex = -1;

    if (fields != null && fields.size() > 0) {
      for (int i = 0; i < fields.size(); i++) {
        Field field = fields.get(i);
        if (field.getLabel().equalsIgnoreCase(label)) {
          foundOccurrenceCount++;
          if (foundOccurrenceCount == occurrence) {
            foundIndex = i;
            break;
          }
        }
      }

      if (foundIndex != -1) {
        fields.remove(foundIndex);
        Field newField = new ScrapedField(label, value);
        fields.add(foundIndex, newField);
      }
    }
  }

  public static List<Link> extractLinks(String sourceToParse) {
    Source source = new Source(sourceToParse);
    source.fullSequentialParse();
    List<Link> links = new ArrayList<Link>();
    List<Element> as = source.getAllElements(HTMLElementName.A);
    for (Element linkElement : as) {
      links.add(new Link(linkElement.getTextExtractor().toString(), linkElement.getAttributeValue("href")));
    }
    return links;
  }

  public static String extractParameter(String uri, String parameter) {
    String paramDelimiter = (uri.contains("?")) ? "?" : ";";
    String finalUri = uri.substring(uri.indexOf(paramDelimiter) + 1);
    String found = null;
    String[] parameterSets = finalUri.split("&");
    for (String parameterSet : parameterSets) {
      String[] pnv = parameterSet.split("=");
      if (pnv.length == 2) {
        if (pnv[0].equals(parameter)) {
          found = pnv[1];
        }
      }
    }
    log.debug("Extracted value '{}' for '{}'", found, parameter);
    return found;
  }

  public static String extract(String source, String sourceTag, int occurrence) {
    log.debug("extracting occurrence {} of tag: {} from: {}", new Object[] { occurrence, sourceTag, source });
    String tag = sourceTag.startsWith("<") ? sourceTag : "<" + sourceTag;
    tag = (tag.endsWith(">")) ? tag.substring(0, tag.length() - 1) : tag;
    String endTag = "</" + tag.substring(1) + ">";
    log.debug("extracting using tags: {} and {}", tag, endTag);
    String[] tags = source.split(endTag);

    for (int i = 0; i < tags.length; i++) {
      tags[i] = tags[i] + endTag;
    }
    int begin = tags[occurrence].indexOf(tag);
    int length = tags[occurrence].length();
    log.debug("occurrence {} at {} to {}", new Object[] { occurrence, begin, length });
    return tags[occurrence].substring(begin, length);
  }

  public static String extract(String source, TagOccurrence tagOccurrence) {
    log.debug("extracting: {}", tagOccurrence);
    int occurrence = tagOccurrence.getOccurrence();
    String tag = cleanupTag(tagOccurrence.getTag());
    String endTag = "</" + tag.substring(1) + ">";
    String[] tags = source.split(endTag);

    for (int i = 0; i < tags.length; i++) {
      tags[i] = tags[i] + endTag;
    }
    int begin = tags[occurrence].indexOf(tag);
    int length = tags[occurrence].length();
    log.debug("occurrence {} at {} to {}", new Object[] { occurrence, begin, length });
    return tags[occurrence].substring(begin, length);
  }

  public static Element extract(Source source, TagOccurrence tagOccurrence) {
    Element result = null;
    if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.cssClass) {
      List<Element> elements = source.getAllElementsByClass(tagOccurrence.getIdentifier());
      if(elements != null && !elements.isEmpty())
        result = elements.get(0);
    } else if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.ID) {
      result = source.getElementById(tagOccurrence.getIdentifier());
    } else {
      List<Element> elements = source.getAllElements(tagOccurrence.getTag());
      if(elements.size() > tagOccurrence.getOccurrence())
        result = elements.get(tagOccurrence.getOccurrence());
    }
    return result;
  }

  public static String extractUsingIdentifier(String html, TagOccurrence tagOccurrence) {
    String result = null;
    Source source = new Source(html);
    source.fullSequentialParse();
    if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.ID) {
      log.debug("extracting tag by id: {}", tagOccurrence.getIdentifier());
      Element idElement = source.getElementById(tagOccurrence.getIdentifier());
      if (idElement != null) {
        result = idElement.toString();
      } else {
        result = "";
      }
    } else if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.cssClass) {
      log.debug("extracting: {}", tagOccurrence);
      List<Element> elements = source.getAllElementsByClass(tagOccurrence.getIdentifier());
      if(elements.size() > tagOccurrence.getOccurrence())
        result = elements.get(tagOccurrence.getOccurrence()).toString();
    }
    log.debug("identifier: {}/{} result: {}",
        new Object[] { tagOccurrence.getIdentifier(), tagOccurrence.getElementIdentifierType(), result });
    return result;
  }

  public static String extractTagMatching(String html, TagOccurrence toGet) {
    log.debug("looking for {} in tags: {}", toGet.getMatching(), toGet.getTag());
    String found = null;
    Source source = new Source(html);
    source.fullSequentialParse();
    log.debug("source = {}", source);
    List<Element> elements = source.getAllElements(HTMLElementName.TABLE);
    for (Element element : elements) {
      log.debug("this element = {}", element);
      String elementText = element.getTextExtractor().toString();
      if (elementText.contains(toGet.getMatching())) {
        found = element.toString();
        log.debug("found element text containing matching text, found = {}", found);
        break;
      }
    }
    log.debug("found = {}", found);
    return found;
  }

  private static String cleanupTag(String sourceTag) {
    String tag = sourceTag.startsWith("<") ? sourceTag : "<".concat(sourceTag);
    tag = (tag.endsWith(">")) ? tag.substring(0, tag.length() - 1) : tag;
    return tag;
  }

  public static String extractSessionId(URL url, String sessionIDName) throws IOException {
    String sessionID = null;
    Source source = new Source(url);
    source.fullSequentialParse();
    List<Element> links = source.getAllElements(HTMLElementName.A);
    for (Element link : links) {
      // log.info("link: {}", link.toString());
      String href = link.getAttributeValue("href");
      if (href != null && href.contains(sessionIDName)) {
        sessionID = extractParameter(href, sessionIDName);
        if (sessionID != null) {
          break;
        }
      }
    }
    return sessionID;
  }

  public static Map<String, String> createFieldMap(List<Field> fields) {
    Map<String, String> fieldMap = new HashMap<String, String>();
    for (Field field : fields) {
      fieldMap.put(field.getLabel(), field.getValue());
    }
    return fieldMap;
  }

  public static URL getBaseUrl(URL nextUrl) {
    String baseUrlString = nextUrl.getProtocol() + "://" + nextUrl.getHost();

    String path = nextUrl.getPath();
    if (!StringUtils.isEmpty(path)) {
      int indexOfLastSlash = path.lastIndexOf('/');
      if (indexOfLastSlash != -1) {
        baseUrlString += path.substring(0, indexOfLastSlash);
      }
    }
    if (!baseUrlString.endsWith("/")) {
      baseUrlString += "/";
    }

    URL result = null;
    try {
      result = new URL(baseUrlString);
    } catch (MalformedURLException e) {
      log.error("malformed base url", e);
    }
    return result;
  }

  public static String safeReplaceSemicolonsWithNewLinesWithTrim(String text) {
    if (text == null) {
      return null;
    }

    String[] splitString = text.split(";");

    StringBuilder stringBuilder = new StringBuilder();
    for (int i = 0; i < splitString.length; i++) {
      stringBuilder.append(splitString[i].trim());
      if (i != splitString.length - 1) {
        stringBuilder.append(System.getProperty("line.separator"));
      }
    }

    return stringBuilder.toString();
  }

  public static boolean isAbsoluteURLString(String urlString) {
    String findResult = getRegexFirstGroupCaseInsensitive("^http[a-z]*:", urlString);
    if (findResult != null) {
      return true;
    }
    return false;
  }

  public static String getBaseUrlString(URL url) {
    String result = MessageFormat.format("{0}://{1}", url.getProtocol(), url.getHost());
    return result;
  }

  public static String getRegexFirstGroupCaseInsensitive(String regex, String string) {
    String result = null;

    Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
    Matcher m = p.matcher(string);
    if (m.find()) {
      result = m.group(0);
    }

    return result;
  }
}
TOP

Related Classes of com.ontometrics.scraper.util.ScraperUtil

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.