Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Source


   * @throws IOException
   */
  public List<URL> getLinks() throws IOException {
    List<URL> links = new ArrayList<URL>();

    Source source = new Source(url);
    source.fullSequentialParse();
    List<Element> linkElements = source.getAllElements(HTMLElementName.A);
    for (Element linkElement : linkElements) {
      String href = linkElement.getAttributeValue("href");
      if (href == null) {
        continue;
      }
View Full Code Here


   *         various manipulators
   * @throws IOException
   */
  public String execute() throws IOException {
    String result = "";
    Source source = new Source(url);
    source.fullSequentialParse();
    // log.debug("parsed source: {}", source.toString());

    if (idToGet != null) {
      result = source.getElementById(idToGet).getTextExtractor().toString();
    } else if (classToGet != null) {
      List<Element> classElements = source.getAllElementsByClass(classToGet);
      if (occurrence > 0) {
        result = classElements.get(occurrence).getTextExtractor().toString();
      } else {
        for (Element element : classElements) {
          result += element.getTextExtractor().toString();
        }
        log.debug("extracted: {} from class: {}", result, classToGet);
      }
    } else {
      if (outputFormat == OutputFormats.Text) {
        result = source.getTextExtractor().toString();
      } else if (outputFormat == OutputFormats.Html) {
        result = source.toString();
      }
      for (TagOccurrence toGet : tagsToGet) {
        if (toGet.getMatching() != null) {
          result = extractTagMatching(source.toString(), toGet);
        } else {
          result = extractTagText(source.toString(), toGet);
        }
      }

    }

View Full Code Here

   * @throws IOException
   */
  public List<String> getResults() throws IOException {
    log.debug("extracting results from url: {}", url);
    List<String> results = new ArrayList<String>();
    Source source = new Source(url);
    source.fullSequentialParse();
    String content = source.toString();
    List<Element> currentElements = null;
    for (TagOccurrence toGet : tagsToGet) {
      log.debug("toGet = {}", toGet);
      if (toGet.getOccurrence() > 0) {
        content = extractTagText(content, toGet);
      } else if (toGet.getMatching() != null) {
        content = extractTagMatching(source.toString(), toGet);
      } else if (toGet.getTag().equals("a")) {
        source = new Source(content);
        currentElements = source.getAllElements(HTMLElementName.A);
        for (Element element : currentElements) {
          String href = element.getAttributeValue("href");
          if (href != null) {
            if (matchingPattern == null || href.contains(matchingPattern)) {
              results.add(href);
View Full Code Here

    String endAfterTag = "</" + afterTagOccurrence.getTag() + ">";
    for (int i = 0; i < afterTagOccurrence.getOccurrence(); i++) {
      sourceHtml = sourceHtml.substring(sourceHtml.indexOf(endAfterTag) + 1);
    }
    String afterSource = sourceHtml;
    Source newSource = new Source(afterSource);
    newSource.fullSequentialParse();
    return newSource;
  }
View Full Code Here

    if (isUsingDefaultFieldExtractor()) {
      extractedFields.addAll(defaultFieldExtractions());
    }

    if (this.classToGet != null) {
      Source source = new Source(url);
      source.fullSequentialParse();
      List<Element> elements = source.getAllElementsByClass(classToGet);
      String text = elements.get(0).toString();
      String[] fields = text.split("<br>");
      log.debug("fields: {}", fields);
      for (String field : fields) {
        Source fieldSource = new Source(field);
        field = fieldSource.getTextExtractor().toString();
        String[] fieldParts = field.split(":");
        log.debug("{} : {}", fieldParts[0], fieldParts[1]);
        extractedFields.add(new ScrapedField(fieldParts[0], fieldParts[1]));
      }
    }

    Source source = new Source(url);
    for (TagOccurrence tagOccurrence : this.tagsToGet) {
      // log.debug("extracting fields using tag: {}", tagOccurrence);
      source.fullSequentialParse();
      if (!(tagOccurrence.getTag().contains(HTMLElementName.TABLE) || tagOccurrence.getTag().contains(
          HTMLElementName.A))) {
        throw new IllegalStateException(MessageFormat.format(
            "Asked to extract tag: {0}, only know how to extract fields from tables.",
            tagOccurrence.getTag()));
      } else {

        if (isAttemptingToMatchSpecificTable(tagOccurrence)) {
          source = new Source(extractTagText(source.toString(), tagOccurrence));
          extractedFields.addAll(extractFieldsFromTable(source.toString()));
        } else if (tagOccurrence.getTag().equals(HTMLElementName.TABLE)) {
          extractedFields.addAll(extractFieldsFromTable(source.toString()));
        } else {
          extractedFields = extractLinksFromList(source.toString());
        }

      }
    }
    source = new Source(url);
    source.fullSequentialParse();
    if (this.afterTagOccurrence != null) {
      source = pruneFrom(source, afterTagOccurrence);
    }
    for (FieldToGet fieldToGet : fieldsToGet) {
      String value = "";
      if (fieldToGet.getSearchType() == FieldSearchType.Tag) {
        value = source.getAllElements(fieldToGet.getLabel()).get(0).getTextExtractor().toString();
      }
      extractedFields.add(new ScrapedField(fieldToGet.getFieldname(), value));
    }
    for (PairedTags tagPair : this.fieldPairs) {
      List<Element> labels = source.getAllElements(tagPair.getLabelTag());
      List<Element> fields = source.getAllElements(tagPair.getFieldTag());

      removeInvalidFields(fields);

      int fieldCount = Math.min(labels.size(), fields.size());
      for (int i = 0; i < fieldCount; i++) {
View Full Code Here

  }

  private List<Field> extractLinksFromList(String html) {
    log.debug("extracting links...");
    List<Field> fields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> links = source.getAllElements(HTMLElementName.A);
    for (Element a : links) {
      String label = a.getTextExtractor().toString();
      String href = a.getAttributeValue("href");
      if (matchingPattern == null || href.contains(matchingPattern)) {
        fields.add(new Link(label, href));
View Full Code Here

    this.useDefaultFieldExtractor = useDefaultExtractor;
    return this;
  }

  private boolean fieldHasMultipleValues(String fieldValue) {
    Source source = new Source(fieldValue);
    source.fullSequentialParse();
    return source.getAllElements(HTMLElementName.BR).size() > 1;
  }
View Full Code Here

        return this;
    }

    private List<Link> extractLinks() {
    List<Link> links = new ArrayList<Link>();
        Source source = getSource();
    List<Element> as = source.getAllElements(HTMLElementName.A);
    for (Element linkElement : as) {
      if(styleClass != null && !styleClass.isEmpty()) {
        String classValue = linkElement.getAttributeValue("class");
        if(classValue == null || !classValue.contains(styleClass))
          continue;
View Full Code Here

     * @throws IOException if i/o operation(s) fails
     */
    private void fetchSourceFromUrl() throws IOException {
        InputStream is = null;
        try {
            source = new Source(is = urlContentProvider.getContent(url));
        } finally {
            IOUtils.closeQuietly(is);
        }
    }
View Full Code Here

  private Logger log = LoggerFactory.getLogger(SimpleSourceExtractor.class);

  @Override
  public Source getSource(URL url) {
    Source source = null;
    try {
      source = new Source(url);
    } catch (IOException e) {
      log.error("Error extracting source: {}", e.toString());
    }
    return source;
  }
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Source

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.