Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Source


    return result;
  }

  private List<Field> extractFieldsFromDL(String html) {
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> labels = source.getAllElements(HTMLElementName.DT);
    List<Element> values = source.getAllElements(HTMLElementName.DD);
    int cellCount = Math.min(labels.size(), values.size());
    for (int i = 0; i < cellCount; i++) {
      String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", "");
      Element valueElement = values.get(i);
      log.debug("looking at value element: {}", valueElement);
View Full Code Here


     * @return list of extracted fields
     */
    private List<Field> extractFieldsFromTagAttributes() {
        if (!tagAttributesToGet.isEmpty()) {
            List<Field> extractedFields = new ArrayList<Field>();
            Source source = getSource();
            for (TagAttributeFieldExtractor extractor : this.tagAttributesToGet) {
                List<Element> elements = source.getAllElements(extractor.getTagName());
                if (elements != null && !elements.isEmpty()) {
                    for (Element element : elements) {
                        Field field = extractor.extractField(element);
                        if (field != null) {
                            extractedFields.add(field);
View Full Code Here

  }

  private List<Field> extractFieldsFromTable(String html) {
    log.debug("extracting fields from table: {}", html);
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    int cellCount = source.getAllElements(HTMLElementName.TD).size();
    int rowCount = source.getAllElements(HTMLElementName.TR).size();
    log.debug("found {} cells in {} rows", cellCount, rowCount);
    if (cellCount == (rowCount * 2)) {
      Field lastField = null;
      log.debug("cells.size: {}", cellCount);
      List<Element> cells = source.getAllElements(HTMLElementName.TD);
      for (int i = 0; i < cellCount; i++) {
        Element labelElement = cells.get(i);
        Element valueElement = cells.get(++i);
        String label = labelElement.getTextExtractor().toString().trim().replaceAll(":$", "");
        String value = getValueFieldText(valueElement);
        log.debug("found field: {}={}", label, value);
        if (StringUtils.isEmpty(label) && lastField != null) {
          lastField.addValue(value);
        } else {
          lastField = new ScrapedField(label, value);
          extractedFields.add(lastField);
        }
      }
    } else {
      List<String> headers = new ArrayList<String>();
      List<Element> rows = source.getAllElements(HTMLElementName.TR);
      for (Element row : rows) {
        List<Element> headerElements = row.getAllElements(HTMLElementName.TH);
        if (headerElements.size() > 0) {
          headers.clear();
        }
View Full Code Here

    return extractedFields;
  }

  private List<Field> extractFieldsFromDL(String html) {
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> labels = source.getAllElements(HTMLElementName.DT);
    List<Element> values = source.getAllElements(HTMLElementName.DD);
    int cellCount = Math.min(labels.size(), values.size());
    for (int i = 0; i < cellCount; i++) {
      String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", "");
      Element valueElement = values.get(i);
      log.debug("looking at value element: {}", valueElement);
View Full Code Here

    return extractedFields;
  }

  private List<Field> extractFieldsFromUL(String html) {
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> lis = source.getAllElements(HTMLElementName.LI);
    for (Element li : lis) {
      log.debug("looking at li: {} w/text: {}", li, li.getTextExtractor().toString());
      String[] parts = li.getTextExtractor().toString().split(":");
      if (parts.length == 2) {
        Field field = new ScrapedField(parts[0], parts[1]);
View Full Code Here

                                .toString()
                                .replace("<br>", ";")
                                .replace("<br/>", ";")
                                .replace("<br />", ";");
                        log.debug("delimited text: {}", delimitedText);
                        Source newElement = new Source(delimitedText);
                        fieldText = newElement.getTextExtractor().toString();
                        if (fieldText.endsWith(";")) {
                            fieldText = fieldText.substring(0, fieldText.length() - 1).trim();
                        }
                    } else {
                        fieldText = valueText.getTextExtractor().toString();
View Full Code Here

  private boolean isAField(String extract) {
    return !extract.contains(HTMLElementName.TABLE);
  }

  private boolean fieldHasMultipleValues(String fieldValue) {
    Source source = new Source(fieldValue);
    source.fullSequentialParse();
    return source.getAllElements(HTMLElementName.BR).size() > 1;
  }
View Full Code Here

    source.fullSequentialParse();
    return source.getAllElements(HTMLElementName.BR).size() > 1;
  }

  private String delimitFieldValues(String source) {
    Source result = new Source(source.replace("<br>", ";").replace("<br/>", ";"));
    return getValueFieldText(result.getFirstElement());
  }
View Full Code Here

        if (extractedSource==null){
            StringBuffer accumulatedSource = new StringBuffer();
            for (HtmlExtractor extractor : htmlExtractors) {
                accumulatedSource.append(extractor.getSource().toString());
            }
            Source combinedSource = new Source(accumulatedSource);
            combinedSource.fullSequentialParse();
            extractedSource = combinedSource;
        }
        return extractedSource;
  }
View Full Code Here

          .getStatuspage("http://www.ns.nl/actuele-vertrektijden/main.link?station="
              + station);
      if (response.getHttpStatusCode() != 200) {
        return null;
      }
      Source source = new Source(response.getPageContent());

      source.fullSequentialParse();

      List<Train> newTrains = new ArrayList<Train>();
      List<Element> tableRows = source.getAllElements(HTMLElementName.TR);
      for (Element tableRow : tableRows) {
        if (tableRow.getParentElement().getName().equals(
            HTMLElementName.TBODY)) {
          Train train = new Train();
          int index = 0;
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Source

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.