Package net.sf.regain.crawler.preparator.html

Examples of net.sf.regain.crawler.preparator.html.HtmlContentExtractor


      String contentStartRegex = (String) sectionArr[i].get("startRegex");
      String contentEndRegex = (String) sectionArr[i].get("endRegex");
      String headlineRegex = (String) sectionArr[i].get("headlineRegex");
      int headlineRegexGroup = getIntParam(sectionArr[i], "headlineRegex.group");

      mContentExtractorArr[i] = new HtmlContentExtractor(prefix,
        contentStartRegex, contentEndRegex, headlineRegex, headlineRegexGroup);
    }

    // Read the path extractors
    sectionArr = config.getSectionsWithName("pathExtractor");
View Full Code Here


    // Get the title
    String title = extractHtmlTitle(rawDocument.getContentAsString());
    setTitle(title);

    // Find the content extractor that is responsible for this document
    HtmlContentExtractor contentExtractor = null;
    if (mContentExtractorArr != null) {
      for (int i = 0; i < mContentExtractorArr.length; i++) {
        if (mContentExtractorArr[i].accepts(rawDocument)) {
          contentExtractor = mContentExtractorArr[i];
        }
      }
    }

    // Cut the content and extract the headlines
    String cuttedContent;
    String headlines;
    boolean isContentCutted = false;
    if (contentExtractor == null) {
      // There is no HtmlContentExtractor responsible for this document
      if (mLog.isDebugEnabled()) {
        mLog.debug("No HTML content extractor is responsible for " + rawDocument.getUrl());
      }

      cuttedContent = rawDocument.getContentAsString();
      headlines = null;
    } else {
      cuttedContent = contentExtractor.extractContent(rawDocument);
      headlines = contentExtractor.extractHeadlines(cuttedContent);
      if (!cuttedContent.equals(rawDocument.getContentAsString())) {
        isContentCutted = true;
      }
    }
View Full Code Here

TOP

Related Classes of net.sf.regain.crawler.preparator.html.HtmlContentExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.