Examples of ParseData

cn.edu.hfut.dmic.webcollector.parser.ParseData
@author hu
edu.uci.ics.crawler4j.parser.ParseData
info.bliki.api.ParseData
wikimedia.org/w/api.php">Wikimedia API
net.nutch.parse.ParseData
Data extracted from a page's content. @see Parse#getData()
org.apache.nutch.parse.ParseData
Data extracted from a page's content. @see Parse#getData()

Examples of org.apache.nutch.parse.ParseData

    reader = new SequenceFile.Reader(fs, parseData, conf);
    
    READ_PARSE_DATA:
      do {
      Text key = new Text();
      ParseData value = new ParseData();
      if(!reader.next(key, value)) break READ_PARSE_DATA;
      // make sure they all contain "nutch.segment.name" and "nutch.content.digest" 
      // keys in parse metadata
      Metadata contentMeta = value.getContentMeta();
      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null 
            && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
        handledurls.add(key.toString());
      }
    } while(true);

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(), new ParseImpl(text, parseData));


    // run filters on parse

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://test1.com/", "text1"));
    inlinks.add(new Inlink("http://test2.com/", "text2"));
    inlinks.add(new Inlink("http://test3.com/", "text2"));
    try {

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    }


    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());


    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));


    return parseResult;
  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

              conf)).get(link);
    } catch (ParserNotFound e) { /* ignore */
    }


    if (parse != null) {
      ParseData data = parse.getData();
      data.getContentMeta().remove(Response.CONTENT_TYPE);
      mergeMetadata(data.getParseMeta(), parseMeta);
      parseResult.put(link, new ParseText(parse.getText()), new ParseData(
          ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
              .getContentMeta(), data.getParseMeta()));
    } else {
      contentMeta.remove(Response.CONTENT_TYPE);
      parseResult.put(link, new ParseText(text), new ParseData(
          ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
          parseMeta));
    }


  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    if (resultTitle == null) {
      resultTitle = "";
    }


    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                                              resultTitle, outlinks,
                                              content.getMetadata());


    if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); }
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData));

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

            Metadata metadata = new Metadata();
            metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
            metadata.set(Response.CONTENT_TYPE, contentType);
            Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
            Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
            ParseData theParseData = parse.getData();
            Outlink[] theOutlinks = theParseData.getOutlinks();
            
            for(int count = 0; count < theOutlinks.length; count++) {
              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
            }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData


    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);


    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

      OutputCollector<Text, MetaWrapper> output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

                     OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;


    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
        final CrawlDatum datum = (CrawlDatum)value;
        if (CrawlDatum.hasDbStatus(datum)) {
          dbDatum = datum;
        }
        else if (CrawlDatum.hasFetchStatus(datum)) {
          // don't index unmodified (empty) pages
          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            fetchDatum = datum;
          }
        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
                   CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
                   CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
          continue;
        } else {
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
        }
      } else if (value instanceof ParseData) {
        parseData = (ParseData)value;


        // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
        if (deleteRobotsNoIndex) {
          // Get the robots meta data
          String robotsMeta = parseData.getMeta("robots");


          // Has it a noindex for this url?
          if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
            // Delete it!
            NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
            output.collect(key, action);
            return;
          }
        }
      } else if (value instanceof ParseText) {
        parseText = (ParseText)value;
      } else if (LOG.isWarnEnabled()) {
        LOG.warn("Unrecognized type: "+value.getClass());
      }
    }
    
    // Whether to delete GONE or REDIRECTS
    if (delete && fetchDatum != null && dbDatum != null) {    
      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
        reporter.incrCounter("IndexerStatus", "Documents deleted", 1);


        NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
        output.collect(key, action);
        return;
      }
      
      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
          fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
          dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
          dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
        reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
        reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);


        NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
        output.collect(key, action);
        return;
      }
    }


    if (fetchDatum == null || dbDatum == null
        || parseText == null || parseData == null) {
      return;                                     // only have inlinks
    }


    // Whether to delete pages marked as duplicates
    if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
      reporter.incrCounter("IndexerStatus", "Duplicates deleted", 1);
      NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
      output.collect(key, action);
      return;
    }
    
    // Whether to skip DB_NOTMODIFIED pages
    if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
      reporter.incrCounter("IndexerStatus", "Skipped", 1);
      return;
    }


    if (!parseData.getStatus().isSuccess() ||
        fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
      return;
    }


    NutchDocument doc = new NutchDocument();
    doc.add("id", key.toString());


    final Metadata metadata = parseData.getContentMeta();


    // add segment, used to map from merged index back to segment files
    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));


    // add digest, used by dedup

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.