Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.Parse


    try {
      ParseUtil parser = new ParseUtil(NutchConfiguration.create());
      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {
        Content content = getContent(docs[t]);
        Parse parse = parser.parse(content).get(content.getUrl());
        assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
      }
    } catch (Exception e) {
      e.printStackTrace(System.out);
      fail(e.toString());
    }
View Full Code Here


    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));

    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));

    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
      final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
      if (url != null) {
View Full Code Here

        }
      }

      // get parse metadata
      Metadata metadata = parseData.getContentMeta();
      Parse parse = new ParseImpl(parseText, parseData);

      // handle redirect urls
      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
        Nutch.WRITABLE_REPR_URL_KEY);
      String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
      String url = key.toString();
      String fieldUrl = (reprUrl != null) ? reprUrl : url;
      String host = URLUtil.getHost(fieldUrl);

      // add segment, used to map from merged index back to segment files
      FieldWritable segField = new FieldWritable(Fields.SEGMENT,
        metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(segField);

      // add digest, used by dedup
      FieldWritable digestField = new FieldWritable(Fields.DIGEST,
        metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(digestField);

      // url is both stored and indexed, so it's both searchable and returned
      fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
        true, true, true));
      fieldsList.add(new FieldWritable(Fields.SEG_URL, url, FieldType.CONTENT,
        false, true, false));

      if (reprUrl != null) {
        // also store original url as both stored and indexes
        fieldsList.add(new FieldWritable(Fields.ORIG_URL, url,
          FieldType.CONTENT, true, true, true));
      }

      if (host != null) {
        // add host as un-stored, indexed and tokenized
        FieldWritable hostField = new FieldWritable(Fields.HOST, host,
          FieldType.CONTENT, true, false, true);
        fieldsList.add(hostField);

        // add site as un-stored, indexed and un-tokenized
        FieldWritable siteField = new FieldWritable(Fields.SITE, host,
          FieldType.CONTENT, true, false, false);
        fieldsList.add(siteField);
      }

      // content is indexed, so that it's searchable, but not stored in index
      fieldsList.add(new FieldWritable(Fields.CONTENT, parse.getText(),
        FieldType.CONTENT, true, false, true));

      // title
      String title = parse.getData().getTitle();
      if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
        title = title.substring(0, MAX_TITLE_LENGTH);
      }
      // add title indexed and stored so that it can be displayed
      fieldsList.add(new FieldWritable(Fields.TITLE, title, FieldType.CONTENT,
        true, true, true));

      // add cached content/summary display policy, if available
      String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
      if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
        fieldsList.add(new FieldWritable(Fields.CACHE, caching,
          FieldType.CONTENT, false, true, false));
      }
View Full Code Here

  public String getTextContent(String fileName) throws ProtocolException, ParseException {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content).get(content.getUrl());
    return parse.getText();
  }
View Full Code Here

     */
    public void testIt() throws ProtocolException, ParseException {
        String urlString;
        Protocol protocol;
        Content content;
        Parse parse;

        Configuration conf = NutchConfiguration.create();
        for (int i = 0; i < sampleFiles.length; i++) {
            urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

            protocol = new ProtocolFactory(conf).getProtocol(urlString);
            content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
            parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content).get(content.getUrl());

            //check that there are 3 outlinks:
            //http://test.channel.com
            //http://www-scf.usc.edu/~mattmann/
            //http://www.nutch.org

            ParseData theParseData = parse.getData();

            Outlink[] theOutlinks = theParseData.getOutlinks();

            assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);

View Full Code Here

        output.collect(key, new NutchWritable(content));

        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature.
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
              segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
              StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
              Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            }
            catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
        if (LOG.isFatalEnabled()) {
          LOG.fatal("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
        }
      }

      // return parse status if it exits
      if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
          return p.getData().getStatus();
        }
      }
    }
   
    return null;
View Full Code Here

   * @see #SAMPLE_DIR
   * @throws Exception
   */
  public void testContent() throws Exception {

    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", this.content)
                        .get(this.content.getUrl());

    ParseData data = parse.getData();
    String text = parse.getText();

    assertTrue("No content extracted length ==0", text.length() > 0);
   
    this.dumpToFile(this.testFile.getName(), data, text);

View Full Code Here

   * @see #SAMPLE_DIR
   * @throws Exception
   */
  public void testMeta() throws Exception {

    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", content)
                        .get(content.getUrl());
   
    ParseData data = parse.getData();

    final FileExtensionFilter titleFilter = new FileExtensionFilter(
        this.testFile.getName() + ".meta");
    final File[] titleFiles = this.sampleDir.listFiles(titleFilter);

View Full Code Here

    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));

    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));

    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
      final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
      if (url != null) {
View Full Code Here

   
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.Parse

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.