Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


   
    private ParseData data = null;
   
    public EmptyParseImpl(ParseStatus status, Configuration conf) {
      data = new ParseData(status, "", new Outlink[0],
                           new Metadata(), new Metadata());
      data.setConf(conf);
    }
View Full Code Here


                             "Can't be handled as Microsoft document. " + e)
                             .getEmptyParse(this.conf);
    }
   
    // collect meta data
    Metadata metadata = new Metadata();
    if (properties != null) {
      title = properties.getProperty(DublinCore.TITLE);
      properties.remove(DublinCore.TITLE);
      metadata.setAll(properties);
    }

    if (text == null) { text = ""; }
    if (title == null) { title = ""; }
View Full Code Here

    }

    String file = args[0];
    byte[] raw = getRawBytes(new File(file));

    Metadata meta = new Metadata();
    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
    Content content = new Content(file, file, raw, mime, meta,
                                  NutchConfiguration.create());

    System.out.println(parser.getParse(content).getText());
  }
View Full Code Here

    // raw bytes
    byte[] bytes = bean.getContent(details);

    // pass all original headers? only these for now.
    Metadata metadata = bean.getParseData(details).getContentMeta();
    String contentType = metadata.get(Response.CONTENT_TYPE);
    //String lastModified = metadata.get(Metadata.LAST_MODIFIED);
    //String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
    // better use this, since it may have been truncated during fetch
    // or give warning if they don't match?
    int contentLength = bytes.length;
View Full Code Here

   
    // Trying to find the document's rel-tags
    Parser parser = new Parser(doc);
    Set tags = parser.getRelTags();
    Iterator iter = tags.iterator();
    Metadata metadata = parse.getData().getParseMeta();
    while (iter.hasNext()) {
      metadata.add(REL_TAG, (String) iter.next());
    }
    return parse;
  }
View Full Code Here

        int i = fname.lastIndexOf('.');
        if (i != -1) {
          // Trying to resolve the Mime-Type
          String contentType = MIME.getMimeType(fname).getName();
          try {
            Metadata metadata = new Metadata();
            metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
            metadata.set(Response.CONTENT_TYPE, contentType);
            Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
            Parse parse = new ParseUtil(this.conf).parse(content);
            ParseData theParseData = parse.getData();
            Outlink[] theOutlinks = theParseData.getOutlinks();
           
View Full Code Here

    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new Document(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }
View Full Code Here

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo", conf),
      new Outlink("http://bar.com/", "Bar", conf)
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    r.setConf(conf);
                       
    WritableTestUtils.testWritable(r, conf);
View Full Code Here

      outlinks[i] = new Outlink("http://outlink.com/" + i, "Outlink" + i, conf);
    }
    ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS,
                                       "Max Outlinks Title",
                                       outlinks,
                                       new Metadata());
    Configuration conf = NutchConfiguration.create();
    // No Outlinks
    conf.setInt("db.max.outlinks.per.page", 0);
    ParseData data = (ParseData) WritableTestUtils.writeRead(original, conf);
    assertEquals(0, data.getOutlinks().length);
View Full Code Here

    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
    String text = parse.getText();
    assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("test rft document", title);
    assertEquals("tests", meta.get(DublinCore.SUBJECT));



  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.