Examples of org.apache.nutch.metadata.Metadata

org.apache.nutch.metadata.Metadata
A multi-valued metadata container.

    
    private ParseData data = null;
    
    public EmptyParseImpl(ParseStatus status, Configuration conf) {
      data = new ParseData(status, "", new Outlink[0],
                           new Metadata(), new Metadata());
      data.setConf(conf);
    }

View Full Code Here

                             "Can't be handled as Microsoft document. " + e)
                             .getEmptyParse(this.conf);
    }
    
    // collect meta data
    Metadata metadata = new Metadata();
    if (properties != null) {
      title = properties.getProperty(DublinCore.TITLE);
      properties.remove(DublinCore.TITLE);
      metadata.setAll(properties);
    }


    if (text == null) { text = ""; }
    if (title == null) { title = ""; }

View Full Code Here

    }


    String file = args[0];
    byte[] raw = getRawBytes(new File(file));


    Metadata meta = new Metadata();
    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
    Content content = new Content(file, file, raw, mime, meta,
                                  NutchConfiguration.create());


    System.out.println(parser.getParse(content).getText());
  }

View Full Code Here


    // raw bytes
    byte[] bytes = bean.getContent(details);


    // pass all original headers? only these for now.
    Metadata metadata = bean.getParseData(details).getContentMeta();
    String contentType = metadata.get(Response.CONTENT_TYPE);
    //String lastModified = metadata.get(Metadata.LAST_MODIFIED);
    //String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
    // better use this, since it may have been truncated during fetch
    // or give warning if they don't match?
    int contentLength = bytes.length;

View Full Code Here

    
    // Trying to find the document's rel-tags
    Parser parser = new Parser(doc);
    Set tags = parser.getRelTags();
    Iterator iter = tags.iterator();
    Metadata metadata = parse.getData().getParseMeta();
    while (iter.hasNext()) {
      metadata.add(REL_TAG, (String) iter.next());
    }
    return parse;
  }

View Full Code Here

        int i = fname.lastIndexOf('.');
        if (i != -1) {
          // Trying to resolve the Mime-Type
          String contentType = MIME.getMimeType(fname).getName();
          try {
            Metadata metadata = new Metadata();
            metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
            metadata.set(Response.CONTENT_TYPE, contentType);
            Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
            Parse parse = new ParseUtil(this.conf).parse(content);
            ParseData theParseData = parse.getData();
            Outlink[] theOutlinks = theParseData.getOutlinks();

View Full Code Here

    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);


    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new Document(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }

View Full Code Here

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo", conf),
      new Outlink("http://bar.com/", "Bar", conf)
    };


    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");


    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    r.setConf(conf);
                        
    WritableTestUtils.testWritable(r, conf);

View Full Code Here

      outlinks[i] = new Outlink("http://outlink.com/" + i, "Outlink" + i, conf);
    }
    ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS,
                                       "Max Outlinks Title",
                                       outlinks,
                                       new Metadata());
    Configuration conf = NutchConfiguration.create();
    // No Outlinks
    conf.setInt("db.max.outlinks.per.page", 0);
    ParseData data = (ParseData) WritableTestUtils.writeRead(original, conf);
    assertEquals(0, data.getOutlinks().length);

View Full Code Here

    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
    String text = parse.getText();
    assertEquals("The quick brown fox jumps over the lazy dog", text.trim());


    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("test rft document", title);
    assertEquals("tests", meta.get(DublinCore.SUBJECT));






  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.metadata.Metadata

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilter

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilter

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilter

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlow

com.flaptor.hounder.crawler.Nutch9Fetcher$NutchSegment$SegmentIterator

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.fetcher.Fetcher$FetcherThread

org.apache.nutch.fetcher.Fetcher2$FetcherThread

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.