Examples of org.apache.nutch.parse.Parse

org.apache.nutch.parse.Parse
The result of parsing a page's raw content. @see Parser#getParse(Content)


  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    this.doc = doc;


    String heading;
    Parse parse = parseResult.get(content.getUrl());


    for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
      heading = getElement(headings[i]);


      if (heading != null) {
        parse.getData().getParseMeta().set(headings[i], heading.trim());
      }
    }


    return parseResult;
  }

View Full Code Here

  
  @Test
  public void testRelTagParser() throws ProtocolException, ParseException, IOException {
  conf = NutchConfiguration.create();
  conf.set("file.content.limit", "-1");
  Parse parse;
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;


  File file = new File(sampleDir + fileSeparator + sampleFile);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));

View Full Code Here

  protected void tearDown() {
  }  
  
  public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, ParseException, IOException {
  String urlString;
  Parse parse;
  
  urlString = "file:" + sampleDir + fileSeparator + sampleFiles;
  File file = new File(urlString);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream dip = new DataInputStream(new FileInputStream(file));
  dip.readFully(bytes);
  dip.close();
    
  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8(urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  MimeUtil mutil = new MimeUtil(conf);
  String mime = mutil.getMimeType(file);
  page.setContentType(new Utf8(mime));
  
  parse = new ParseUtil(conf).parse(urlString, page);
  return parse.getOutlinks();
  }

View Full Code Here

    }


    public void testIt() throws ProtocolException, ParseException, IOException {


  String urlString;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  MimeUtil mimeutil = new MimeUtil(conf);


  urlString = "file:" + sampleDir + fileSeparator + rtfFile;


  File file = new File(sampleDir + fileSeparator + rtfFile);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  in.close();


  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8(urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));


  parse = new ParseUtil(conf).parse(urlString, page);


  String title = parse.getTitle();
  String text = parse.getText();
  assertEquals("test rft document", title);
  //assertEquals("The quick brown fox jumps over the lazy dog", text.trim());


  
  // HOW DO WE GET THE PARSE METADATA?

View Full Code Here

      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getParseStatus();
      String text = parse.getText();
      Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
      return new Parse(text, title, newlinks, status);
    }
    return parse;
  }

View Full Code Here

      title = script.substring(0, idx);
    } else {
      idx = Math.min(MAX_TITLE_LEN, script.length());
      title = script.substring(0, idx);
    }
    Parse parse =
      new Parse(script, title, outlinks, ParseStatusUtils.STATUS_SUCCESS);
    return parse;
  }

View Full Code Here

      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }


    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);


    if (metaTags.getNoCache()) {             // not okay to cache
      page.putToMetadata(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
          ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));

View Full Code Here

    parser.setConf(conf);
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    page.setContentType(new Utf8("text/html"));
    Parse parse = parser.getParse(url, page);
    System.out.println("title: "+parse.getTitle());
    System.out.println("text: "+parse.getText());
    System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));


  }

View Full Code Here

      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }


    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);


    if (metaTags.getNoCache()) { // not okay to cache
      page.putToMetadata(new Utf8(Nutch.CACHING_FORBIDDEN_KEY), ByteBuffer.wrap(Bytes
          .toBytes(cachingPolicy)));

View Full Code Here

    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    // Parse parse = parser.getParse(url, page);


    Parse parse = new ParseUtil(conf).parse(url, page);


    System.out.println("content type: " + mtype);
    System.out.println("title: " + parse.getTitle());
    System.out.println("text: " + parse.getText());
    System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.Parse

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.nutch.analysis.lang.HTMLLanguageParser

org.apache.nutch.analysis.lang.LanguageIdentifier

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.indexer.field.BasicFields$Extractor

org.apache.nutch.indexer.IndexerMapReduce

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.