Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.Parse


  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    this.doc = doc;

    String heading;
    Parse parse = parseResult.get(content.getUrl());

    for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
      heading = getElement(headings[i]);

      if (heading != null) {
        parse.getData().getParseMeta().set(headings[i], heading.trim());
      }
    }

    return parseResult;
  }
View Full Code Here


 
  @Test
  public void testRelTagParser() throws ProtocolException, ParseException, IOException {
  conf = NutchConfiguration.create();
  conf.set("file.content.limit", "-1");
  Parse parse;
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;

  File file = new File(sampleDir + fileSeparator + sampleFile);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
View Full Code Here

  protected void tearDown() {
  } 
 
  public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, ParseException, IOException {
  String urlString;
  Parse parse;
 
  urlString = "file:" + sampleDir + fileSeparator + sampleFiles;
  File file = new File(urlString);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream dip = new DataInputStream(new FileInputStream(file));
  dip.readFully(bytes);
  dip.close();
   
  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8(urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  MimeUtil mutil = new MimeUtil(conf);
  String mime = mutil.getMimeType(file);
  page.setContentType(new Utf8(mime));
 
  parse = new ParseUtil(conf).parse(urlString, page);
  return parse.getOutlinks();
  }
View Full Code Here

    }

    public void testIt() throws ProtocolException, ParseException, IOException {

  String urlString;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  MimeUtil mimeutil = new MimeUtil(conf);

  urlString = "file:" + sampleDir + fileSeparator + rtfFile;

  File file = new File(sampleDir + fileSeparator + rtfFile);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  in.close();

  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8(urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));

  parse = new ParseUtil(conf).parse(urlString, page);

  String title = parse.getTitle();
  String text = parse.getText();
  assertEquals("test rft document", title);
  //assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

 
  // HOW DO WE GET THE PARSE METADATA?
View Full Code Here

      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getParseStatus();
      String text = parse.getText();
      Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
      return new Parse(text, title, newlinks, status);
    }
    return parse;
  }
View Full Code Here

      title = script.substring(0, idx);
    } else {
      idx = Math.min(MAX_TITLE_LEN, script.length());
      title = script.substring(0, idx);
    }
    Parse parse =
      new Parse(script, title, outlinks, ParseStatusUtils.STATUS_SUCCESS);
    return parse;
  }
View Full Code Here

      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);

    if (metaTags.getNoCache()) {             // not okay to cache
      page.putToMetadata(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
          ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
View Full Code Here

    parser.setConf(conf);
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    page.setContentType(new Utf8("text/html"));
    Parse parse = parser.getParse(url, page);
    System.out.println("title: "+parse.getTitle());
    System.out.println("text: "+parse.getText());
    System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));

  }
View Full Code Here

      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);

    if (metaTags.getNoCache()) { // not okay to cache
      page.putToMetadata(new Utf8(Nutch.CACHING_FORBIDDEN_KEY), ByteBuffer.wrap(Bytes
          .toBytes(cachingPolicy)));
View Full Code Here

    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    // Parse parse = parser.getParse(url, page);

    Parse parse = new ParseUtil(conf).parse(url, page);

    System.out.println("content type: " + mtype);
    System.out.println("title: " + parse.getTitle());
    System.out.println("text: " + parse.getText());
    System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.Parse

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.