Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.Content


    }

    public void map(Writable key, Content value,
        OutputCollector<BytesWritable, BytesWritable> output, Reporter reporter)
        throws IOException {
        Content content = (Content) value;

        if (content.getContentType().toLowerCase().startsWith("image/jpeg")) {
            try {
                //MessageDigest md = MessageDigest.getInstance("SHA-1");
                output.collect(new BytesWritable(content.getUrl().getBytes()),
                            new BytesWritable(content.getContent()));
            } catch (Exception e) {
            }
        }
    }
View Full Code Here


    if (LOG.isInfoEnabled()) { LOG.info("fetching: "+url); }

    Configuration conf = NutchConfiguration.create();
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();

    if (force) {
      content.setContentType(contentType);
    } else {
      contentType = content.getContentType();
    }

    if (contentType == null) {
      System.err.println("");
      System.exit(-1);
View Full Code Here

    String file = args[0];
    byte[] raw = getRawBytes(new File(file));

    Metadata meta = new Metadata();
    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
    Content content = new Content(file, file, raw, mime, meta,
                                  NutchConfiguration.create());

    System.out.println(parser.getParse(content).getText());
  }
View Full Code Here

  protected void tearDown() {}

  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
View Full Code Here

          String contentType = MIME.getMimeType(fname).getName();
          try {
            Metadata metadata = new Metadata();
            metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
            metadata.set(Response.CONTENT_TYPE, contentType);
            Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
            Parse parse = new ParseUtil(this.conf).parse(content);
            ParseData theParseData = parse.getData();
            Outlink[] theOutlinks = theParseData.getOutlinks();
           
            for(int count = 0; count < theOutlinks.length; count++) {
View Full Code Here

    String url = args[0];
    Configuration conf = NutchConfiguration.create();
    RSSParser parser = new RSSParser();
    parser.setConf(conf);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
    Parse parse = parser.getParse(content);
    System.out.println("data: "+ parse.getData());
    System.out.println("text: "+parse.getText());
  }
View Full Code Here

  protected void tearDown() {}

  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
View Full Code Here

    ArrayList<String> handledurls=new ArrayList<String>();
   
    READ:
      do {
      Text key=new Text();
      Content value=new Content();
      if(!reader.next(key, value)) break READ;
      String contentString=new String(value.getContent());
      if(contentString.indexOf("Nutch fetcher test page")!=-1) {
        handledurls.add(key.toString());
      }
    } while(true);
View Full Code Here

  public void testIt() throws ProtocolException, ParseException {

    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    ParseUtil parser = new ParseUtil(conf);
    ProtocolFactory factory = new ProtocolFactory(conf);
View Full Code Here

     * </ul>
     */
    public void testIt() throws ProtocolException, ParseException {
        String urlString;
        Protocol protocol;
        Content content;
        Parse parse;

        Configuration conf = NutchConfiguration.create();
        for (int i = 0; i < sampleFiles.length; i++) {
            urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.Content

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.