Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseImpl


      title = "";
    }

    String text = delegate.getText();

    return new ParseImpl(text,
                         new ParseData(ParseStatus.STATUS_SUCCESS,
                                       title,
                                       OutlinkExtractor
        .                              getOutlinks(text, this.conf),
                                       content.getMetadata(),
View Full Code Here


      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());
      parseData.setConf(this.conf);
      parse = new ParseImpl(text, parseData);
    }
    return parse;
  }
View Full Code Here

      title = script.substring(0, idx);
    }
    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
                                 c.getMetadata());
    pd.setConf(this.conf);
    Parse parse = new ParseImpl(script, pd);
    return parse;
  }
View Full Code Here

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }
View Full Code Here

      while ((line = br.readLine()) != null) {
        if (text.length() > 0) text.append("\n");
        text.append(line);
      }
      br.close();
      byte[] signature = sig.calculate(null, new ParseImpl(text.toString(), null));
      res.put(files[i].toString(), signature);
    }
    Iterator it = res.keySet().iterator();
    while (it.hasNext()) {
      String name = (String)it.next();
View Full Code Here

  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
        ParseImpl parse = new ParseImpl("foo bar", new ParseData());
       
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
        catch(Exception e){
View Full Code Here

  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here

        //   LOG.info("Outlinks: "+outlinks);
        // }

        ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                contentTitle.toString(), outlinks, content.getMetadata());
        return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData));
    }
View Full Code Here

      List list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      parse = new ParseImpl(text, new ParseData(status, title, newlinks, metadata));
    }
    return parse;
  }
View Full Code Here

    }
    Properties metadata = new Properties();
    metadata.putAll(c.getMetadata());
    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
            outlinks, metadata);
    Parse parse = new ParseImpl(script, pd);
    return parse;
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseImpl

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.