Package org.apache.nutch.util

Examples of org.apache.nutch.util.MimeUtil


    public void testIt() throws ProtocolException, ParseException, IOException {
  String urlString;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  MimeUtil mimeutil = new MimeUtil(conf);

  System.out.println("Expected : " + expectedText);

  for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      if (sampleFiles[i].startsWith("ootest") == false)
    continue;

      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
View Full Code Here


  public void addIndexBackendOptions(Configuration conf) {
  }

  public void setConf(Configuration conf) {
    this.conf = conf;
    MIME = new MimeUtil(conf);
  }
View Full Code Here

    public void testIt() throws ProtocolException, ParseException, IOException {
  String urlString;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  MimeUtil mimeutil = new MimeUtil(conf);

  for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      int index = parse.getText().indexOf(expectedText);
View Full Code Here

    String urlString;
    Protocol protocol;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    MimeUtil mimeutil = new MimeUtil(conf);
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      // check that there are 2 outlinks:
View Full Code Here

    // TikaParser parser = new TikaParser();
    // parser.setConf(conf);
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    // Parse parse = parser.getParse(url, page);

    Parse parse = new ParseUtil(conf).parse(url, page);
View Full Code Here

    this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
        .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
    this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
    this.accept = conf.get("http.accept", accept);
    this.ip_header = conf.getBoolean("http.store.ip.address", false);
    this.mimeTypes = new MimeUtil(conf);
    this.useHttp11 = conf.getBoolean("http.useHttp11", false);
    this.robots.setConf(conf);
    logConf();
  }
View Full Code Here

 
 
  /** Creates a new instance of ZipTextExtractor */
  public ZipTextExtractor(Configuration conf) {
    this.conf = conf;
    this.MIME = new MimeUtil(conf);
  }
View Full Code Here

  public void addIndexBackendOptions(Configuration conf) {
  }

  public void setConf(Configuration conf) {
    this.conf = conf;
    MIME = new MimeUtil(conf);
  }
View Full Code Here

    in.close();

    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(urlString));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    parse = new ParseUtil(conf).parse(urlString, page);
    //begin assertion for tests
    ByteBuffer bbuf = page.getFromMetadata(new Utf8("Rel-Tag"));
    byte[] byteArray = new byte[bbuf.remaining()];
View Full Code Here

    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
    this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
        .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
    this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
    this.accept = conf.get("http.accept", accept);
    this.mimeTypes = new MimeUtil(conf);
    this.useHttp11 = conf.getBoolean("http.useHttp11", false);
    this.robots.setConf(conf);
    logConf();
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.util.MimeUtil

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.