Package org.apache.nutch.util

Examples of org.apache.nutch.util.MimeUtil


    this.url = url;
    this.base = base;
    this.content = content;
    this.metadata = metadata;

    this.mimeTypes = new MimeUtil(conf);
    this.contentType = getContentType(contentType, url, content);
  }
View Full Code Here


    this.orig = url.toString();
    this.base = url.toString();
    this.file = file;
    this.conf = conf;
   
    MIME = new MimeUtil(conf);

    if (!"file".equals(url.getProtocol()))
      throw new FileException("Not a file url:" + url);

    if (File.LOG.isTraceEnabled()) {
View Full Code Here

    public void testIt() throws ProtocolException, ParseException, IOException {

  String urlString;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  MimeUtil mimeutil = new MimeUtil(conf);

  urlString = "file:" + sampleDir + fileSeparator + rtfFile;

  File file = new File(sampleDir + fileSeparator + rtfFile);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  in.close();

  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8(urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));

  parse = new ParseUtil(conf).parse(urlString, page);

  String title = parse.getTitle();
View Full Code Here

    @Test
    public void testIt() throws ProtocolException, ParseException, IOException {
  String urlString;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  MimeUtil mimeutil = new MimeUtil(conf);

  for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      int index = parse.getText().indexOf(expectedText);
View Full Code Here

  public void testIt()throws ProtocolException, ParseException, IOException {
    String urlString;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    MimeUtil mimeutil = new MimeUtil(conf);
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      // check that there are 2 outlinks:
View Full Code Here

    // TikaParser parser = new TikaParser();
    // parser.setConf(conf);
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    // Parse parse = parser.getParse(url, page);

    Parse parse = new ParseUtil(conf).parse(url, page);
View Full Code Here

    Configuration conf = NutchConfiguration.create();

    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));

    new ParseUtil(conf).parse(url, page);

    ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
View Full Code Here

  Parse parse;
  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8("file:"+urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  // set the content type?
  MimeUtil mimeutil = new MimeUtil(conf);
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));
   
  parse = new ParseUtil(conf).parse("file:"+urlString, page);
  return parse.getText();
    }
View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException, IOException {
    String urlString;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    MimeUtil mimeutil = new MimeUtil(conf);
 
    try {
      // read the test string
      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
        + sampleText);
      StringBuffer sb = new StringBuffer();
      int len = 0;
      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
      char[] buf = new char[1024];
      while ((len = isr.read(buf)) > 0) {
        sb.append(buf, 0, len);
      }
      isr.close();
      expectedText = sb.toString();
      // normalize space
      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
    } catch (Exception e) {
      e.printStackTrace();
    }

    System.out.println("Expected : " + expectedText);

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      if (sampleFiles[i].startsWith("ootest") == false)
      continue;

      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
View Full Code Here

    dip.close();
   
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(urlString));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mutil = new MimeUtil(conf);
    String mime = mutil.getMimeType(file);
    page.setContentType(new Utf8(mime));
 
    parse = new ParseUtil(conf).parse(urlString, page);
    return parse.getOutlinks();
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.util.MimeUtil

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.