Package fr.eolya.extraction.tika

Examples of fr.eolya.extraction.tika.TikaWrapper


                  params.put("originalContentType", "application/x-shockwave-flash");
                  contentType = "text/html; charset=utf-8";

                  String swfToHtmlPath = Utils.getValidPropertyPath(config.getProperty("/crawler/param[@name='swfToHtmlPath']", ""), null, "HOME");

                  TikaWrapper tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_HTML);
                  tikaWrapper.setSwfToHtmlPath(swfToHtmlPath);
                  tikaWrapper.process(urlLoader.getStream(), TikaWrapper.CONTENT_TYPE_SWF);
                  rawPage = tikaWrapper.getText();
                 
                  //MultiFormatTextExtractor extractor = new MultiFormatTextExtractor();
                  //extractor.setSwfToHtmlPath(swfToHtmlPath);
                  //rawPage = extractor.swfInputStreamToHtml(urlLoader.getStream());
View Full Code Here


        return parserText;
      }

      HtmlParser htmlParser = new HtmlParser();

      TikaWrapper wrapper = null;
      String tikaContentType = null;

      // application/pdf
      if (contentType.startsWith("application/pdf")) {
        wrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT);
        tikaContentType = TikaWrapper.CONTENT_TYPE_PDF;
        wrapper.setPdfToTextPath(pdfToTextPath);
      } else {
        // text/html
        if (contentType.startsWith("text/html")) {
         
          if (htmlParser.parse(rawData, contentType, url, scriptName)) {
            //cleanMethod = ""; // if parse script was use, disable any clean algorithm
            rawData = htmlParser.getBestHtml(rawData);
          }
         
          if (input==null && rawData!=null) input = new ByteArrayInputStream(rawData.getBytes());
          if (input==null) return "";

          String outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT;
          if ("boilerpipe_article".equals(cleanMethod))
            outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE;
          if ("boilerpipe_default".equals(cleanMethod))
            outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT;
          if ("boilerpipe_canola".equals(cleanMethod))
            outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA;
          if ("snacktory".equals(cleanMethod))
            outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY;
         
          wrapper = new TikaWrapper(outputFormat);
          tikaContentType = TikaWrapper.CONTENT_TYPE_HTML;

         
        } else {
          wrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT);
        }
      }
     
      wrapper.setTempPath(tmpPath);
      wrapper.process(input, tikaContentType);
     
      parserText = wrapper.getText();
      parserContentType = wrapper.getMetaContentType();
      if (contentType.startsWith("text/html")) {
        parserTitle = htmlParser.getBestTitle(wrapper.getMetaTitle());
        parserDate = htmlParser.getBestDate(wrapper.getMetaCreated());
      } else {
        parserTitle = wrapper.getMetaTitle();
        parserDate = wrapper.getMetaCreated();       
      }

      //parserContentSize = Long.toString(extractor.getContentSize());

      return parserText;
View Full Code Here

                //String data = ws.getString();
                //ws.clear();
               
                //String rawPage = extractor.htmlPageToText(data, page, "");
                //String title = extractor.getTitle();
        TikaWrapper tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_HTML);
        tikaWrapper.process(urlLoader.getStream());
        String rawPage = tikaWrapper.getText();
        String title = tikaWrapper.getMetaTitle();

                ret += "<page_0><![CDATA[" + rawPage + "]]>" + "</page_0>";
                ret += "<title_0><![CDATA[" + title + "]]>" + "</title_0>";
               
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_article");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_1><![CDATA[" + rawPage + "]]>" + "</page_1>";
                ret += "<title_1><![CDATA[" + title + "]]>" + "</title_1>";
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_default");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_2><![CDATA[" + rawPage + "]]>" + "</page_2>";
                ret += "<title_2><![CDATA[" + title + "]]>" + "</title_2>";
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_canola");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_3><![CDATA[" + rawPage + "]]>" + "</page_3>";
                ret += "<title_3><![CDATA[" + title + "]]>" + "</title_3>";

                //rawPage = extractor.htmlPageToText(data, page, "snacktory");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_4><![CDATA[" + rawPage + "]]>" + "</page_4>";
                ret += "<title_4><![CDATA[" + title + "]]>" + "</title_4>";
               
                ret += "</result>";
                urlLoader.close();
View Full Code Here

                    page = m.get("page");
                }
               
                // Get page text
                //MultiFormatTextExtractor extractor = new MultiFormatTextExtractor();
        TikaWrapper tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT);

                String text = "";
                InputStream in = null;
                if (page==null || "".equals(page)) {
                    //text = extractor.htmlPageToText(rawPage, "", "");
                  in = IOUtils.toInputStream(rawPage);
                } else {
                    //text = extractor.htmlPageToText(page, "", "");
                  in = IOUtils.toInputStream(page);
                }
               
        tikaWrapper.process(in, TikaWrapper.CONTENT_TYPE_HTML);
        text = tikaWrapper.getText();
       
                if (title==null || "".equals(title))
                    title = tikaWrapper.getMetaTitle();
               
                System.out.println("Title = "+ title);
                System.out.println("Date  = " + d);
                System.out.println("Text  = " + text);
                System.out.println("Page  = " + page);
View Full Code Here

TOP

Related Classes of fr.eolya.extraction.tika.TikaWrapper

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.