Package com.tamingtext.tika

Source Code of com.tamingtext.tika.TikaTest

/*
* Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
*
*    Licensed under the Apache License, Version 2.0 (the "License");
*    you may not use this file except in compliance with the License.
*    You may obtain a copy of the License at
*
*        http://www.apache.org/licenses/LICENSE-2.0
*
*    Unless required by applicable law or agreed to in writing, software
*    distributed under the License is distributed on an "AS IS" BASIS,
*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*    See the License for the specific language governing permissions and
*    limitations under the License.
* -------------------
* To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
* http://www.manning.com/ingersoll
*/

package com.tamingtext.tika;

import com.tamingtext.TamingTextTestJ4;
import junit.framework.TestCase;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.File;
import java.nio.charset.Charset;

import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.ToHTMLContentHandler;
import org.junit.*;
import org.xml.sax.ContentHandler;


/**
* Demonstrate basic Tika usage
*
**/
public class TikaTest extends TamingTextTestJ4 {
  @Test
  public void testTika() throws Exception {
    //<start id="tika"/>
    InputStream input = new FileInputStream(
            new File("src/test/resources/pdfBox-sample.pdf"));//<co id="tika.is"/>
    ContentHandler textHandler = new BodyContentHandler();//<co id="tika.handler"/>
    Metadata metadata = new Metadata();//<co id="tika.metadata"/>
    Parser parser = new AutoDetectParser();//<co id="tika.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, textHandler, metadata, context);//<co id="tika.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));//<co id="tika.title"/>
    System.out.println("Body: " + textHandler.toString());//<co id="tika.body"/>
    /*
<calloutlist>
    <callout arearefs="tika.is"><para>Create the <classname>InputStream</classname> to read in the content</para></callout>
    <callout arearefs="tika.handler"><para>The <classname>BodyContentHandler</classname> is a Tika-provided <classname>ContentHandler</classname> that extracts just the "body" of the InputStream</para></callout>
  <callout arearefs="tika.metadata"><para>The <classname>Metadata</classname> object will hold metadata like author, title, etc. about the content in a map.</para></callout>
  <callout arearefs="tika.parser"><para>The <classname>AutoDetectParser</classname> will figure out the MIME type of the document automatically when parse is called.  Since we know our input is a PDF file, we could have used the <classname>PDFParser</classname> instead.</para></callout>
  <callout arearefs="tika.parse"><para>Execute the parse</para></callout>
  <callout arearefs="tika.title"><para>Get the title from the <classname>Metadata</classname> instance</para></callout>
  <callout arearefs="tika.body"><para>Print out the body from the <classname>ContentHandler</classname></para></callout>

</calloutlist>
*/
    //<end id="tika"/>
  }

  @Test
  public void testHtml() throws Exception {
    String html = "<html><head><title>The Big Brown Shoe</title></head><body><p>The best pizza place " +
            "in the US is <a href=\"http://antoniospizzas.com/\">Antonio's Pizza</a>.</p>" +
            "<p>It is located in Amherst, MA.</p></body></html>";
    //<start id="tika-html"/>
    InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8")));
    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>
    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();//<co id="html.store"/>
    Parser parser = new HtmlParser();//<co id="html.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
    /*
    <calloutlist>
        <callout arearefs="html.text.co"><para>Construct a ContentHandler that will just extract between the body tags.</para></callout>
        <callout arearefs="html.link.co"><para>Construct ContentHandler that knows about HTML links</para></callout>
        <callout arearefs="html.merge"><para>Wrap up our ContentHandlers into one</para></callout>
        <callout arearefs="html.store"><para>Metadata is a simple storage mechanism where the extracted metadata gets stored</para></callout>
        <callout arearefs="html.parser"><para>We know the input is HTML, so construct a Parser to parse it</para></callout>
        <callout arearefs="html.parse"><para>Do the parse</para></callout>
    </calloutlist>
    */
    //<end id="tika-html"/>
  }

}
TOP

Related Classes of com.tamingtext.tika.TikaTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.