Package org.apache.tika.parser

Examples of org.apache.tika.parser.Parser


import org.xml.sax.ContentHandler;

public class ArParserTest extends AbstractPkgTest {
    @Test
  public void testArParsing() throws Exception {
    Parser parser = new AutoDetectParser();

    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    InputStream stream = ArParserTest.class
        .getResourceAsStream("/test-documents/testARofText.ar");
    try {
      parser.parse(stream, handler, metadata, recursingContext);
    } finally {
      stream.close();
    }

    assertEquals("application/x-archive",
        metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertTrue(content.contains("testTXT.txt"));
    assertTrue(content.contains("Test d'indexation de Txt"));
    assertTrue(content.contains("http://www.apache.org"));

    stream = ArParserTest.class
        .getResourceAsStream("/test-documents/testARofSND.ar");
    try {
      parser.parse(stream, handler, metadata, recursingContext);
    } finally {
      stream.close();
    }

    assertEquals("application/x-archive",
View Full Code Here


   * Tests that the ParseContext parser is correctly fired for all the
   * embedded entries.
   */
    @Test
  public void testEmbedded() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    InputStream stream = ArParserTest.class
        .getResourceAsStream("/test-documents/testARofText.ar");
    try {
      parser.parse(stream, handler, metadata, trackingContext);
    } finally {
      stream.close();
    }

    assertEquals(1, tracker.filenames.size());
    assertEquals(1, tracker.mediatypes.size());

    assertEquals("testTXT.txt", tracker.filenames.get(0));

    for (String type : tracker.mediatypes) {
      assertNull(type);
    }

    tracker.reset();
    stream = ArParserTest.class
        .getResourceAsStream("/test-documents/testARofSND.ar");
    try {
      parser.parse(stream, handler, metadata, trackingContext);
    } finally {
      stream.close();
    }

    assertEquals(1, tracker.filenames.size());
View Full Code Here

        for (int i=0; i<extensions.length; i++) {
            String extension = extensions[i];
            String filename = "testPPT." + extension;

            Parser parser = new AutoDetectParser();
            Metadata metadata = new Metadata();
            // TODO: should auto-detect without the resource name
            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
            ContentHandler handler = new BodyContentHandler();
            ParseContext context = new ParseContext();
   
            InputStream input = getTestDocument(filename);
            try {
                parser.parse(input, handler, metadata, context);
   
                assertEquals(
                        "Mime-type checking for " + filename,
                        mimeTypes[i],
                        metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

        for (int i=0; i<extensions.length; i++) {
            String extension = extensions[i];
            final String filename = "testPPT." + extension;

            Parser parser = new AutoDetectParser();
            final Metadata metadata = new Metadata();
            // TODO: should auto-detect without the resource name
            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);

      // Allow the value to be access from the inner class
      final int currentI = i;
            ContentHandler handler = new BodyContentHandler()
    {
        public void startDocument ()
        {
      assertEquals(
             "Mime-type checking for " + filename,
             mimeTypes[currentI],
             metadata.get(Metadata.CONTENT_TYPE));
      assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
      assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
      assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));

        }

    };
            ParseContext context = new ParseContext();
   
            InputStream input = getTestDocument(filename);
            try {
                parser.parse(input, handler, metadata, context);
            } finally {
                input.close();
            }
        }
    }
View Full Code Here

      
       for (int i=0; i<extensions.length; i++) {
          String extension = extensions[i];
          String filename = "testPPT." + extension;

          Parser parser = new AutoDetectParser();
          Metadata metadata = new Metadata();
          metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
          ContentHandler handler = new BodyContentHandler();
          ParseContext context = new ParseContext();
 
          InputStream input = getTestDocument(filename);
          try {
              parser.parse(input, handler, metadata, context);

              // Should get the metadata
              assertEquals(
                    "Mime-type checking for " + filename,
                    mimeTypes[i],
View Full Code Here

    @Test
    public void testProtectedExcelSheets() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/protectedSheets.xlsx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        try {
            parser.parse(input, handler, metadata, context);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

     * See TIKA-437.
     */
    @Test
    public void testProtectedExcelFile() throws Exception {

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        InputStream input = getTestDocument("protectedFile.xlsx");
        try {
            parser.parse(input, handler, metadata, context);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

     * Test docx without headers
     * TIKA-633
     */
    @Test
    public void testNullHeaders() throws Exception {
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        InputStream input = getTestDocument("NullHeader.docx");
        try {
            parser.parse(input, handler, metadata, context);
            assertFalse(handler.toString().length()==0);
        } finally {
            input.close();
        }
    }
View Full Code Here

*/
public class TarParserTest extends AbstractPkgTest {

    @Test
    public void testTarParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = TarParserTest.class.getResourceAsStream(
                "/test-documents/test-documents.tar");
        try {
            parser.parse(stream, handler, metadata, recursingContext);
        } finally {
            stream.close();
        }

        assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
    @Test
    public void testEmbedded() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = ZipParserTest.class.getResourceAsStream(
               "/test-documents/test-documents.tar");
       try {
           parser.parse(stream, handler, metadata, trackingContext);
       } finally {
           stream.close();
       }
      
       // Should have found all 9 documents, but not the directory
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.