Examples of org.apache.tika.parser.AutoDetectParser

org.apache.tika.parser.AutoDetectParser

     * Test that with only ID3v2 tags, of version 2.4, we get the full
     *  set of information out.
     */
    @Test
    public void testMp3ParsingID3v24() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();


        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
                "/test-documents/testMP3id3v24.mp3");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }


        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

          @Context UriInfo info,
          boolean saveAll
  ) throws Exception {
    Metadata metadata = new Metadata();


    AutoDetectParser parser = TikaResource.createParser();


    TikaResource.fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
    TikaResource.logRequest(logger, info, metadata);


    ContentHandler ch;
    ByteArrayOutputStream text = new ByteArrayOutputStream();


    if (saveAll) {
      ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, "UTF-8")));
    } else {
      ch = new DefaultHandler();
    }


    ParseContext pc = new ParseContext();


    Map<String, byte[]> files = new HashMap<String, byte[]>();
    MutableInt count = new MutableInt();


    pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));


    try {
      parser.parse(is, ch, metadata, pc);
    } catch (TikaException ex) {
      logger.warn(String.format(
              "%s: Unpacker failed",
              info.getPath()
      ), ex);

View Full Code Here


        for (int i=0; i<extensions.length; i++) {
            String extension = extensions[i];
            String filename = "testPPT." + extension;


            Parser parser = new AutoDetectParser();
            Metadata metadata = new Metadata();
            // TODO: should auto-detect without the resource name
            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
            ContentHandler handler = new BodyContentHandler();
            ParseContext context = new ParseContext();
    
            InputStream input = getTestDocument(filename);
            try {
                parser.parse(input, handler, metadata, context);
    
                assertEquals(
                        "Mime-type checking for " + filename,
                        mimeTypes[i],
                        metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

     * Tests that a file with characters not in the ISO 8859-1
     *  range is correctly handled
     */
    @Test
    public void testMp3ParsingID3i18n() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();


       InputStream stream = Mp3ParserTest.class.getResourceAsStream(
               "/test-documents/testMP3i18n.mp3");
       try {
           parser.parse(stream, handler, metadata, new ParseContext());
       } finally {
           stream.close();
       }


       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here


        for (int i=0; i<extensions.length; i++) {
            String extension = extensions[i];
            final String filename = "testPPT." + extension;


            Parser parser = new AutoDetectParser();
            final Metadata metadata = new Metadata();
            // TODO: should auto-detect without the resource name
            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);


      // Allow the value to be access from the inner class
      final int currentI = i;
            ContentHandler handler = new BodyContentHandler()
    {
        public void startDocument ()
        {
      assertEquals(
             "Mime-type checking for " + filename,
             mimeTypes[currentI],
             metadata.get(Metadata.CONTENT_TYPE));
      assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
      assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
      assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));


        }


    };
            ParseContext context = new ParseContext();
    
            InputStream input = getTestDocument(filename);
            try {
                parser.parse(input, handler, metadata, context);
            } finally {
                input.close();
            }
        }
    }

View Full Code Here

     * Tests that a file with both lyrics and
     *  ID3v2 tags gets both extracted correctly
     */
    @Test
    public void testMp3ParsingLyrics() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();


        // Note - our test file has a lyrics tag, but lacks any
        //  lyrics in the tags, so we can't test that bit
        // TODO Find a better sample file
        
        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
                "/test-documents/testMP3lyrics.mp3");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }


        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

       
       for (int i=0; i<extensions.length; i++) {
          String extension = extensions[i];
          String filename = "testPPT." + extension;


          Parser parser = new AutoDetectParser();
          Metadata metadata = new Metadata();
          metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
          ContentHandler handler = new BodyContentHandler();
          ParseContext context = new ParseContext();
  
          InputStream input = getTestDocument(filename);
          try {
              parser.parse(input, handler, metadata, context);


              // Should get the metadata
              assertEquals(
                    "Mime-type checking for " + filename,
                    mimeTypes[i],

View Full Code Here

    @Test
    public void testProtectedExcelSheets() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/protectedSheets.xlsx");


        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();


        try {
            parser.parse(input, handler, metadata, context);


            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

     * This test will check for the complicated set of ID3v2.4
     *  tags.
     */
    @Test
    public void testTIKA424() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();


       InputStream stream = Mp3ParserTest.class.getResourceAsStream(
               "/test-documents/test2.mp3");
       if(stream == null) {
          // You haven't downloaded the file
          // Skip the test
          return;
       }
       
       try {
           parser.parse(stream, handler, metadata, new ParseContext());
       } finally {
           stream.close();
       }


       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

     * In this case, it is a file with JPEG data in the ID3, which
     *  is trunacted before the end of the JPEG bit of the ID3 frame.
     */
    @Test
    public void testTIKA474() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();


       InputStream stream = Mp3ParserTest.class.getResourceAsStream(
               "/test-documents/testMP3truncated.mp3");
       
       
       try {
           parser.parse(stream, handler, metadata, new ParseContext());
       } finally {
           stream.close();
       }


       // Check we could get the headers from the start

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.tika.parser.AutoDetectParser

bixo.parser.SimpleParser

com.baasbox.controllers.File

com.findwise.utils.tika.InputStreamParser

com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester

com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika

com.nidhinova.tika.server.TikaService

com.tamingtext.tika.TikaTest

edu.isi.karma.rdf.GenericRDFGenerator

fr.inra.lipm.jezlucene.Main

fr.inra.lipm.jezlucene.parser.Parser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.