Package org.apache.tika.parser

Examples of org.apache.tika.parser.AutoDetectParser


     * Test that with only ID3v2 tags, of version 2.4, we get the full
     *  set of information out.
     */
    @Test
    public void testMp3ParsingID3v24() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
                "/test-documents/testMP3id3v24.mp3");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here


          @Context UriInfo info,
          boolean saveAll
  ) throws Exception {
    Metadata metadata = new Metadata();

    AutoDetectParser parser = TikaResource.createParser();

    TikaResource.fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
    TikaResource.logRequest(logger, info, metadata);

    ContentHandler ch;
    ByteArrayOutputStream text = new ByteArrayOutputStream();

    if (saveAll) {
      ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, "UTF-8")));
    } else {
      ch = new DefaultHandler();
    }

    ParseContext pc = new ParseContext();

    Map<String, byte[]> files = new HashMap<String, byte[]>();
    MutableInt count = new MutableInt();

    pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));

    try {
      parser.parse(is, ch, metadata, pc);
    } catch (TikaException ex) {
      logger.warn(String.format(
              "%s: Unpacker failed",
              info.getPath()
      ), ex);
View Full Code Here

        for (int i=0; i<extensions.length; i++) {
            String extension = extensions[i];
            String filename = "testPPT." + extension;

            Parser parser = new AutoDetectParser();
            Metadata metadata = new Metadata();
            // TODO: should auto-detect without the resource name
            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
            ContentHandler handler = new BodyContentHandler();
            ParseContext context = new ParseContext();
   
            InputStream input = getTestDocument(filename);
            try {
                parser.parse(input, handler, metadata, context);
   
                assertEquals(
                        "Mime-type checking for " + filename,
                        mimeTypes[i],
                        metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

     * Tests that a file with characters not in the ISO 8859-1
     *  range is correctly handled
     */
    @Test
    public void testMp3ParsingID3i18n() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = Mp3ParserTest.class.getResourceAsStream(
               "/test-documents/testMP3i18n.mp3");
       try {
           parser.parse(stream, handler, metadata, new ParseContext());
       } finally {
           stream.close();
       }

       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

        for (int i=0; i<extensions.length; i++) {
            String extension = extensions[i];
            final String filename = "testPPT." + extension;

            Parser parser = new AutoDetectParser();
            final Metadata metadata = new Metadata();
            // TODO: should auto-detect without the resource name
            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);

      // Allow the value to be access from the inner class
      final int currentI = i;
            ContentHandler handler = new BodyContentHandler()
    {
        public void startDocument ()
        {
      assertEquals(
             "Mime-type checking for " + filename,
             mimeTypes[currentI],
             metadata.get(Metadata.CONTENT_TYPE));
      assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
      assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
      assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));

        }

    };
            ParseContext context = new ParseContext();
   
            InputStream input = getTestDocument(filename);
            try {
                parser.parse(input, handler, metadata, context);
            } finally {
                input.close();
            }
        }
    }
View Full Code Here

     * Tests that a file with both lyrics and
     *  ID3v2 tags gets both extracted correctly
     */
    @Test
    public void testMp3ParsingLyrics() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        // Note - our test file has a lyrics tag, but lacks any
        //  lyrics in the tags, so we can't test that bit
        // TODO Find a better sample file
       
        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
                "/test-documents/testMP3lyrics.mp3");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

      
       for (int i=0; i<extensions.length; i++) {
          String extension = extensions[i];
          String filename = "testPPT." + extension;

          Parser parser = new AutoDetectParser();
          Metadata metadata = new Metadata();
          metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
          ContentHandler handler = new BodyContentHandler();
          ParseContext context = new ParseContext();
 
          InputStream input = getTestDocument(filename);
          try {
              parser.parse(input, handler, metadata, context);

              // Should get the metadata
              assertEquals(
                    "Mime-type checking for " + filename,
                    mimeTypes[i],
View Full Code Here

    @Test
    public void testProtectedExcelSheets() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/protectedSheets.xlsx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        try {
            parser.parse(input, handler, metadata, context);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

     * This test will check for the complicated set of ID3v2.4
     *  tags.
     */
    @Test
    public void testTIKA424() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = Mp3ParserTest.class.getResourceAsStream(
               "/test-documents/test2.mp3");
       if(stream == null) {
          // You haven't downloaded the file
          // Skip the test
          return;
       }
      
       try {
           parser.parse(stream, handler, metadata, new ParseContext());
       } finally {
           stream.close();
       }

       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

     * In this case, it is a file with JPEG data in the ID3, which
     *  is trunacted before the end of the JPEG bit of the ID3 frame.
     */
    @Test
    public void testTIKA474() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = Mp3ParserTest.class.getResourceAsStream(
               "/test-documents/testMP3truncated.mp3");
      
      
       try {
           parser.parse(stream, handler, metadata, new ParseContext());
       } finally {
           stream.close();
       }

       // Check we could get the headers from the start
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.AutoDetectParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.