Package org.apache.tika.parser

Examples of org.apache.tika.parser.AutoDetectParser


        InputStream input = ODFParserTest.class.getResourceAsStream(
            "/test-documents/testFooter.ods");
        try {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser().parse(input, handler, metadata);
 
            String content = handler.toString();
            assertContains("Here is a footer in the center area", content);
        } finally {
            input.close();
View Full Code Here


     * Test that we can extract information from
     *  a M4A MP4 Audio file
     */
    @Test
    public void testMP4ParsingAudio() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = MP4ParserTest.class.getResourceAsStream(
                "/test-documents/testMP4.m4a");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        // Check core properties
        assertEquals("audio/mp4", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
        assertEquals("2012-01-28T18:39:18Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals("2012-01-28T18:39:18Z", metadata.get(Metadata.CREATION_DATE));
        assertEquals("2012-01-28T18:40:25Z", metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("2012-01-28T18:40:25Z", metadata.get(Metadata.DATE));

        // Check the textual contents
        String content = handler.toString();
        assertTrue(content.contains("Test Title"));
        assertTrue(content.contains("Test Artist"));
        assertTrue(content.contains("Test Album"));
        assertTrue(content.contains("2008"));
        assertTrue(content.contains("Test Comment"));
        assertTrue(content.contains("Test Genre"));
       
        // Check XMPDM-typed audio properties
        assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
        assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
        assertEquals("Test Composer", metadata.get(XMPDM.COMPOSER));
        assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
        assertEquals("Test Genre", metadata.get(XMPDM.GENRE));
        assertEquals("Test Comments", metadata.get(XMPDM.LOG_COMMENT.getName()));
        assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
       
        assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
        //assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE)); // TODO Extract
        assertEquals("M4A", metadata.get(XMPDM.AUDIO_COMPRESSOR));
       
       
        // Check again by file, rather than stream
        TikaInputStream tstream = TikaInputStream.get(
              MP4ParserTest.class.getResourceAsStream("/test-documents/testMP4.m4a"));
        tstream.getFile();
        try {
           parser.parse(tstream, handler, metadata, new ParseContext());
        } finally {
           tstream.close();
        }
    }
View Full Code Here

     *  shouldn't break on these files either (TIKA-826
     */
    @Test
    public void testExcelXLSB() throws Exception {
       Detector detector = new DefaultDetector();
       AutoDetectParser parser = new AutoDetectParser();
      
       InputStream input = ExcelParserTest.class.getResourceAsStream(
             "/test-documents/testEXCEL.xlsb");
       Metadata m = new Metadata();
       m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
      
       // Should be detected correctly
       MediaType type = null;
       try {
          type = detector.detect(input, m);
          assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
       } finally {
          input.close();
       }
      
       // OfficeParser won't handle it
       assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
      
       // OOXMLParser won't handle it
       assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
      
       // AutoDetectParser doesn't break on it
       input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb");

       try {
          ContentHandler handler = new BodyContentHandler(-1);
          ParseContext context = new ParseContext();
          context.set(Locale.class, Locale.US);
          parser.parse(input, handler, m, context);

          String content = handler.toString();
          assertEquals("", content);
       } finally {
          input.close();
View Full Code Here

     *  but we shouldn't break on these files either (TIKA-976
     */
    @Test
    public void testExcel95() throws Exception {
       Detector detector = new DefaultDetector();
       AutoDetectParser parser = new AutoDetectParser();
      
       InputStream input = ExcelParserTest.class.getResourceAsStream(
             "/test-documents/testEXCEL_95.xls");
       Metadata m = new Metadata();
       m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
      
       // Should be detected correctly
       MediaType type = null;
       try {
          type = detector.detect(input, m);
          assertEquals("application/vnd.ms-excel", type.toString());
       } finally {
          input.close();
       }
      
       // OfficeParser will claim to handle it
       assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
      
       // OOXMLParser won't handle it
       assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
      
       // AutoDetectParser doesn't break on it
       input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");

       try {
          ContentHandler handler = new BodyContentHandler(-1);
          ParseContext context = new ParseContext();
          context.set(Locale.class, Locale.US);
          parser.parse(input, handler, m, context);

          String content = handler.toString();
          assertEquals("", content);
       } finally {
          input.close();
View Full Code Here

    return GREETING;
  }

  @SuppressWarnings("serial")
  public static AutoDetectParser createParser() {
    final AutoDetectParser parser = new AutoDetectParser();

    Map<MediaType,Parser> parsers = parser.getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    parser.setParsers(parsers);

    parser.setFallback(new Parser() {
      public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return parser.getSupportedTypes(parseContext);
      }

      public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
        throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
      }
View Full Code Here

  @Produces("text/plain")
  public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
    return produceText(is, httpHeaders.getRequestHeaders(), info);
  }
  public StreamingOutput produceText(final InputStream is, MultivaluedMap<String, String> httpHeaders, final UriInfo info) {   
    final AutoDetectParser parser = createParser();
    final Metadata metadata = new Metadata();

    fillMetadata(parser, metadata, httpHeaders);

    logRequest(logger, info, metadata);

    return new StreamingOutput() {
      public void write(OutputStream outputStream) throws IOException, WebApplicationException {
        Writer writer = new OutputStreamWriter(outputStream, "UTF-8");

        BodyContentHandler body = new BodyContentHandler(new RichTextContentHandler(writer));

        TikaInputStream tis = TikaInputStream.get(is);

        try {
            parser.parse(tis, body, metadata);
        } catch (SAXException e) {
          throw new WebApplicationException(e);
        } catch (EncryptedDocumentException e) {
          logger.warn(String.format(
                  "%s: Encrypted document",
View Full Code Here

    return produceOutput(is, httpHeaders.getRequestHeaders(), info, "xml");
  }
 
  private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders,
        final UriInfo info, final String format) {
    final AutoDetectParser parser = createParser();
    final Metadata metadata = new Metadata();

    fillMetadata(parser, metadata, httpHeaders);

    logRequest(logger, info, metadata);

    return new StreamingOutput() {
      public void write(OutputStream outputStream)
        throws IOException, WebApplicationException {
        Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
        ContentHandler content;

        try {
          SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
          TransformerHandler handler = factory.newTransformerHandler( );
          handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
          handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
          handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
          handler.setResult(new StreamResult(writer));
          content = new ExpandedTitleContentHandler( handler );
        }
        catch ( TransformerConfigurationException e ) {
          throw new WebApplicationException( e );
        }

        TikaInputStream tis = TikaInputStream.get(is);

        try {
          parser.parse(tis, content, metadata);
        }
        catch (SAXException e) {
          throw new WebApplicationException(e);
        }
        catch (EncryptedDocumentException e) {
View Full Code Here

    /**
     * Test that with only ID3v1 tags, we get some information out  
     */
    @Test
    public void testMp3ParsingID3v1() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
                "/test-documents/testMP3id3v1.mp3");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

     * Test that with only ID3v2 tags, we get the full
     *  set of information out.
     */
    @Test
    public void testMp3ParsingID3v2() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
                "/test-documents/testMP3id3v2.mp3");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        // Check core properties
View Full Code Here

     * Test that with both id3v2 and id3v1, we prefer the
     *  details from id3v2
     */
    @Test
    public void testMp3ParsingID3v1v2() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
                "/test-documents/testMP3id3v1_v2.mp3");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.AutoDetectParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.