Package org.apache.tika.parser

Examples of org.apache.tika.parser.Parser


        String s2 = ParseUtils.getStringContent(file, tc,
        "application/vnd.ms-excel");
        assertEquals(s1, s2);
        assertTrue("Text does not contain '" + expected + "'", s1
                .contains(expected));
        Parser parser = tc.getParser("application/vnd.ms-excel");
        Metadata metadata = new Metadata();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, new DefaultHandler(), metadata);
        } finally {
            stream.close();
        }
        assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
    }
View Full Code Here


        File file = getTestFile("testHTML.html");
        String s1 = ParseUtils.getStringContent(file, tc);
        String s2 = ParseUtils.getStringContent(file, tc, "text/html");
        assertEquals(s1, s2);

        Parser parser = tc.getParser("text/html");
        assertNotNull(parser);
    }
View Full Code Here

    public void testZipExtraction() throws Exception {
        File zip = getTestFile("test-documents.zip");
        List<Parser> parsers = ParseUtils.getParsersFromZip(zip, tc);
        List<File> zipFiles = Utils.unzip(new FileInputStream(zip));
        for (int i = 0; i < parsers.size(); i++) {
            Parser zipEntryParser = parsers.get(i);
            assertNotNull(zipEntryParser);
            for (int j = 0; j < zipFiles.size(); j++) {
                /* FIXME: Doesn't work with the new Parser interface
                ParserConfig config = tc.getParserConfig(
                        zipEntryParser.getMimeType());
View Full Code Here

     */
    public static String getStringContent(
            InputStream stream, TikaConfig config, String mimeType)
            throws TikaException, IOException {
        try {
            Parser parser = config.getParser(mimeType);
            StringWriter writer = new StringWriter();
            parser.parse(
                    stream, new WriteOutContentHandler(writer), new Metadata());
            return writer.toString();
        } catch (SAXException e) {
            throw new TikaException("Unexpected SAX error", e);
        }
View Full Code Here

        File file = getResourceAsFile("/test-documents/testPPT.ppt");
        String s1 = ParseUtils.getStringContent(file, tc);
        String s2 = ParseUtils.getStringContent(file, tc,
                "application/vnd.ms-powerpoint");
        assertEquals(s1, s2);
        Parser parser = tc.getParser("application/vnd.ms-powerpoint");
        Metadata metadata = new Metadata();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, new DefaultHandler(), metadata);
        } finally {
            stream.close();
        }
        assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
    }
View Full Code Here

    public void testWORDxtraction() throws Exception {
        File file = getResourceAsFile("/test-documents/testWORD.doc");
        String s1 = ParseUtils.getStringContent(file, tc);
        String s2 = ParseUtils.getStringContent(file, tc, "application/msword");
        assertEquals(s1, s2);
        Parser parser = tc.getParser("application/msword");
        Metadata metadata = new Metadata();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, new DefaultHandler(), metadata);
        } finally {
            stream.close();
        }
        assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
    }
View Full Code Here

        String s2 = ParseUtils.getStringContent(file, tc,
                "application/vnd.ms-excel");
        assertEquals(s1, s2);
        assertTrue("Text does not contain '" + expected + "'", s1
                .contains(expected));
        Parser parser = tc.getParser("application/vnd.ms-excel");
        Metadata metadata = new Metadata();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, new DefaultHandler(), metadata);
        } finally {
            stream.close();
        }
        assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
    }
View Full Code Here

        File file = getResourceAsFile("/test-documents/testHTML.html");
        String s1 = ParseUtils.getStringContent(file, tc);
        String s2 = ParseUtils.getStringContent(file, tc, "text/html");
        assertEquals(s1, s2);

        Parser parser = tc.getParser("text/html");
        assertNotNull(parser);
    }
View Full Code Here

        File file = getResourceAsFile("/test-documents/test-documents.zip");
        String s1 = ParseUtils.getStringContent(file, tc);
        String s2 = ParseUtils.getStringContent(file, tc, "application/zip");
        assertEquals(s1, s2);

        Parser parser = tc.getParser("application/zip");
        assertNotNull(parser);
    }
View Full Code Here

        File file = getResourceAsFile("/test-documents/testMP3id3v1.mp3");
        String s1 = ParseUtils.getStringContent(file, tc);
        String s2 = ParseUtils.getStringContent(file, tc, "audio/mpeg");
        assertEquals(s1, s2);

        Parser parser = tc.getParser("audio/mpeg");
        assertNotNull(parser);
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.