Package org.apache.tika.sax

Examples of org.apache.tika.sax.WriteOutContentHandler


    public void testEmptyText() throws Exception {
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(new byte[0]),
                new WriteOutContentHandler(writer),
                metadata);
        String content = writer.toString();
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("", content);
    }
View Full Code Here


        InputStream input = ExcelParserTest.class.getResourceAsStream(
                "/test-documents/testEXCEL.xls");
        try {
            Metadata metadata = new Metadata();
            StringWriter writer = new StringWriter();
            ContentHandler handler = new WriteOutContentHandler(writer);
            new ExcelParser().parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.ms-excel",
                    metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

        StringWriter writer = new StringWriter();
        Metadata metadata = new Metadata();

        parser.parse(getStream("test-documents/testHTML.html"),
                new WriteOutContentHandler(writer), metadata);
        String content = writer.toString();

        assertTrue("Did not contain expected text:"
                + "Title : Test Indexation Html", content
                .contains("Title : Test Indexation Html"));
View Full Code Here

        StringWriter writer = new StringWriter();
        Metadata metadata = new Metadata();

        parser.parse(getStream("test-documents/testHTML_utf8.html"),
                new WriteOutContentHandler(writer), metadata);
        String content = writer.toString();

        assertTrue("Did not contain expected text:"
                + "Title : Tilte with UTF-8 chars öäå", content
                .contains("Title : Tilte with UTF-8 chars öäå"));
View Full Code Here

    public void testParseEmpty() throws Exception {
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(new ByteArrayInputStream(new byte[0]),
                new WriteOutContentHandler(writer), metadata);
        String content = writer.toString();
        assertEquals("", content);
    }
View Full Code Here

        InputStream input = WordParserTest.class.getResourceAsStream(
                "/test-documents/testWORD.doc");
        try {
            Metadata metadata = new Metadata();
            StringWriter writer = new StringWriter();
            ContentHandler handler = new WriteOutContentHandler(writer);
            new WordParser().parse(input, handler, metadata);

            assertEquals(
                    "application/msword",
                    metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

        InputStream input = PowerPointParserTest.class.getResourceAsStream(
                "/test-documents/testPPT.ppt");
        try {
            Metadata metadata = new Metadata();
            StringWriter writer = new StringWriter();
            ContentHandler handler = new WriteOutContentHandler(writer);
            new PowerPointParser().parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.ms-powerpoint",
                    metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

            throws TikaException, IOException {
        try {
            Parser parser = config.getParser(mimeType);
            StringWriter writer = new StringWriter();
            parser.parse(
                    stream, new WriteOutContentHandler(writer), new Metadata());
            return writer.toString();
        } catch (SAXException e) {
            throw new TikaException("Unexpected SAX error", e);
        }
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        StringWriter writer = new StringWriter();
        handler = new TeeContentHandler(
                handler, new WriteOutContentHandler(writer));
        super.parse(stream, handler, metadata);

        String content = writer.toString();
        metadata.set("fulltext", content);
View Full Code Here

     * @throws IOException if the document can not be read
     * @throws TikaException if the document can not be parsed
     */
    public String parseToString(InputStream stream, Metadata metadata)
            throws IOException, TikaException {
        WriteOutContentHandler handler =
            new WriteOutContentHandler(maxStringLength);
        try {
            ParseContext context = new ParseContext();
            context.set(Parser.class, parser);
            parser.parse(
                    stream, new BodyContentHandler(handler), metadata, context);
        } catch (SAXException e) {
            if (!handler.isWriteLimitReached(e)) {
                // This should never happen with BodyContentHandler...
                throw new TikaException("Unexpected SAX processing failure", e);
            }
        } finally {
            stream.close();
        }
        return handler.toString();
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.WriteOutContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.