Examples of BodyContentHandler


Examples of org.apache.tika.sax.BodyContentHandler

        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }

    private void assertExtractText(String msg, String expected, byte[] input)
            throws Exception {
        ContentHandler handler = new BodyContentHandler() {
            public void ignorableWhitespace(char[] ch, int off, int len) {
                // Ignore the whitespace added by XHTMLContentHandler
            }
        };
        Metadata metadata = new Metadata();
        parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(msg, expected, handler.toString());
    }
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

        Metadata metadata = new Metadata();
        metadata.set(Metadata.LANGUAGE, "en");

        parser.parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("en", metadata.get(Metadata.LANGUAGE));
    }
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
            String content = handler.toString();
            assertTrue(content.contains("Sample Excel Worksheet"));
            assertTrue(content.contains("Numbers and their Squares"));
            assertTrue(content.contains("9"));
            assertFalse(content.contains("9.0"));
            assertTrue(content.contains("196"));
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));

            String content = handler.toString();

            // Number #,##0.00
            assertTrue(content.contains("1,599.99"));
            assertTrue(content.contains("-1,599.99"));
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testPPT.pptx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
            String content = handler.toString();
            assertTrue(content.contains("Sample Powerpoint Slide"));
            assertTrue(content.contains("Powerpoint X for Mac"));
        } finally {
            input.close();
        }
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
            assertTrue(handler.toString().contains("Sample Word Document"));
        } finally {
            input.close();
        }
    }
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

    public void testParseAscii() throws Exception {
        String path = "/test-documents/testHTML.html";
        final StringWriter href = new StringWriter();
        final StringWriter name = new StringWriter();
        ContentHandler body = new BodyContentHandler();
        Metadata metadata = new Metadata();
        InputStream stream = HtmlParserTest.class.getResourceAsStream(path);
        try {
            ContentHandler link = new DefaultHandler() {
                @Override
                public void startElement(
                        String u, String l, String n, Attributes a)
                        throws SAXException {
                    if ("a".equals(l)) {
                        if (a.getValue("href") != null) {
                            href.append(a.getValue("href"));
                        } else if (a.getValue("name") != null) {
                            name.append(a.getValue("name"));
                        }
                    }
                }
            };
            new HtmlParser().parse(
                    stream, new TeeContentHandler(body, link),
                    metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
                "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
        assertEquals("Tika Developers", metadata.get("Author"));
        assertEquals("5", metadata.get("refresh"));

        assertEquals("http://www.apache.org/", href.toString());
        assertEquals("test-anchor", name.toString());

        String content = body.toString();
        assertTrue(
                "Did not contain expected text:" + "Test Indexation Html",
                content.contains("Test Indexation Html"));
        assertTrue(
                "Did not contain expected text:" + "Indexation du fichier",
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

        assertTrue(content.contains("extract content"));
        assertTrue(content.contains("an XHTML document"));
    }

    public void testParseEmpty() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        new HtmlParser().parse(
                new ByteArrayInputStream(new byte[0]),
                handler,  new Metadata(), new ParseContext());
        assertEquals("", handler.toString());
    }
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

Examples of org.apache.tika.sax.BodyContentHandler

        String test =
            "<html><head><title>\u017d</title></head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("\u017d", metadata.get(Metadata.TITLE));
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.