Package org.apache.tika.parser

Examples of org.apache.tika.parser.ParseContext


            + "<body></body></html>";

        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here


        String test = "<html><title>Simple Content</title><body></body></html>";
        Metadata metadata = new Metadata();
        metadata.add(Metadata.CONTENT_LANGUAGE, "en");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
    }
View Full Code Here

            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test1.getBytes("ISO-8859-1")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));

        // Some HTML pages have errors like ';;' versus '; ' as separator
        String test2 =
            "<html><head><meta http-equiv=\"content-type\""
            + " content=\"text/html;;charset=ISO-8859-15\" />"
            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

            + "<body></body></html>";

        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

    public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
        String path = "/test-documents/big-preamble.html";
        Metadata metadata = new Metadata();
        new HtmlParser().parse(
                HtmlParserTest.class.getResourceAsStream(path),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

        Metadata metadata = new Metadata();
        BodyContentHandler handler = new BodyContentHandler();
        new HtmlParser().parse(
                HtmlParserTest.class.getResourceAsStream(path),
                new BoilerpipeContentHandler(handler),  metadata, new ParseContext());

        String content = handler.toString();
        assertTrue(content.startsWith("This is the real meat"));
        assertTrue(content.endsWith("This is the end of the text.\n"));
        assertFalse(content.contains("boilerplate"));
View Full Code Here

        "</head><body><p>Simple Content</p></body></html>";

        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                makeHtmlTransformer(sw), new Metadata(), new ParseContext());

        String result = sw.toString();

        // Title element in <head> section
        assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));
View Full Code Here

        "</head><body><img src=\"image.jpg\" /></body></html>";

        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                makeHtmlTransformer(sw), new Metadata(), new ParseContext());

        String result = sw.toString();

        // <img> tag should exist, with fully resolved URL
        assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
View Full Code Here

        "</head><frameset><frame src=\"frame.html\" /></frameset></html>";

        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                makeHtmlTransformer(sw), new Metadata(), new ParseContext());

        String result = sw.toString();

        // <frame> tag should exist, with fully resolved URL
        assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
View Full Code Here

        "<p>Your browser doesn't support iframes!</p></body></html>";

        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                makeHtmlTransformer(sw), new Metadata(), new ParseContext());

        String result = sw.toString();

        // <iframe> tag should exist, with fully resolved URL
        assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.ParseContext

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.