Package org.apache.tika.parser

Examples of org.apache.tika.parser.ParseContext


        "</map></p></body></html>";

        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                makeHtmlTransformer(sw), new Metadata(), new ParseContext());

        String result = sw.toString();

        // <map> tag should exist, with <area> tag with fully resolved URL
        assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
View Full Code Here


        "</object></p></body></html>";

        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                makeHtmlTransformer(sw), new Metadata(), new ParseContext());

        String result = sw.toString();

        // <object> tag should exist with fully resolved URLs
        assertTrue(
View Full Code Here

        metadata.add("Language", null);

        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                makeHtmlTransformer(sw), metadata, new ParseContext());

        String result = sw.toString();

        // <meta> tag for Content-Type should exist, but nothing for Language
        assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result));
View Full Code Here

        "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>";

        StringWriter sw1 = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test1.getBytes("UTF-8")),
                makeHtmlTransformer(sw1), new Metadata(), new ParseContext());

        String result = sw1.toString();

        // <frame> tag should exist, with fully resolved URL
        assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));

        // <body> tag should not exist.
        assertFalse(Pattern.matches("(?s).*<body>.*$", result));

        // Test the example from the Nutch project.
        final String test2 = "<html><head><title> my title </title></head><body>" +
        "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
        "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
        "<frame src=\"invalid.html\"/></frame>" +
        "<frame src=\"right.html\"></frame>" +
        "</frameset></frameset></body></html>";

        StringWriter sw2 = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                makeHtmlTransformer(sw2), new Metadata(), new ParseContext());

        result = sw2.toString();

        // <frame> tags should exist, with relative URL (no base element specified)
        assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
View Full Code Here

        Metadata metadata = new Metadata();
        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                HtmlParserTest.class.getResourceAsStream(path),
                makeHtmlTransformer(sw),  metadata, new ParseContext());

        String content = sw.toString();

        // Should have <html>, <head>, <title>, <body> elements
        assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
View Full Code Here

        "</head><body></body></html>";

        StringWriter sw = new StringWriter();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                makeHtmlTransformer(sw), new Metadata(), new ParseContext());

        String result = sw.toString();

        // <link> tag should exist in <head>, with fully resolved URL
        assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
View Full Code Here

        BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
        bpch.setIncludeMarkup(true);

        new HtmlParser().parse(
                HtmlParserTest.class.getResourceAsStream(path),
                bpch,  metadata, new ParseContext());

        String content = sw.toString();
        assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
        assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
        assertTrue("Has real content", content.contains("<p>This is the real meat"));
View Full Code Here

    @Test
    public void testIdentityMapper() throws Exception {
        final String html = "<html><head><title>Title</title></head>" +
                "<body></body></html>";
        Metadata metadata = new Metadata();
        ParseContext parseContext = new ParseContext();
        parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);

        StringWriter sw = new StringWriter();

        new HtmlParser().parse (
                new ByteArrayInputStream(html.getBytes("UTF-8")),
View Full Code Here

                "<body><ul><li>one</li></ul></body></html>";

        BodyContentHandler handler = new BodyContentHandler();
        new HtmlParser().parse(
                new ByteArrayInputStream(html.getBytes("UTF-8")),
                handler,  new Metadata(), new ParseContext());

        // Make sure we get <tab>, "one", newline, newline
        String result = handler.toString();

        assertTrue(Pattern.matches("\tone\n\n", result));
View Full Code Here

        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
        bpHandler.setIncludeMarkup(true);
       
        new HtmlParser().parse(
                HtmlParserTest.class.getResourceAsStream(path),
                bpHandler,  metadata, new ParseContext());
       
        String content = handler.toString();

        // Should not contain item_aitem_b
        assertFalse(content.contains("item_aitem_b"));
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.ParseContext

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.