Package org.apache.tika.metadata

Examples of org.apache.tika.metadata.Metadata


                        if (atts.getValue("", "href") != null) {
                            links.add(atts.getValue("", "href"));
                        }
                    }
                },
                new Metadata(),
                new ParseContext());
        assertEquals(1, links.size());
        assertEquals(url, links.get(0));
    }
View Full Code Here


        String test =
            "<html><head><meta http-equiv=\"content-type\""
            + " content=\"text/html; charset=ISO-8859-1\" />"
            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

     * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
     */
    public void testDetectOfCharset() throws Exception {
        String test =
            "<html><head><title>\u017d</title></head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("\u017d", metadata.get(Metadata.TITLE));
    }
View Full Code Here

    public void testUsingCharsetInContentTypeHeader() throws Exception {
        final String test =
            "<html><head><title>the name is \u00e1ndre</title></head>"
            + "<body></body></html>";

        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

     * Test case for TIKA-339: Don't use language returned by CharsetDetector
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
     */
    public void testIgnoreCharsetDetectorLanguage() throws Exception {
        String test = "<html><title>Simple Content</title><body></body></html>";
        Metadata metadata = new Metadata();
        metadata.add(Metadata.CONTENT_LANGUAGE, "en");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
    }
View Full Code Here

        String test1 =
            "<html><head><meta http-equiv=\"content-type\""
            + " content=\"text/html; charset=ISO-8859-1; charset=iso-8859-1\" />"
            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test1.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));

        // Some HTML pages have errors like ';;' versus '; ' as separator
        String test2 =
            "<html><head><meta http-equiv=\"content-type\""
            + " content=\"text/html;;charset=ISO-8859-1\" />"
            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

    public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
        final String test =
            "<html><head><title>the name is \u00e1ndre</title></head>"
            + "<body></body></html>";

        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

     * Test case for TIKA-357
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
     */
    public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
        String path = "/test-documents/big-preamble.html";
        Metadata metadata = new Metadata();
        new HtmlParser().parse(
                HtmlParserTest.class.getResourceAsStream(path),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

            fail("Could not open stream from specified resource: "
                    + tp.resourceRealName);
        }

        try {
            Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
            metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
            StringWriter writer = new StringWriter();
            ContentHandler handler = new WriteOutContentHandler(writer);
            new AutoDetectParser().parse(input, handler, metadata);

            assertEquals("Bad content type: " + tp,
                    tp.realType, metadata.get(Metadata.CONTENT_TYPE));

            assertTrue("Expected content not found: " + tp,
                    writer.toString().contains(tp.expectedContentFragment));
        } finally {
            input.close();
View Full Code Here

        String text =
            "Hello, World! This is simple UTF-8 text content written"
            + " in English to test autodetection of both the character"
            + " encoding and the language of the input stream.";

        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                new WriteOutContentHandler(writer),
                metadata);
        String content = writer.toString();

        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
        assertEquals("en", metadata.get(Metadata.LANGUAGE));
        // TODO: ICU reports the content encoding as ISO-8859-1, even though
        // it could just as well be ASCII or UTF-8, so  for now we won't
        // test for the Metadata.CONTENT_ENCODING field

        assertTrue(content.contains("Hello"));
View Full Code Here

TOP

Related Classes of org.apache.tika.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.