Package org.apache.tika.metadata

Examples of org.apache.tika.metadata.Metadata


        return shorter;
    }

    public String getType(String typeName, String url, byte[] data) {
        try {
            Metadata metadata = new Metadata();
            if (url != null) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, url);
            }
            if (typeName != null) {
                metadata.set(Metadata.CONTENT_TYPE, typeName);
            }
            return detect(new ByteArrayInputStream(data), metadata).toString();
        } catch (IOException e) {
            throw new IllegalStateException(
                    "ByteArrayInputStream throws an IOException!", e);
View Full Code Here


     * @throws IOException if the document can not be accessed
     */
    public String getType(URL url) throws IOException {
        InputStream stream = url.openStream();
        try {
            Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, url.toString());
            return detect(stream, metadata).toString();
        } finally {
            stream.close();
        }
    }
View Full Code Here

            type = TEXT;
        } else if (arg.equals("-m") || arg.equals("--metadata")) {
            type = METADATA;
        } else {
            pipeMode = false;
            metadata = new Metadata();
            if (arg.equals("-")) {
                parser.parse(
                        System.in, type.getContentHandler(),
                        metadata, context);
            } else {
View Full Code Here

    public void testExcelParser() throws Exception {
        InputStream input = ExcelParserTest.class.getResourceAsStream(
                "/test-documents/testEXCEL.xls");
        try {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new OfficeParser().parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.ms-excel",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
            String content = handler.toString();
            assertTrue(content.contains("Sample Excel Worksheet"));
            assertTrue(content.contains("Numbers and their Squares"));
            assertTrue(content.contains("\t\tNumber\tSquare"));
            assertTrue(content.contains("9"));
View Full Code Here

    public void testExcelParserFormatting() throws Exception {
        InputStream input = ExcelParserTest.class.getResourceAsStream(
                "/test-documents/testEXCEL-formats.xls");
        try {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new OfficeParser().parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.ms-excel",
                    metadata.get(Metadata.CONTENT_TYPE));

            String content = handler.toString();

            // Number #,##0.00
            assertTrue(content.contains("1,599.99"));
View Full Code Here

        String text =
            "Hello, World! This is simple UTF-8 text content written"
            + " in English to test autodetection of both the character"
            + " encoding and the language of the input stream.";

        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());
        String content = writer.toString();

        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
        assertEquals("en", metadata.get(Metadata.LANGUAGE));
        // TODO: ICU reports the content encoding as ISO-8859-1, even though
        // it could just as well be ASCII or UTF-8, so  for now we won't
        // test for the Metadata.CONTENT_ENCODING field

        assertTrue(content.contains("Hello"));
View Full Code Here

    public void testUTF8Text() throws Exception {
        String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";

        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                handler, metadata, new ParseContext());
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        assertTrue(handler.toString().contains(text));
    }
View Full Code Here

        assertTrue(handler.toString().contains(text));
    }

    public void testEmptyText() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("\n", handler.toString());
    }
View Full Code Here

    public void testUseIncomingCharsetAsHint() throws Exception {
        // Could be UTF-8 or ISO 8859-1 or ...
        // u00e1 is latin small letter a with acute
        final String test2 = "the name is \u00e1ndre";

        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
       
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
        parser.parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());
       
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

    public void testUsingCharsetInContentTypeHeader() throws Exception {
        // Could be UTF-8 or ISO 8859-1 or ...
        // u00e1 is latin small letter a with acute
        final String test2 = "the name is \u00e1ndre";

        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
        parser.parse(
                new ByteArrayInputStream(test2.getBytes("UTF-8")),
                new BodyContentHandler(),  metadata, new ParseContext());

        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.