Package org.apache.tika.sax

Examples of org.apache.tika.sax.ToXMLContentHandler


    @Test
    public void testParser() throws Exception {

        AutoDetectParser parser = new AutoDetectParser();
        ToXMLContentHandler handler = new ToXMLContentHandler();
        Metadata metadata = new Metadata();
        String path = "/test-documents/breidamerkurjokull_radar_profiles_2009.mat";

        InputStream stream = MatParser.class.getResourceAsStream(path);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        //Check Metadata
        assertEquals("PCWIN64", metadata.get("platform"));
        assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
        assertEquals("IM", metadata.get("endian"));
        assertEquals("Thu Feb 21 15:52:49 2013", metadata.get("createdOn"));

        //Check Content
        String content = handler.toString();

        assertTrue(content.contains("<li>[1x909  double array]</li>"));
        assertTrue(content.contains("<p>c1:[1x1  struct array]</p>"));
        assertTrue(content.contains("<li>[1024x1  double array]</li>"));
        assertTrue(content.contains("<p>b1:[1x1  struct array]</p>"));
View Full Code Here


    @Test
    public void testParserForText() throws Exception {

        Parser parser = new MatParser();
        ToXMLContentHandler handler = new ToXMLContentHandler();
        Metadata metadata = new Metadata();
        String path = "/test-documents/test_mat_text.mat";

        InputStream stream = MatParser.class.getResourceAsStream(path);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        //Check Content
        String content = handler.toString();
        assertTrue(content.contains("<p>double:[2x2  double array]</p>"));
    }
View Full Code Here

                    "writing the text/plain version of the parsed content",e);
            }
            final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
            final ContentHandler textHandler = new BodyContentHandler( //only the Body
                new PlainTextHandler(plainTextWriter, false,skipLinebreaks)); //skip ignoreable
            final ToXMLContentHandler xhtmlHandler;
            final ContentHandler mainHandler;
            ContentSink xhtmlSink = null;
            try {
                if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
                    try {
                        xhtmlSink = ciFactory.createContentSink(XHTML +"; charset="+UTF8.name());
                    } catch (IOException e) {
                        throw new EngineException("Error while initialising Blob for" +
                                "writing the application/xhtml+xml version of the parsed content",e);
                    }
                    try {
                        xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(),UTF8.name());
                    } catch (UnsupportedEncodingException e) {
                        throw new EngineException("This system does not support the encoding "+UTF8,e);
                    }
                    mainHandler = new MultiHandler(textHandler,xhtmlHandler);
                } else {
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.ToXMLContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.