Examples of ToXMLContentHandler


Examples of org.apache.tika.sax.ToXMLContentHandler

    protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
      ParseContext context = new ParseContext();
      context.set(Parser.class, parser);

      try {
          ContentHandler handler = new ToXMLContentHandler();
          parser.parse(input, handler, metadata, context);
          return new XMLResult(handler.toString(), metadata);
      } finally {
          input.close();
      }
  }
View Full Code Here

Examples of org.apache.tika.sax.ToXMLContentHandler

        config.setExtractUniqueInlineImagesOnly(false);
        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);


        Metadata metadata = new Metadata();
        ContentHandler handler = new ToXMLContentHandler();
        String path = "/test-documents/testPDF_childAttachments.pdf";
        InputStream stream = null;
        try {
            stream = TikaInputStream.get(this.getClass().getResource(path));
            parser.parse(stream, handler, metadata, context);
        } finally {
            IOUtils.closeQuietly(stream);
        }

        String xml = handler.toString();
        //regular attachment
        assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
        //inline image
        assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml);
View Full Code Here

Examples of org.apache.tika.sax.ToXMLContentHandler

    /**
     * Example of extracting the contents as HTML, as a string.
     */
    public String parseToHTML() throws IOException, SAXException, TikaException {
        ContentHandler handler = new ToXMLContentHandler();
       
        InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc");
        AutoDetectParser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        try {
            parser.parse(stream, handler, metadata);
            return handler.toString();
        } finally {
            stream.close();
        }
    }
View Full Code Here

Examples of org.apache.tika.sax.ToXMLContentHandler

     * Example of extracting just the body as HTML, without the
     *  head part, as a string
     */
    public String parseBodyToHTML() throws IOException, SAXException, TikaException {
        ContentHandler handler = new BodyContentHandler(
                new ToXMLContentHandler());
       
        InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc");
        AutoDetectParser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        try {
View Full Code Here

Examples of org.apache.tika.sax.ToXMLContentHandler

        // Only get things under html -> body -> div (class=header)
        XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
        Matcher divContentMatcher = xhtmlParser.parse(
                "/xhtml:html/xhtml:body/xhtml:div/descendant::node()");       
        ContentHandler handler = new MatchingContentHandler(
                new ToXMLContentHandler(), divContentMatcher);
       
        InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc");
        AutoDetectParser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        try {
View Full Code Here

Examples of org.apache.tika.sax.ToXMLContentHandler

    if (System.getProperty("java.version").startsWith("1.5")) {
      return;
    }

    Parser parser = new EnviHeaderParser();
    ToXMLContentHandler handler = new ToXMLContentHandler();
    Metadata metadata = new Metadata();

    InputStream stream = EnviHeaderParser.class
        .getResourceAsStream("/test-documents/envi_test_header.hdr");
    assertNotNull("Test ENVI file not found", stream);
    try {
      parser.parse(stream, handler, metadata, new ParseContext());
    } finally {
      stream.close();
    }

    // Check content of test file
    String content = handler.toString();
        assertTrue(content.contains("<body><p>ENVI</p>"));
    assertTrue(content.contains("<p>samples = 2400</p>"));
    assertTrue(content.contains("<p>lines   = 2400</p>"));
    assertTrue(content.contains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>"));
    assertTrue(content.contains("content=\"application/envi.hdr\""));
View Full Code Here

Examples of org.apache.tika.sax.ToXMLContentHandler

    @Test
    public void testParser() throws Exception {

        AutoDetectParser parser = new AutoDetectParser();
        ToXMLContentHandler handler = new ToXMLContentHandler();
        Metadata metadata = new Metadata();
        String path = "/test-documents/breidamerkurjokull_radar_profiles_2009.mat";

        InputStream stream = MatParser.class.getResourceAsStream(path);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        //Check Metadata
        assertEquals("PCWIN64", metadata.get("platform"));
        assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
        assertEquals("IM", metadata.get("endian"));
        assertEquals("Thu Feb 21 15:52:49 2013", metadata.get("createdOn"));

        //Check Content
        String content = handler.toString();

        assertTrue(content.contains("<li>[1x909  double array]</li>"));
        assertTrue(content.contains("<p>c1:[1x1  struct array]</p>"));
        assertTrue(content.contains("<li>[1024x1  double array]</li>"));
        assertTrue(content.contains("<p>b1:[1x1  struct array]</p>"));
View Full Code Here

Examples of org.apache.tika.sax.ToXMLContentHandler

    @Test
    public void testParserForText() throws Exception {

        Parser parser = new MatParser();
        ToXMLContentHandler handler = new ToXMLContentHandler();
        Metadata metadata = new Metadata();
        String path = "/test-documents/test_mat_text.mat";

        InputStream stream = MatParser.class.getResourceAsStream(path);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        //Check Content
        String content = handler.toString();
        assertTrue(content.contains("<p>double:[2x2  double array]</p>"));
    }
View Full Code Here

Examples of org.apache.tika.sax.ToXMLContentHandler

                    "writing the text/plain version of the parsed content",e);
            }
            final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
            final ContentHandler textHandler = new BodyContentHandler( //only the Body
                new PlainTextHandler(plainTextWriter, false,skipLinebreaks)); //skip ignoreable
            final ToXMLContentHandler xhtmlHandler;
            final ContentHandler mainHandler;
            ContentSink xhtmlSink = null;
            try {
                if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
                    try {
                        xhtmlSink = ciFactory.createContentSink(XHTML +"; charset="+UTF8.name());
                    } catch (IOException e) {
                        throw new EngineException("Error while initialising Blob for" +
                                "writing the application/xhtml+xml version of the parsed content",e);
                    }
                    try {
                        xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(),UTF8.name());
                    } catch (UnsupportedEncodingException e) {
                        throw new EngineException("This system does not support the encoding "+UTF8,e);
                    }
                    mainHandler = new MultiHandler(textHandler,xhtmlHandler);
                } else {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.