Package org.apache.tika.sax.xpath

Examples of org.apache.tika.sax.xpath.XPathParser


        stream = new CloseShieldInputStream(stream);

        // Prepare the HTML content handler that generates proper
        // XHTML events to records relevant document metadata
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        XPathParser xpath = new XPathParser(null, "");
        Matcher body = xpath.parse("/HTML/BODY//node()");
        Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
        Matcher meta = xpath.parse("/HTML/HEAD/META//node()");
        handler = new TeeContentHandler(
                new MatchingContentHandler(getBodyHandler(xhtml), body),
                new MatchingContentHandler(getTitleHandler(metadata), title),
                new MatchingContentHandler(getMetaHandler(metadata), meta));
View Full Code Here


        stream = new CloseShieldInputStream(stream);

        // Prepare the HTML content handler that generates proper
        // XHTML events to records relevant document metadata
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        XPathParser xpath = new XPathParser(null, "");
        Matcher body = xpath.parse("/HTML/BODY//node()");
        Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
        Matcher meta = xpath.parse("/HTML/HEAD/META//node()");
        handler = new TeeContentHandler(
                new MatchingContentHandler(getBodyHandler(xhtml), body),
                new MatchingContentHandler(getTitleHandler(metadata), title),
                new MatchingContentHandler(getMetaHandler(metadata), meta));
View Full Code Here

     * Example of extracting just one part of the document's body,
     *  as HTML as a string, excluding the rest
     */
    public String parseOnePartToHTML() throws IOException, SAXException, TikaException {
        // Only get things under html -> body -> div (class=header)
        XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
        Matcher divContentMatcher = xhtmlParser.parse(
                "/xhtml:html/xhtml:body/xhtml:div/descendant::node()");       
        ContentHandler handler = new MatchingContentHandler(
                new ToXMLContentHandler(), divContentMatcher);
       
        InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc");
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.xpath.XPathParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.