Package org.apache.tika.sax

Examples of org.apache.tika.sax.TeeContentHandler


     */
    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        ContentHandler body = new BodyContentHandler();
        super.parse(stream, new TeeContentHandler(handler, body), metadata);

        String content = body.toString();
        metadata.set("fulltext", content);

        int length = Math.min(content.length(), 500);
View Full Code Here


        Matcher matcher = new CompositeMatcher(
                DC_XPATH.parse("//dc:" + element),
                DC_XPATH.parse("//dc:" + element + "//text()"));
        ContentHandler branch =
            new MatchingContentHandler(new MetadataHandler(md, name), matcher);
        return new TeeContentHandler(ch, branch);
    }
View Full Code Here

        Matcher matcher = new CompositeMatcher(
                META_XPATH.parse("//meta:" + element),
                META_XPATH.parse("//meta:" + element + "//text()"));
        ContentHandler branch =
            new MatchingContentHandler(new MetadataHandler(md, name), matcher);
        return new TeeContentHandler(ch, branch);
    }
View Full Code Here

            ContentHandler ch, Metadata md, String name, String attribute) {
        Matcher matcher =
            META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
        ContentHandler branch =
            new MatchingContentHandler(new MetadataHandler(md, name), matcher);
        return new TeeContentHandler(ch, branch);
    }
View Full Code Here

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        XPathParser xpath = new XPathParser(null, "");
        Matcher body = xpath.parse("/HTML/BODY//node()");
        Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
        Matcher meta = xpath.parse("/HTML/HEAD/META//node()");
        handler = new TeeContentHandler(
                new MatchingContentHandler(getBodyHandler(xhtml), body),
                new MatchingContentHandler(getTitleHandler(metadata), title),
                new MatchingContentHandler(getMetaHandler(metadata), meta));

        // Parse the HTML document
View Full Code Here

            }
        };
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/testHTML.html");
        try {
            parser.parse(stream, new TeeContentHandler(body, link), metadata);
        } finally {
            stream.close();
        }

        assertEquals(
View Full Code Here

                        }
                    }
                }
            };
            new HtmlParser().parse(
                    stream, new TeeContentHandler(body, link),
                    metadata, new ParseContext());
        } finally {
            stream.close();
        }
View Full Code Here

   
    BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
    BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
    LinkContentHandler linkHandler = new LinkContentHandler();
   
    TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );

    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, parallelHandler, metadata, new ParseContext());
     
View Full Code Here

   
    BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
    BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
    LinkContentHandler linkHandler = new LinkContentHandler();
   
    TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );

    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, parallelHandler, metadata, new ParseContext());
     
View Full Code Here

        ContentHandler parsingHandler = handler;
        StringWriter debugWriter = null;
        if (LOG.isTraceEnabled()) {
          debugWriter = new StringWriter();
          ContentHandler serializer = new XMLSerializer(debugWriter, new OutputFormat("XML", "UTF-8", true));
          parsingHandler = new TeeContentHandler(parsingHandler, serializer);
        }

        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.TeeContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.