Examples of org.apache.tika.sax.WriteOutContentHandler

org.apache.tika.sax.WriteOutContentHandler
SAX event handler that writes all character content out to a {@link Writer} character stream.


        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());
        String content = writer.toString();


        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

     * the given writer.
     *
     * @param writer writer
     */
    public BoilerpipeContentHandler(Writer writer) {
        this(new WriteOutContentHandler(writer));
    }

View Full Code Here

        
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        tika.getParser().parse(
                     new FileInputStream(file),
                     new WriteOutContentHandler(writer),
                     metadata,
                     new ParseContext());
        String content = writer.toString();


        assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

       
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        tika.getParser().parse(
                     new FileInputStream(file),
                     new WriteOutContentHandler(writer),
                     metadata,
                     new ParseContext());
        String content = writer.toString();
        return new Result(content, metadata);
    }

View Full Code Here

        }
        return supportedMediaTypes.contains(MediaType.parse(type));
    }


    private String parseStringValue(Blob v, Metadata metadata) {
        WriteOutContentHandler handler = new WriteOutContentHandler();
        try {
            InputStream stream = v.getNewStream();
            try {
                parser.parse(stream, handler, metadata, new ParseContext());
            } finally {
                stream.close();
            }
        } catch (LinkageError e) {
            // Capture and ignore errors caused by extraction libraries
            // not being present. This is equivalent to disabling
            // selected media types in configuration, so we can simply
            // ignore these errors.
        } catch (Throwable t) {
            // Capture and report any other full text extraction problems.
            // The special STOP exception is used for normal termination.
            if (!handler.isWriteLimitReached(t)) {
                log.debug("Failed to extract text from a binary property."
                        + " This is a fairly common case, and nothing to"
                        + " worry about. The stack trace is included to"
                        + " help improve the text extraction feature.", t);
                return "TextExtractionError";
            }
        }
        return handler.toString();
    }

View Full Code Here

            throws IOException, SAXException, TikaException {
        long startTime = System.currentTimeMillis();
        if (logger.isDebugEnabled()) {
            logger.debug("Start text extraction using metadata: " + metadata);
        }
        WriteOutContentHandler handler = new WriteOutContentHandler(characterLimit);
        try {
            parser.parse(stream, new BodyContentHandler(handler), metadata, new ParseContext());
        } catch (SAXException e) {
            if (handler.isWriteLimitReached(e)) {
                if (characterLimit > 0) {
                    logger.info("Document content length exceeded the configured limit. Extracted first "
                            + characterLimit + " characters.");
                }
            } else {
                throw e;
            }
        }
        String extractedText = handler.toString();
        if (logger.isDebugEnabled()) {
            logger.debug("Text extraction finished in " + (System.currentTimeMillis() - startTime) + " ms. Extracted "
                    + extractedText.length() + " characters.");
            logger.debug("Extracted metadata: " + metadata);
            if (logger.isTraceEnabled()) {

View Full Code Here

     * @throws IOException if the document can not be read
     * @throws TikaException if the document can not be parsed
     */
    public String parseToString(InputStream stream, Metadata metadata)
            throws IOException, TikaException {
        WriteOutContentHandler handler =
            new WriteOutContentHandler(maxStringLength);
        try {
            ParseContext context = new ParseContext();
            context.set(Parser.class, parser);
            parser.parse(
                    stream, new BodyContentHandler(handler), metadata, context);
        } catch (SAXException e) {
            if (!handler.isWriteLimitReached(e)) {
                // This should never happen with BodyContentHandler...
                throw new TikaException("Unexpected SAX processing failure", e);
            }
        } finally {
            stream.close();
        }
        return handler.toString();
    }

View Full Code Here

            doc.add(newFulltextField(parseStringValue(v, metadata)));
        }
    }


    private String parseStringValue(Blob v, Metadata metadata) {
        WriteOutContentHandler handler = new WriteOutContentHandler();
        try {
            InputStream stream = v.getNewStream();
            try {
                context.getParser().parse(stream, handler, metadata, new ParseContext());
            } finally {
                stream.close();
            }
        } catch (LinkageError e) {
            // Capture and ignore errors caused by extraction libraries
            // not being present. This is equivalent to disabling
            // selected media types in configuration, so we can simply
            // ignore these errors.
        } catch (Throwable t) {
            // Capture and report any other full text extraction problems.
            // The special STOP exception is used for normal termination.
            if (!handler.isWriteLimitReached(t)) {
                log.debug("Failed to extract text from a binary property."
                        + " This is a fairly common case, and nothing to"
                        + " worry about. The stack trace is included to"
                        + " help improve the text extraction feature.", t);
                return "TextExtractionError";
            }
        }
        return handler.toString();
    }

View Full Code Here


        private final WriteOutContentHandler writeOutContentHandler;


        public ParsingTask(Parser parser, InternalValue value,
                Metadata metadata, int maxFieldLength) {
            this(new WriteOutContentHandler(maxFieldLength), parser, value,
                    metadata);
        }

View Full Code Here

    try {
      Metadata metadata = metadataProcessor.prepareMetadata();
      ParseContext parseContext = parseContextProvider.getParseContext( name, value );


      StringWriter writer = new StringWriter();
      WriteOutContentHandler contentHandler = new WriteOutContentHandler( writer );


      Parser parser = new AutoDetectParser();
      parser.parse( in, contentHandler, metadata, parseContext );
      luceneOptions.addFieldToDocument( name, writer.toString(), document );

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.tika.sax.WriteOutContentHandler

bixo.examples.crawl.SimpleBodyContentHandler

org.apache.jackrabbit.core.query.lucene.LazyTextExtractorField$ParsingTask

org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditor

org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexUpdate

org.apache.tika.parser.AutoDetectParserTest

org.apache.tika.parser.html.BoilerpipeContentHandler

org.apache.tika.parser.html.HtmlParser

org.apache.tika.parser.html.HtmlParserTest

org.apache.tika.parser.microsoft.ExcelParserTest

org.apache.tika.parser.microsoft.PowerPointParserTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.