Package org.apache.tika.sax

Examples of org.apache.tika.sax.WriteOutContentHandler


     * the given writer.
     *
     * @param writer writer
     */
    public BoilerpipeContentHandler(Writer writer) {
        this(new WriteOutContentHandler(writer));
    }
View Full Code Here


        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("ISO-8859-1")),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());
        String content = writer.toString();

        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

    public void testCP866() throws Exception {
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                TXTParserTest.class.getResourceAsStream("/test-documents/russian.cp866.txt"),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());

        assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
    }
View Full Code Here

    public void testEBCDIC_CP500() throws Exception {
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                TXTParserTest.class.getResourceAsStream("/test-documents/english.cp500.txt"),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());

        assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));

        // Additional check that it isn't too eager on short blocks of text
        metadata = new Metadata();
        writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes("ISO-8859-1")),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());

        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
    }
View Full Code Here

        }
        return supportedMediaTypes.contains(MediaType.parse(type));
    }

    private String parseStringValue(Blob v, Metadata metadata) {
        WriteOutContentHandler handler = new WriteOutContentHandler();
        try {
            InputStream stream = v.getNewStream();
            try {
                parser.parse(stream, handler, metadata, new ParseContext());
            } finally {
                stream.close();
            }
        } catch (LinkageError e) {
            // Capture and ignore errors caused by extraction libraries
            // not being present. This is equivalent to disabling
            // selected media types in configuration, so we can simply
            // ignore these errors.
        } catch (Throwable t) {
            // Capture and report any other full text extraction problems.
            // The special STOP exception is used for normal termination.
            if (!handler.isWriteLimitReached(t)) {
                log.debug("Failed to extract text from a binary property."
                        + " This is a fairly common case, and nothing to"
                        + " worry about. The stack trace is included to"
                        + " help improve the text extraction feature.", t);
                return "TextExtractionError";
            }
        }
        return handler.toString();
    }
View Full Code Here

               try
               {
                  Metadata metadata = new Metadata();
                  metadata.set(Metadata.CONTENT_TYPE, mimeType);

                  ContentHandler handler = new WriteOutContentHandler(MAX_READED_SIZE);
                  ParseContext context = new ParseContext();
                  context.set(Parser.class, parser);
                  try
                  {
                     parser.parse(is, handler, metadata, context);
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.WriteOutContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.