Package org.apache.tika.sax

Examples of org.apache.tika.sax.WriteOutContentHandler


        parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
        xhtml.endDocument();
    }

    private ContentHandler getTitleHandler(final Metadata metadata) {
        return new WriteOutContentHandler() {
            @Override
            public void endElement(String u, String l, String n) {
                metadata.set(Metadata.TITLE, toString());
            }
        };
View Full Code Here


            }
        };
    }

    private ContentHandler getMetaHandler(final Metadata metadata) {
        return new WriteOutContentHandler() {
            @Override
            public void startElement(
                    String uri, String local, String name, Attributes atts)
                    throws SAXException {
                    if (atts.getValue("http-equiv") != null) {
View Full Code Here

       
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                     new FileInputStream(file),
                     new WriteOutContentHandler(writer),
                     metadata,
                     new ParseContext());
        String content = writer.toString();

        assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

      
        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                     new FileInputStream(file),
                     new WriteOutContentHandler(writer),
                     metadata,
                     new ParseContext());
        String content = writer.toString();
        return new Result(content, metadata);
    }
View Full Code Here

        Blob blob = (Blob) value;
        InputStream is = null;

        // TODO make write limit configurable
        WriteOutContentHandler woh = new WriteOutContentHandler(500 * 1000); // 500K limit (Tika default: 100K)
        BodyContentHandler ch = new BodyContentHandler(woh);

        try {
            is = repository.getTable(table).getInputStream(record, fieldType.getName(), indexes);

            Metadata metadata = new Metadata();
            metadata.add(Metadata.CONTENT_TYPE, blob.getMediaType());
            if (blob.getName() != null) {
                metadata.add(Metadata.RESOURCE_NAME_KEY, blob.getName());
            }

            ParseContext parseContext = new ParseContext();

            tikaParser.parse(is, ch, metadata, parseContext);
        } catch (Throwable t) {
            if (woh.isWriteLimitReached(t)) {
                // ok, we'll just add use the partial result
                if (log.isInfoEnabled()) {
                    log.info("Blob extraction: write limit reached. Field '" + fieldType.getName() + "', record '"
                            + record.getId() + "'.");
                }
View Full Code Here

     * @throws IOException if the document can not be read
     * @throws TikaException if the document can not be parsed
     */
    public String parseToString(InputStream stream, Metadata metadata)
            throws IOException, TikaException {
        WriteOutContentHandler handler =
            new WriteOutContentHandler(maxStringLength);
        try {
            ParseContext context = new ParseContext();
            context.set(Parser.class, parser);
            parser.parse(
                    stream, new BodyContentHandler(handler), metadata, context);
        } catch (SAXException e) {
            if (!handler.isWriteLimitReached(e)) {
                // This should never happen with BodyContentHandler...
                throw new TikaException("Unexpected SAX processing failure", e);
            }
        } finally {
            stream.close();
        }
        return handler.toString();
    }
View Full Code Here

        parser.setContentHandler(new XHTMLDowngradeHandler(handler));
        parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
    }

    private ContentHandler getTitleHandler(final Metadata metadata) {
        return new WriteOutContentHandler() {
            @Override
            public void endElement(String u, String l, String n) {
                metadata.set(Metadata.TITLE, toString());
            }
        };
View Full Code Here

            }
        };
    }

    private ContentHandler getMetaHandler(final Metadata metadata) {
        return new WriteOutContentHandler() {
            @Override
            public void startElement(
                    String uri, String local, String name, Attributes atts)
                    throws SAXException {
                    if (atts.getValue("http-equiv") != null) {
View Full Code Here

        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                new WriteOutContentHandler(writer),
                metadata);
        String content = writer.toString();

        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
View Full Code Here

        Metadata metadata = new Metadata();
        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("ISO-8859-1")),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());
        String content = writer.toString();

        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.WriteOutContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.