Package org.apache.tika.sax

Examples of org.apache.tika.sax.XHTMLContentHandler


            throws IOException, SAXException, TikaException {
        try {
            DefaultStyledDocument sd = new DefaultStyledDocument();
            new RTFEditorKit().read(stream, sd, 0);

            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();
            xhtml.element("p", sd.getText(0, sd.getLength()));
            xhtml.endDocument();
        } catch (BadLocationException e) {
            throw new TikaException("Error parsing an RTF document", e);
        } catch (InternalError e) {
            throw new TikaException(
                    "Internal error parsing an RTF document, see TIKA-282", e);
View Full Code Here


        metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
        metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
        metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // flv tag stream follows...
        while (true) {
            int type = datainput.read();
            if (type == -1) {
                // EOF
                break;
            }

            int datalen = readUInt24(datainput); //body length
            stream.skip(4); // timestamp
            stream.skip(3); // streamid

            if (type == TYPE_METADATA) {
                // found metadata Tag, read content to buffer
                byte[] metaBytes = new byte[datalen];
                for (int readCount = 0; readCount < datalen;) {
                    int r = stream.read(metaBytes, readCount, datalen - readCount);
                    if(r!=-1) {
                        readCount += r;

                    } else {
                        break;
                    }
                }

                ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);

                DataInputStream dis = new DataInputStream(is);

                Object data = null;

                for (int i = 0; i < 2; i++) {
                    data = readAMFData(dis, -1);
                }

                if (data instanceof Map) {
                    // TODO if there are multiple metadata values with same key (in
                    // separate AMF blocks, we currently loose previous values)
                    Map<String, Object> extractedMetadata = (Map<String, Object>) data;
                    for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
                        metadata.set(entry.getKey(), entry.getValue().toString());
                    }
                }

            } else {
                // Tag was not metadata, skip over data we cannot handle
                for (int skiplen = 0; skiplen < datalen;) {
                    long currentSkipLen = datainput.skip(datalen - skiplen);
                    skiplen += currentSkipLen;
                }
            }

            sizePrev = readUInt32(datainput); // previous block size
            if (sizePrev != datalen + 11) {
                // file was corrupt or we could not parse it...
                break;
            }
        }

        xhtml.endDocument();
    }
View Full Code Here

            } catch (IIOException e) {
                throw new TikaException(type + " parse error", e);
            }
        }

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

        }
    }

    public HtmlHandler(
            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
        this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
    }
View Full Code Here

     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        POIFSFileSystem filesystem = new POIFSFileSystem(stream);

        // Parse summary entries first, to make metadata available early
        parseSummaryEntryIfExists(
                filesystem, SUMMARY_INFORMATION, metadata);
        parseSummaryEntryIfExists(
                filesystem, DOCUMENT_SUMMARY_INFORMATION, metadata);

        // Parse remaining document entries
        boolean outlookExtracted = false;
        Iterator<?> entries = filesystem.getRoot().getEntries();
        while (entries.hasNext()) {
            Entry entry = (Entry) entries.next();
            String name = entry.getName();
            if (!(entry instanceof DocumentEntry)) {
                // Skip directory entries
            } else if ("WordDocument".equals(name)) {
                setType(metadata, "application/msword");
                WordExtractor extractor = new WordExtractor(filesystem);

                addTextIfAny(xhtml, "header", extractor.getHeaderText());

                for (String paragraph : extractor.getParagraphText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getFootnoteText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getCommentsText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getEndnoteText()) {
                    xhtml.element("p", paragraph);
                }

                addTextIfAny(xhtml, "footer", extractor.getFooterText());
            } else if ("PowerPoint Document".equals(name)) {
                setType(metadata, "application/vnd.ms-powerpoint");
                PowerPointExtractor extractor =
                    new PowerPointExtractor(filesystem);
                xhtml.element("p", extractor.getText(true, true));
            } else if ("Workbook".equals(name)) {
                setType(metadata, "application/vnd.ms-excel");
                Locale locale = context.get(Locale.class, Locale.getDefault());
                new ExcelExtractor().parse(filesystem, xhtml, locale);
            } else if ("VisioDocument".equals(name)) {
                setType(metadata, "application/vnd.visio");
                VisioTextExtractor extractor =
                    new VisioTextExtractor(filesystem);
                for (String text : extractor.getAllText()) {
                    xhtml.element("p", text);
                }
            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                // TODO: Cleaner mechanism for detecting Outlook
                outlookExtracted = true;
                setType(metadata, "application/vnd.ms-outlook");
                new OutlookExtractor(filesystem).parse(xhtml, metadata);
            }
        }

        xhtml.endDocument();
    }
View Full Code Here

            int bom = reader.read();
            if (bom != '\ufeff') { // zero-width no-break space
                reader.reset();
            }

            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();

            xhtml.startElement("p");
            char[] buffer = new char[4096];
            int n = reader.read(buffer);
            while (n != -1) {
                xhtml.characters(buffer, 0, n);
                n = reader.read(buffer);
            }
            xhtml.endElement("p");

            xhtml.endDocument();
        } catch (UnsupportedEncodingException e) {
            throw new TikaException(
                    "Unsupported text encoding: " + encoding, e);
        }
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler,metadata);

        try {
            SAXParserFactory factory = SAXParserFactory.newInstance();
            factory.setValidating(false);
            factory.setNamespaceAware(true);
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler,metadata);
        DefaultHandler dh = new ElementMappingContentHandler(xhtml, MAPPINGS) {

            private final BitSet textNodeStack = new BitSet();

            private int nodeDepth = 0;

            private int completelyFiltered = 0;

            private Stack<String> headingStack = new Stack<String>();

            @Override
            public void characters(char[] ch, int start, int length)
                    throws SAXException {
                // only forward content of tags from text:-namespace
                if (completelyFiltered == 0 && nodeDepth > 0
                        && textNodeStack.get(nodeDepth - 1)) {
                    super.characters(ch,start,length);
                }
            }

            // helper for checking tags which need complete filtering
            // (with sub-tags)
            private boolean needsCompleteFiltering(
                    String namespaceURI, String localName) {
                if (TEXT_NS.equals(namespaceURI)) {
                    return localName.endsWith("-template")
                        || localName.endsWith("-style");
                } else if (TABLE_NS.equals(namespaceURI)) {
                    return "covered-table-cell".equals(localName);
                } else {
                    return false;
                }
            }

            // map the heading level to <hX> HTML tags
            private String getXHTMLHeaderTagName(Attributes atts) {
                String depthStr = atts.getValue(TEXT_NS, "outline-level");
                if (depthStr == null) {
                    return "h1";
                }

                int depth = Integer.parseInt(depthStr);
                if (depth >= 6) {
                    return "h6";
                } else if (depth <= 1) {
                    return "h1";
                } else {
                    return "h" + depth;
                }
            }

            /**
             * Check if a node is a text node
             */
            private boolean isTextNode(String namespaceURI, String localName) {
                if (TEXT_NS.equals(namespaceURI)) {
                    return true;
                }
                if (SVG_NS.equals(namespaceURI)) {
                    return "title".equals(localName) ||
                            "desc".equals(localName);
                }
                return false;
            }

            @Override
            public void startElement(
                    String namespaceURI, String localName, String qName,
                    Attributes atts) throws SAXException {
                // keep track of current node type. If it is a text node,
                // a bit at the current depth ist set in textNodeStack.
                // characters() checks the top bit to determine, if the
                // actual node is a text node to print out nodeDepth contains
                // the depth of the current node and also marks top of stack.
                assert nodeDepth >= 0;

                textNodeStack.set(nodeDepth++,
                        isTextNode(namespaceURI, localName));
                // filter *all* content of some tags
                assert completelyFiltered >= 0;

                if (needsCompleteFiltering(namespaceURI, localName)) {
                    completelyFiltered++;
                }
                // call next handler if no filtering
                if (completelyFiltered == 0) {
                    // special handling of text:h, that are directly passed
                    // to xhtml handler
                    if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                        xhtml.startElement(headingStack.push(
                                getXHTMLHeaderTagName(atts)));
                    } else {
                        super.startElement(
                                namespaceURI, localName, qName, atts);
                    }
                }
            }

            @Override
            public void endElement(
                    String namespaceURI, String localName, String qName)
                    throws SAXException {
                // call next handler if no filtering
                if (completelyFiltered == 0) {
                    // special handling of text:h, that are directly passed
                    // to xhtml handler
                    if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                        xhtml.endElement(headingStack.pop());
                    } else {
                        super.endElement(namespaceURI,localName,qName);
                    }

                    // special handling of tabulators
View Full Code Here

                filesystem, SummaryInformation.DEFAULT_STREAM_NAME, metadata);
        getMetadata(
                filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
                metadata);

        XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");
        extractText(filesystem, new AppendableAdaptor(xhtml));
        xhtml.endElement("p");
        xhtml.endDocument();
    }
View Full Code Here

public class EmptyParser implements Parser {

    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.XHTMLContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.