Package org.apache.tika.sax

Examples of org.apache.tika.sax.XHTMLContentHandler


     */
    protected void parseArchive(
            ArchiveInputStream archive, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        ArchiveEntry entry = archive.getNextEntry();
        while (entry != null) {
            if (!entry.isDirectory()) {
                xhtml.startElement("div", "class", "package-entry");
                Metadata entrydata = new Metadata();
                String name = entry.getName();
                if (name != null && name.length() > 0) {
                    entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
                    xhtml.element("h1", name);
                }
                try {
                    // Use the delegate parser to parse this entry
                    super.parse(
                            new CloseShieldInputStream(archive),
                            new EmbeddedContentHandler(
                                    new BodyContentHandler(xhtml)),
                            entrydata, context);
                } catch (TikaException e) {
                    // Could not parse the entry, just skip the content
                }
                xhtml.endElement("div");
            }
            entry = archive.getNextEntry();
        }

        xhtml.endDocument();
    }
View Full Code Here


    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // AudioSystem expects the stream to support the mark feature
        InputStream buffered = new BufferedInputStream(stream);
        try {
            AudioFormat format =
                AudioSystem.getAudioFileFormat(buffered).getFormat();

            float rate = format.getSampleRate();
            if (rate != AudioSystem.NOT_SPECIFIED) {
                metadata.set("samplerate", String.valueOf(rate));
            }

            int channels = format.getChannels();
            if (channels != AudioSystem.NOT_SPECIFIED) {
                metadata.set("channels", String.valueOf(channels));
            }

            int bits = format.getSampleSizeInBits();
            if (bits != AudioSystem.NOT_SPECIFIED) {
                metadata.set("bits", String.valueOf(bits));
            }

            metadata.set("encoding", format.getEncoding().toString());

            // Javadoc suggests that some of the following properties might
            // be available, but I had no success in finding any:

            // "duration" Long playback duration of the file in microseconds
            // "author" String name of the author of this file
            // "title" String title of this file
            // "copyright" String copyright message
            // "date" Date date of the recording or release
            // "comment" String an arbitrary text

            for (Entry<String, Object> entry : format.properties().entrySet()) {
                metadata.set(entry.getKey(), entry.getValue().toString());
            }
        } catch (UnsupportedAudioFileException e) {
            // There is no way to know whether this exception was
            // caused by the document being corrupted or by the format
            // just being unsupported. So we do nothing.
        }

        xhtml.endDocument();
    }
View Full Code Here

        BufferedReader reader = new BufferedReader(isr);

        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
        metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        ParseStates parseState = ParseStates.START;
        String multiLine = null;
        boolean inQuote = false;
        int numEmails = 0;

        // We're going to scan, line-by-line, for a line that starts with
        // "From "
        for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
            boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
            if (newMessage) {
                numEmails += 1;
            }

            switch (parseState) {
            case START:
                if (newMessage) {
                    parseState = ParseStates.IN_HEADER;
                    newMessage = false;
                    // Fall through to IN_HEADER
                } else {
                    break;
                }

            case IN_HEADER:
                if (newMessage) {
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    multiLine = curLine;
                } else if (curLine.length() == 0) {
                    // Blank line is signal that we're transitioning to the content.
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    parseState = ParseStates.IN_CONTENT;

                    // Mimic what PackageParser does between entries.
                    xhtml.startElement("div", "class", "email-entry");
                    xhtml.startElement("p");
                    inQuote = false;
                } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
                    multiLine += " " + curLine.trim();
                } else {
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    multiLine = curLine;
                }

                break;

                // TODO - use real email parsing support so we can correctly handle
                // things like multipart messages and quoted-printable encoding.
                // We'd also want this for charset handling, where content isn't 7-bit
                // ascii.
            case IN_CONTENT:
                if (newMessage) {
                    endMessage(xhtml, inQuote);
                    parseState = ParseStates.IN_HEADER;
                    multiLine = curLine;
                } else {
                    boolean quoted = curLine.startsWith(">");
                    if (inQuote) {
                        if (!quoted) {
                            xhtml.endElement("q");
                            inQuote = false;
                        }
                    } else if (quoted) {
                        xhtml.startElement("q");
                        inQuote = true;
                    }

                    xhtml.characters(curLine);

                    // For plain text email, each line is a real break position.
                    xhtml.element("br", "");
                }
            }
        }

        if (parseState == ParseStates.IN_HEADER) {
            saveHeaderInMetadata(numEmails, metadata, multiLine);
        } else if (parseState == ParseStates.IN_CONTENT) {
            endMessage(xhtml, inQuote);
        }

        xhtml.endDocument();
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // MidiSystem expects the stream to support the mark feature
        InputStream buffered = new BufferedInputStream(stream);
        try {
            Sequence sequence = MidiSystem.getSequence(buffered);

            Track[] tracks = sequence.getTracks();
            metadata.set("tracks", String.valueOf(tracks.length));

            Patch[] patches = sequence.getPatchList();
            metadata.set("patches", String.valueOf(patches.length));

            float type = sequence.getDivisionType();
            if (type == Sequence.PPQ) {
                metadata.set("divisionType", "PPQ");
            } else if (type == Sequence.SMPTE_24) {
                metadata.set("divisionType", "SMPTE_24");
            } else if (type == Sequence.SMPTE_25) {
                metadata.set("divisionType", "SMPTE_25");
            } else if (type == Sequence.SMPTE_30) {
                metadata.set("divisionType", "SMPTE_30");
            } else if (type == Sequence.SMPTE_30DROP) {
                metadata.set("divisionType", "SMPTE_30DROP");
            } else if (type == Sequence.SMPTE_24) {
                metadata.set("divisionType", String.valueOf(type));
            }

            for (Track track : tracks) {
                xhtml.startElement("p");
                for (int i = 0; i < track.size(); i++) {
                    MidiMessage message = track.get(i).getMessage();
                    if (message instanceof MetaMessage) {
                        MetaMessage meta = (MetaMessage) message;
                        // Types 1-15 are reserved for text events
                        if (meta.getType() >= 1 && meta.getType() <= 15) {
                            // FIXME: What's the encoding?
                            xhtml.characters(
                                    new String(meta.getData(), "ISO-8859-1"));
                        }
                    }
                }
                xhtml.endElement("p");
            }
        } catch (InvalidMidiDataException ignore) {
            // There is no way to know whether this exception was
            // caused by the document being corrupted or by the format
            // just being unsupported. So we do nothing.
        }

        xhtml.endDocument();
    }
View Full Code Here

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // At the end we want to close the bzip2 stream to release any associated
        // resources, but the underlying document stream should not be closed
        InputStream bzip2 =
            new BZip2CompressorInputStream(new CloseShieldInputStream(stream));
        try {
            Metadata entrydata = new Metadata();
            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
            if (name != null) {
                if (name.endsWith(".tbz")) {
                    name = name.substring(0, name.length() - 4) + ".tar";
                } else if (name.endsWith(".tbz2")) {
                    name = name.substring(0, name.length() - 5) + ".tar";
                } else if (name.endsWith(".bz")) {
                    name = name.substring(0, name.length() - 3);
                } else if (name.endsWith(".bz2")) {
                    name = name.substring(0, name.length() - 4);
                }
                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
            }
            // Use the delegate parser to parse the compressed document
            super.parse(
                    new CloseShieldInputStream(bzip2),
                    new EmbeddedContentHandler(
                            new BodyContentHandler(xhtml)),
                    entrydata, context);
        } finally {
            bzip2.close();
        }

        xhtml.endDocument();
    }
View Full Code Here

     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        POIFSFileSystem filesystem = new POIFSFileSystem(stream);

        // Parse summary entries first, to make metadata available early
        parseSummaryEntryIfExists(
                filesystem, SUMMARY_INFORMATION, metadata);
        parseSummaryEntryIfExists(
                filesystem, DOCUMENT_SUMMARY_INFORMATION, metadata);

        // Parse remaining document entries
        boolean outlookExtracted = false;
        Iterator<?> entries = filesystem.getRoot().getEntries();
        while (entries.hasNext()) {
            Entry entry = (Entry) entries.next();
            String name = entry.getName();
            if (!(entry instanceof DocumentEntry)) {
                // Skip directory entries
            } else if ("WordDocument".equals(name)) {
                setType(metadata, "application/msword");
                WordExtractor extractor = new WordExtractor(filesystem);

                addTextIfAny(xhtml, "header", extractor.getHeaderText());

                for (String paragraph : extractor.getParagraphText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getFootnoteText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getCommentsText()) {
                    xhtml.element("p", paragraph);
                }

                for (String paragraph : extractor.getEndnoteText()) {
                    xhtml.element("p", paragraph);
                }

                addTextIfAny(xhtml, "footer", extractor.getFooterText());
            } else if ("PowerPoint Document".equals(name)) {
                setType(metadata, "application/vnd.ms-powerpoint");
                PowerPointExtractor extractor =
                    new PowerPointExtractor(filesystem);
                xhtml.element("p", extractor.getText(true, true));
            } else if ("Workbook".equals(name)) {
                setType(metadata, "application/vnd.ms-excel");
                new ExcelExtractor().parse(filesystem, xhtml);
            } else if ("VisioDocument".equals(name)) {
                setType(metadata, "application/vnd.visio");
                VisioTextExtractor extractor =
                    new VisioTextExtractor(filesystem);
                for (String text : extractor.getAllText()) {
                    xhtml.element("p", text);
                }
            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                // TODO: Cleaner mechanism for detecting Outlook
                outlookExtracted = true;
                setType(metadata, "application/vnd.ms-outlook");
                new OutlookExtractor(filesystem).parse(xhtml, metadata);
            }
        }

        xhtml.endDocument();
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler,metadata);
        DefaultHandler dh = new ElementMappingContentHandler(xhtml, MAPPINGS) {

            private final BitSet textNodeStack = new BitSet();

            private int nodeDepth = 0;

            private int completelyFiltered = 0;

            private Stack<String> headingStack = new Stack<String>();

            @Override
            public void characters(char[] ch, int start, int length)
                    throws SAXException {
                // only forward content of tags from text:-namespace
                if (completelyFiltered == 0 && nodeDepth > 0
                        && textNodeStack.get(nodeDepth - 1)) {
                    super.characters(ch,start,length);
                }
            }

            // helper for checking tags which need complete filtering
            // (with sub-tags)
            private boolean needsCompleteFiltering(
                    String namespaceURI, String localName) {
                if (TEXT_NS.equals(namespaceURI)) {
                    return localName.endsWith("-template")
                        || localName.endsWith("-style");
                } else if (TABLE_NS.equals(namespaceURI)) {
                    return "covered-table-cell".equals(localName);
                } else {
                    return false;
                }
            }

            // map the heading level to <hX> HTML tags
            private String getXHTMLHeaderTagName(Attributes atts) {
                String depthStr = atts.getValue(TEXT_NS, "outline-level");
                if (depthStr == null) {
                    return "h1";
                }

                int depth = Integer.parseInt(depthStr);
                if (depth >= 6) {
                    return "h6";
                } else if (depth <= 1) {
                    return "h1";
                } else {
                    return "h" + depth;
                }
            }

            /**
             * Check if a node is a text node
             */
            private boolean isTextNode(String namespaceURI, String localName) {
                if (TEXT_NS.equals(namespaceURI)) {
                    return true;
                }
                if (SVG_NS.equals(namespaceURI)) {
                    return "title".equals(localName) ||
                            "desc".equals(localName);
                }
                return false;
            }

            @Override
            public void startElement(
                    String namespaceURI, String localName, String qName,
                    Attributes atts) throws SAXException {
                // keep track of current node type. If it is a text node,
                // a bit at the current depth ist set in textNodeStack.
                // characters() checks the top bit to determine, if the
                // actual node is a text node to print out nodeDepth contains
                // the depth of the current node and also marks top of stack.
                assert nodeDepth >= 0;

                textNodeStack.set(nodeDepth++,
                        isTextNode(namespaceURI, localName));
                // filter *all* content of some tags
                assert completelyFiltered >= 0;

                if (needsCompleteFiltering(namespaceURI, localName)) {
                    completelyFiltered++;
                }
                // call next handler if no filtering
                if (completelyFiltered == 0) {
                    // special handling of text:h, that are directly passed
                    // to xhtml handler
                    if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                        xhtml.startElement(headingStack.push(
                                getXHTMLHeaderTagName(atts)));
                    } else {
                        super.startElement(
                                namespaceURI, localName, qName, atts);
                    }
                }
            }

            @Override
            public void endElement(
                    String namespaceURI, String localName, String qName)
                    throws SAXException {
                // call next handler if no filtering
                if (completelyFiltered == 0) {
                    // special handling of text:h, that are directly passed
                    // to xhtml handler
                    if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                        xhtml.endElement(headingStack.pop());
                    } else {
                        super.endElement(namespaceURI,localName,qName);
                    }

                    // special handling of tabulators
View Full Code Here

            throws IOException, SAXException, TikaException {
        try {
            DefaultStyledDocument sd = new DefaultStyledDocument();
            new RTFEditorKit().read(stream, sd, 0);

            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();
            xhtml.element("p", sd.getText(0, sd.getLength()));
            xhtml.endDocument();
        } catch (BadLocationException e) {
            throw new TikaException("Error parsing an RTF document", e);
        }
    }
View Full Code Here

            throws IOException, SAXException, TikaException {
        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
        }

        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");

        getSAXParser(context).parse(
                new CloseShieldInputStream(stream),
                new OfflineContentHandler(
                        getContentHandler(handler, metadata)));

        xhtml.endElement("p");
        xhtml.endDocument();
    }
View Full Code Here

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        byte[] tag = getSuffix(stream, 128);
        if (tag.length == 128
                && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') {
            String title = getString(tag, 3, 33);
            String artist = getString(tag, 33, 63);
            String album = getString(tag, 63, 93);
            String year = getString(tag, 93, 97);
            String comment = getString(tag, 97, 127);
            int genre = (int) tag[127] & 0xff; // unsigned byte

            metadata.set(Metadata.TITLE, title);
            metadata.set(Metadata.AUTHOR, artist);

            xhtml.element("h1", title);
            xhtml.element("p", artist);
            // ID3v1.1 Track addition
            // If the last two bytes of the comment field are zero and
            // non-zero, then the last byte is the track number
            if (tag[125] == 0 && tag[126] != 0) {
                int track = (int) tag[126] & 0xff;
                xhtml.element("p", album + ", track " + track);
            } else {
                xhtml.element("p", album);
            }
            xhtml.element("p", year);
            xhtml.element("p", comment);
            xhtml.element("p", GENRES[Math.min(genre, GENRES.length - 1)]);
        }

        xhtml.endDocument();
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.XHTMLContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.