Examples of XHTMLContentHandler

org.apache.tika.sax.XHTMLContentHandler
Content handler decorator that simplifies the task of producing XHTML events for Tika content parsers.

Examples of org.apache.tika.sax.XHTMLContentHandler

        BufferedReader reader = new BufferedReader(isr);


        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
        metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        ParseStates parseState = ParseStates.START;
        String multiLine = null;
        boolean inQuote = false;
        int numEmails = 0;


        // We're going to scan, line-by-line, for a line that starts with
        // "From "
        for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
            boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
            if (newMessage) {
                numEmails += 1;
            }


            switch (parseState) {
            case START:
                if (newMessage) {
                    parseState = ParseStates.IN_HEADER;
                    newMessage = false;
                    // Fall through to IN_HEADER
                } else {
                    break;
                }


            case IN_HEADER:
                if (newMessage) {
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    multiLine = curLine;
                } else if (curLine.length() == 0) {
                    // Blank line is signal that we're transitioning to the content.
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    parseState = ParseStates.IN_CONTENT;


                    // Mimic what PackageParser does between entries.
                    xhtml.startElement("div", "class", "email-entry");
                    xhtml.startElement("p");
                    inQuote = false;
                } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
                    multiLine += " " + curLine.trim();
                } else {
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    multiLine = curLine;
                }


                break;


                // TODO - use real email parsing support so we can correctly handle
                // things like multipart messages and quoted-printable encoding.
                // We'd also want this for charset handling, where content isn't 7-bit
                // ascii.
            case IN_CONTENT:
                if (newMessage) {
                    endMessage(xhtml, inQuote);
                    parseState = ParseStates.IN_HEADER;
                    multiLine = curLine;
                } else {
                    boolean quoted = curLine.startsWith(">");
                    if (inQuote) {
                        if (!quoted) {
                            xhtml.endElement("q");
                            inQuote = false;
                        }
                    } else if (quoted) {
                        xhtml.startElement("q");
                        inQuote = true;
                    }


                    xhtml.characters(curLine);


                    // For plain text email, each line is a real break position.
                    xhtml.element("br", "");
                }
            }
        }


        if (parseState == ParseStates.IN_HEADER) {
            saveHeaderInMetadata(numEmails, metadata, multiLine);
        } else if (parseState == ParseStates.IN_CONTENT) {
            endMessage(xhtml, inQuote);
        }


        xhtml.endDocument();
    }

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler


    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws SAXException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

        extractContent(xmlDoc, Metadata.FORMAT, "//dc:format", metadata);
        extractContent(xmlDoc, Metadata.IDENTIFIER, "//dc:identifier", metadata);
        extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
        extractContent(xmlDoc, Metadata.RIGHTS, "//dc:rights", metadata);


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");
        concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
        xhtml.endElement("p");
        xhtml.endDocument();
    }

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler


        metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
        metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
        metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        // flv tag stream follows...
        while (true) {
            int type = datainput.read();
            if (type == -1) {
                // EOF
                break;
            }


            int datalen = readUInt24(datainput); //body length
            readUInt32(datainput); // timestamp
            readUInt24(datainput); // streamid


            if (type == TYPE_METADATA) {
                // found metadata Tag, read content to buffer
                byte[] metaBytes = new byte[datalen];
                for (int readCount = 0; readCount < datalen;) {
                    int r = stream.read(metaBytes, readCount, datalen - readCount);
                    if(r!=-1) {
                        readCount += r;


                    } else {
                        break;
                    }
                }


                ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);


                DataInputStream dis = new DataInputStream(is);


                Object data = null;


                for (int i = 0; i < 2; i++) {
                    data = readAMFData(dis, -1);
                }


                if (data instanceof Map) {
                    // TODO if there are multiple metadata values with same key (in
                    // separate AMF blocks, we currently loose previous values)
                    Map<String, Object> extractedMetadata = (Map<String, Object>) data;
                    for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
                        if (entry.getValue() == null) {
                            continue;
                        }
                        metadata.set(entry.getKey(), entry.getValue().toString());
                    }
                }


            } else {
                // Tag was not metadata, skip over data we cannot handle
                for (int i = 0; i < datalen; i++) {
                    datainput.readByte();
                }
            }


            sizePrev = readUInt32(datainput); // previous block size
            if (sizePrev != datalen + 11) {
                // file was corrupt or we could not parse it...
                break;
            }
        }


        xhtml.endDocument();
    }

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        final DirectoryNode root;
        TikaInputStream tstream = TikaInputStream.cast(stream);
        if (tstream == null) {
            root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
        } else {
            final Object container = tstream.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                root = ((NPOIFSFileSystem) container).getRoot();
            } else if (container instanceof DirectoryNode) {
                root = (DirectoryNode) container;
            } else if (tstream.hasFile()) {
                root = new NPOIFSFileSystem(tstream.getFileChannel()).getRoot();
            } else {
                root = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)).getRoot();
            }
        }
        parse(root, context, metadata, xhtml);
        xhtml.endDocument();
    }

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

        //source of config (derives from context or PDFParser?) is
        //already determined in PDFParser.  No need to check context here.
        this.config = config;
        this.originalHandler = handler;
        this.context = context;
        this.handler = new XHTMLContentHandler(handler, metadata);
        setForceParsing(true);
        setSortByPosition(config.getSortByPosition());
        if (config.getEnableAutoSpace()) {
            setWordSeparator(" ");
        } else {

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler


        // metadata
        metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");


        // content
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        Iterator<DirectoryListingEntry> it =
                chmExtractor.getChmDirList().getDirectoryListingEntryList().iterator();
        while (it.hasNext()) {
            DirectoryListingEntry entry = it.next();
            if (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm")) {
                xhtml.characters(extract(chmExtractor.extractChmEntry(entry)));
            }
        }


        xhtml.endDocument();
    }

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

        metadata.set(TikaCoreProperties.CREATED, font.getHeader().getCreated().getTime());
        metadata.set(
                TikaCoreProperties.MODIFIED,
                font.getHeader().getModified().getTime());


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler


    private final XHTMLContentHandler handler;


    private PDF2XHTML(ContentHandler handler, Metadata metadata)
            throws IOException {
        this.handler = new XHTMLContentHandler(handler, metadata);
        setForceParsing(true);
        setSortByPosition(true);
    }

View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

        try {
            File file = TikaInputStream.get(stream).getFile();
            Archive archive = new Archive(file);


            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();
            for (FileHeader header : archive.getFileHeaders()) {
                Metadata entrydata = new Metadata();
                entrydata.set(
                        Metadata.RESOURCE_NAME_KEY,
                        header.getFileNameString());
                if (extractor.shouldParseEmbedded(entrydata)) {
                    extractor.parseEmbedded(stream, xhtml, entrydata, true);
                }
            }
            xhtml.endDocument();
        } catch (RarException e) {
            throw new TikaException("Unable to parse a RAR archive", e);
        }
    }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.