Package org.apache.tika.extractor

Examples of org.apache.tika.extractor.EmbeddedDocumentExtractor


    private final EmbeddedDocumentExtractor embeddedExtractor;

    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
        this.extractor = extractor;

        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            embeddedExtractor = ex;
View Full Code Here


    public PackageExtractor(
            ContentHandler handler, Metadata metadata, ParseContext context) {
        this.handler = handler;
        this.metadata = metadata;

        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            this.extractor = ex;
View Full Code Here

    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor, String type) {
        this.extractor = extractor;
        this.type = type;

        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            embeddedExtractor = ex;
View Full Code Here

abstract class AbstractPOIFSExtractor {

    private final EmbeddedDocumentExtractor extractor;

    protected AbstractPOIFSExtractor(ParseContext context) {
        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            this.extractor = ex;
View Full Code Here

    private final EmbeddedDocumentExtractor embeddedExtractor;

    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
        this.extractor = extractor;

        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            embeddedExtractor = ex;
View Full Code Here

  }

  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, TikaException, SAXException {

    EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
        new ParsingEmbeddedDocumentExtractor(context));

    String charsetName = "windows-1252";

    metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
    metadata.set(Metadata.CONTENT_ENCODING, charsetName);

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    InputStreamReader isr = new InputStreamReader(stream, charsetName);
    BufferedReader reader = new BufferedReader(isr);
    try {
      String curLine = reader.readLine();
      int mailItem = 0;
      do {
        if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
          Metadata mailMetadata = new Metadata();
          Queue<String> multiline = new LinkedList<String>();
          mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
          mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
          curLine = reader.readLine();

          ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
          do {
            if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
              String latestLine = multiline.poll();
              latestLine += " " + curLine.trim();
              multiline.add(latestLine);
            } else {
              multiline.add(curLine);
            }

            message.write(curLine.getBytes(charsetName));
            message.write(0x0A);
            curLine = reader.readLine();
          } while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);

          for (String item : multiline) {
            saveHeaderInMetadata(mailMetadata, item);
          }

          ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
          message = null;

          if (extractor.shouldParseEmbedded(mailMetadata)) {
            extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
          }

          if (tracking) {
            getTrackingMetadata().put(mailItem++, mailMetadata);
          }
View Full Code Here

  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {

    // Use the delegate parser to parse the contained document
    EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
        new ParsingEmbeddedDocumentExtractor(context));

    metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
View Full Code Here

        if (!type.equals(MediaType.OCTET_STREAM)) {
            metadata.set(CONTENT_TYPE, type.toString());
        }

        // Use the delegate parser to parse the contained document
        EmbeddedDocumentExtractor extractor = context.get(
                EmbeddedDocumentExtractor.class,
                new ParsingEmbeddedDocumentExtractor(context));

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
View Full Code Here

                }
                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
            }

            // Use the delegate parser to parse the compressed document
            EmbeddedDocumentExtractor extractor = context.get(
                    EmbeddedDocumentExtractor.class,
                    new ParsingEmbeddedDocumentExtractor(context));
            if (extractor.shouldParseEmbedded(entrydata)) {
                extractor.parseEmbedded(cis, xhtml, entrydata, true);
            }
        } finally {
            cis.close();
        }
View Full Code Here

     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    protected void handleCompletedObject() throws IOException, SAXException, TikaException {
       EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
      
       if (embeddedExtractor == null) {
           embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
       }
      
View Full Code Here

TOP

Related Classes of org.apache.tika.extractor.EmbeddedDocumentExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.