Examples of org.apache.tika.io.TikaInputStream

org.apache.tika.io.TikaInputStream
Input stream with extended capabilities. The purpose of this class is to allow files and other resources and information to be associated with the {@link InputStream} instance passed through the{@link org.apache.tika.parser.Parser} interface and other similar APIs.
TikaInputStream instances can be created using the various static get() factory methods. Most of these methods take an optional {@link Metadata} argument that is then filled with the available inputmetadata from the given resource. The created TikaInputStream instance keeps track of the original resource used to create it, while behaving otherwise just like a normal, buffered {@link InputStream}. A TikaInputStream instance is also guaranteed to support the {@link #mark(int)} feature.
Code that wants to access the underlying file or other resources associated with a TikaInputStream should first use the {@link #get(InputStream)} factory method to cast or wrap a given{@link InputStream} into a TikaInputStream instance. @since Apache Tika 0.8

  @Override
  public void processBean(final CRResolvableBean bean) throws CRException {
    if (this.contentAttributeField != null) {
      Object obj = bean.get(this.contentAttributeField);
      if (obj != null) {
        TikaInputStream inputStream = null;
        if (obj instanceof byte[]) {
          inputStream = TikaInputStream.get((byte[]) obj);
        } else {
          throw new IllegalArgumentException("Parameter must be instance of byte[]");
        }


        ContentHandler textHandler = new BodyContentHandler(fileLengthLimit);
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();


        try {
          metadata.set(Metadata.CONTENT_TYPE, tika.detect(inputStream));


          parser.parse(inputStream, textHandler, metadata, context);


          bean.set(headingField, metadata.get(TikaCoreProperties.TITLE));


          if (bean.get(createTimestampField) == null) {
            bean.set(createTimestampField, metadata.get(TikaCoreProperties.CREATED));
          }
          if (bean.get(editTimestampField) == null) {
            bean.set(editTimestampField, metadata.get(TikaCoreProperties.MODIFIED));
          }
          if (bean.get(keywordsField) == null) {
            bean.set(keywordsField, metadata.get(TikaCoreProperties.KEYWORDS));
          }
          if (bean.get(publishTimestampField) == null) {
            bean.set(publishTimestampField, metadata.get(TikaCoreProperties.PRINT_DATE));
          }
          if (bean.get(mimetypeField) == null) {
            //HttpHeaders.CONTENT_TYPE
            bean.set(mimetypeField, metadata.get(Metadata.CONTENT_TYPE));
          }


          String content = prepareContent(bean, textHandler);
          bean.set(this.targetAttributeField, content);


        } catch (IOException e) {
          LOGGER.error("Error reading inputstream from bean: " + bean.getContentid(), e);
        } catch (SAXException e) {
          LOGGER.error("Sax Parser Exception while reading inputstream from bean: " + bean.getContentid(), e);
        } catch (TikaException e) {
          LOGGER.error("Tika Parser Exception while reading inputstream  from bean: " + bean.getContentid(), e);
        } catch (Exception e) {
          LOGGER.error("Exception occured while indexing file at bean: " + bean.getContentid(), e);
        } finally {
          try {
            if (inputStream != null) {
              inputStream.close();
            }
          } catch (IOException e) {
            LOGGER.error("Could not close inputstream of bean: " + bean.getContentid(), e);
          }
        }

View Full Code Here

               attributes.addAttribute("", "class", "class", "CDATA", "embedded");
               attributes.addAttribute("", "id", "id", "CDATA", objID);
               xhtml.startElement("div", attributes);
               xhtml.endElement("div");


               TikaInputStream stream =
                    TikaInputStream.get(data.getData());
               try {
                  String mediaType = null;
                  if ("Excel.Chart.8".equals(oleShape.getProgID())) {
                     mediaType = "application/vnd.ms-excel";
                  }
                  handleEmbeddedResource(
                        stream, objID, objID,
                        mediaType, xhtml, false);
               } finally {
                  stream.close();
               }
            }
         }
      }
   }

View Full Code Here

        }


        // If this is a TikaInputStream wrapping an already
        // parsed NPOIFileSystem/DirectoryNode, just get the
        // names from the root:
        TikaInputStream tis = TikaInputStream.cast(input);
        Set<String> names = null;
        if (tis != null) {
            Object container = tis.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
            } else if (container instanceof DirectoryNode) {
                names = getTopLevelNames((DirectoryNode) container);
            }
        }


        if (names == null) {
            // Check if the document starts with the OLE header
            input.mark(8);
            try {
                if (input.read() != 0xd0 || input.read() != 0xcf
                    || input.read() != 0x11 || input.read() != 0xe0
                    || input.read() != 0xa1 || input.read() != 0xb1
                    || input.read() != 0x1a || input.read() != 0xe1) {
                    return MediaType.OCTET_STREAM;
                }
            } finally {
                input.reset();
            }
        }


        // We can only detect the exact type when given a TikaInputStream
        if (names == null && tis != null) {
            // Look for known top level entry names to detect the document type
            names = getTopLevelNames(tis);
        }
        
        // Detect based on the names (as available)
        if (tis != null && 
            tis.getOpenContainer() != null && 
            tis.getOpenContainer() instanceof NPOIFSFileSystem) {
            return detect(names, ((NPOIFSFileSystem)tis.getOpenContainer()).getRoot());
        } else {
            return detect(names, null);
        }
    }

View Full Code Here

            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        final DirectoryNode root;
        TikaInputStream tstream = TikaInputStream.cast(stream);
        if (tstream == null) {
            root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
        } else {
            final Object container = tstream.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                root = ((NPOIFSFileSystem) container).getRoot();
            } else if (container instanceof DirectoryNode) {
                root = (DirectoryNode) container;
            } else if (tstream.hasFile()) {
                root = new NPOIFSFileSystem(tstream.getFileChannel()).getRoot();
            } else {
                root = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)).getRoot();
            }
        }
        parse(root, context, metadata, xhtml);

View Full Code Here

            metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
            metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
            metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));


            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                TikaInputStream stream = TikaInputStream.get(file.createInputStream());
                try {
                    embeddedExtractor.parseEmbedded(
                            stream,
                            new EmbeddedContentHandler(handler),
                            metadata, false);
                } finally {
                    stream.close();
                }
            }
        }
    }

View Full Code Here


            try {
                os = new FileOutputStream(outputFile);


                if (inputStream instanceof TikaInputStream) {
                    TikaInputStream tin = (TikaInputStream) inputStream;


                    if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
                        POIFSFileSystem fs = new POIFSFileSystem();
                        copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
                        fs.writeFilesystem(os);
                    } else {
                        IOUtils.copy(inputStream, os);
                    }
                } else {

View Full Code Here

       xhtml.endElement("img");


       // Have we already output this one?
       // (Only expose each individual image once) 
       if(! pictures.hasOutput(picture)) {
          TikaInputStream stream = TikaInputStream.get(picture.getContent());
          handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
          pictures.recordOutput(picture);
       }
    }

View Full Code Here

        PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
        try {
            // PDFBox can process entirely in memory, or can use a temp file
            //  for unpacked / processed resources
            // Decide which to do based on if we're reading from a file or not already
            TikaInputStream tstream = TikaInputStream.cast(stream);
            if (tstream != null && tstream.hasFile()) {
                // File based, take that as a cue to use a temporary file
                RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
                if (localConfig.getUseNonSequentialParser() == true){
                    pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
                } else {

View Full Code Here

            return MediaType.OCTET_STREAM;
        }


        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(input, tmp);


            byte[] prefix = new byte[1024]; // enough for all known formats
            int length = tis.peek(prefix);


            MediaType type = detectArchiveFormat(prefix, length);
            if (PackageParser.isZipArchive(type)
                    && TikaInputStream.isTikaInputStream(input)) {
                return detectZipFormat(tis);

View Full Code Here


    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TikaInputStream tis = TikaInputStream.cast(stream);
        
        // Until PDFBOX-1749 is fixed, if we can, use AWT to verify
        //  that the file is valid (otherwise FontBox could hang)
        // See TIKA-1182 for details
        if (tis != null) {
            try {
                if (tis.hasFile()) {
                    Font.createFont(Font.TRUETYPE_FONT, tis.getFile());
                } else {
                    tis.mark(0);
                    Font.createFont(Font.TRUETYPE_FONT, stream);
                    tis.reset();
                }
            } catch (FontFormatException ex) {
                throw new TikaException("Bad TrueType font.");
            }
        }
        
        // Ask FontBox to parse the file for us
        TrueTypeFont font;
        TTFParser parser = new TTFParser();
        if (tis != null && tis.hasFile()) {
            font = parser.parseTTF(tis.getFile());
        } else {
            font = parser.parseTTF(stream);
        }


        // Report the details of the font

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.tika.io.TikaInputStream

com.cloudera.cdk.morphline.tika.decompress.EmbeddedExtractor

com.cloudera.cdk.morphline.tika.decompress.UnpackBuilder$Unpack

com.gentics.cr.lucene.indexer.transformer.tika.TikaParserTransformer

org.apache.tika.cli.TikaCLI$FileEmbeddedDocumentExtractor

org.apache.tika.detect.POIFSContainerDetector

org.apache.tika.detect.TestContainerAwareDetector

org.apache.tika.embedder.ExternalEmbedder

org.apache.tika.extractor.ParserContainerExtractor$RecursiveParser

org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor

org.apache.tika.gui.TikaGUI

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.