Package org.apache.tika.io

Examples of org.apache.tika.io.TikaInputStream


  @Override
  public void processBean(final CRResolvableBean bean) throws CRException {
    if (this.contentAttributeField != null) {
      Object obj = bean.get(this.contentAttributeField);
      if (obj != null) {
        TikaInputStream inputStream = null;
        if (obj instanceof byte[]) {
          inputStream = TikaInputStream.get((byte[]) obj);
        } else {
          throw new IllegalArgumentException("Parameter must be instance of byte[]");
        }

        ContentHandler textHandler = new BodyContentHandler(fileLengthLimit);
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();

        try {
          metadata.set(Metadata.CONTENT_TYPE, tika.detect(inputStream));

          parser.parse(inputStream, textHandler, metadata, context);

          bean.set(headingField, metadata.get(TikaCoreProperties.TITLE));

          if (bean.get(createTimestampField) == null) {
            bean.set(createTimestampField, metadata.get(TikaCoreProperties.CREATED));
          }
          if (bean.get(editTimestampField) == null) {
            bean.set(editTimestampField, metadata.get(TikaCoreProperties.MODIFIED));
          }
          if (bean.get(keywordsField) == null) {
            bean.set(keywordsField, metadata.get(TikaCoreProperties.KEYWORDS));
          }
          if (bean.get(publishTimestampField) == null) {
            bean.set(publishTimestampField, metadata.get(TikaCoreProperties.PRINT_DATE));
          }
          if (bean.get(mimetypeField) == null) {
            //HttpHeaders.CONTENT_TYPE
            bean.set(mimetypeField, metadata.get(Metadata.CONTENT_TYPE));
          }

          String content = prepareContent(bean, textHandler);
          bean.set(this.targetAttributeField, content);

        } catch (IOException e) {
          LOGGER.error("Error reading inputstream from bean: " + bean.getContentid(), e);
        } catch (SAXException e) {
          LOGGER.error("Sax Parser Exception while reading inputstream from bean: " + bean.getContentid(), e);
        } catch (TikaException e) {
          LOGGER.error("Tika Parser Exception while reading inputstream  from bean: " + bean.getContentid(), e);
        } catch (Exception e) {
          LOGGER.error("Exception occured while indexing file at bean: " + bean.getContentid(), e);
        } finally {
          try {
            if (inputStream != null) {
              inputStream.close();
            }
          } catch (IOException e) {
            LOGGER.error("Could not close inputstream of bean: " + bean.getContentid(), e);
          }
        }
View Full Code Here


               attributes.addAttribute("", "class", "class", "CDATA", "embedded");
               attributes.addAttribute("", "id", "id", "CDATA", objID);
               xhtml.startElement("div", attributes);
               xhtml.endElement("div");

               TikaInputStream stream =
                    TikaInputStream.get(data.getData());
               try {
                  String mediaType = null;
                  if ("Excel.Chart.8".equals(oleShape.getProgID())) {
                     mediaType = "application/vnd.ms-excel";
                  }
                  handleEmbeddedResource(
                        stream, objID, objID,
                        mediaType, xhtml, false);
               } finally {
                  stream.close();
               }
            }
         }
      }
   }
View Full Code Here

        }

        // If this is a TikaInputStream wrapping an already
        // parsed NPOIFileSystem/DirectoryNode, just get the
        // names from the root:
        TikaInputStream tis = TikaInputStream.cast(input);
        Set<String> names = null;
        if (tis != null) {
            Object container = tis.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
            } else if (container instanceof DirectoryNode) {
                names = getTopLevelNames((DirectoryNode) container);
            }
        }

        if (names == null) {
            // Check if the document starts with the OLE header
            input.mark(8);
            try {
                if (input.read() != 0xd0 || input.read() != 0xcf
                    || input.read() != 0x11 || input.read() != 0xe0
                    || input.read() != 0xa1 || input.read() != 0xb1
                    || input.read() != 0x1a || input.read() != 0xe1) {
                    return MediaType.OCTET_STREAM;
                }
            } finally {
                input.reset();
            }
        }

        // We can only detect the exact type when given a TikaInputStream
        if (names == null && tis != null) {
            // Look for known top level entry names to detect the document type
            names = getTopLevelNames(tis);
        }
       
        // Detect based on the names (as available)
        if (tis != null &&
            tis.getOpenContainer() != null &&
            tis.getOpenContainer() instanceof NPOIFSFileSystem) {
            return detect(names, ((NPOIFSFileSystem)tis.getOpenContainer()).getRoot());
        } else {
            return detect(names, null);
        }
    }
View Full Code Here

            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        final DirectoryNode root;
        TikaInputStream tstream = TikaInputStream.cast(stream);
        if (tstream == null) {
            root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
        } else {
            final Object container = tstream.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                root = ((NPOIFSFileSystem) container).getRoot();
            } else if (container instanceof DirectoryNode) {
                root = (DirectoryNode) container;
            } else if (tstream.hasFile()) {
                root = new NPOIFSFileSystem(tstream.getFileChannel()).getRoot();
            } else {
                root = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)).getRoot();
            }
        }
        parse(root, context, metadata, xhtml);
View Full Code Here

            metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
            metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
            metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));

            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                TikaInputStream stream = TikaInputStream.get(file.createInputStream());
                try {
                    embeddedExtractor.parseEmbedded(
                            stream,
                            new EmbeddedContentHandler(handler),
                            metadata, false);
                } finally {
                    stream.close();
                }
            }
        }
    }
View Full Code Here

            try {
                os = new FileOutputStream(outputFile);

                if (inputStream instanceof TikaInputStream) {
                    TikaInputStream tin = (TikaInputStream) inputStream;

                    if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
                        POIFSFileSystem fs = new POIFSFileSystem();
                        copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
                        fs.writeFilesystem(os);
                    } else {
                        IOUtils.copy(inputStream, os);
                    }
                } else {
View Full Code Here

       xhtml.endElement("img");

       // Have we already output this one?
       // (Only expose each individual image once)
       if(! pictures.hasOutput(picture)) {
          TikaInputStream stream = TikaInputStream.get(picture.getContent());
          handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
          pictures.recordOutput(picture);
       }
    }
View Full Code Here

        PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
        try {
            // PDFBox can process entirely in memory, or can use a temp file
            //  for unpacked / processed resources
            // Decide which to do based on if we're reading from a file or not already
            TikaInputStream tstream = TikaInputStream.cast(stream);
            if (tstream != null && tstream.hasFile()) {
                // File based, take that as a cue to use a temporary file
                RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
                if (localConfig.getUseNonSequentialParser() == true){
                    pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
                } else {
View Full Code Here

            return MediaType.OCTET_STREAM;
        }

        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(input, tmp);

            byte[] prefix = new byte[1024]; // enough for all known formats
            int length = tis.peek(prefix);

            MediaType type = detectArchiveFormat(prefix, length);
            if (PackageParser.isZipArchive(type)
                    && TikaInputStream.isTikaInputStream(input)) {
                return detectZipFormat(tis);
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TikaInputStream tis = TikaInputStream.cast(stream);
       
        // Until PDFBOX-1749 is fixed, if we can, use AWT to verify
        //  that the file is valid (otherwise FontBox could hang)
        // See TIKA-1182 for details
        if (tis != null) {
            try {
                if (tis.hasFile()) {
                    Font.createFont(Font.TRUETYPE_FONT, tis.getFile());
                } else {
                    tis.mark(0);
                    Font.createFont(Font.TRUETYPE_FONT, stream);
                    tis.reset();
                }
            } catch (FontFormatException ex) {
                throw new TikaException("Bad TrueType font.");
            }
        }
       
        // Ask FontBox to parse the file for us
        TrueTypeFont font;
        TTFParser parser = new TTFParser();
        if (tis != null && tis.hasFile()) {
            font = parser.parseTTF(tis.getFile());
        } else {
            font = parser.parseTTF(stream);
        }

        // Report the details of the font
View Full Code Here

TOP

Related Classes of org.apache.tika.io.TikaInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.