Package org.apache.tika.mime

Examples of org.apache.tika.mime.MimeType


   * @param data
   * @param url
   * @return
   */
  private NutchDocument addType(NutchDocument doc, ParseData data, String url) {
    MimeType mimeType = null;
    String contentType = data.getMeta(Response.CONTENT_TYPE);
    if (contentType == null) {
      // Note by Jerome Charron on 20050415:
      // Content Type not solved by a previous plugin
      // Or unable to solve it... Trying to find it
      // Should be better to use the doc content too
      // (using MimeTypes.getMimeType(byte[], String), but I don't know
      // which field it is?
      // if (MAGIC) {
      //   contentType = MIME.getMimeType(url, content);
      // } else {
      //   contentType = MIME.getMimeType(url);
      // }
      mimeType = MIME.getMimeType(url);
    } else {
      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
    }
       
    // Checks if we solved the content-type.
    if (mimeType == null) {
      return doc;
    }

    contentType = mimeType.getName();
   
    doc.add("type", contentType);

    // Check if we need to split the content type in sub parts
    if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
View Full Code Here


   * @param data
   *          The byte data, returned from the crawl, if any.
   * @return The correctly, automatically guessed {@link MimeType} name.
   */
  public String autoResolveContentType(String typeName, String url, byte[] data) {
    MimeType type = null;
    String cleanedMimeType = null;

    try {
      cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
          .forName(MimeUtil.cleanMimeType(typeName)).getName()
          : null;
    } catch (MimeTypeException mte) {
      // Seems to be a malformed mime type name...
    }

    // first try to get the type from the cleaned type name
    try {
      type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
          : null;
    } catch (MimeTypeException e) {
      type = null;
    }

    // if returned null, or if it's the default type then try url resolution
    if (type == null
        || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
      // If no mime-type header, or cannot find a corresponding registered
      // mime-type, then guess a mime-type from the url pattern
      type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
          .getMimeType(url) : type;
    }

    // if magic is enabled use mime magic to guess if the mime type returned
    // from the magic guess is different than the one that's already set so far
    // if it is, and it's not the default mime type, then go with the mime type
    // returned by the magic
    if (this.mimeMagic) {
      MimeType magicType = this.mimeTypes.getMimeType(data);
      if (magicType != null && !magicType.getName().equals(MimeTypes.OCTET_STREAM)
          && !magicType.getName().equals(MimeTypes.PLAIN_TEXT)
          && type != null && !type.getName().equals(magicType.getName())) {
        // If magic enabled and the current mime type differs from that of the
        // one returned from the magic, take the magic mimeType
        type = magicType;
      }

View Full Code Here

            File file = entry.getKey();

            byte[] buf = IOUtils.toByteArray(new FileInputStream(file));
            MediaType mediaType = mimeTypes.
                    detect(new ByteArrayInputStream(buf), new Metadata());
            MimeType mimeType = mimeTypes.forName(mediaType.toString());
            FileBody fb = new FileBody(file, name + mimeType.getExtension(),
                    mimeType.getName(), Consts.UTF_8.name());

            multipartEntity.addPart("files[" + x + "]", fb);
            x++;
        }
View Full Code Here

   * @param data
   *          The byte data, returned from the crawl, if any.
   * @return The correctly, automatically guessed {@link MimeType} name.
   */
  public String autoResolveContentType(String typeName, String url, byte[] data) {
    MimeType type = null;
    String cleanedMimeType = null;

    try {
      cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
          .forName(MimeUtil.cleanMimeType(typeName)).getName()
          : null;
    } catch (MimeTypeException mte) {
      // Seems to be a malformed mime type name...
    }

    // first try to get the type from the cleaned type name
    try {
      type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
          : null;
    } catch (MimeTypeException e) {
      type = null;
    }

    // if returned null, or if it's the default type then try url resolution
    if (type == null
        || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
      // If no mime-type header, or cannot find a corresponding registered
      // mime-type, then guess a mime-type from the url pattern
      type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
          .getMimeType(url) : type;
    }

    // if magic is enabled use mime magic to guess if the mime type returned
    // from the magic guess is different than the one that's already set so far
    // if it is, and it's not the default mime type, then go with the mime type
    // returned by the magic
    if (this.mimeMagic) {
      MimeType magicType = this.mimeTypes.getMimeType(data);
      if (magicType != null && !magicType.getName().equals(MimeTypes.OCTET_STREAM)
          && !magicType.getName().equals(MimeTypes.PLAIN_TEXT)
          && type != null && !type.getName().equals(magicType.getName())) {
        // If magic enabled and the current mime type differs from that of the
        // one returned from the magic, take the magic mimeType
        type = magicType;
      }

View Full Code Here

   * @param data
   * @param url
   * @return
   */
  private NutchDocument addType(NutchDocument doc, ParseData data, String url) {
    MimeType mimeType = null;
    String contentType = data.getMeta(Response.CONTENT_TYPE);
    if (contentType == null) {
      // Note by Jerome Charron on 20050415:
      // Content Type not solved by a previous plugin
      // Or unable to solve it... Trying to find it
      // Should be better to use the doc content too
      // (using MimeTypes.getMimeType(byte[], String), but I don't know
      // which field it is?
      // if (MAGIC) {
      //   contentType = MIME.getMimeType(url, content);
      // } else {
      //   contentType = MIME.getMimeType(url);
      // }
      mimeType = MIME.getMimeType(url);
    } else {
      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
    }
       
    // Checks if we solved the content-type.
    if (mimeType == null) {
      return doc;
    }

    contentType = mimeType.getName();
   
    doc.add("type", contentType);

    // Check if we need to split the content type in sub parts
    if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
View Full Code Here

   * @return The correctly, automatically guessed {@link MimeType} name.
   */
  public String autoResolveContentType(String typeName, String url, byte[] data) {
    String retType = null;
    String magicType = null;
    MimeType type = null;
    String cleanedMimeType = null;

    try {
      cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
          .forName(MimeUtil.cleanMimeType(typeName)).getName()
          : null;
    } catch (MimeTypeException mte) {
      // Seems to be a malformed mime type name...
    }

    // first try to get the type from the cleaned type name
    try {
      type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
          : null;
    } catch (MimeTypeException e) {
      type = null;
    }

    // if returned null, or if it's the default type then try url resolution
    if (type == null
        || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
      // If no mime-type header, or cannot find a corresponding registered
      // mime-type, then guess a mime-type from the url pattern
      type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
          .getMimeType(url) : type;
    }

    retType= type.getName();

    // if magic is enabled use mime magic to guess if the mime type returned
    // from the magic guess is different than the one that's already set so far
    // if it is, and it's not the default mime type, then go with the mime type
    // returned by the magic
View Full Code Here

                  
                   // Try to work out what it is
                   MediaType mediaType = getDetector().detect(embedded, new Metadata());
                   String extension = type.getExtension();
                   try {
                      MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                      extension = mimeType.getExtension();
                   } catch(MimeTypeException mte) {
                      // No details on this type are known
                   }
                  
                   // Record what we can do about it
View Full Code Here

   * @return The correctly, automatically guessed {@link MimeType} name.
   */
  public String autoResolveContentType(String typeName, String url, byte[] data) {
    String retType = null;
    String magicType = null;
    MimeType type = null;
    String cleanedMimeType = null;

    try {
      cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
          .forName(MimeUtil.cleanMimeType(typeName)).getName()
          : null;
    } catch (MimeTypeException mte) {
      // Seems to be a malformed mime type name...
    }

    // first try to get the type from the cleaned type name
    try {
      type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
          : null;
    } catch (MimeTypeException e) {
      type = null;
    }

    // if returned null, or if it's the default type then try url resolution
    if (type == null
        || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
      // If no mime-type header, or cannot find a corresponding registered
      // mime-type, then guess a mime-type from the url pattern

      try {
        TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
        Tika tika = new Tika(tikaConfig);
        retType = tika.detect(url) != null ? tika.detect(url) : null;
      } catch (Exception e) {
        String message = "Problem loading default Tika configuration";
        LOG.error(message, e);
        throw new RuntimeException(e);
      }
    } else {
        retType = type.getName();
    }

    // if magic is enabled use mime magic to guess if the mime type returned
    // from the magic guess is different than the one that's already set so far
    // if it is, and it's not the default mime type, then go with the mime type
View Full Code Here

            // using the original content
            if (mimeType == null | forceMTDetection) {
                if (inputDoc.getContent() != null) {
                    Metadata meta = new Metadata();
                    meta.set(Metadata.RESOURCE_NAME_KEY, inputDoc.getUrl());
                    MimeType mimetype = null;
                    try {
                        MediaType mediaType = detector
                                .detect(new ByteArrayInputStream(inputDoc
                                        .getContent()), meta);
                        mimetype = mimetypes.forName(mediaType.getType() + "/"
                                + mediaType.getSubtype());
                    } catch (IOException e) {
                        LOG.error("Exception", e);
                    } catch (MimeTypeException e) {
                        LOG.error("Exception", e);
                    }
                    mt = mimetype.getName();
                } else if (mimeType == null && inputDoc.getText() != null) {
                    // force it to text
                    mt = "text/plain";
                }
            } else {
View Full Code Here

                  
                   // Try to work out what it is
                   MediaType mediaType = getDetector().detect(embedded, new Metadata());
                   String extension = type.getExtension();
                   try {
                      MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                      extension = mimeType.getExtension();
                   } catch(MimeTypeException mte) {
                      // No details on this type are known
                   }
                  
                   // Record what we can do about it
View Full Code Here

TOP

Related Classes of org.apache.tika.mime.MimeType

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.