Package utils

Source Code of utils.FileUtil

/**
* Yobi, Project Hosting SW
*
* Copyright 2012 NAVER Corp.
* http://yobi.io
*
* @Author Ahn Hyeok Jun
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package utils;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.mozilla.universalchardet.UniversalDetector;

import java.io.*;
import java.nio.charset.Charset;

public class FileUtil {

    public static final int MAX_SIZE_FOR_BINARY_DETECTION = 512;

    public static void rm_rf(File file){
        if(file.isDirectory()){
            File[] list = file.listFiles();
            for (File f : list) {
                rm_rf(f);
            }
            file.delete();
        } else {
            file.delete();
        }
    }

    static private String or(String a, String b) {
        return StringUtils.isNotEmpty(a) ? a : b;
    }

    /**
     * Detect the charset used for the given byte array.
     *
     * Return "UTF-8" if it fails.
     *
     * We use mozilla's UniversalDetector instead of Tika's CharsetDetector
     * which raises "mark/reset not supported" IOException for FileInputStream.
     *
     * @param bytes - a byte array to be checked
     * @return charset - the charset used
     * @throws IOException
     */
    public static String detectCharset(byte bytes[]) {
        UniversalDetector detector = new UniversalDetector(null);
        int offset = 0;

        do {
            int blockSize = Math.min(4096, bytes.length - offset);
            detector.handleData(bytes, offset, blockSize);
            offset += blockSize;
        } while(offset < bytes.length);
        detector.dataEnd();

        return or(detector.getDetectedCharset(), "UTF-8");
    }

    /**
     * Detect the charset used for the given stream.
     *
     * Return "UTF-8" if it fails.
     *
     * We use mozilla's UniversalDetector instead of Tika's CharsetDetector
     * which raises "mark/reset not supported" IOException for FileInputStream.
     *
     * @param is - an input stream to be checked
     * @return charset - the charset used
     * @throws IOException
     */
    public static String detectCharset(InputStream is) throws IOException {
        UniversalDetector detector = new UniversalDetector(null);
        byte[] buf = new byte[4096];
        int nRead;

        while ((nRead = is.read(buf)) > 0 && !detector.isDone()) {
            detector.handleData(buf, 0, nRead);
        }
        detector.dataEnd();

        return or(detector.getDetectedCharset(), "UTF-8");
    }

    public static MediaType detectMediaType(File file, String name) throws IOException {
        return detectMediaType(new FileInputStream(file), name);
    }

    public static MediaType detectMediaType(byte[] bytes, String name) throws IOException {
        return detectMediaType(new ByteArrayInputStream(bytes), name);
    }

    /**
     * Detects media type of the given resource, by using Apache Tika.
     *
     * This method does following additional tasks besides Tika:
     * 1. Adds a charset parameter for text resources.
     * 2. Fixes Tika's misjudge of media type for ogg videos
     *
     * @param is    the input stream to read the resource
     * @param name  the filename of the resource
     * @return the detected media type which optionally includes a charset parameter
     *         e.g. "text/plain; charset=utf-8"
     * @throws IOException
     */
    public static MediaType detectMediaType(InputStream is, String name)
        throws IOException {
        Metadata meta = new Metadata();
        meta.add(Metadata.RESOURCE_NAME_KEY, name);
        MediaType mediaType = new Tika().getDetector().detect(
                new BufferedInputStream(is), meta);

        if (mediaType.getType().toLowerCase().equals("text")) {
            return new MediaType(mediaType, Charset.forName(FileUtil.detectCharset(is)));
        } else if (mediaType.equals(MediaType.audio("ogg"))
                && FilenameUtils.getExtension(name).toLowerCase().equals("ogv")) {
            // This fixes Tika's misjudge of media type for ogg videos.
            return new MediaType("video", "ogg");
        }

        return mediaType;
    }

    public static String getCharset(MediaType mediaType) {
        return mediaType.hasParameters()
                ? mediaType.getParameters().get("charset") : null;
    }
}
TOP

Related Classes of utils.FileUtil

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.