Package edu.isi.karma.util

Source Code of edu.isi.karma.util.EncodingDetector

package edu.isi.karma.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.mozilla.universalchardet.UniversalDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class EncodingDetector {
   
  private static Logger logger = LoggerFactory.getLogger(EncodingDetector.class);
    public final static String DEFAULT_ENCODING = StandardCharsets.UTF_8.name();

    public static String detect(InputStream is) throws IOException {

        byte[] buf = new byte[4096];
        // (1)
        UniversalDetector detector = new UniversalDetector(null);

        // (2)
        int nread;
        while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
            detector.handleData(buf, 0, nread);
        }
        // (3)
        detector.dataEnd();

        // (4)
        String encoding = detector.getDetectedCharset();
        if (encoding != null) {
            logger.debug("Detected encoding = " + encoding);
        } else {
            logger.debug("No encoding detected, using default: " + DEFAULT_ENCODING);
            encoding = DEFAULT_ENCODING;
        }

        // (5)
        detector.reset();
       
        return encoding;
    }
   
    public static String detect(File file) {
        try {
          FileInputStream fis = new FileInputStream(file);
         
          String encoding = EncodingDetector.detect(fis);
         
          logger.info("Detected encoding for file: " + file.getName() + ": " + encoding);
          if (encoding == null) {
              encoding = DEFAULT_ENCODING;
          }
          return encoding;
        } catch(Exception e) {
          logger.debug("Exception detecting encoding, using default: " + DEFAULT_ENCODING);
        }
        return DEFAULT_ENCODING;
    }

    public static InputStreamReader getInputStreamReader(InputStream is, String encoding) throws IOException {
       
        logger.info("Reading stream: using encoding: " + encoding);
        BOMInputStream bis = new BOMInputStream(is); //So that we can remove the BOM
        return new InputStreamReader(bis, encoding);
    }
    public static InputStreamReader getInputStreamReader(File file, String encoding) throws IOException {
       
        FileInputStream fis = new FileInputStream(file);
        logger.info("Reading file: " + file + " using encoding: " + encoding);
        BOMInputStream bis = new BOMInputStream(fis); //So that we can remove the BOM
        return new InputStreamReader(bis, encoding);
    }
   
    public static String getString(File file, String encoding) throws IOException {
        StringWriter sw = new StringWriter();
       
        FileInputStream fis = new FileInputStream(file);
        logger.info("Reading file: " + file + " using encoding: " + encoding);
        IOUtils.copy(fis, sw, encoding);

        return sw.toString();
    }
}
TOP

Related Classes of edu.isi.karma.util.EncodingDetector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.