Package com.ibm.icu.text

Examples of com.ibm.icu.text.CharsetDetector.detect()


        ltrStrBuf = ltrStrBuf.reverse();
        byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
       
        CharsetDetector det = new CharsetDetector();
        det.setText(bytes);
        CharsetMatch m = det.detect();
        return m;
    }
}
View Full Code Here


        CharsetDetector detector = new CharsetDetector();
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
        detector.setText(in);
        CharsetMatch found = detector.detect();
        result = found.getName();
        LOG.debug("Encoding: " + result);
        return result;
    }
View Full Code Here

            stream = new BufferedInputStream(stream);
        }
   
        detector.setText(stream);
   
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
       
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
View Full Code Here

   *             occurs when the detection is failed.
   */
  public static String detectEncoding(byte[] data, String defaultEncoding) throws IOException {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    CharsetMatch cm = detector.detect();
    String estimatedEncoding = cm.getName();
    boolean isReliable = Charset.isSupported(estimatedEncoding) && cm.getConfidence() >= MINIMAL_CONFIDENCE_LEVEL;
    return isReliable ? estimatedEncoding : defaultEncoding;
  }
}
View Full Code Here

      //encoding detection
    try {
      BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
      CharsetDetector cd = new CharsetDetector();
      cd.setText(bis);
      CharsetMatch cm = cd.detect();
      if (cm != null) {
        format += "; charset=" + cm.getName();
      }
    } catch (IOException e) {
      log.error("Error detecting charset for '{}': {}", fileName, e.getMessage());
View Full Code Here

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }
  }

  /**
 
View Full Code Here

        // encoding detection
        // FIXME: is this required?
        try (BufferedInputStream bis = new BufferedInputStream(openStream(file))) {
            CharsetDetector cd = new CharsetDetector();
            cd.setText(bis);
            CharsetMatch cm = cd.detect();
            if (cm != null) {
                log.trace("Detected charset {} in {}", cm.getName(), file);
                format += "; charset=" + cm.getName();
            }
            bis.close();
View Full Code Here

    }

    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return match.getName().toUpperCase();
  }

  /**
   * A pretty good test that something is UTF-8. There are many sequences that will pass here that
View Full Code Here

    public ICSVFetcherResult read(InputStream stream) throws Exception {

        CharsetDetector detector = new CharsetDetector();
        detector.setText(new BufferedInputStream(stream));

        CSVReader reader = new CSVReader(detector.detect().getReader(), separator);

        String[] keys = reader.readNext();
        String[] nextLine;

        callback.onStart();
View Full Code Here

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }
  }

  /**
 
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.