Package org.mozilla.intl.chardet

Examples of org.mozilla.intl.chardet.nsDetector


   * @return
   * @throws IOException
   */
  public static String getEncoding(final String filename) throws IOException {
    // System.out.println("getEncoding: " + filename);
    final nsDetector det = new nsDetector();

    final BufferedInputStream imp = new BufferedInputStream(new FileInputStream(filename));

    final byte[] buf = new byte[1024];
    int len;
    boolean done = false;
    boolean isAscii = true;

    while ((len = imp.read(buf, 0, buf.length)) != -1) {
      // Check if the stream is only ascii.
      if (isAscii) {
        isAscii = det.isAscii(buf, len);
      }

      // DoIt if non-ascii and not done yet.
      if (!isAscii && !done) {
        done = det.DoIt(buf, len, false);
      }
    }
    det.DataEnd();

    if (isAscii) {
      return "ASCII";
    } else {
      final String prob[] = det.getProbableCharsets();
      if (prob.length == 0) {
        throw new IllegalStateException("cannot determine file encoding for : " + filename);
      }

      for (int i = 0; i < prob.length; i++) {
View Full Code Here


    private boolean found = false;

    public FileCharsetDetector( File detectedFile )
        throws FileNotFoundException, IOException
    {
        nsDetector det = new nsDetector( nsPSMDetector.ALL );

        det.Init( new nsICharsetDetectionObserver()
        {
            public void Notify( String charset )
            {
                FileCharsetDetector.this.charset = charset;
                FileCharsetDetector.this.found = true;
            }
        } );

        FileInputStream fileInputStream = new FileInputStream( detectedFile );
        BufferedInputStream imp = new BufferedInputStream( fileInputStream );
        try
        {

            byte[] buf = new byte[1024];
            int len;
            boolean done = false;
            boolean isAscii = true;

            while ( ( len = imp.read( buf, 0, buf.length ) ) != -1 )
            {
                // Check if the stream is only ascii.
                if ( isAscii )
                {
                    isAscii = det.isAscii( buf, len );
                }

                // DoIt if non-ascii and not done yet.
                if ( !isAscii && !done )
                {
                    done = det.DoIt( buf, len, false );
                    found = done;
                }
            }
            det.DataEnd();

            if ( !isFound() )
            {
                String[] prob = det.getProbableCharsets();

                if ( prob.length > 0 )
                {
                    charset = prob[0];
                }
View Full Code Here


    public FileCharsetDetector( InputStream detectedStream )
        throws FileNotFoundException, IOException
    {
        nsDetector det = new nsDetector( nsPSMDetector.ALL );

        det.Init( new nsICharsetDetectionObserver()
        {
            public void Notify( String charset )
            {
                FileCharsetDetector.this.charset = charset;
                FileCharsetDetector.this.found = true;
            }
        } );

        BufferedInputStream imp = new BufferedInputStream( detectedStream );

        byte[] buf = new byte[1024];
        int len;
        boolean done = false;
        boolean isAscii = true;

        while ( ( len = imp.read( buf, 0, buf.length ) ) != -1 )
        {
            // Check if the stream is only ascii.
            if ( isAscii )
            {
                isAscii = det.isAscii( buf, len );
            }

            // DoIt if non-ascii and not done yet.
            if ( !isAscii && !done )
            {
                done = det.DoIt( buf, len, false );
                found = done;
            }
        }
        det.DataEnd();

        if ( !isFound() )
        {
            String[] prob = det.getProbableCharsets();

            if ( prob.length > 0 )
            {
                charset = prob[0];
            }
View Full Code Here

    } else if (hasUtf1BOM(buf, len)) {
      charset = UTF1;
      buffered.write(buf, 3, len - 3);
    } else {
      // Use jchardet which tries a variety of heuristics to choose an encoding.
      nsDetector det = new nsDetector(nsPSMDetector.ALL);
      // The below is adapted from the main method in HtmlCharsetDetector.
      Observer observer = new Observer();
      det.Init(observer);
      do {
        buffered.write(buf, 0, len);
        if (isAscii) { isAscii = det.isAscii(buf, len); }
        if (!isAscii) {
          if (det.DoIt(buf, len, false)) { break; }
        }
      } while ((len = in.read(buf)) > 0);
      det.DataEnd();
      charset = observer.charset;
    }
    if (charset != null) { charset = supportedCharsetName(charset); }
    if (charset == null) { charset = UTF8; }
    return Pair.pair(
View Full Code Here

*/
public class CharsetDetectorJcd implements ICharsetDetector {

  public Collection<String> detectCharset(byte[] bytes) {
   
    nsDetector det = new nsDetector(nsDetector.ALL) ;

    final Collection<String> charsets = new LinkedHashSet<String>();
    det.Init(new nsICharsetDetectionObserver() {
            public void Notify(String charset) {
              charsets.add(charset);
            }
    });

    boolean isAscii = det.isAscii(bytes, bytes.length);
    if ( !isAscii && charsets.size() == 0 ) {
      det.DoIt(bytes, bytes.length, false);
    }
    det.DataEnd();

    if ( isAscii ) {
      charsets.add("ASCII");
    }
    else if ( charsets.size() == 0 ) {
      String[] pcs = det.getProbableCharsets();
      if ( pcs != null ) {
        charsets.addAll(Arrays.asList(pcs));
      }
    }
    return charsets;
View Full Code Here

    if (content != null && content.length != 0) {

      DetectorState state = new DetectorState();

      nsDetector detector = new nsDetector(nsPSMDetector.ALL);

      detector.Init(state);

      if (offset != 0) {
        byte[] contentCopy = new byte[Math.min(length, MAX_CHARS_TO_DETECT)];
        System.arraycopy(content,offset, contentCopy,0,length);
        content = contentCopy;
      }
     
      boolean isAscii = detector.isAscii(content, content.length);

      if (!isAscii) {
        isAscii = detector.DoIt(content, Math.min(content.length,
            MAX_CHARS_TO_DETECT), false);
      }
      detector.DataEnd();

      if (isAscii) {
        return "ASCII";
      } else if (state._detectedCharset != null) {
        return state._detectedCharset;
      } else {
        String prob[] = detector.getProbableCharsets();
        if (prob != null && prob.length != 0) {
          return prob[0];
        }
      }
    }
View Full Code Here

    private CharsetDecoder usedDecoder = null;
    private ByteBuffer remaining = null;

    protected Decoder() {
      super(JChardetCharset.this, 1.0f, 2.0f);
      detector = new nsDetector(languageFlag);
      detector.Init(this);
    }
View Full Code Here

      return result;
    }

    @Override
    protected void implReset() {
      detector = new nsDetector(languageFlag);
      buffer = new ByteArrayOutputStream();
      usedDecoder = null;
      remaining = null;
      isASCII = true;
    }
View Full Code Here

*
*/
public class CharsetUtils {
   
    public static String getCharset(byte[] bytes) {
        nsDetector det = new nsDetector(nsPSMDetector.ALL);
        CharsetListener listener = new CharsetListener();
        det.Init(listener);
       
        boolean isAscii = det.isAscii(bytes,bytes.length);
        // DoIt if non-ascii and not done yet.
        if (!isAscii)
            det.DoIt(bytes,bytes.length, false);
        det.DataEnd();
        if (isAscii) return "ASCII";
       
        return listener.getCharset();
    }
View Full Code Here

        // we could do things like make buttons visible and invisible here
    }

    private String getCharset(String contentType, byte[] bytes) {
        String[] charsets;
        nsDetector det = new nsDetector(nsPSMDetector.ALL);
       
        boolean isAscii = det.isAscii(bytes,bytes.length);
        // DoIt if non-ascii and not done yet.
        if (!isAscii)
            det.DoIt(bytes,bytes.length, false);
        charsets = det.getProbableCharsets();
        det.DataEnd();
       
        if (isAscii) return "ASCII";
        if (charsets.length == 0) return null;
        if (charsets.length == 1 && charsets[0].equals("nomatch")) return null;
       
View Full Code Here

TOP

Related Classes of org.mozilla.intl.chardet.nsDetector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.