Examples of com.ibm.icu.text.CharsetDetector

com.ibm.icu.text.CharsetDetector
CharsetDetector provides a facility for detecting the charset or encoding of character data in an unknown format. The input data can either be from an input stream or an array of bytes. The result of the detection operation is a list of possibly matching charsets, or, for simple use, you can just ask for a Java Reader that will will work over the input data.
Character set detection is at best an imprecise operation. The detection process will attempt to identify the charset that best matches the characteristics of the byte data, but the process is partly statistical in nature, and the results can not be guaranteed to always be correct.
For best accuracy in charset detection, the input data should be primarily in a single language, and a minimum of a few hundred bytes worth of plain text in the language are needed. The detection process will attempt to ignore html or xml style markup that could otherwise obscure the content.
@stable ICU 3.4

            }
        }


        // the author didn't tell us the encoding, try the mozilla-heuristic
        if (charset == null) {
          CharsetDetector det = new CharsetDetector();
          det.enableInputFilter(true);
          InputStream detStream = new BufferedInputStream(sourceStream);
          det.setText(detStream);
          charset = det.detect().getName();
          sourceStream = detStream;
        }
        
        // wtf? still nothing, just take system-standard
        if (charset == null) {

View Full Code Here

    String defaultCharset = Charset.defaultCharset().name();
    setSelectedItem(defaultCharset);
  }


  public String autoDetectEncoding(byte[] bytes) {
    CharsetDetector cd = new CharsetDetector();
    cd.setText(bytes);
    CharsetMatch charsetMatch = cd.detect();
    String charSet = charsetMatch.getName();


    int confidence = charsetMatch.getConfidence();
    logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
    setSelectedItem(charSet);

View Full Code Here

     * and content language ({@link HttpHeaders#CONTENT_LANGUAGE}).
     * 
     * @return Reader to utf8 encoded reader.
     */
    public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
        CharsetDetector detector = new CharsetDetector();
    
        // Use the declared character encoding, if available
        String encoding = metadata.get(Metadata.CONTENT_ENCODING);
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
    
        // CharsetDetector expects a stream to support marks
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream);
        }
    
        detector.setText(stream);
    
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
        
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());

View Full Code Here

                reader = ReaderFactory.newXmlReader( f );
                return ( (XmlStreamReader) reader ).getEncoding();
            }


            is = new BufferedInputStream( new FileInputStream( f ) );
            CharsetDetector detector = new CharsetDetector();
            detector.setText( is );
            CharsetMatch match = detector.detect();


            return match.getName().toUpperCase( Locale.ENGLISH );
        }
        catch ( IOException e )
        {

View Full Code Here

    if (assume88591IfNotUtf8) {
      return ISO_8859_1;
    }


    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return Charset.forName(match.getName().toUpperCase());
  }

View Full Code Here


 
  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }

View Full Code Here

        if (!acceptsMimeType(response.getLastHeader("Content-Type"))) {
          return new RejectedMimeTypePage(url, status, response.getLastHeader("Content-Type").getValue());
        }


        if (Status.OK.equals(status)) {
          CharsetDetector detector = new CharsetDetector();
          detector.setText(read(response.getEntity().getContent()));
          CharsetMatch match = detector.detect();


          log.debug("Detected charset: " + match.getName());


          String content = match.getString();
          CharBuffer buffer = CharBuffer.wrap(content.toCharArray());

View Full Code Here

 */
public class CharsetDetectorIcu implements ICharsetDetector {


  public Collection<String> detectCharset(byte[] bytes) {
    
    CharsetDetector detector = new CharsetDetector();
    detector.setText(bytes);
    
    CharsetMatch[] matches = detector.detectAll();
    if ( matches == null || matches.length == 0 ) {
      return null;
    }
    
    Collection<String> charsets = new LinkedHashSet<String>();

View Full Code Here

        return filtered;
    }
    
    private CharsetMatch[] detect(byte[] bytes)
    {
        CharsetDetector det = new CharsetDetector();
        
        det.setText(bytes);
        
        return det.detectAll();
    }

View Full Code Here

        return det.detectAll();
    }
    
    private CharsetMatch[] detect(BufferedInputStream inputStream)
    {
        CharsetDetector det    = new CharsetDetector();
        
        try {
            det.setText(inputStream);
            
            return det.detectAll();
        } catch (Exception e) {
            // TODO: error message?
            return null;
        }
    }

View Full Code Here

0 1 2 3 4

TOP

Related Classes of com.ibm.icu.text.CharsetDetector

com.ibm.icu.dev.demo.charsetdet.DetectingViewer

com.ibm.icu.dev.test.charsetdet.TestCharsetDetector

com.nardoz.restopengov.utils.CSVFetcher

fr.eolya.utils.http.HttpStream

fr.eolya.utils.http.WebStream

net.geco.basics.GecoResources

net.sf.jmatchparser.util.charset.icu4jchardet.ICU4JChardetCharset$Decoder

net.vidageek.crawler.component.WebDownloader

net.yacy.document.parser.htmlParser

nu.validator.htmlparser.extra.IcuDetectorSniffer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.