Examples of CharsetMatch


Examples of com.ibm.icu.text.CharsetMatch

      if (usedDecoder == null) {
        CharsetDetector detector = new CharsetDetector();
        detector.enableInputFilter(filtered);
        byte[] data = buffer.toByteArray();
        detector.setText(data);
        CharsetMatch cm = detector.detect();
        try {
          usedDecoder = Charset.forName(cm == null ? "ISO-8859-1" : cm.getName()).newDecoder();
        } catch (UnsupportedCharsetException ex) {
          usedDecoder = Charset.forName("ISO-8859-1").newDecoder();
        }
        usedDecoder.onUnmappableCharacter(unmappableCharacterAction());
        usedDecoder.onMalformedInput(malformedInputAction());
View Full Code Here

Examples of com.ibm.icu.text.CharsetMatch

    }

    public static Reader readerWithCharsetDetect(InputStream is) {
        CharsetDetector detector = new CharsetDetector();
        try {
            CharsetMatch match = detector.setText(is).detect();
            is.reset();
            return new InputStreamReader(is, match.getName());
        } catch (IOException e) {
            e.printStackTrace();
            try {
                is.reset();
            } catch (IOException e1) {
View Full Code Here

Examples of com.ibm.icu.text.CharsetMatch

   
    public Encoding sniff() throws IOException {
        try {
            CharsetDetector detector = new CharsetDetector();
            detector.setText(this);
            CharsetMatch match = detector.detect();
            Encoding enc = Encoding.forName(match.getName());
            Encoding actual = enc.getActualHtmlEncoding();
            if (actual != null) {
                enc = actual;
            }
            if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetMatch

    public String guessEncoding(InputStream is) throws IOException {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) );
        charsetDetector.enableInputFilter(true);
        CharsetMatch cm = charsetDetector.detect();
        return cm.getName();
    }
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetMatch

    public String guessEncoding(InputStream is) throws IOException {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) );
        charsetDetector.enableInputFilter(true);
        CharsetMatch cm = charsetDetector.detect();
        return cm.getName();
    }
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetMatch

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetMatch

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetMatch

        if (!"text/html".equals(mime_type)) {
          return;
        }
                              
        // try to determine character encoding for content
        CharsetMatch charset = new CharsetDetector().setText(item.getContent().getBytes()).detect();               
       
        // fetch http response as decoded by CharsetDetector
        String decodedHttpResponse;       
        try {
          decodedHttpResponse = charset.getString();
        }
        catch (NullPointerException e) {
          // unexplainable cases of CharsetMatch throwing null pointer for what looks to be sane text ?
          // just use string as best bet
          decodedHttpResponse = new String(item.getContent().getReadOnlyBytes());
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetMatch

        int headerLength = locateEndOfHeader(v.getBytes());
                              
        // try to determine character encoding for rest of response   
        // ( 4 bytes for the header/body seperator 0d 0a 0d 0a
        InputStream responseBytes = new ByteArrayInputStream(v.getBytes(), headerLength + 4, v.getLength() - headerLength - 4);
        CharsetMatch charset = new CharsetDetector().setText(responseBytes).detect();               
       
        // fetch http response as decoded by CharsetDetector
        String decodedHttpResponse;       
        try {
          decodedHttpResponse = charset.getString();
        }
        catch (NullPointerException e) {
          // unexplainable cases of CharsetMatch throwing null pointer for what looks to be sane text ?
          // just use string as best bet
          decodedHttpResponse = new String(v.getBytes(), headerLength + 4, v.getLength() - headerLength - 4);
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetMatch

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.