Package org.apache.tika.parser.txt

Examples of org.apache.tika.parser.txt.CharsetMatch


    public String guessEncoding(InputStream is) throws IOException {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) );
        charsetDetector.enableInputFilter(true);
        CharsetMatch cm = charsetDetector.detect();
        return cm.getName();
    }
View Full Code Here


    public String guessEncoding(InputStream is) throws IOException {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) );
        charsetDetector.enableInputFilter(true);
        CharsetMatch cm = charsetDetector.detect();
        return cm.getName();
    }
View Full Code Here

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here

        if (!"text/html".equals(mime_type)) {
          return;
        }
                              
        // try to determine character encoding for content
        CharsetMatch charset = new CharsetDetector().setText(item.getContent().getBytes()).detect();               
       
        // fetch http response as decoded by CharsetDetector
        String decodedHttpResponse;       
        try {
          decodedHttpResponse = charset.getString();
        }
        catch (NullPointerException e) {
          // unexplainable cases of CharsetMatch throwing null pointer for what looks to be sane text ?
          // just use string as best bet
          decodedHttpResponse = new String(item.getContent().getReadOnlyBytes());
View Full Code Here

        int headerLength = locateEndOfHeader(v.getBytes());
                              
        // try to determine character encoding for rest of response   
        // ( 4 bytes for the header/body seperator 0d 0a 0d 0a
        InputStream responseBytes = new ByteArrayInputStream(v.getBytes(), headerLength + 4, v.getLength() - headerLength - 4);
        CharsetMatch charset = new CharsetDetector().setText(responseBytes).detect();               
       
        // fetch http response as decoded by CharsetDetector
        String decodedHttpResponse;       
        try {
          decodedHttpResponse = charset.getString();
        }
        catch (NullPointerException e) {
          // unexplainable cases of CharsetMatch throwing null pointer for what looks to be sane text ?
          // just use string as best bet
          decodedHttpResponse = new String(v.getBytes(), headerLength + 4, v.getLength() - headerLength - 4);
View Full Code Here

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here

                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
           }
          
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.txt.CharsetMatch

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.