Examples of org.apache.tika.parser.txt.CharsetDetector

org.apache.tika.parser.txt.CharsetDetector
CharsetDetector provides a facility for detecting the charset or encoding of character data in an unknown format. The input data can either be from an input stream or an array of bytes. The result of the detection operation is a list of possibly matching charsets, or, for simple use, you can just ask for a Java Reader that will will work over the input data.
Character set detection is at best an imprecise operation. The detection process will attempt to identify the charset that best matches the characteristics of the byte data, but the process is partly statistical in nature, and the results can not be guaranteed to always be correct.
For best accuracy in charset detection, the input data should be primarily in a single language, and a minimum of a few hundred bytes worth of plain text in the language are needed. The detection process will attempt to ignore html or xml style markup that could otherwise obscure the content.
@stable ICU 3.4

 * @version $Id$
 */
public class TikaEncodingDetector implements EncodingDetector {


    public String guessEncoding(InputStream is) throws IOException {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) );
        charsetDetector.enableInputFilter(true);
        CharsetMatch cm = charsetDetector.detect();
        return cm.getName();
    }

View Full Code Here

            }
        }


        // No charset in a meta http-equiv tag, see if it's in the passed content-encoding
        // hint, or the passed content-type hint.
        CharsetDetector detector = new CharsetDetector();
        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
        if (incomingCharset == null && incomingType != null) {
            // TIKA-341: Use charset in content-type
            MediaType mt = MediaType.parse(incomingType);
            if (mt != null) {
                String charset = mt.getParameters().get("charset");
                if ((charset != null) && Charset.isSupported(charset)) {
                    incomingCharset = charset;
                }
            }
        }


        if (incomingCharset != null) {
            detector.setDeclaredEncoding(incomingCharset);
        }


        // TIKA-341 without enabling input filtering (stripping of tags) the
        // short HTML tests don't work well.
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (Charset.isSupported(match.getName())) {
                metadata.set(Metadata.CONTENT_ENCODING, match.getName());


                // TIKA-339: Don't set language, as it's typically not a very good
                // guess, and it can create ambiguity if another (better) language

View Full Code Here

            }
        }


        // No charset in a meta http-equiv tag, see if it's in the passed content-encoding
        // hint, or the passed content-type hint.
        CharsetDetector detector = new CharsetDetector();
        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
        if (incomingCharset == null && incomingType != null) {
            // TIKA-341: Use charset in content-type
            MediaType mt = MediaType.parse(incomingType);
            if (mt != null) {
                String charset = mt.getParameters().get("charset");
                if ((charset != null) && Charset.isSupported(charset)) {
                    incomingCharset = charset;
                }
            }
        }


        if (incomingCharset != null) {
            detector.setDeclaredEncoding(incomingCharset);
        }


        // TIKA-341 without enabling input filtering (stripping of tags) the
        // short HTML tests don't work well.
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (Charset.isSupported(match.getName())) {
                metadata.set(Metadata.CONTENT_ENCODING, match.getName());


                // TIKA-339: Don't set language, as it's typically not a very good
                // guess, and it can create ambiguity if another (better) language

View Full Code Here

 * @version $Id: TikaEncodingDetector.java 1231556 2012-01-14 18:47:58Z mostarda $
 */
public class TikaEncodingDetector implements EncodingDetector {


    public String guessEncoding(InputStream is) throws IOException {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) );
        charsetDetector.enableInputFilter(true);
        CharsetMatch cm = charsetDetector.detect();
        return cm.getName();
    }

View Full Code Here

            }
        }


        // No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
        // hint, or the passed content-type hint.
        CharsetDetector detector = new CharsetDetector();
        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
        if (incomingCharset == null && incomingType != null) {
            // TIKA-341: Use charset in content-type
            MediaType mt = MediaType.parse(incomingType);
            if (mt != null) {
                String charset = mt.getParameters().get("charset");
                if ((charset != null) && Charset.isSupported(charset)) {
                    incomingCharset = charset;
                }
            }
        }


        if (incomingCharset != null) {
            detector.setDeclaredEncoding(incomingCharset);
        }


        // TIKA-341 without enabling input filtering (stripping of tags) the
        // short HTML tests don't work well.
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (Charset.isSupported(match.getName())) {
                metadata.set(Metadata.CONTENT_ENCODING, match.getName());


                // TIKA-339: Don't set language, as it's typically not a very good
                // guess, and it can create ambiguity if another (better) language

View Full Code Here

              } else {
                 // Nothing in the header, try encoding detection
                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk; 
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }

View Full Code Here

              } else {
                 // Nothing in the header, try encoding detection
                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk; 
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }

View Full Code Here

            }
        }


        // No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
        // hint, or the passed content-type hint.
        CharsetDetector detector = new CharsetDetector();
        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
        if (incomingCharset == null && incomingType != null) {
            // TIKA-341: Use charset in content-type
            MediaType mt = MediaType.parse(incomingType);
            if (mt != null) {
                String charset = mt.getParameters().get("charset");
                if ((charset != null) && Charset.isSupported(charset)) {
                    incomingCharset = charset;
                }
            }
        }


        if (incomingCharset != null) {
            detector.setDeclaredEncoding(incomingCharset);
        }


        // TIKA-341 without enabling input filtering (stripping of tags) the
        // short HTML tests don't work well.
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (Charset.isSupported(match.getName())) {
                metadata.set(Metadata.CONTENT_ENCODING, match.getName());


                // TIKA-339: Don't set language, as it's typically not a very good
                // guess, and it can create ambiguity if another (better) language

View Full Code Here

        if (!"text/html".equals(mime_type)) {
          return;
        }
                               
        // try to determine character encoding for content
        CharsetMatch charset = new CharsetDetector().setText(item.getContent().getBytes()).detect();                
        
        // fetch http response as decoded by CharsetDetector 
        String decodedHttpResponse;        
        try {
          decodedHttpResponse = charset.getString();

View Full Code Here

        int headerLength = locateEndOfHeader(v.getBytes());
                               
        // try to determine character encoding for rest of response    
        // ( 4 bytes for the header/body seperator 0d 0a 0d 0a
        InputStream responseBytes = new ByteArrayInputStream(v.getBytes(), headerLength + 4, v.getLength() - headerLength - 4);
        CharsetMatch charset = new CharsetDetector().setText(responseBytes).detect();                
        
        // fetch http response as decoded by CharsetDetector 
        String decodedHttpResponse;        
        try {
          decodedHttpResponse = charset.getString();

View Full Code Here

0 1

TOP

Related Classes of org.apache.tika.parser.txt.CharsetDetector

cc.FilterTextHtml$FilterTextHtmlMapper

cc.FilterTextHtml2$FilterTextHtmlMapper

org.apache.any23.encoding.TikaEncodingDetector

org.apache.tika.parser.html.HtmlParser

org.apache.tika.parser.microsoft.OutlookExtractor

org.sleuthkit.autopsy.thunderbirdparser.MboxParser

java.util.ArrayList

java.nio.charset.Charset

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.