Package org.apache.tika.parser.txt

Examples of org.apache.tika.parser.txt.CharsetDetector


              } else {
                 // Nothing in the header, try encoding detection
                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
View Full Code Here


              } else {
                 // Nothing in the header, try encoding detection
                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
View Full Code Here

              } else {
                 // Nothing in the header, try encoding detection
                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
View Full Code Here

            }
        }

        // No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
        // hint, or the passed content-type hint.
        CharsetDetector detector = new CharsetDetector();
        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
        if (incomingCharset == null && incomingType != null) {
            // TIKA-341: Use charset in content-type
            MediaType mt = MediaType.parse(incomingType);
            if (mt != null) {
                String charset = mt.getParameters().get("charset");
                if ((charset != null) && Charset.isSupported(charset)) {
                    incomingCharset = charset;
                }
            }
        }

        if (incomingCharset != null) {
            detector.setDeclaredEncoding(incomingCharset);
        }

        // TIKA-341 without enabling input filtering (stripping of tags) the
        // short HTML tests don't work well.
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (Charset.isSupported(match.getName())) {
                metadata.set(Metadata.CONTENT_ENCODING, match.getName());

                // TIKA-339: Don't set language, as it's typically not a very good
                // guess, and it can create ambiguity if another (better) language
View Full Code Here

              } else {
                 // Nothing in the header, try encoding detection
                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
View Full Code Here

              } else {
                 // Nothing in the header, try encoding detection
                 //  on the message body
                 StringChunk text = msg.getMainChunks().textBodyChunk;
                 if(text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText( text.getRawValue() );
                    CharsetMatch match = detector.detect();
                    if(match.getConfidence() > 35) {
                       msg.set7BitEncoding( match.getName() );
                    }
                 }
              }
View Full Code Here

            logger.log(Level.WARNING, "Failed to find mbox file while detecting charset"); //NON-NLS
            return possibleEncoders;
        }
       
        try {
            CharsetDetector detector = new CharsetDetector();
            detector.setText(is);
            CharsetMatch[] matches = detector.detectAll();
            for (CharsetMatch match : matches) {
                try {
                    possibleEncoders.add(Charset.forName(match.getName()).newEncoder());
                } catch (UnsupportedCharsetException | IllegalCharsetNameException ex) {
                    // Don't add unsupported charsets to the list
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.txt.CharsetDetector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.