Package net.yacy.document.parser.html

Examples of net.yacy.document.parser.html.ScraperInputStream


                            // detect charset of html-files
                            if((path.endsWith("html") || path.endsWith("htm"))) {
                                // save position
                                fis.mark(1000);
                                // scrape document to look up charset
                                final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new DigestURI("http://localhost"),null,false);
                                final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
                                if(charset != null)
                                    mimeType = mimeType + "; charset="+charset;
                                // reset position
                                fis.reset();
                            }
View Full Code Here


        }
       
        // nothing found: try to find a meta-tag
        if (charset == null) {
            try {
                final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
            } catch (IOException e1) {
                throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
            }
        }
View Full Code Here

                            // detect charset of html-files
                            if((path.endsWith("html") || path.endsWith("htm"))) {
                                // save position
                                fis.mark(1000);
                                // scrape document to look up charset
                                final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new DigestURI("http://localhost"),null,false);
                                final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
                                if(charset != null)
                                    mimeType = mimeType + "; charset="+charset;
                                // reset position
                                fis.reset();
                            }
View Full Code Here

        }

        // nothing found: try to find a meta-tag
        if (charset == null) {
            try {
                final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
            } catch (final IOException e1) {
                throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
            }
        }
View Full Code Here

TOP

Related Classes of net.yacy.document.parser.html.ScraperInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.