Source Code of org.archive.wayback.replay.charset.CharsetDetector

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.replay.charset;


import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.Iterator;
import java.util.Map;


import org.archive.wayback.core.Resource;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.replay.TagMagix;
import org.mozilla.universalchardet.UniversalDetector;


/**
 * Abstract class containing common methods for determining the character 
 * encoding of a text Resource, most of which should be refactored into a
 * Util package.
 * @author brad
 *
 */
public abstract class CharsetDetector {
  // hand off this many bytes to the chardet library
  protected final static int MAX_CHARSET_READAHEAD = 65536;
  // ...if it also includes "charset="
  protected final static String CHARSET_TOKEN = "charset=";
  // ...and if the chardet library fails, use the Content-Type header
  protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
  /** the default charset name to use when giving up */
  public final static String DEFAULT_CHARSET = "UTF-8";


  protected boolean isCharsetSupported(String charsetName) {
    // can you believe that this throws a runtime? Just asking if it's
    // supported!!?! They coulda just said "no"...
    if(charsetName == null) {
      return false;
    }
    try {
      return Charset.isSupported(charsetName);
    } catch(IllegalCharsetNameException e) {
      return false;
    }
  }
  protected String mapCharset(String orig) {
    String lc = orig.toLowerCase();
    if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
      return "cp1252";
    }
    if(lc.contains("unicode")) {
      return DEFAULT_CHARSET;
    }
    return orig;
  }
  protected String contentTypeToCharset(final String contentType) {
    int offset = 
        contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());


    if (offset != -1) {
      String cs = contentType.substring(offset + CHARSET_TOKEN.length());
      if(isCharsetSupported(cs)) {
        return mapCharset(cs);
      }
      // test for extra spaces... there's at least one page out there that
      // indicates it's charset with:


      //  <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">


      // bad web page!
      if(isCharsetSupported(cs.replace(" ", ""))) {
        return mapCharset(cs.replace(" ", ""));
      }
    }
    return null;
  }


  /**
   * Attempt to divine the character encoding of the document from the 
   * Content-Type HTTP header (with a "charset=")
   * 
   * @param resource
   * @return String character set found or null if the header was not present
   * @throws IOException 
   */
  protected String getCharsetFromHeaders(Resource resource) 
      throws IOException {


    String charsetName = null;


    Map<String,String> httpHeaders = resource.getHttpHeaders();
    Iterator<String> keys = httpHeaders.keySet().iterator();
    String ctype = null;
    while(keys.hasNext()) {
      String headerKey = keys.next();
      String keyCmp = headerKey.toUpperCase().trim();
      if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) {
        ctype = httpHeaders.get(headerKey);
        break;
      }
    }
    if (ctype != null) {
      charsetName = contentTypeToCharset(ctype);
    }
    return charsetName;
  }


  /**
   * Attempt to find a META tag in the HTML that hints at the character set
   * used to write the document.
   * 
   * @param resource
   * @return String character set found from META tags in the HTML
   * @throws IOException
   */
  protected String getCharsetFromMeta(InputStream resource) throws IOException {
    String charsetName = null;


    byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
    resource.mark(MAX_CHARSET_READAHEAD);
    resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
    resource.reset();
    // convert to UTF-8 String -- which hopefully will not mess up the
    // characters we're interested in...
    StringBuilder sb = new StringBuilder(new String(bbuffer,DEFAULT_CHARSET));
    String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
        "content", "http-equiv", "Content-Type");
    if(metaContentType != null) {
      charsetName = contentTypeToCharset(metaContentType);
    }
    return charsetName;
  }


  /**
   * Attempts to figure out the character set of the document using
   * the excellent juniversalchardet library.
   * 
   * @param resource
   * @return String character encoding found, or null if nothing looked good.
   * @throws IOException
   */
  protected String getCharsetFromBytes(InputStream resource) throws IOException {
    String charsetName;


    byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
    // (1)
    UniversalDetector detector = new UniversalDetector(null);


    // (2)
    resource.mark(MAX_CHARSET_READAHEAD);
    int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
    resource.reset();
    detector.handleData(bbuffer, 0, len);
    // (3)
    detector.dataEnd();
    // (4)
    charsetName = detector.getDetectedCharset();


    // (5)
    detector.reset();
    if(isCharsetSupported(charsetName)) {
      return charsetName;
    }
    return null;
  }


  public String getCharset(Resource resource, WaybackRequest request)
      throws IOException {
    return getCharset(resource, resource, request);
  }


  /**
   * @param httpHeadersResource resource with http headers to consider 
   * @param payloadResource resource with payload to consider (presumably text)
         * @param wbRequest WaybackRequest which may contain additional hints to processing
   * @return String charset name for the Resource
   * @throws IOException if there are problems reading the Resource
   */
  public abstract String getCharset(Resource httpHeadersResource,
      Resource payloadResource, WaybackRequest wbRequest)
          throws IOException;
}
Source Code of org.archive.wayback.replay.charset.CharsetDetector

Related Classes of org.archive.wayback.replay.charset.CharsetDetector