Package org.archive.wayback.replay.charset

Source Code of org.archive.wayback.replay.charset.CharsetDetector

/*
*  This file is part of the Wayback archival access software
*   (http://archive-access.sourceforge.net/projects/wayback/).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.wayback.replay.charset;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.Iterator;
import java.util.Map;

import org.archive.wayback.core.Resource;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.replay.TagMagix;
import org.mozilla.universalchardet.UniversalDetector;

/**
* Abstract class containing common methods for determining the character
* encoding of a text Resource, most of which should be refactored into a
* Util package.
* @author brad
*
*/
public abstract class CharsetDetector {
  // hand off this many bytes to the chardet library
  protected final static int MAX_CHARSET_READAHEAD = 65536;
  // ...if it also includes "charset="
  protected final static String CHARSET_TOKEN = "charset=";
  // ...and if the chardet library fails, use the Content-Type header
  protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
  /** the default charset name to use when giving up */
  public final static String DEFAULT_CHARSET = "UTF-8";

  protected boolean isCharsetSupported(String charsetName) {
    // can you believe that this throws a runtime? Just asking if it's
    // supported!!?! They coulda just said "no"...
    if(charsetName == null) {
      return false;
    }
    try {
      return Charset.isSupported(charsetName);
    } catch(IllegalCharsetNameException e) {
      return false;
    }
  }
  protected String mapCharset(String orig) {
    String lc = orig.toLowerCase();
    if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
      return "cp1252";
    }
    if(lc.contains("unicode")) {
      return DEFAULT_CHARSET;
    }
    return orig;
  }
  protected String contentTypeToCharset(final String contentType) {
    int offset =
        contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());

    if (offset != -1) {
      String cs = contentType.substring(offset + CHARSET_TOKEN.length());
      if(isCharsetSupported(cs)) {
        return mapCharset(cs);
      }
      // test for extra spaces... there's at least one page out there that
      // indicates it's charset with:

      //  <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">

      // bad web page!
      if(isCharsetSupported(cs.replace(" ", ""))) {
        return mapCharset(cs.replace(" ", ""));
      }
    }
    return null;
  }

  /**
   * Attempt to divine the character encoding of the document from the
   * Content-Type HTTP header (with a "charset=")
   *
   * @param resource
   * @return String character set found or null if the header was not present
   * @throws IOException
   */
  protected String getCharsetFromHeaders(Resource resource)
      throws IOException {

    String charsetName = null;

    Map<String,String> httpHeaders = resource.getHttpHeaders();
    Iterator<String> keys = httpHeaders.keySet().iterator();
    String ctype = null;
    while(keys.hasNext()) {
      String headerKey = keys.next();
      String keyCmp = headerKey.toUpperCase().trim();
      if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) {
        ctype = httpHeaders.get(headerKey);
        break;
      }
    }
    if (ctype != null) {
      charsetName = contentTypeToCharset(ctype);
    }
    return charsetName;
  }

  /**
   * Attempt to find a META tag in the HTML that hints at the character set
   * used to write the document.
   *
   * @param resource
   * @return String character set found from META tags in the HTML
   * @throws IOException
   */
  protected String getCharsetFromMeta(InputStream resource) throws IOException {
    String charsetName = null;

    byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
    resource.mark(MAX_CHARSET_READAHEAD);
    resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
    resource.reset();
    // convert to UTF-8 String -- which hopefully will not mess up the
    // characters we're interested in...
    StringBuilder sb = new StringBuilder(new String(bbuffer,DEFAULT_CHARSET));
    String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
        "content", "http-equiv", "Content-Type");
    if(metaContentType != null) {
      charsetName = contentTypeToCharset(metaContentType);
    }
    return charsetName;
  }

  /**
   * Attempts to figure out the character set of the document using
   * the excellent juniversalchardet library.
   *
   * @param resource
   * @return String character encoding found, or null if nothing looked good.
   * @throws IOException
   */
  protected String getCharsetFromBytes(InputStream resource) throws IOException {
    String charsetName;

    byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
    // (1)
    UniversalDetector detector = new UniversalDetector(null);

    // (2)
    resource.mark(MAX_CHARSET_READAHEAD);
    int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
    resource.reset();
    detector.handleData(bbuffer, 0, len);
    // (3)
    detector.dataEnd();
    // (4)
    charsetName = detector.getDetectedCharset();

    // (5)
    detector.reset();
    if(isCharsetSupported(charsetName)) {
      return charsetName;
    }
    return null;
  }

  public String getCharset(Resource resource, WaybackRequest request)
      throws IOException {
    return getCharset(resource, resource, request);
  }

  /**
   * @param httpHeadersResource resource with http headers to consider
   * @param payloadResource resource with payload to consider (presumably text)
         * @param wbRequest WaybackRequest which may contain additional hints to processing
   * @return String charset name for the Resource
   * @throws IOException if there are problems reading the Resource
   */
  public abstract String getCharset(Resource httpHeadersResource,
      Resource payloadResource, WaybackRequest wbRequest)
          throws IOException;
}
TOP

Related Classes of org.archive.wayback.replay.charset.CharsetDetector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.