Source Code of org.eclipse.jst.jsp.core.internal.contenttype.JSPResourceEncodingDetector

/*******************************************************************************
 * Copyright (c) 2004, 2009 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/
package org.eclipse.jst.jsp.core.internal.contenttype;


import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Pattern;


import org.eclipse.core.resources.IStorage;
import org.eclipse.core.runtime.CoreException;
import org.eclipse.core.runtime.content.IContentDescription;
import org.eclipse.jst.jsp.core.internal.Logger;
import org.eclipse.wst.sse.core.internal.encoding.CodedIO;
import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento;
import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector;
import org.eclipse.wst.sse.core.internal.encoding.NonContentBasedEncodingRules;
import org.eclipse.wst.sse.core.utils.StringUtils;
import org.eclipse.wst.xml.core.internal.contenttype.EncodingParserConstants;
import org.eclipse.wst.xml.core.internal.contenttype.XMLHeadTokenizerConstants;


import com.ibm.icu.util.StringTokenizer;


public class JSPResourceEncodingDetector implements IResourceCharsetDetector {


  private String fCharset;


  private String fContentType;


  private String fContentTypeValue;


  private String fLanguage;


  private String fPageEncodingValue;


  private JSPHeadTokenizer fTokenizer;


  private String fXMLDecEncodingName;


  private boolean unicodeCase;


  private EncodingMemento fEncodingMemento;


  private boolean fHeaderParsed;


  private Reader fReader;


  private boolean fXHTML;


  private boolean fWML;




  /**
   * No Arg constructor.
   */
  public JSPResourceEncodingDetector() {
    super();
  }


  class NullMemento extends EncodingMemento {
    /**
     * 
     */
    public NullMemento() {
      super();
      String defaultCharset = NonContentBasedEncodingRules.useDefaultNameRules(null);
      setJavaCharsetName(defaultCharset);
      setAppropriateDefault(defaultCharset);
      setDetectedCharsetName(null);
    }


  }


  /**
   * @return Returns the contentType.
   */
  public String getContentType() throws IOException {
    ensureInputSet();
    if (!fHeaderParsed) {
      parseInput();
      // we keep track of if header's already been parse, so can make
      // multiple 'get' calls, without causing reparsing.
      fHeaderParsed = true;
      // Note: there is a "hidden assumption" here that an empty
      // string in content should be treated same as not present.
    }
    return fContentType;
  }


  public String getEncoding() throws IOException {
    return getEncodingMemento().getDetectedCharsetName();
  }


  // to ensure consist overall rules used, we'll mark as
  // final,
  // and require subclasses to provide certain pieces of
  // the
  // implementation
  public EncodingMemento getEncodingMemento() throws IOException {
    ensureInputSet();
    if (!fHeaderParsed) {
      parseInput();
      // we keep track of if header's already been
      // parse, so can make
      // multiple 'get' calls, without causing
      // reparsing.
      fHeaderParsed = true;
      // Note: there is a "hidden assumption" here
      // that an empty
      // string in content should be treated same as
      // not present.
    }
    if (fEncodingMemento == null) {
      handleSpecDefault();
    }
    if (fEncodingMemento == null) {
      // safty net
      fEncodingMemento = new NullMemento();
    }
    return fEncodingMemento;
  }


  public String getLanguage() throws IOException {
    ensureInputSet();
    if (!fHeaderParsed) {
      parseInput();
      fHeaderParsed = true;
    }
    return fLanguage;
  }


  public String getSpecDefaultEncoding() {
    // by JSP Spec
    final String enc = "ISO-8859-1"; //$NON-NLS-1$
    return enc;
  }


  public EncodingMemento getSpecDefaultEncodingMemento() {
    resetAll();
    EncodingMemento result = null;
    String enc = getSpecDefaultEncoding();
    if (enc != null) {
      createEncodingMemento(enc, EncodingMemento.DEFAULTS_ASSUMED_FOR_EMPTY_INPUT);
      fEncodingMemento.setAppropriateDefault(enc);
      result = fEncodingMemento;
    }
    return result;
  }


  /**
   * 
   */
  public void set(InputStream inputStream) {
    resetAll();
    fReader = new ByteReader(inputStream);
    try {
      fReader.mark(CodedIO.MAX_MARK_SIZE);
    }
    catch (IOException e) {
      // impossible, since we know ByteReader
      // supports marking
      throw new Error(e);
    }
  }


  /**
   * 
   */
  public void set(IStorage iStorage) throws CoreException {
    resetAll();
    InputStream inputStream = iStorage.getContents();
    InputStream resettableStream = new BufferedInputStream(inputStream, CodedIO.MAX_BUF_SIZE);
    resettableStream.mark(CodedIO.MAX_MARK_SIZE);
    set(resettableStream);
    // TODO we'll need to "remember" IFile, or
    // get its (or its project's) settings, in case
    // those are needed to handle cases when the
    // encoding is not in the file stream.
  }


  /**
   * Note: this is not part of interface to help avoid confusion ... it
   * expected this Reader is a well formed character reader ... that is, its
   * all ready been determined to not be a unicode marked input stream. And,
   * its assumed to be in the correct position, at position zero, ready to
   * read first character.
   */
  public void set(Reader reader) {
    resetAll();
    fReader = reader;
    if (!fReader.markSupported()) {
      fReader = new BufferedReader(fReader);
    }
    try {
      fReader.mark(CodedIO.MAX_MARK_SIZE);
    }
    catch (IOException e) {
      // impossble, since we just checked if markable
      throw new Error(e);
    }
  }


  private boolean canHandleAsUnicodeStream(String tokenType) {
    boolean canHandleAsUnicode = false;
    if (tokenType == EncodingParserConstants.UTF83ByteBOM) {
      canHandleAsUnicode = true;
      String enc = "UTF-8"; //$NON-NLS-1$
      createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES);
      fEncodingMemento.setUTF83ByteBOMUsed(true);
    }
    else if (tokenType == EncodingParserConstants.UTF16BE || tokenType == EncodingParserConstants.UTF16LE) {
      canHandleAsUnicode = true;
      String enc = "UTF-16"; //$NON-NLS-1$
      byte[] bom = (tokenType == EncodingParserConstants.UTF16BE) ? IContentDescription.BOM_UTF_16BE : IContentDescription.BOM_UTF_16LE;
      createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES);
      fEncodingMemento.setUnicodeStream(true);
      fEncodingMemento.setUnicodeBOM(bom);
    }
    return canHandleAsUnicode;
  }


  /**
   * Note: once this instance is created, trace info still needs to be
   * appended by caller, depending on the context its created.
   */
  private void createEncodingMemento(String detectedCharsetName) {
    fEncodingMemento = new EncodingMemento();
    fEncodingMemento.setJavaCharsetName(getAppropriateJavaCharset(detectedCharsetName));
    fEncodingMemento.setDetectedCharsetName(detectedCharsetName);
    // TODO: if detectedCharset and spec default is
    // null, need to use "work
    // bench based" defaults.
    fEncodingMemento.setAppropriateDefault(getSpecDefaultEncoding());
  }


  /**
   * There can sometimes be mulitple 'encodings' specified in a file. This
   * is an attempt to centralize the rules for deciding between them.
   * Returns encoding according to priority: 1. XML Declaration 2. page
   * directive pageEncoding name 3. page directive contentType charset name
   */
  private String getAppropriateEncoding() {
    String result = null;
    if (fXMLDecEncodingName != null)
      result = fXMLDecEncodingName;
    else if (fPageEncodingValue != null)
      result = fPageEncodingValue;
    else if (fCharset != null)
      result = fCharset;
    return result;
  }


  /**
   * This method can return null, if invalid charset name (in which case
   * "appropriateDefault" should be used, if a name is really need for some
   * "save anyway" cases).
   * 
   * @param detectedCharsetName
   * @return
   */
  private String getAppropriateJavaCharset(String detectedCharsetName) {
    String result = null;
    // 1. Check explicit mapping overrides from
    // property file -- its here we pick up "rules" for cases
    // that are not even in Java
    result = CodedIO.checkMappingOverrides(detectedCharsetName);
    // 2. Use the "canonical" name from JRE mappings
    // Note: see Charset JavaDoc, the name you get one
    // with can be alias,
    // the name you get back is "standard" name.
    Charset javaCharset = null;
    try {
      javaCharset = Charset.forName(detectedCharsetName);
    }
    catch (UnsupportedCharsetException e) {
      // only set invalid, if result is same as detected -- they won't
      // be equal if
      // overridden
      if (result != null && result.equals(detectedCharsetName)) {
        fEncodingMemento.setInvalidEncoding(detectedCharsetName);
      }
    }
    catch (IllegalCharsetNameException e) {
      // only set invalid, if result is same as detected -- they won't
      // be equal if
      // overridden
      if (result != null && result.equals(detectedCharsetName)) {
        fEncodingMemento.setInvalidEncoding(detectedCharsetName);
      }
    }
    // give priority to java cononical name, if present
    if (javaCharset != null) {
      result = javaCharset.name();
      // but still allow overrides
      result = CodedIO.checkMappingOverrides(result);
    }
    return result;
  }


  private JSPHeadTokenizer getTokinizer() {
    if (fTokenizer == null) {
      fTokenizer = new JSPHeadTokenizer();
    }
    return fTokenizer;
  }


  private void handleSpecDefault() {
    String encodingName;
    encodingName = getSpecDefaultEncoding();
    if (encodingName != null) {
      // createEncodingMemento(encodingName,
      // EncodingMemento.USED_CONTENT_TYPE_DEFAULT);
      fEncodingMemento = new EncodingMemento();
      fEncodingMemento.setJavaCharsetName(encodingName);
      fEncodingMemento.setAppropriateDefault(encodingName);
    }
  }


  private boolean isLegalString(String valueTokenType) {
    boolean result = false;
    if (valueTokenType != null) {
      result = valueTokenType.equals(EncodingParserConstants.StringValue) || valueTokenType.equals(EncodingParserConstants.UnDelimitedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTerminatedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue);
    }
    return result;
  }




  /**
   * This method should be exactly the same as what is in 
   * JSPHeadTokenizerTester
   * @param contentType
   */
  private void parseContentTypeValue(String contentType) {
    /*
     * Based partially on
     * org.eclipse.jst.jsp.core.internal.document.PageDirectiveAdapterImpl
     * .getMimeTypeFromContentTypeValue(String) , divides the full value
     * into segments according to ';', assumes the first specifies the
     * content type itself if it has no '=', and that the remainder are
     * parameters which may specify a charset
     */
    
    String cleanContentTypeValue = StringUtils.stripNonLetterDigits(contentType);
    /* Break the mime header into the main value and its parameters, separated by ';' */
    StringTokenizer tokenizer = new StringTokenizer(cleanContentTypeValue, ";"); //$NON-NLS-1$
    int tLen = tokenizer.countTokens();
    if (tLen == 0)
      return;
    String[] tokens = new String[tLen];
    int j = 0;
    while (tokenizer.hasMoreTokens()) {
      tokens[j] = tokenizer.nextToken();
      j++;
    }
    
    int firstParameter = 0;
    if (tokens[0].indexOf('=') == -1) {
      /*
       * no equal sign in the first segment, so assume it indicates a
       * content type properly
       */
      fContentType = tokens[0].trim();
      firstParameter = 1;
    }
    /*
     * now handle parameters as name=value pairs, looking for "charset"
     * specifically
     */
    Pattern equalPattern = Pattern.compile("\\s*=\\s*"); //$NON-NLS-1$
    for (int i = firstParameter; i < tokens.length; i++) {
      String[] pair = equalPattern.split(tokens[i]);
      if (pair.length < 2)
        continue;
      if (pair[0].trim().equals("charset")) { //$NON-NLS-1$
        fCharset = pair[1].trim();
      }
    }
  }




  /**
   * Looks for what ever encoding properties the tokenizer returns. Its the
   * responsibility of the tokenizer to stop when appropriate and not go too
   * far.
   */
  private void parseHeader(JSPHeadTokenizer tokenizer) throws Exception {
    fPageEncodingValue = null;
    fCharset = null;


    HeadParserToken token = null;
    do {
      // don't use 'get' here (at least until reset issue fixed)
      token = tokenizer.getNextToken();
      String tokenType = token.getType();
      if (canHandleAsUnicodeStream(tokenType))
        unicodeCase = true;
      else {
        if (tokenType == XMLHeadTokenizerConstants.XMLDelEncoding) {
          if (tokenizer.hasMoreTokens()) {
            HeadParserToken valueToken = tokenizer.getNextToken();
            String valueTokenType = valueToken.getType();
            if (isLegalString(valueTokenType)) {
              fXMLDecEncodingName = valueToken.getText();
            }
          }
        }
        else if (tokenType == JSPHeadTokenizerConstants.PageEncoding) {
          if (tokenizer.hasMoreTokens()) {
            HeadParserToken valueToken = tokenizer.getNextToken();
            String valueTokenType = valueToken.getType();
            if (isLegalString(valueTokenType)) {
              fPageEncodingValue = valueToken.getText();
            }
          }
        }
        else if (tokenType == JSPHeadTokenizerConstants.PageContentType) {
          if (tokenizer.hasMoreTokens()) {
            HeadParserToken valueToken = tokenizer.getNextToken();
            String valueTokenType = valueToken.getType();
            if (isLegalString(valueTokenType)) {
              fContentTypeValue = valueToken.getText();
            }
          }
        }
        else if (tokenType == JSPHeadTokenizerConstants.PageLanguage) {
          if (tokenizer.hasMoreTokens()) {
            HeadParserToken valueToken = tokenizer.getNextToken();
            String valueTokenType = valueToken.getType();
            if (isLegalString(valueTokenType)) {
              fLanguage = valueToken.getText();
            }
          }
        }
      }
    }
    while (tokenizer.hasMoreTokens());
    if (fContentTypeValue != null) {
      parseContentTypeValue(fContentTypeValue);
    }
    if (tokenizer.isXHTML()) {
      fXHTML = true;
    }
    if (tokenizer.isWML() ) {
      fWML = true;
    }
  }


  private void parseInput() throws IOException {
    JSPHeadTokenizer tokenizer = getTokinizer();
    fReader.reset();
    tokenizer.reset(fReader);
    try {
      parseHeader(tokenizer);
      // unicode stream cases are created directly in parseHeader
      if (!unicodeCase) {
        String enc = getAppropriateEncoding();
        if (enc != null && enc.length() > 0) {
          createEncodingMemento(enc, EncodingMemento.FOUND_ENCODING_IN_CONTENT);
        }
      }
    } catch (Exception e) {
      Logger.log(Logger.ERROR_DEBUG, e.getMessage());
    }
  }


  /**
   * 
   */
  private void resetAll() {
    fReader = null;
    fHeaderParsed = false;
    fEncodingMemento = null;
    fCharset = null;
    fContentTypeValue = null;
    fPageEncodingValue = null;
    fXMLDecEncodingName = null;
    unicodeCase = false;
    fXHTML=false;
    fWML=false;
  }






  /**
   * convience method all subclasses can use (but not override)
   * 
   * @param detectedCharsetName
   * @param reason
   */
  private void createEncodingMemento(String detectedCharsetName, String reason) {
    createEncodingMemento(detectedCharsetName);
  }


  /**
   * convience method all subclasses can use (but not override)
   */
  private void ensureInputSet() {
    if (fReader == null) {
      throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$
    }
  }


  public boolean isWML() throws IOException {
    ensureInputSet();
    if (!fHeaderParsed) {
      parseInput();
      // we keep track of if header's already been parse, so can make
      // multiple 'get' calls, without causing reparsing.
      fHeaderParsed = true;
      // Note: there is a "hidden assumption" here that an empty
      // string in content should be treated same as not present.
    }
    return fWML;
  }


  public boolean isXHTML() throws IOException {
    ensureInputSet();
    if (!fHeaderParsed) {
      parseInput();
      // we keep track of if header's already been parse, so can make
      // multiple 'get' calls, without causing reparsing.
      fHeaderParsed = true;
      // Note: there is a "hidden assumption" here that an empty
      // string in content should be treated same as not present.
    }
    return fXHTML;
  }
}
Source Code of org.eclipse.jst.jsp.core.internal.contenttype.JSPResourceEncodingDetector

Related Classes of org.eclipse.jst.jsp.core.internal.contenttype.JSPResourceEncodingDetector