Package com.ibm.icu.text

Source Code of com.ibm.icu.text.CollationRuleParser$IndirectBoundaries

/**
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/
package com.ibm.icu.text;

import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.PatternProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Collator.ReorderCodes;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;

/**
* Class for parsing collation rules, produces a list of tokens that will be turned into collation elements
*
* @author Syn Wee Quek
* @since release 2.2, June 7 2002
*/
final class CollationRuleParser {
  // public data members ---------------------------------------------------

  // package private constructors ------------------------------------------

  /**
   * <p>
   * RuleBasedCollator constructor that takes the rules. Please see RuleBasedCollator class description for more details on the collation
   * rule syntax.
   * </p>
   *
   * @see java.util.Locale
   * @param rules
   *            the collation rules to build the collation table from.
   * @exception ParseException
   *                thrown when argument rules have an invalid syntax.
   */
  CollationRuleParser(String rules) throws ParseException {
    // Prepares m_copySet_ and m_removeSet_.
    rules = preprocessRules(rules);

    // Save the rules as a long string.  The StringBuilder object is
    // used to store the result of token parsing as well.
    m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim());
    m_rules_ = m_source_.toString();

    // Index of the next unparsed character.
    m_current_ = 0;

    // Index of the next unwritten character in the parsed result.
    m_extraCurrent_ = m_source_.length();

    m_variableTop_ = null;
    m_parsedToken_ = new ParsedToken();
    m_hashTable_ = new HashMap<Token, Token>();
    m_options_ = new OptionSet(RuleBasedCollator.UCA_);
    m_listHeader_ = new TokenListHeader[512];
    m_resultLength_ = 0;
    // call assembleTokenList() manually, so that we can
    // init a parser and manually parse tokens
    //assembleTokenList();
  }

  // package private inner classes -----------------------------------------

  /**
   * Collation options set
   */
  static class OptionSet {
    // package private constructor ---------------------------------------

    /**
     * Initializes the option set with the argument collators
     *
     * @param collator
     *            option to use
     */
    OptionSet(RuleBasedCollator collator) {
      m_variableTopValue_ = collator.m_variableTopValue_;
      m_isFrenchCollation_ = collator.isFrenchCollation();
      m_isAlternateHandlingShifted_ = collator.isAlternateHandlingShifted();
      m_caseFirst_ = collator.m_caseFirst_;
      m_isCaseLevel_ = collator.isCaseLevel();
      m_decomposition_ = collator.getDecomposition();
      m_strength_ = collator.getStrength();
      m_isHiragana4_ = collator.m_isHiragana4_;

      if (collator.m_reorderCodes_ != null) {
        m_scriptOrder_ = new int[collator.m_reorderCodes_.length];
        for (int i = 0; i < m_scriptOrder_.length; i++) {
          m_scriptOrder_[i] = collator.m_reorderCodes_[i];
        }
      }

    }

    // package private data members --------------------------------------

    int m_variableTopValue_;
    boolean m_isFrenchCollation_;
    /**
     * Attribute for handling variable elements
     */
    boolean m_isAlternateHandlingShifted_;
    /**
     * who goes first, lower case or uppercase
     */
    int m_caseFirst_;
    /**
     * do we have an extra case level
     */
    boolean m_isCaseLevel_;
    /**
     * attribute for normalization
     */
    int m_decomposition_;
    /**
     * attribute for strength
     */
    int m_strength_;
    /**
     * attribute for special Hiragana
     */
    boolean m_isHiragana4_;

    /**
     * the ordering of the scripts
     */
    int[] m_scriptOrder_;
  }

  /**
   * List of tokens used by the collation rules
   */
  static class TokenListHeader {
    Token m_first_;
    Token m_last_;
    Token m_reset_;
    boolean m_indirect_;
    int m_baseCE_;
    int m_baseContCE_;
    int m_nextCE_;
    int m_nextContCE_;
    int m_previousCE_;
    int m_previousContCE_;
    int m_pos_[] = new int[Collator.IDENTICAL + 1];
    int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
    int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
    int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
    Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
    Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
  }

  /**
   * Token wrapper for collation rules
   */
  static class Token {
    // package private data members ---------------------------------------

    int m_CE_[];
    int m_CELength_;
    int m_expCE_[];
    int m_expCELength_;
    int m_source_;
    int m_expansion_;
    int m_prefix_;
    int m_strength_;
    int m_toInsert_;
    int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
    TokenListHeader m_listHeader_;
    Token m_previous_;
    Token m_next_;
    StringBuilder m_rules_;
    char m_flags_;

    // package private constructors ---------------------------------------

    Token() {
      m_CE_ = new int[128];
      m_expCE_ = new int[128];
      // TODO: this should also handle reverse
      m_polarity_ = TOKEN_POLARITY_POSITIVE_;
      m_next_ = null;
      m_previous_ = null;
      m_CELength_ = 0;
      m_expCELength_ = 0;
    }

    // package private methods --------------------------------------------

    /**
     * Hashcode calculation for token
     *
     * @return the hashcode
     */
    @Override
    public int hashCode() {
      int result = 0;
      int len = (m_source_ & 0xFF000000) >>> 24;
      int inc = ((len - 32) / 32) + 1;

      int start = m_source_ & 0x00FFFFFF;
      int limit = start + len;

      while (start < limit) {
        result = (result * 37) + m_rules_.charAt(start);
        start += inc;
      }
      return result;
    }

    /**
     * Equals calculation
     *
     * @param target
     *            object to compare
     * @return true if target is the same as this object
     */
    @Override
    public boolean equals(Object target) {
      if (target == this) {
        return true;
      }
      if (target instanceof Token) {
        Token t = (Token) target;
        int sstart = m_source_ & 0x00FFFFFF;
        int tstart = t.m_source_ & 0x00FFFFFF;
        int slimit = (m_source_ & 0xFF000000) >> 24;
        int tlimit = (m_source_ & 0xFF000000) >> 24;

        int end = sstart + slimit - 1;

        if (m_source_ == 0 || t.m_source_ == 0) {
          return false;
        }
        if (slimit != tlimit) {
          return false;
        }
        if (m_source_ == t.m_source_) {
          return true;
        }

        while (sstart < end && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {
          ++sstart;
          ++tstart;
        }
        if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {
          return true;
        }
      }
      return false;
    }
  }

  // package private data member -------------------------------------------

  /**
   * Indicator that the token is resetted yet, ie & in the rules
   */
  static final int TOKEN_RESET_ = 0xDEADBEEF;

  /**
   * Size of the number of tokens
   */
  int m_resultLength_;
  /**
   * List of parsed tokens
   */
  TokenListHeader m_listHeader_[];
  /**
   * Variable top token
   */
  Token m_variableTop_;
  /**
   * Collation options
   */
  OptionSet m_options_;
  /**
   * Normalized collation rules with some extra characters
   */
  StringBuilder m_source_;
  /**
   * Hash table to keep all tokens
   */
  Map<Token, Token> m_hashTable_;

  // package private method ------------------------------------------------

  void setDefaultOptionsInCollator(RuleBasedCollator collator) {
    collator.m_defaultStrength_ = m_options_.m_strength_;
    collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
    collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
    collator.m_defaultIsAlternateHandlingShifted_ = m_options_.m_isAlternateHandlingShifted_;
    collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
    collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
    collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
    collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
    if (m_options_.m_scriptOrder_ != null) {
      collator.m_defaultReorderCodes_ = m_options_.m_scriptOrder_.clone();
    } else {
      collator.m_defaultReorderCodes_ = null;
    }
  }

  // private inner classes -------------------------------------------------

  /**
   * This is a token that has been parsed but not yet processed. Used to reduce the number of arguments in the parser
   */
  private static class ParsedToken {
    // private constructor ----------------------------------------------

    /**
     * Empty constructor
     */
    ParsedToken() {
      m_charsLen_ = 0;
      m_charsOffset_ = 0;
      m_extensionLen_ = 0;
      m_extensionOffset_ = 0;
      m_prefixLen_ = 0;
      m_prefixOffset_ = 0;
      m_flags_ = 0;
      m_strength_ = TOKEN_UNSET_;
    }

    // private data members ---------------------------------------------

    int m_strength_;
    int m_charsOffset_;
    int m_charsLen_;
    int m_extensionOffset_;
    int m_extensionLen_;
    int m_prefixOffset_;
    int m_prefixLen_;
    char m_flags_;
    char m_indirectIndex_;
  }

  /**
   * Boundary wrappers
   */
  private static class IndirectBoundaries {
    // package private constructor ---------------------------------------

    IndirectBoundaries(int startce[], int limitce[]) {
      // Set values for the top - TODO: once we have values for all the
      // indirects, we are going to initalize here.
      m_startCE_ = startce[0];
      m_startContCE_ = startce[1];
      if (limitce != null) {
        m_limitCE_ = limitce[0];
        m_limitContCE_ = limitce[1];
      } else {
        m_limitCE_ = 0;
        m_limitContCE_ = 0;
      }
    }

    // package private data members --------------------------------------

    int m_startCE_;
    int m_startContCE_;
    int m_limitCE_;
    int m_limitContCE_;
  }

  /**
   * Collation option rule tag
   */
  private static class TokenOption {
    // package private constructor ---------------------------------------

    TokenOption(String name, int attribute, String suboptions[], int suboptionattributevalue[]) {
      m_name_ = name;
      m_attribute_ = attribute;
      m_subOptions_ = suboptions;
      m_subOptionAttributeValues_ = suboptionattributevalue;
    }

    // package private data member ---------------------------------------

    private String m_name_;
    private int m_attribute_;
    private String m_subOptions_[];
    private int m_subOptionAttributeValues_[];
  }

  // private variables -----------------------------------------------------

  /**
   * Current parsed token
   */
  private ParsedToken m_parsedToken_;
  /**
   * Collation rule
   */
  private String m_rules_;
  private int m_current_;
  /**
   * End of the option while reading. Need it for UnicodeSet reading support.
   */
  private int m_optionEnd_;
  /*
   * Current offset in m_source
   */
  //private int m_sourceLimit_;
  /**
   * Offset to m_source_ ofr the extra expansion characters
   */
  private int m_extraCurrent_;

  /**
   * UnicodeSet that contains code points to be copied from the UCA
   */
  UnicodeSet m_copySet_;

  /**
   * UnicodeSet that contains code points for which we want to remove UCA contractions. It implies copying of these code points from the
   * UCA.
   */
  UnicodeSet m_removeSet_;

  /*
   * This is space for the extra strings that need to be unquoted during the
   * parsing of the rules
   */
  //private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
  /**
   * Indicator that the token is not set yet
   */
  private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
  /*
   * Indicator that the rule is in the > polarity, ie everything on the
   * right of the rule is less than
   */
  //private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
  /**
   * Indicator that the rule is in the < polarity, ie everything on the right of the rule is greater than
   */
  private static final int TOKEN_POLARITY_POSITIVE_ = 1;
  /**
   * Flag mask to determine if top is set
   */
  private static final int TOKEN_TOP_MASK_ = 0x04;
  /**
   * Flag mask to determine if variable top is set
   */
  private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
  /**
   * Flag mask to determine if a before attribute is set
   */
  private static final int TOKEN_BEFORE_ = 0x03;
  /**
   * For use in parsing token options
   */
  private static final int TOKEN_SUCCESS_MASK_ = 0x10;

  /**
   * These values are used for finding CE values for indirect positioning. Indirect positioning is a mechanism for allowing resets on
   * symbolic values. It only works for resets and you cannot tailor indirect names. An indirect name can define either an anchor point or
   * a range. An anchor point behaves in exactly the same way as a code point in reset would, except that it cannot be tailored. A range
   * (we currently only know for the [top] range will explicitly set the upper bound for generated CEs, thus allowing for better control
   * over how many CEs can be squeezed between in the range without performance penalty. In that respect, we use [top] for tailoring of
   * locales that use CJK characters. Other indirect values are currently a pure convenience, they can be used to assure that the CEs will
   * be always positioned in the same place relative to a point with known properties (e.g. first primary ignorable).
   */
  private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];

  //    /**
  //     * Inverse UCA constants
  //     */
  //    private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
  //    private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
  //    private static final int INVERSE_SHIFT_VALUE_ = 20;

  /**
   * Collation option tags [last variable] last variable value [last primary ignorable] largest CE for primary ignorable [last secondary
   * ignorable] largest CE for secondary ignorable [last tertiary ignorable] largest CE for tertiary ignorable [top] guaranteed to be
   * above all implicit CEs, for now and in the future (in 1.8)
   */
  private static final TokenOption RULES_OPTIONS_[];

  static {
    INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
    // UCOL_RESET_TOP_VALUE
    INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
        RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
    // UCOL_FIRST_PRIMARY_IGNORABLE
    INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_, null);
    // UCOL_LAST_PRIMARY_IGNORABLE
    INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_, null);

    // UCOL_FIRST_SECONDARY_IGNORABLE
    INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_, null);
    // UCOL_LAST_SECONDARY_IGNORABLE
    INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_, null);
    // UCOL_FIRST_TERTIARY_IGNORABLE
    INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_, null);
    // UCOL_LAST_TERTIARY_IGNORABLE
    INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_, null);
    // UCOL_FIRST_VARIABLE;
    INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_, null);
    // UCOL_LAST_VARIABLE
    INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_, null);
    // UCOL_FIRST_NON_VARIABLE
    INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_, null);
    // UCOL_LAST_NON_VARIABLE
    INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
        RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
    // UCOL_FIRST_IMPLICIT
    INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_, null);
    // UCOL_LAST_IMPLICIT
    INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
        RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
    // UCOL_FIRST_TRAILING
    INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_, null);
    // UCOL_LAST_TRAILING
    INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_, null);
    INDIRECT_BOUNDARIES_[14].m_limitCE_ = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;

    RULES_OPTIONS_ = new TokenOption[20];
    String option[] = { "non-ignorable", "shifted" };
    int value[] = { RuleBasedCollator.AttributeValue.NON_IGNORABLE_, RuleBasedCollator.AttributeValue.SHIFTED_ };
    RULES_OPTIONS_[0] = new TokenOption("alternate", RuleBasedCollator.Attribute.ALTERNATE_HANDLING_, option, value);
    option = new String[1];
    option[0] = "2";
    value = new int[1];
    value[0] = RuleBasedCollator.AttributeValue.ON_;
    RULES_OPTIONS_[1] = new TokenOption("backwards", RuleBasedCollator.Attribute.FRENCH_COLLATION_, option, value);
    String offonoption[] = new String[2];
    offonoption[0] = "off";
    offonoption[1] = "on";
    int offonvalue[] = new int[2];
    offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
    offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
    RULES_OPTIONS_[2] = new TokenOption("caseLevel", RuleBasedCollator.Attribute.CASE_LEVEL_, offonoption, offonvalue);
    option = new String[3];
    option[0] = "lower";
    option[1] = "upper";
    option[2] = "off";
    value = new int[3];
    value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
    value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
    value[2] = RuleBasedCollator.AttributeValue.OFF_;
    RULES_OPTIONS_[3] = new TokenOption("caseFirst", RuleBasedCollator.Attribute.CASE_FIRST_, option, value);
    RULES_OPTIONS_[4] = new TokenOption("normalization", RuleBasedCollator.Attribute.NORMALIZATION_MODE_, offonoption, offonvalue);
    RULES_OPTIONS_[5] = new TokenOption("hiraganaQ", RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_, offonoption, offonvalue);
    option = new String[5];
    option[0] = "1";
    option[1] = "2";
    option[2] = "3";
    option[3] = "4";
    option[4] = "I";
    value = new int[5];
    value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
    value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
    value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
    value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
    value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
    RULES_OPTIONS_[6] = new TokenOption("strength", RuleBasedCollator.Attribute.STRENGTH_, option, value);
    RULES_OPTIONS_[7] = new TokenOption("variable top", RuleBasedCollator.Attribute.LIMIT_, null, null);
    RULES_OPTIONS_[8] = new TokenOption("rearrange", RuleBasedCollator.Attribute.LIMIT_, null, null);
    option = new String[3];
    option[0] = "1";
    option[1] = "2";
    option[2] = "3";
    value = new int[3];
    value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
    value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
    value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
    RULES_OPTIONS_[9] = new TokenOption("before", RuleBasedCollator.Attribute.LIMIT_, option, value);
    RULES_OPTIONS_[10] = new TokenOption("top", RuleBasedCollator.Attribute.LIMIT_, null, null);
    String firstlastoption[] = new String[7];
    firstlastoption[0] = "primary";
    firstlastoption[1] = "secondary";
    firstlastoption[2] = "tertiary";
    firstlastoption[3] = "variable";
    firstlastoption[4] = "regular";
    firstlastoption[5] = "implicit";
    firstlastoption[6] = "trailing";

    int firstlastvalue[] = new int[7];
    Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);

    RULES_OPTIONS_[11] = new TokenOption("first", RuleBasedCollator.Attribute.LIMIT_, firstlastoption, firstlastvalue);
    RULES_OPTIONS_[12] = new TokenOption("last", RuleBasedCollator.Attribute.LIMIT_, firstlastoption, firstlastvalue);
    RULES_OPTIONS_[13] = new TokenOption("optimize", RuleBasedCollator.Attribute.LIMIT_, null, null);
    RULES_OPTIONS_[14] = new TokenOption("suppressContractions", RuleBasedCollator.Attribute.LIMIT_, null, null);
    RULES_OPTIONS_[15] = new TokenOption("undefined", RuleBasedCollator.Attribute.LIMIT_, null, null);
    RULES_OPTIONS_[16] = new TokenOption("reorder", RuleBasedCollator.Attribute.LIMIT_, null, null);
    RULES_OPTIONS_[17] = new TokenOption("charsetname", RuleBasedCollator.Attribute.LIMIT_, null, null);
    RULES_OPTIONS_[18] = new TokenOption("charset", RuleBasedCollator.Attribute.LIMIT_, null, null);
    RULES_OPTIONS_[19] = new TokenOption("import", RuleBasedCollator.Attribute.LIMIT_, null, null);
  }

  /**
   * Utility data members
   */
  private Token m_utilToken_ = new Token();
  private CollationElementIterator m_UCAColEIter_ = RuleBasedCollator.UCA_.getCollationElementIterator("");
  private int m_utilCEBuffer_[] = new int[2];

  private boolean m_isStarred_;

  private int m_currentStarredCharIndex_;

  private int m_lastStarredCharIndex_;

  private int m_currentRangeCp_;

  private int m_lastRangeCp_;

  private boolean m_inRange_;

  private int m_previousCp_;

  private boolean m_savedIsStarred_;

  // private methods -------------------------------------------------------

  /**
   * Assembles the token list
   *
   * @exception ParseException
   *                thrown when rules syntax fails
   */
  int assembleTokenList() throws ParseException {
    Token lastToken = null;
    m_parsedToken_.m_strength_ = TOKEN_UNSET_;
    int sourcelimit = m_source_.length();
    int expandNext = 0;

    m_isStarred_ = false;

    while (m_current_ < sourcelimit || m_isStarred_) {
      m_parsedToken_.m_prefixOffset_ = 0;
      if (parseNextToken(lastToken == null) < 0) {
        // we have reached the end
        continue;
      }
      char specs = m_parsedToken_.m_flags_;
      boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
      boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
      int lastStrength = TOKEN_UNSET_;
      if (lastToken != null) {
        lastStrength = lastToken.m_strength_;
      }
      m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24 | m_parsedToken_.m_charsOffset_;
      m_utilToken_.m_rules_ = m_source_;
      // 4 Lookup each source in the CharsToToken map, and find a
      // sourcetoken
      Token sourceToken = m_hashTable_.get(m_utilToken_);
      if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
        if (lastToken == null) {
          // this means that rules haven't started properly
          throwParseException(m_source_.toString(), 0);
        }
        //  6 Otherwise (when relation != reset)
        if (sourceToken == null) {
          // If sourceToken is null, create new one
          sourceToken = new Token();
          sourceToken.m_rules_ = m_source_;
          sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24 | m_parsedToken_.m_charsOffset_;
          sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24 | m_parsedToken_.m_prefixOffset_;
          // TODO: this should also handle reverse
          sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
          sourceToken.m_next_ = null;
          sourceToken.m_previous_ = null;
          sourceToken.m_CELength_ = 0;
          sourceToken.m_expCELength_ = 0;
          m_hashTable_.put(sourceToken, sourceToken);
        } else {
          // we could have fished out a reset here
          if (sourceToken.m_strength_ != TOKEN_RESET_ && lastToken != sourceToken) {
            // otherwise remove sourceToken from where it was.

            // Take care of the next node
            if (sourceToken.m_next_ != null) {
              if (sourceToken.m_next_.m_strength_ > sourceToken.m_strength_) {
                sourceToken.m_next_.m_strength_ = sourceToken.m_strength_;
              }
              sourceToken.m_next_.m_previous_ = sourceToken.m_previous_;
            } else {
              // sourcetoken is the last token.
              // Redefine the tail token.
              sourceToken.m_listHeader_.m_last_ = sourceToken.m_previous_;
            }

            // Take care of the previous node.
            if (sourceToken.m_previous_ != null) {
              sourceToken.m_previous_.m_next_ = sourceToken.m_next_;
            } else {
              // sourcetoken is the first token.
              // Redefine the head node.
              sourceToken.m_listHeader_.m_first_ = sourceToken.m_next_;
            }
            sourceToken.m_next_ = null;
            sourceToken.m_previous_ = null;
          }
        }
        sourceToken.m_strength_ = m_parsedToken_.m_strength_;
        sourceToken.m_listHeader_ = lastToken.m_listHeader_;

        // 1.  Find the strongest strength in each list, and set
        // strongestP and strongestN accordingly in the headers.
        if (lastStrength == TOKEN_RESET_ || sourceToken.m_listHeader_.m_first_ == null) {
          // If LAST is a reset insert sourceToken in the list.
          if (sourceToken.m_listHeader_.m_first_ == null) {
            sourceToken.m_listHeader_.m_first_ = sourceToken;
            sourceToken.m_listHeader_.m_last_ = sourceToken;
          } else { // we need to find a place for us
                // and we'll get in front of the same strength
            if (sourceToken.m_listHeader_.m_first_.m_strength_ <= sourceToken.m_strength_) {
              sourceToken.m_next_ = sourceToken.m_listHeader_.m_first_;
              sourceToken.m_next_.m_previous_ = sourceToken;
              sourceToken.m_listHeader_.m_first_ = sourceToken;
              sourceToken.m_previous_ = null;
            } else {
              lastToken = sourceToken.m_listHeader_.m_first_;
              while (lastToken.m_next_ != null && lastToken.m_next_.m_strength_ > sourceToken.m_strength_) {
                lastToken = lastToken.m_next_;
              }
              if (lastToken.m_next_ != null) {
                lastToken.m_next_.m_previous_ = sourceToken;
              } else {
                sourceToken.m_listHeader_.m_last_ = sourceToken;
              }
              sourceToken.m_previous_ = lastToken;
              sourceToken.m_next_ = lastToken.m_next_;
              lastToken.m_next_ = sourceToken;
            }
          }
        } else {
          // Otherwise (when LAST is not a reset)
          // if polarity (LAST) == polarity(relation), insert
          // sourceToken after LAST, otherwise insert before.
          // when inserting after or before, search to the next
          // position with the same strength in that direction.
          // (This is called postpone insertion).
          if (sourceToken != lastToken) {
            if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
              while (lastToken.m_next_ != null && lastToken.m_next_.m_strength_ > sourceToken.m_strength_) {
                lastToken = lastToken.m_next_;
              }
              sourceToken.m_previous_ = lastToken;
              if (lastToken.m_next_ != null) {
                lastToken.m_next_.m_previous_ = sourceToken;
              } else {
                sourceToken.m_listHeader_.m_last_ = sourceToken;
              }
              sourceToken.m_next_ = lastToken.m_next_;
              lastToken.m_next_ = sourceToken;
            } else {
              while (lastToken.m_previous_ != null && lastToken.m_previous_.m_strength_ > sourceToken.m_strength_) {
                lastToken = lastToken.m_previous_;
              }
              sourceToken.m_next_ = lastToken;
              if (lastToken.m_previous_ != null) {
                lastToken.m_previous_.m_next_ = sourceToken;
              } else {
                sourceToken.m_listHeader_.m_first_ = sourceToken;
              }
              sourceToken.m_previous_ = lastToken.m_previous_;
              lastToken.m_previous_ = sourceToken;
            }
          } else { // repeated one thing twice in rules, stay with the
                // stronger strength
            if (lastStrength < sourceToken.m_strength_) {
              sourceToken.m_strength_ = lastStrength;
            }
          }
        }
        // if the token was a variable top, we're gonna put it in
        if (variableTop == true && m_variableTop_ == null) {
          variableTop = false;
          m_variableTop_ = sourceToken;
        }
        // Treat the expansions.
        // There are two types of expansions: explicit (x / y) and
        // reset based propagating expansions
        // (&abc * d * e <=> &ab * d / c * e / c)
        // if both of them are in effect for a token, they are combined.
        sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24 | m_parsedToken_.m_extensionOffset_;
        if (expandNext != 0) {
          if (sourceToken.m_strength_ == Collator.PRIMARY) {
            // primary strength kills off the implicit expansion
            expandNext = 0;
          } else if (sourceToken.m_expansion_ == 0) {
            // if there is no expansion, implicit is just added to
            // the token
            sourceToken.m_expansion_ = expandNext;
          } else {
            // there is both explicit and implicit expansion.
            // We need to make a combination
            int start = expandNext & 0xFFFFFF;
            int size = expandNext >>> 24;
            if (size > 0) {
              m_source_.append(m_source_.substring(start, start + size));
            }
            start = m_parsedToken_.m_extensionOffset_;
            m_source_.append(m_source_.substring(start, start + m_parsedToken_.m_extensionLen_));
            sourceToken.m_expansion_ = (size + m_parsedToken_.m_extensionLen_) << 24 | m_extraCurrent_;
            m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;
          }
        }
        // if the previous token was a reset before, the strength of this
        // token must match the strength of before. Otherwise we have an
        // undefined situation.
        // In other words, we currently have a cludge which we use to
        // represent &a >> x. This is written as &[before 2]a << x.
        if ((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
          int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
          if (beforeStrength != sourceToken.m_strength_) {
            throwParseException(m_source_.toString(), m_current_);
          }
        }

      } else {
        if (lastToken != null && lastStrength == TOKEN_RESET_) {
          // if the previous token was also a reset, this means that
          // we have two consecutive resets and we want to remove the
          // previous one if empty
          if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
            m_resultLength_--;
          }
        }
        if (sourceToken == null) {
          // this is a reset, but it might still be somewhere in the
          // tailoring, in shorter form
          int searchCharsLen = m_parsedToken_.m_charsLen_;
          while (searchCharsLen > 1 && sourceToken == null) {
            searchCharsLen--;
            // key = searchCharsLen << 24 | charsOffset;
            m_utilToken_.m_source_ = searchCharsLen << 24 | m_parsedToken_.m_charsOffset_;
            m_utilToken_.m_rules_ = m_source_;
            sourceToken = m_hashTable_.get(m_utilToken_);
          }
          if (sourceToken != null) {
            expandNext = (m_parsedToken_.m_charsLen_ - searchCharsLen) << 24 | (m_parsedToken_.m_charsOffset_ + searchCharsLen);
          }
        }
        if ((specs & TOKEN_BEFORE_) != 0) {
          if (top == false) {
            // we're doing before & there is no indirection
            int strength = (specs & TOKEN_BEFORE_) - 1;
            if (sourceToken != null && sourceToken.m_strength_ != TOKEN_RESET_) {
              // this is a before that is already ordered in the UCA
              // - so we need to get the previous with good strength
              while (sourceToken.m_strength_ > strength && sourceToken.m_previous_ != null) {
                sourceToken = sourceToken.m_previous_;
              }
              // here, either we hit the strength or NULL
              if (sourceToken.m_strength_ == strength) {
                if (sourceToken.m_previous_ != null) {
                  sourceToken = sourceToken.m_previous_;
                } else { // start of list
                  sourceToken = sourceToken.m_listHeader_.m_reset_;
                }
              } else { // we hit NULL, we should be doing the else part
                sourceToken = sourceToken.m_listHeader_.m_reset_;
                sourceToken = getVirginBefore(sourceToken, strength);
              }
            } else {
              sourceToken = getVirginBefore(sourceToken, strength);
            }
          } else {
            // this is both before and indirection
            top = false;
            m_listHeader_[m_resultLength_] = new TokenListHeader();
            m_listHeader_[m_resultLength_].m_previousCE_ = 0;
            m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
            m_listHeader_[m_resultLength_].m_indirect_ = true;
            // we need to do slightly more work. we need to get the
            // baseCE using the inverse UCA & getPrevious. The next
            // bound is not set, and will be decided in ucol_bld
            int strength = (specs & TOKEN_BEFORE_) - 1;
            int baseCE = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startCE_;
            int baseContCE = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_;
            int ce[] = new int[2];
            if ((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
                && (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
              int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_
                  | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
              int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
              int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw - 1);
              ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
              ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
            } else {
              CollationParsedRuleBuilder.InverseUCA invuca = CollationParsedRuleBuilder.INVERSE_UCA_;
              invuca.getInversePrevCE(baseCE, baseContCE, strength, ce);
            }
            m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
            m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
            m_listHeader_[m_resultLength_].m_nextCE_ = 0;
            m_listHeader_[m_resultLength_].m_nextContCE_ = 0;

            sourceToken = new Token();
            expandNext = initAReset(0, sourceToken);
          }
        }
        // 5 If the relation is a reset:
        // If sourceToken is null
        // Create new list, create new sourceToken, make the baseCE
        // from source, put the sourceToken in ListHeader of the new
        // list
        if (sourceToken == null) {
          if (m_listHeader_[m_resultLength_] == null) {
            m_listHeader_[m_resultLength_] = new TokenListHeader();
          }
          // 3 Consider each item: relation, source, and expansion:
          // e.g. ...< x / y ...
          // First convert all expansions into normal form.
          // Examples:
          // If "xy" doesn't occur earlier in the list or in the UCA,
          // convert &xy * c * d * ... into &x * c/y * d * ...
          // Note: reset values can never have expansions, although
          // they can cause the very next item to have one. They may
          // be contractions, if they are found earlier in the list.
          if (top == false) {
            CollationElementIterator coleiter = RuleBasedCollator.UCA_.getCollationElementIterator(m_source_.substring(
                m_parsedToken_.m_charsOffset_, m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_));

            int CE = coleiter.next();
            // offset to the character in the full rule string
            int expand = coleiter.getOffset() + m_parsedToken_.m_charsOffset_;
            int SecondCE = coleiter.next();

            m_listHeader_[m_resultLength_].m_baseCE_ = CE & 0xFFFFFF3F;
            if (RuleBasedCollator.isContinuation(SecondCE)) {
              m_listHeader_[m_resultLength_].m_baseContCE_ = SecondCE;
            } else {
              m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
            }
            m_listHeader_[m_resultLength_].m_nextCE_ = 0;
            m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
            m_listHeader_[m_resultLength_].m_previousCE_ = 0;
            m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
            m_listHeader_[m_resultLength_].m_indirect_ = false;
            sourceToken = new Token();
            expandNext = initAReset(expand, sourceToken);
          } else { // top == TRUE
            top = false;
            m_listHeader_[m_resultLength_].m_previousCE_ = 0;
            m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
            m_listHeader_[m_resultLength_].m_indirect_ = true;
            IndirectBoundaries ib = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
            m_listHeader_[m_resultLength_].m_baseCE_ = ib.m_startCE_;
            m_listHeader_[m_resultLength_].m_baseContCE_ = ib.m_startContCE_;
            m_listHeader_[m_resultLength_].m_nextCE_ = ib.m_limitCE_;
            m_listHeader_[m_resultLength_].m_nextContCE_ = ib.m_limitContCE_;
            sourceToken = new Token();
            expandNext = initAReset(0, sourceToken);
          }
        } else { // reset to something already in rules
          top = false;
        }
      }
      // 7 After all this, set LAST to point to sourceToken, and goto
      // step 3.
      lastToken = sourceToken;
    }

    if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
      m_resultLength_--;
    }
    return m_resultLength_;
  }

  /**
   * Formats and throws a ParseException
   *
   * @param rules
   *            collation rule that failed
   * @param offset
   *            failed offset in rules
   * @throws ParseException
   *             with failure information
   */
  private static final void throwParseException(String rules, int offset) throws ParseException {
    // for pre-context
    String precontext = rules.substring(0, offset);
    String postcontext = rules.substring(offset, rules.length());
    StringBuilder error = new StringBuilder("Parse error occurred in rule at offset ");
    error.append(offset);
    error.append("\n after the prefix \"");
    error.append(precontext);
    error.append("\" before the suffix \"");
    error.append(postcontext);
    throw new ParseException(error.toString(), offset);
  }

  private final boolean doSetTop() {
    m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
    m_source_.append((char) 0xFFFE);
    IndirectBoundaries ib = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
    m_source_.append((char) (ib.m_startCE_ >> 16));
    m_source_.append((char) (ib.m_startCE_ & 0xFFFF));
    m_extraCurrent_ += 3;
    if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ == 0) {
      m_parsedToken_.m_charsLen_ = 3;
    } else {
      m_source_.append((char) (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ >> 16));
      m_source_.append((char) (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ & 0xFFFF));
      m_extraCurrent_ += 2;
      m_parsedToken_.m_charsLen_ = 5;
    }
    return true;
  }

  private static boolean isCharNewLine(char c) {
    switch (c) {
    case 0x000A: /* LF */
    case 0x000D: /* CR */
    case 0x000C: /* FF */
    case 0x0085: /* NEL */
    case 0x2028: /* LS */
    case 0x2029: /* PS */
      return true;
    default:
      return false;
    }
  }

  /**
   * Parses the next token.
   *
   * It updates/accesses the following member variables: m_current_: Index to the next unparsed character (not code point) in the
   * character array (a StringBuilder object) m_source_. m_parsedToken_: The parsed token. The following of the token are updated.
   * .m_strength: The strength of the token. .m_charsOffset, m_charsLen_: Index to the first character (after operators), and number of
   * characters in the token. This may be in the main string, or in the appended string. .m_extensionOffset_, .m_extensionLen_: .m_flags:
   * .m_prefixOffset, .m_prefixLen: Used when "|" is used to specify "context before". .m_indirectIndex:
   *
   * @param startofrules
   *            flag indicating if we are at the start of rules
   * @return the offset of the next unparsed char
   * @exception ParseException
   *                thrown when rule parsing fails
   */
  private int parseNextToken(boolean startofrules) throws ParseException {

    if (m_inRange_) {
      // We are not done processing a range.  Continue it.
      return processNextCodePointInRange();
    } else if (m_isStarred_) {
      // We are not done processing a starred token.  Continue it.
      return processNextTokenInTheStarredList();
    }

    // Get the next token.
    int nextOffset = parseNextTokenInternal(startofrules);

    // If the next token is starred and/or in range, we need to handle it here.
    if (m_inRange_) {
      // A new range has started.
      // Check whether it is a chain of ranges with more than one hyphen.
      if (m_lastRangeCp_ > 0 && m_lastRangeCp_ == m_previousCp_) {
        throw new ParseException("Chained range syntax", m_current_);
      }

      // The current token is the first character of the second code point of the range.
      // Process just that, and then proceed with the star.
      m_lastRangeCp_ = m_source_.codePointAt(this.m_parsedToken_.m_charsOffset_);
      if (m_lastRangeCp_ <= m_previousCp_) {
        throw new ParseException("Invalid range", m_current_);
      }

      // Set current range code point to process the range loop
      m_currentRangeCp_ = m_previousCp_ + 1;

      // Set current starred char index to continue processing the starred
      // expression after the range is done.
      m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + Character.charCount(m_lastRangeCp_);
      m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1;

      return processNextCodePointInRange();
    } else if (m_isStarred_) {
      // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
      // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
      // separated into several tokens and returned.
      m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_;
      m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1;

      return processNextTokenInTheStarredList();
    }
    return nextOffset;
  }

  private int processNextCodePointInRange() throws ParseException {
    int nChars = Character.charCount(m_currentRangeCp_);
    m_source_.appendCodePoint(m_currentRangeCp_);

    m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
    m_parsedToken_.m_charsLen_ = nChars;

    m_extraCurrent_ += nChars;
    ++m_currentRangeCp_;
    if (m_currentRangeCp_ > m_lastRangeCp_) {
      // All the code points in the range are processed.
      // Turn the range flag off.
      m_inRange_ = false;

      // If there is a starred portion remaining in the current
      // parsed token, resume the starred operation.
      if (m_currentStarredCharIndex_ <= m_lastStarredCharIndex_) {
        m_isStarred_ = true;
      } else {
        m_isStarred_ = false;
      }
    } else {
      m_previousCp_ = m_currentRangeCp_;
    }
    return m_current_;
  }

  /**
   * Extracts the next token from the starred token from m_currentStarredCharIndex_ and returns it.
   *
   * @return the offset of the next unparsed char
   * @throws ParseException
   */
  private int processNextTokenInTheStarredList() throws ParseException {
    // Extract the characters corresponding to the next code point.
    int cp = m_source_.codePointAt(m_currentStarredCharIndex_);
    int nChars = Character.charCount(cp);

    m_parsedToken_.m_charsLen_ = nChars;
    m_parsedToken_.m_charsOffset_ = m_currentStarredCharIndex_;
    m_currentStarredCharIndex_ += nChars;

    // When we are done parsing the starred string, turn the flag off so that
    // the normal processing is restored.
    if (m_currentStarredCharIndex_ > m_lastStarredCharIndex_) {
      m_isStarred_ = false;
    }
    m_previousCp_ = cp;
    return m_current_;
  }

  private int resetToTop(boolean top, boolean variableTop, int extensionOffset, int newExtensionLen, byte byteBefore)
      throws ParseException {
    m_parsedToken_.m_indirectIndex_ = 5;
    top = doSetTop();
    return doEndParseNextToken(TOKEN_RESET_, top, extensionOffset, newExtensionLen, variableTop, byteBefore);
  }

  /**
   * Gets the next token and sets the necessary internal variables. This function parses a starred string as a single token, which will be
   * separated in the calling function.
   *
   * @param startofrules
   *            Boolean value indicating whether this is the first rule
   * @return the offset of the next unparsed char
   * @throws ParseException
   */
  @SuppressWarnings("fallthrough")
  private int parseNextTokenInternal(boolean startofrules) throws ParseException {
    boolean variabletop = false;
    boolean top = false;
    boolean inchars = true;
    boolean inquote = false;
    boolean wasinquote = false;
    byte before = 0;
    boolean isescaped = false;
    int /*newcharslen = 0,*/newextensionlen = 0;
    int /*charsoffset = 0,*/extensionoffset = 0;
    int newstrength = TOKEN_UNSET_;

    initializeParsedToken();

    int limit = m_rules_.length();
    while (m_current_ < limit) {
      char ch = m_source_.charAt(m_current_);
      if (inquote) {
        if (ch == 0x0027) { // '\''
          inquote = false;
        } else {
          if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
            if (m_parsedToken_.m_charsLen_ == 0) {
              m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
            }
            m_parsedToken_.m_charsLen_++;
          } else {
            if (newextensionlen == 0) {
              extensionoffset = m_extraCurrent_;
            }
            newextensionlen++;
          }
        }
      } else if (isescaped) {
        isescaped = false;
        if (newstrength == TOKEN_UNSET_) {
          throwParseException(m_rules_, m_current_);
        }
        if (ch != 0 && m_current_ != limit) {
          if (inchars) {
            if (m_parsedToken_.m_charsLen_ == 0) {
              m_parsedToken_.m_charsOffset_ = m_current_;
            }
            m_parsedToken_.m_charsLen_++;
          } else {
            if (newextensionlen == 0) {
              extensionoffset = m_current_;
            }
            newextensionlen++;
          }
        }
      } else {
        if (!PatternProps.isWhiteSpace(ch)) {
          // Sets the strength for this entry
          switch (ch) {
          case 0x003D: // '='
            if (newstrength != TOKEN_UNSET_) {
              return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before);
            }
            // if we start with strength, we'll reset to top
            if (startofrules == true) {
              return resetToTop(top, variabletop, extensionoffset, newextensionlen, before);
            }
            newstrength = Collator.IDENTICAL;
            if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
              m_current_++;
              m_isStarred_ = true;
            }
            break;
          case 0x002C: // ','
            if (newstrength != TOKEN_UNSET_) {
              return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before);
            }
            // if we start with strength, we'll reset to top
            if (startofrules == true) {
              return resetToTop(top, variabletop, extensionoffset, newextensionlen, before);
            }
            newstrength = Collator.TERTIARY;
            break;
          case 0x003B: // ';'
            if (newstrength != TOKEN_UNSET_) {
              return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before);
            }
            //if we start with strength, we'll reset to top
            if (startofrules == true) {
              return resetToTop(top, variabletop, extensionoffset, newextensionlen, before);
            }
            newstrength = Collator.SECONDARY;
            break;
          case 0x003C: // '<'
            if (newstrength != TOKEN_UNSET_) {
              return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before);
            }
            // if we start with strength, we'll reset to top
            if (startofrules == true) {
              return resetToTop(top, variabletop, extensionoffset, newextensionlen, before);
            }
            // before this, do a scan to verify whether this is
            // another strength
            if (m_source_.charAt(m_current_ + 1) == 0x003C) {
              m_current_++;
              if (m_source_.charAt(m_current_ + 1) == 0x003C) {
                m_current_++; // three in a row!
                newstrength = Collator.TERTIARY;
              } else { // two in a row
                newstrength = Collator.SECONDARY;
              }
            } else { // just one
              newstrength = Collator.PRIMARY;
            }
            if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
              m_current_++;
              m_isStarred_ = true;
            }
            break;

          case 0x0026: // '&'
            if (newstrength != TOKEN_UNSET_) {
              return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before);
            }
            newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
            break;
          case 0x005b: // '['
            // options - read an option, analyze it
            m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);
            if (m_optionEnd_ != -1) { // ']'
              byte result = readAndSetOption();
              m_current_ = m_optionEnd_;
              if ((result & TOKEN_TOP_MASK_) != 0) {
                if (newstrength == TOKEN_RESET_) {
                  doSetTop();
                  if (before != 0) {
                    // This is a combination of before and
                    // indirection like
                    // '&[before 2][first regular]<b'
                    m_source_.append((char) 0x002d);
                    m_source_.append((char) before);
                    m_extraCurrent_ += 2;
                    m_parsedToken_.m_charsLen_ += 2;
                  }
                  m_current_++;
                  return doEndParseNextToken(newstrength, true, extensionoffset, newextensionlen, variabletop, before);
                } else {
                  throwParseException(m_rules_, m_current_);
                }
              } else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {
                if (newstrength != TOKEN_RESET_ && newstrength != TOKEN_UNSET_) {
                  variabletop = true;
                  m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
                  m_source_.append((char) 0xFFFF);
                  m_extraCurrent_++;
                  m_current_++;
                  m_parsedToken_.m_charsLen_ = 1;
                  return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before);
                } else {
                  throwParseException(m_rules_, m_current_);
                }
              } else if ((result & TOKEN_BEFORE_) != 0) {
                if (newstrength == TOKEN_RESET_) {
                  before = (byte) (result & TOKEN_BEFORE_);
                } else {
                  throwParseException(m_rules_, m_current_);
                }
              }
            }
            break;
          case 0x002F: // '/'
            wasinquote = false; // if we were copying source
                      // characters, we want to stop now
            inchars = false; // we're now processing expansion
            break;
          case 0x005C: // back slash for escaped chars
            isescaped = true;
            break;
          // found a quote, we're gonna start copying
          case 0x0027: //'\''
            if (newstrength == TOKEN_UNSET_) {
              // quote is illegal until we have a strength
              throwParseException(m_rules_, m_current_);
            }
            inquote = true;
            if (inchars) { // we're doing characters
              if (wasinquote == false) {
                m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
              }
              if (m_parsedToken_.m_charsLen_ != 0) {
                // We are processing characters in quote together.
                // Copy whatever is in the current token, so that
                // the unquoted string can be appended to that.
                m_source_.append(m_source_.substring(m_current_ - m_parsedToken_.m_charsLen_, m_current_));
                m_extraCurrent_ += m_parsedToken_.m_charsLen_;
              }
              m_parsedToken_.m_charsLen_++;
            } else { // we're doing an expansion
              if (wasinquote == false) {
                extensionoffset = m_extraCurrent_;
              }
              if (newextensionlen != 0) {
                m_source_.append(m_source_.substring(m_current_ - newextensionlen, m_current_));
                m_extraCurrent_ += newextensionlen;
              }
              newextensionlen++;
            }
            wasinquote = true;
            m_current_++;
            ch = m_source_.charAt(m_current_);
            if (ch == 0x0027) { // copy the double quote
              m_source_.append(ch);
              m_extraCurrent_++;
              inquote = false;
            }
            break;
          // '@' is french only if the strength is not currently set
          // if it is, it's just a regular character in collation
          case 0x0040: // '@'
            if (newstrength == TOKEN_UNSET_) {
              m_options_.m_isFrenchCollation_ = true;
              break;
            }
            // fall through
          case 0x007C: //|
            // this means we have actually been reading prefix part
            // we want to store read characters to the prefix part
            // and continue reading the characters (proper way
            // would be to restart reading the chars, but in that
            // case we would have to complicate the token hasher,
            // which I do not intend to play with. Instead, we will
            // do prefixes when prefixes are due (before adding the
            // elements).
            m_parsedToken_.m_prefixOffset_ = m_parsedToken_.m_charsOffset_;
            m_parsedToken_.m_prefixLen_ = m_parsedToken_.m_charsLen_;
            if (inchars) { // we're doing characters
              if (wasinquote == false) {
                m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
              }
              if (m_parsedToken_.m_charsLen_ != 0) {
                String prefix = m_source_.substring(m_current_ - m_parsedToken_.m_charsLen_, m_current_);
                m_source_.append(prefix);
                m_extraCurrent_ += m_parsedToken_.m_charsLen_;
              }
              m_parsedToken_.m_charsLen_++;
            }
            wasinquote = true;
            do {
              m_current_++;
              ch = m_source_.charAt(m_current_);
              // skip whitespace between '|' and the character
            } while (PatternProps.isWhiteSpace(ch));
            break;
          case 0x002D: // '-', indicates a range.
            if (newstrength != TOKEN_UNSET_) {
              m_savedIsStarred_ = m_isStarred_;
              return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before);
            }

            m_isStarred_ = m_savedIsStarred_;
            // Ranges are valid only in starred tokens.
            if (!m_isStarred_) {
              throwParseException(m_rules_, m_current_);
            }

            newstrength = m_parsedToken_.m_strength_;
            m_inRange_ = true;
            break;

          case 0x0023: // '#' // this is a comment, skip everything through the end of line
            do {
              m_current_++;
              ch = m_source_.charAt(m_current_);
            } while (!isCharNewLine(ch));
            break;
          case 0x0021: // '!' // ignoring java set thai reordering
            break;
          default:
            if (newstrength == TOKEN_UNSET_) {
              throwParseException(m_rules_, m_current_);
            }
            if (isSpecialChar(ch) && (inquote == false)) {
              throwParseException(m_rules_, m_current_);
            }
            if (ch == 0x0000 && m_current_ + 1 == limit) {
              break;
            }
            if (inchars) {
              if (m_parsedToken_.m_charsLen_ == 0) {
                m_parsedToken_.m_charsOffset_ = m_current_;
              }
              m_parsedToken_.m_charsLen_++;
            } else {
              if (newextensionlen == 0) {
                extensionoffset = m_current_;
              }
              newextensionlen++;
            }
            break;
          }
        }
      }
      if (wasinquote) {
        if (ch != 0x27) {
          m_source_.append(ch);
          m_extraCurrent_++;
        }
      }
      m_current_++;
    }
    return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before);
  }

  /**
     *
     */
  private void initializeParsedToken() {
    m_parsedToken_.m_charsLen_ = 0;
    m_parsedToken_.m_charsOffset_ = 0;
    m_parsedToken_.m_prefixOffset_ = 0;
    m_parsedToken_.m_prefixLen_ = 0;
    m_parsedToken_.m_indirectIndex_ = 0;
  }

  /**
   * End the next parse token
   *
   * @param newstrength
   *            new strength
   * @return offset in rules, -1 for end of rules
   */
  private int doEndParseNextToken(int newstrength, /*int newcharslen,*/
      boolean top, /*int charsoffset,*/
      int extensionoffset, int newextensionlen, boolean variabletop, int before) throws ParseException {
    if (newstrength == TOKEN_UNSET_) {
      return -1;
    }
    if (m_parsedToken_.m_charsLen_ == 0 && top == false) {
      throwParseException(m_rules_, m_current_);
    }

    m_parsedToken_.m_strength_ = newstrength;
    //m_parsedToken_.m_charsOffset_ = charsoffset;
    //m_parsedToken_.m_charsLen_ = newcharslen;
    m_parsedToken_.m_extensionOffset_ = extensionoffset;
    m_parsedToken_.m_extensionLen_ = newextensionlen;
    m_parsedToken_.m_flags_ = (char) ((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0) | (top ? TOKEN_TOP_MASK_ : 0) | before);
    return m_current_;
  }

  /**
   * Token before this element
   *
   * @param sourcetoken
   * @param strength
   *            collation strength
   * @return the token before source token
   * @exception ParseException
   *                thrown when rules have the wrong syntax
   */
  private Token getVirginBefore(Token sourcetoken, int strength) throws ParseException {
    // this is a virgin before - we need to fish the anchor from the UCA
    if (sourcetoken != null) {
      int offset = sourcetoken.m_source_ & 0xFFFFFF;
      m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1));
    } else {
      m_UCAColEIter_.setText(m_source_.substring(m_parsedToken_.m_charsOffset_, m_parsedToken_.m_charsOffset_ + 1));
    }

    int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;
    int basecontce = m_UCAColEIter_.next();
    if (basecontce == CollationElementIterator.NULLORDER) {
      basecontce = 0;
    }

    int ch = 0;

    if ((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
        && (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */

      int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
      int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
      ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw - 1);
      int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw - 1);
      m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
      m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;

      m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
      m_source_.append('\uFFFE');
      m_source_.append((char) ch);
      m_extraCurrent_ += 2;
      m_parsedToken_.m_charsLen_++;

      m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24) | m_parsedToken_.m_charsOffset_;
      m_utilToken_.m_rules_ = m_source_;
      sourcetoken = m_hashTable_.get(m_utilToken_);

      if (sourcetoken == null) {
        m_listHeader_[m_resultLength_] = new TokenListHeader();
        m_listHeader_[m_resultLength_].m_baseCE_ = m_utilCEBuffer_[0] & 0xFFFFFF3F;
        if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
          m_listHeader_[m_resultLength_].m_baseContCE_ = m_utilCEBuffer_[1];
        } else {
          m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
        }
        m_listHeader_[m_resultLength_].m_nextCE_ = 0;
        m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
        m_listHeader_[m_resultLength_].m_previousCE_ = 0;
        m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
        m_listHeader_[m_resultLength_].m_indirect_ = false;

        sourcetoken = new Token();
        initAReset(-1, sourcetoken);
      }

    } else {

      // first ce and second ce m_utilCEBuffer_
      /*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(basece, basecontce, strength, m_utilCEBuffer_);
      // we got the previous CE. Now we need to see if the difference between
      // the two CEs is really of the requested strength.
      // if it's a bigger difference (we asked for secondary and got primary), we
      // need to modify the CE.
      if (CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
        // adjust the strength
        // now we are in the situation where our baseCE should actually be modified in
        // order to get the CE in the right position.
        if (strength == Collator.SECONDARY) {
          m_utilCEBuffer_[0] = basece - 0x0200;
        } else { // strength == UCOL_TERTIARY
          m_utilCEBuffer_[0] = basece - 0x02;
        }
        if (RuleBasedCollator.isContinuation(basecontce)) {
          if (strength == Collator.SECONDARY) {
            m_utilCEBuffer_[1] = basecontce - 0x0200;
          } else { // strength == UCOL_TERTIARY
            m_utilCEBuffer_[1] = basecontce - 0x02;
          }
        }
      }

      /*
                  // the code below relies on getting a code point from the inverse table, in order to be
                  // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
                  // 1. There are many code points that have the same CE
                  // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
                  // Also, in case when there is no equivalent strength before an element, we have to actually
                  // construct one. For example, &[before 2]a << x won't result in x << a, because the element
                  // before a is a primary difference.
                  ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
                                                                            + 2];
                  if ((ch &  INVERSE_SIZE_MASK_) != 0) {
                      int offset = ch & INVERSE_OFFSET_MASK_;
                      ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
                                                                                 offset];
                  }
                  m_source_.append((char)ch);
                  m_extraCurrent_ ++;
                  m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
                  m_parsedToken_.m_charsLen_ = 1;

                  // We got an UCA before. However, this might have been tailored.
                  // example:
                  // &\u30ca = \u306a
                  // &[before 3]\u306a<<<\u306a|\u309d

                  m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
                                                       | m_parsedToken_.m_charsOffset_;
                  m_utilToken_.m_rules_ = m_source_;
                  sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
      */

      // here is how it should be. The situation such as &[before 1]a < x, should be
      // resolved exactly as if we wrote &a > x.
      // therefore, I don't really care if the UCA value before a has been changed.
      // However, I do care if the strength between my element and the previous element
      // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
      // have to construct the base CE.

      // if we found a tailored thing, we have to use the UCA value and
      // construct a new reset token with constructed name
      //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
      // character to which we want to anchor is already tailored.
      // We need to construct a new token which will be the anchor point
      //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
      //m_source_.append(ch);
      //m_extraCurrent_ ++;
      //m_parsedToken_.m_charsLen_ ++;
      // grab before
      m_parsedToken_.m_charsOffset_ -= 10;
      m_parsedToken_.m_charsLen_ += 10;
      m_listHeader_[m_resultLength_] = new TokenListHeader();
      m_listHeader_[m_resultLength_].m_baseCE_ = m_utilCEBuffer_[0] & 0xFFFFFF3F;
      if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
        m_listHeader_[m_resultLength_].m_baseContCE_ = m_utilCEBuffer_[1];
      } else {
        m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
      }
      m_listHeader_[m_resultLength_].m_nextCE_ = 0;
      m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
      m_listHeader_[m_resultLength_].m_previousCE_ = 0;
      m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
      m_listHeader_[m_resultLength_].m_indirect_ = false;
      sourcetoken = new Token();
      initAReset(-1, sourcetoken);
      //}
    }
    return sourcetoken;
  }

  /**
   * Processing Description. 1. Build a m_listHeader_. Each list has a header, which contains two lists (positive and negative), a reset
   * token, a baseCE, nextCE, and previousCE. The lists and reset may be null. 2. As you process, you keep a LAST pointer that points to
   * the last token you handled.
   *
   * @param expand
   *            string offset, -1 for null strings
   * @param targetToken
   *            token to update
   * @return expandnext offset
   * @throws ParseException
   *             thrown when rules syntax failed
   */
  private int initAReset(int expand, Token targetToken) throws ParseException {
    if (m_resultLength_ == m_listHeader_.length - 1) {
      // Unfortunately, this won't work, as we store addresses of lhs in
      // token
      TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
      System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);
      m_listHeader_ = temp;
    }
    // do the reset thing
    targetToken.m_rules_ = m_source_;
    targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24 | m_parsedToken_.m_charsOffset_;
    targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24 | m_parsedToken_.m_extensionOffset_;
    // keep the flags around so that we know about before
    targetToken.m_flags_ = m_parsedToken_.m_flags_;

    if (m_parsedToken_.m_prefixOffset_ != 0) {
      throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);
    }

    targetToken.m_prefix_ = 0;
    // TODO: this should also handle reverse
    targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
    targetToken.m_strength_ = TOKEN_RESET_;
    targetToken.m_next_ = null;
    targetToken.m_previous_ = null;
    targetToken.m_CELength_ = 0;
    targetToken.m_expCELength_ = 0;
    targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
    m_listHeader_[m_resultLength_].m_first_ = null;
    m_listHeader_[m_resultLength_].m_last_ = null;
    m_listHeader_[m_resultLength_].m_first_ = null;
    m_listHeader_[m_resultLength_].m_last_ = null;
    m_listHeader_[m_resultLength_].m_reset_ = targetToken;

    /* 3 Consider each item: relation, source, and expansion:
     * e.g. ...< x / y ...
     * First convert all expansions into normal form. Examples:
     * If "xy" doesn't occur earlier in the list or in the UCA, convert
     * &xy * c * d * ... into &x * c/y * d * ...
     * Note: reset values can never have expansions, although they can
     * cause the very next item to have one. They may be contractions, if
     * they are found earlier in the list.
     */
    int result = 0;
    if (expand > 0) {
      // check to see if there is an expansion
      if (m_parsedToken_.m_charsLen_ > 1) {
        targetToken.m_source_ = ((expand - m_parsedToken_.m_charsOffset_) << 24) | m_parsedToken_.m_charsOffset_;
        result = ((m_parsedToken_.m_charsLen_ + m_parsedToken_.m_charsOffset_ - expand) << 24) | expand;
      }
    }

    m_resultLength_++;
    m_hashTable_.put(targetToken, targetToken);
    return result;
  }

  /**
   * Checks if an character is special
   *
   * @param ch
   *            character to test
   * @return true if the character is special
   */
  private static final boolean isSpecialChar(char ch) {
    return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A) || (ch <= 0x0060 && ch >= 0x005B)
        || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
  }

  private UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException {
    while (source.charAt(start) != '[') { /* advance while we find the first '[' */
      start++;
    }
    // now we need to get a balanced set of '[]'. The problem is that a set can have
    // many, and *end point to the first closing '['
    int noOpenBraces = 1;
    int current = 1; // skip the opening brace
    while (start + current < source.length() && noOpenBraces != 0) {
      if (source.charAt(start + current) == '[') {
        noOpenBraces++;
      } else if (source.charAt(start + current) == ']') { // closing brace
        noOpenBraces--;
      }
      current++;
    }
    //int nextBrace = -1;

    if (noOpenBraces != 0 || (/*nextBrace =*/source.indexOf("]", start + current) /*']'*/) == -1) {
      throwParseException(m_rules_, start);
    }
    return new UnicodeSet(source.substring(start, start + current)); //uset_openPattern(start, current);
  }

  /**
   * in C, optionarg is passed by reference to function. We use a private int to simulate this.
   */
  private int m_optionarg_ = 0;

  private int readOption(String rules, int start, int optionend) {
    m_optionarg_ = 0;
    int i = 0;
    while (i < RULES_OPTIONS_.length) {
      String option = RULES_OPTIONS_[i].m_name_;
      int optionlength = option.length();
      if (rules.length() > start + optionlength && option.equalsIgnoreCase(rules.substring(start, start + optionlength))) {
        if (optionend - start > optionlength) {
          m_optionarg_ = start + optionlength;
          // start of the options, skip space
          while (m_optionarg_ < optionend && PatternProps.isWhiteSpace(rules.charAt(m_optionarg_))) {   // eat whitespace
            m_optionarg_++;
          }
        }
        break;
      }
      i++;
    }
    if (i == RULES_OPTIONS_.length) {
      i = -1;
    }
    return i;
  }

  /**
   * Reads and set collation options
   *
   * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
   * @exception ParseException
   *                thrown when options in rules are wrong
   */
  private byte readAndSetOption() throws ParseException {
    int start = m_current_ + 1; // skip opening '['
    int i = readOption(m_rules_, start, m_optionEnd_);

    int optionarg = m_optionarg_;

    if (i < 0) {
      throwParseException(m_rules_, start);
    }

    if (i < 7) {
      if (optionarg != 0) {
        for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
          String subname = RULES_OPTIONS_[i].m_subOptions_[j];
          int size = optionarg + subname.length();
          if (m_rules_.length() > size && subname.equalsIgnoreCase(m_rules_.substring(optionarg, size))) {
            setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_, RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
            return TOKEN_SUCCESS_MASK_;
          }
        }
      }
      throwParseException(m_rules_, optionarg);
    } else if (i == 7) { // variable top
      return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
    } else if (i == 8) { // rearrange
      return TOKEN_SUCCESS_MASK_;
    } else if (i == 9) { // before
      if (optionarg != 0) {
        for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
          String subname = RULES_OPTIONS_[i].m_subOptions_[j];
          int size = optionarg + subname.length();
          if (m_rules_.length() > size && subname.equalsIgnoreCase(m_rules_.substring(optionarg, optionarg + subname.length()))) {
            return (byte) (TOKEN_SUCCESS_MASK_ | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j] + 1);
          }
        }
      }
      throwParseException(m_rules_, optionarg);
    } else if (i == 10) {  // top, we are going to have an array with
      // structures of limit CEs index to this array will be
      // src->parsedToken.indirectIndex
      m_parsedToken_.m_indirectIndex_ = 0;
      return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
    } else if (i < 13) { // first, last
      for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
        String subname = RULES_OPTIONS_[i].m_subOptions_[j];
        int size = optionarg + subname.length();
        if (m_rules_.length() > size && subname.equalsIgnoreCase(m_rules_.substring(optionarg, size))) {
          m_parsedToken_.m_indirectIndex_ = (char) (i - 10 + (j << 1));
          return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
        }
      }
      throwParseException(m_rules_, optionarg);
    } else if (i == 13 || i == 14) { // copy and remove are handled before normalization
      // we need to move end here
      int noOpenBraces = 1;
      m_current_++; // skip opening brace
      while (m_current_ < m_source_.length() && noOpenBraces != 0) {
        if (m_source_.charAt(m_current_) == '[') {
          noOpenBraces++;
        } else if (m_source_.charAt(m_current_) == ']') { // closing brace
          noOpenBraces--;
        }
        m_current_++;
      }
      m_optionEnd_ = m_current_ - 1;
      return TOKEN_SUCCESS_MASK_;
    } else if (i == 16) {
      m_current_ = m_optionarg_; // skip opening brace and name
      parseScriptReorder();
      return TOKEN_SUCCESS_MASK_;
    } else {
      throwParseException(m_rules_, optionarg);
    }
    return TOKEN_SUCCESS_MASK_; // we will never reach here.
  }

  /**
   * Set collation option
   *
   * @param optionset
   *            option set to set
   * @param attribute
   *            type to set
   * @param value
   *            attribute value
   */
  private void setOptions(OptionSet optionset, int attribute, int value) {
    switch (attribute) {
    case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_:
      optionset.m_isHiragana4_ = (value == RuleBasedCollator.AttributeValue.ON_);
      break;
    case RuleBasedCollator.Attribute.FRENCH_COLLATION_:
      optionset.m_isFrenchCollation_ = (value == RuleBasedCollator.AttributeValue.ON_);
      break;
    case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_:
      optionset.m_isAlternateHandlingShifted_ = (value == RuleBasedCollator.AttributeValue.SHIFTED_);
      break;
    case RuleBasedCollator.Attribute.CASE_FIRST_:
      optionset.m_caseFirst_ = value;
      break;
    case RuleBasedCollator.Attribute.CASE_LEVEL_:
      optionset.m_isCaseLevel_ = (value == RuleBasedCollator.AttributeValue.ON_);
      break;
    case RuleBasedCollator.Attribute.NORMALIZATION_MODE_:
      if (value == RuleBasedCollator.AttributeValue.ON_) {
        value = Collator.CANONICAL_DECOMPOSITION;
      }
      optionset.m_decomposition_ = value;
      break;
    case RuleBasedCollator.Attribute.STRENGTH_:
      optionset.m_strength_ = value;
      break;
    default:
      break;
    }
  }

  UnicodeSet getTailoredSet() throws ParseException {
    boolean startOfRules = true;
    UnicodeSet tailored = new UnicodeSet();
    String pattern;
    CanonicalIterator it = new CanonicalIterator("");

    m_parsedToken_.m_strength_ = TOKEN_UNSET_;
    int sourcelimit = m_source_.length();
    //int expandNext = 0;

    while (m_current_ < sourcelimit) {
      m_parsedToken_.m_prefixOffset_ = 0;
      if (parseNextToken(startOfRules) < 0) {
        // we have reached the end
        continue;
      }
      startOfRules = false;
      // The idea is to tokenize the rule set. For each non-reset token,
      // we add all the canonicaly equivalent FCD sequences
      if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
        it.setSource(m_source_.substring(m_parsedToken_.m_charsOffset_, m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_));
        pattern = it.next();
        while (pattern != null) {
          if (Normalizer.quickCheck(pattern, Normalizer.FCD, 0) != Normalizer.NO) {
            tailored.add(pattern);
          }
          pattern = it.next();
        }
      }
    }
    return tailored;
  }

  final private String preprocessRules(String rules) throws ParseException {
    int optionNumber = -1;
    int setStart = 0;
    int i = 0;
    while (i < rules.length()) {
      if (rules.charAt(i) == 0x005B) { // [
        optionNumber = readOption(rules, i + 1, rules.length());
        setStart = m_optionarg_;
        if (optionNumber == 13) { /* copy - parts of UCA to tailoring */
          UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
          if (m_copySet_ == null) {
            m_copySet_ = newSet;
          } else {
            m_copySet_.addAll(newSet);
          }
        } else if (optionNumber == 14) {
          UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
          if (m_removeSet_ == null) {
            m_removeSet_ = newSet;
          } else {
            m_removeSet_.addAll(newSet);
          }
        } else if (optionNumber == 19) {
          int optionEndOffset = rules.indexOf(']', i) + 1;
          ULocale locale = ULocale.forLanguageTag(rules.substring(setStart, optionEndOffset - 1));
          UResourceBundle bundle = UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BASE_NAME + "/coll",
              locale.getBaseName());

          String type = locale.getKeywordValue("collation");
          if (type == null) {
            type = "standard";
          }

          String importRules = bundle.get("collations").get(type).get("Sequence").getString();

          rules = rules.substring(0, i) + importRules + rules.substring(optionEndOffset);
        }
      }
      i++;
    }
    return rules;
  }

  /* This is the data that is used for non-script reordering codes. These _must_ be kept
   * in order that they are to be applied as defaults and in synch with the Collator.ReorderCodes statics.
   */
  static final String ReorderingTokensArray[] = { "SPACE", "PUNCT", "SYMBOL", "CURRENCY", "DIGIT", };

  int findReorderingEntry(String name) {
    for (int tokenIndex = 0; tokenIndex < ReorderingTokensArray.length; tokenIndex++) {
      if (name.equalsIgnoreCase(ReorderingTokensArray[tokenIndex])) {
        return tokenIndex + ReorderCodes.FIRST;
      }
    }
    return UScript.INVALID_CODE;
  }

  private void parseScriptReorder() throws ParseException {
    ArrayList<Integer> tempOrder = new ArrayList<Integer>();
    int end = m_rules_.indexOf(']', m_current_);
    if (end == -1) {
      return;
    }
    String tokenString = m_rules_.substring(m_current_, end);
    String[] tokens = tokenString.split("\\s+", 0);
    String token;
    for (int tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
      token = tokens[tokenIndex];
      int reorderCode = findReorderingEntry(token);
      if (reorderCode == UScript.INVALID_CODE) {
        reorderCode = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, token);
        if (reorderCode < 0) {
          throw new ParseException(m_rules_, tokenIndex);
        }
      }
      tempOrder.add(reorderCode);
    }
    m_options_.m_scriptOrder_ = new int[tempOrder.size()];
    for (int i = 0; i < tempOrder.size(); i++) {
      m_options_.m_scriptOrder_[i] = tempOrder.get(i);
    }
  }
}
TOP

Related Classes of com.ibm.icu.text.CollationRuleParser$IndirectBoundaries

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.