Package com.ibm.icu.text

Source Code of com.ibm.icu.text.TransliteratorParser$RuleHalf

/*
**********************************************************************
*   Copyright (c) 2001-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*/
package com.ibm.icu.text;

import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.ibm.icu.impl.IllegalIcuArgumentException;
import com.ibm.icu.impl.PatternProps;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.RuleBasedTransliterator.Data;

@SuppressWarnings("deprecation")
class TransliteratorParser {

  //----------------------------------------------------------------------
  // Data members
  //----------------------------------------------------------------------

  /**
   * PUBLIC data member. A Vector of RuleBasedTransliterator.Data objects, one for each discrete group of rules in the rule set
   */
  public List<Data> dataVector;

  /**
   * PUBLIC data member. A Vector of Strings containing all of the ID blocks in the rule set
   */
  public List<String> idBlockVector;

  /**
   * The current data object for which we are parsing rules
   */
  private Data curData;

  /**
   * PUBLIC data member containing the parsed compound filter, if any.
   */
  public UnicodeSet compoundFilter;

  private int direction;

  /**
   * Temporary symbol table used during parsing.
   */
  private ParseData parseData;

  /**
   * Temporary vector of set variables. When parsing is complete, this is copied into the array data.variables. As with data.variables,
   * element 0 corresponds to character data.variablesBase.
   */
  private List<Object> variablesVector;

  /**
   * Temporary table of variable names. When parsing is complete, this is copied into data.variableNames.
   */
  private Map<String, char[]> variableNames;

  /**
   * String of standins for segments. Used during the parsing of a single rule. segmentStandins.charAt(0) is the standin for "$1" and
   * corresponds to StringMatcher object segmentObjects.elementAt(0), etc.
   */
  private StringBuffer segmentStandins;

  /**
   * Vector of StringMatcher objects for segments. Used during the parsing of a single rule. segmentStandins.charAt(0) is the standin for
   * "$1" and corresponds to StringMatcher object segmentObjects.elementAt(0), etc.
   */
  private List<StringMatcher> segmentObjects;

  /**
   * The next available stand-in for variables. This starts at some point in the private use area (discovered dynamically) and increments
   * up toward <code>variableLimit</code>. At any point during parsing, available variables are <code>variableNext..variableLimit-1</code>
   * .
   */
  private char variableNext;

  /**
   * The last available stand-in for variables. This is discovered dynamically. At any point during parsing, available variables are
   * <code>variableNext..variableLimit-1</code>. During variable definition we use the special value variableLimit-1 as a placeholder.
   */
  private char variableLimit;

  /**
   * When we encounter an undefined variable, we do not immediately signal an error, in case we are defining this variable, e.g.,
   * "$a = [a-z];". Instead, we save the name of the undefined variable, and substitute in the placeholder char variableLimit - 1, and
   * decrement variableLimit.
   */
  private String undefinedVariableName;

  /**
   * The stand-in character for the 'dot' set, represented by '.' in patterns. This is allocated the first time it is needed, and reused
   * thereafter.
   */
  private int dotStandIn = -1;

  //----------------------------------------------------------------------
  // Constants
  //----------------------------------------------------------------------

  // Indicator for ID blocks
  private static final String ID_TOKEN = "::";
  private static final int ID_TOKEN_LEN = 2;

  /*
  (reserved for future expansion)
      // markers for beginning and end of rule groups
      private static final String BEGIN_TOKEN = "BEGIN";
      private static final String END_TOKEN = "END";
  */

  // Operators
  private static final char VARIABLE_DEF_OP = '=';
  private static final char FORWARD_RULE_OP = '>';
  private static final char REVERSE_RULE_OP = '<';
  private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op

  private static final String OPERATORS = "=><\u2190\u2192\u2194";
  private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";

  // Other special characters
  private static final char QUOTE = '\'';
  private static final char ESCAPE = '\\';
  private static final char END_OF_RULE = ';';
  private static final char RULE_COMMENT_CHAR = '#';

  private static final char CONTEXT_ANTE = '{'; // ante{key
  private static final char CONTEXT_POST = '}'; // key}post
  private static final char CURSOR_POS = '|';
  private static final char CURSOR_OFFSET = '@';
  private static final char ANCHOR_START = '^';

  private static final char KLEENE_STAR = '*';
  private static final char ONE_OR_MORE = '+';
  private static final char ZERO_OR_ONE = '?';

  private static final char DOT = '.';
  private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]";

  // By definition, the ANCHOR_END special character is a
  // trailing SymbolTable.SYMBOL_REF character.
  // private static final char ANCHOR_END       = '$';

  // Segments of the input string are delimited by "(" and ")".  In the
  // output string these segments are referenced as "$1", "$2", etc.
  private static final char SEGMENT_OPEN = '(';
  private static final char SEGMENT_CLOSE = ')';

  // A function is denoted &Source-Target/Variant(text)
  private static final char FUNCTION = '&';

  // Aliases for some of the syntax characters. These are provided so
  // transliteration rules can be expressed in XML without clashing with
  // XML syntax characters '<', '>', and '&'.
  private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
  private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
  private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
  private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)

  // Special characters disallowed at the top level
  private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");

  // Special characters disallowed within a segment
  private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");

  // Special characters disallowed within a function argument
  private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");

  //----------------------------------------------------------------------
  // class ParseData
  //----------------------------------------------------------------------

  /**
   * This class implements the SymbolTable interface. It is used during parsing to give UnicodeSet access to variables that have been
   * defined so far. Note that it uses variablesVector, _not_ data.variables.
   */
  private class ParseData implements SymbolTable {

    /**
     * Implement SymbolTable API.
     */
    public char[] lookup(final String name) {
      return variableNames.get(name);
    }

    /**
     * Implement SymbolTable API.
     */
    public UnicodeMatcher lookupMatcher(final int ch) {
      // Note that we cannot use data.lookup() because the
      // set array has not been constructed yet.
      int i = ch - curData.variablesBase;
      if (i >= 0 && i < variablesVector.size()) {
        return (UnicodeMatcher) variablesVector.get(i);
      }
      return null;
    }

    /**
     * Implement SymbolTable API. Parse out a symbol reference name.
     */
    public String parseReference(final String text, final ParsePosition pos, final int limit) {
      int start = pos.getIndex();
      int i = start;
      while (i < limit) {
        char c = text.charAt(i);
        if ((i == start && !UCharacter.isUnicodeIdentifierStart(c)) || !UCharacter.isUnicodeIdentifierPart(c)) {
          break;
        }
        ++i;
      }
      if (i == start) { // No valid name chars
        return null;
      }
      pos.setIndex(i);
      return text.substring(start, i);
    }

    /**
     * Return true if the given character is a matcher standin or a plain character (non standin).
     */
    public boolean isMatcher(final int ch) {
      // Note that we cannot use data.lookup() because the
      // set array has not been constructed yet.
      int i = ch - curData.variablesBase;
      if (i >= 0 && i < variablesVector.size()) {
        return variablesVector.get(i) instanceof UnicodeMatcher;
      }
      return true;
    }

    /**
     * Return true if the given character is a replacer standin or a plain character (non standin).
     */
    public boolean isReplacer(final int ch) {
      // Note that we cannot use data.lookup() because the
      // set array has not been constructed yet.
      int i = ch - curData.variablesBase;
      if (i >= 0 && i < variablesVector.size()) {
        return variablesVector.get(i) instanceof UnicodeReplacer;
      }
      return true;
    }
  }

  //----------------------------------------------------------------------
  // classes RuleBody, RuleArray, and RuleReader
  //----------------------------------------------------------------------

  /**
   * A private abstract class representing the interface to rule source code that is broken up into lines. Handles the folding of lines
   * terminated by a backslash. This folding is limited; it does not account for comments, quotes, or escapes, so its use to be limited.
   */
  private static abstract class RuleBody {

    /**
     * Retrieve the next line of the source, or return null if none. Folds lines terminated by a backslash into the next line, without
     * regard for comments, quotes, or escapes.
     */
    String nextLine() {
      String s = handleNextLine();
      if (s != null && s.length() > 0 && s.charAt(s.length() - 1) == '\\') {
        StringBuilder b = new StringBuilder(s);
        do {
          b.deleteCharAt(b.length() - 1);
          s = handleNextLine();
          if (s == null) {
            break;
          }
          b.append(s);
        } while (s.length() > 0 && s.charAt(s.length() - 1) == '\\');
        s = b.toString();
      }
      return s;
    }

    /**
     * Reset to the first line of the source.
     */
    abstract void reset();

    /**
     * Subclass method to return the next line of the source.
     */
    abstract String handleNextLine();
  }

  /**
   * RuleBody subclass for a String[] array.
   */
  private static class RuleArray extends RuleBody {
    String[] array;
    int i;

    public RuleArray(final String[] array) {
      this.array = array;
      i = 0;
    }

    @Override
    public String handleNextLine() {
      return (i < array.length) ? array[i++] : null;
    }

    @Override
    public void reset() {
      i = 0;
    }
  }

  /*
   * RuleBody subclass for a ResourceReader.
   */
  /*    private static class RuleReader extends RuleBody {
          ResourceReader reader;
          public RuleReader(ResourceReader reader) { this.reader = reader; }
          public String handleNextLine() {
              try {
                  return reader.readLine();
              } catch (java.io.IOException e) {}
              return null;
          }
          public void reset() {
              reader.reset();
          }
      }*/

  //----------------------------------------------------------------------
  // class RuleHalf
  //----------------------------------------------------------------------

  /**
   * A class representing one side of a rule. This class knows how to parse half of a rule. It is tightly coupled to the method
   * TransliteratorParser.parseRule().
   */
  private static class RuleHalf {

    public String text;

    public int cursor = -1; // position of cursor in text
    public int ante = -1;   // position of ante context marker '{' in text
    public int post = -1;   // position of post context marker '}' in text

    // Record the offset to the cursor either to the left or to the
    // right of the key.  This is indicated by characters on the output
    // side that allow the cursor to be positioned arbitrarily within
    // the matching text.  For example, abc{def} > | @@@ xyz; changes
    // def to xyz and moves the cursor to before abc.  Offset characters
    // must be at the start or end, and they cannot move the cursor past
    // the ante- or postcontext text.  Placeholders are only valid in
    // output text.  The length of the ante and post context is
    // determined at runtime, because of supplementals and quantifiers.
    public int cursorOffset = 0; // only nonzero on output side

    // Position of first CURSOR_OFFSET on _right_.  This will be -1
    // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
    private int cursorOffsetPos = 0;

    public boolean anchorStart = false;
    public boolean anchorEnd = false;

    /**
     * The segment number from 1..n of the next '(' we see during parsing; 1-based.
     */
    private int nextSegmentNumber = 1;

    /**
     * Parse one side of a rule, stopping at either the limit, the END_OF_RULE character, or an operator.
     *
     * @return the index after the terminating character, or if limit was reached, limit
     */
    public int parse(final String rule, int pos, final int limit, final TransliteratorParser parser) {
      int start = pos;
      StringBuffer buf = new StringBuffer();
      pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
      text = buf.toString();

      if (cursorOffset > 0 && cursor != cursorOffsetPos) {
        syntaxError("Misplaced " + CURSOR_POS, rule, start);
      }

      return pos;
    }

    /**
     * Parse a section of one side of a rule, stopping at either the limit, the END_OF_RULE character, an operator, or a segment close
     * character. This method parses both a top-level rule half and a segment within such a rule half. It calls itself recursively to
     * parse segments and nested segments.
     *
     * @param buf
     *            buffer into which to accumulate the rule pattern characters, either literal characters from the rule or standins for
     *            UnicodeMatcher objects including segments.
     * @param illegal
     *            the set of special characters that is illegal during this parse.
     * @param isSegment
     *            if true, then we've already seen a '(' and pos on entry points right after it. Accumulate everything up to the closing
     *            ')', put it in a segment matcher object, generate a standin for it, and add the standin to buf. As a side effect,
     *            update the segments vector with a reference to the segment matcher. This works recursively for nested segments. If
     *            isSegment is false, just accumulate characters into buf.
     * @return the index after the terminating character, or if limit was reached, limit
     */
    private int parseSection(final String rule, int pos, final int limit, final TransliteratorParser parser, final StringBuffer buf,
        final UnicodeSet illegal, final boolean isSegment) {
      int start = pos;
      ParsePosition pp = null;
      int quoteStart = -1; // Most recent 'single quoted string'
      int quoteLimit = -1;
      int varStart = -1; // Most recent $variableReference
      int varLimit = -1;
      int[] iref = new int[1];
      int bufStart = buf.length();

      main: while (pos < limit) {
        // Since all syntax characters are in the BMP, fetching
        // 16-bit code units suffices here.
        char c = rule.charAt(pos++);
        if (PatternProps.isWhiteSpace(c)) {
          continue;
        }
        // HALF_ENDERS is all chars that end a rule half: "<>=;"
        if (HALF_ENDERS.indexOf(c) >= 0) {
          ///CLOVER:OFF
          // isSegment is always false
          if (isSegment) {
            syntaxError("Unclosed segment", rule, start);
          }
          ///CLOVER:ON
          break main;
        }
        if (anchorEnd) {
          // Text after a presumed end anchor is a syntax err
          syntaxError("Malformed variable reference", rule, start);
        }
        if (UnicodeSet.resemblesPattern(rule, pos - 1)) {
          if (pp == null) {
            pp = new ParsePosition(0);
          }
          pp.setIndex(pos - 1); // Backup to opening '['
          buf.append(parser.parseSet(rule, pp));
          pos = pp.getIndex();
          continue;
        }
        // Handle escapes
        if (c == ESCAPE) {
          if (pos == limit) {
            syntaxError("Trailing backslash", rule, start);
          }
          iref[0] = pos;
          int escaped = Utility.unescapeAt(rule, iref);
          pos = iref[0];
          if (escaped == -1) {
            syntaxError("Malformed escape", rule, start);
          }
          parser.checkVariableRange(escaped, rule, start);
          UTF16.append(buf, escaped);
          continue;
        }
        // Handle quoted matter
        if (c == QUOTE) {
          int iq = rule.indexOf(QUOTE, pos);
          if (iq == pos) {
            buf.append(c); // Parse [''] outside quotes as [']
            ++pos;
          } else {
            /* This loop picks up a run of quoted text of the
             * form 'aaaa' each time through.  If this run
             * hasn't really ended ('aaaa''bbbb') then it keeps
             * looping, each time adding on a new run.  When it
             * reaches the final quote it breaks.
             */
            quoteStart = buf.length();
            for (;;) {
              if (iq < 0) {
                syntaxError("Unterminated quote", rule, start);
              }
              buf.append(rule.substring(pos, iq));
              pos = iq + 1;
              if (pos < limit && rule.charAt(pos) == QUOTE) {
                // Parse [''] inside quotes as [']
                iq = rule.indexOf(QUOTE, pos + 1);
                // Continue looping
              } else {
                break;
              }
            }
            quoteLimit = buf.length();

            for (iq = quoteStart; iq < quoteLimit; ++iq) {
              parser.checkVariableRange(buf.charAt(iq), rule, start);
            }
          }
          continue;
        }

        parser.checkVariableRange(c, rule, start);

        if (illegal.contains(c)) {
          syntaxError("Illegal character '" + c + '\'', rule, start);
        }

        switch (c) {

        //------------------------------------------------------
        // Elements allowed within and out of segments
        //------------------------------------------------------
        case ANCHOR_START:
          if (buf.length() == 0 && !anchorStart) {
            anchorStart = true;
          } else {
            syntaxError("Misplaced anchor start", rule, start);
          }
          break;
        case SEGMENT_OPEN: {
          // bufSegStart is the offset in buf to the first
          // character of the segment we are parsing.
          int bufSegStart = buf.length();

          // Record segment number now, since nextSegmentNumber
          // will be incremented during the call to parseSection
          // if there are nested segments.
          int segmentNumber = nextSegmentNumber++; // 1-based

          // Parse the segment
          pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);

          // After parsing a segment, the relevant characters are
          // in buf, starting at offset bufSegStart.  Extract them
          // into a string matcher, and replace them with a
          // standin for that matcher.
          StringMatcher m = new StringMatcher(buf.substring(bufSegStart), segmentNumber, parser.curData);

          // Record and associate object and segment number
          parser.setSegmentObject(segmentNumber, m);
          buf.setLength(bufSegStart);
          buf.append(parser.getSegmentStandin(segmentNumber));
        }
          break;
        case FUNCTION:
        case ALT_FUNCTION: {
          iref[0] = pos;
          TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
          // The next character MUST be a segment open
          if (single == null || !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
            syntaxError("Invalid function", rule, start);
          }

          Transliterator t = single.getInstance();
          if (t == null) {
            syntaxError("Invalid function ID", rule, start);
          }

          // bufSegStart is the offset in buf to the first
          // character of the segment we are parsing.
          int bufSegStart = buf.length();

          // Parse the segment
          pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);

          // After parsing a segment, the relevant characters are
          // in buf, starting at offset bufSegStart.
          FunctionReplacer r = new FunctionReplacer(t, new StringReplacer(buf.substring(bufSegStart), parser.curData));

          // Replace the buffer contents with a stand-in
          buf.setLength(bufSegStart);
          buf.append(parser.generateStandInFor(r));
        }
          break;
        case SymbolTable.SYMBOL_REF:
        // Handle variable references and segment references "$1" .. "$9"
        {
          // A variable reference must be followed immediately
          // by a Unicode identifier start and zero or more
          // Unicode identifier part characters, or by a digit
          // 1..9 if it is a segment reference.
          if (pos == limit) {
            // A variable ref character at the end acts as
            // an anchor to the context limit, as in perl.
            anchorEnd = true;
            break;
          }
          // Parse "$1" "$2" .. "$9" .. (no upper limit)
          c = rule.charAt(pos);
          int r = UCharacter.digit(c, 10);
          if (r >= 1 && r <= 9) {
            iref[0] = pos;
            r = Utility.parseNumber(rule, iref, 10);
            if (r < 0) {
              syntaxError("Undefined segment reference", rule, start);
            }
            pos = iref[0];
            buf.append(parser.getSegmentStandin(r));
          } else {
            if (pp == null) { // Lazy create
              pp = new ParsePosition(0);
            }
            pp.setIndex(pos);
            String name = parser.parseData.parseReference(rule, pp, limit);
            if (name == null) {
              // This means the '$' was not followed by a
              // valid name.  Try to interpret it as an
              // end anchor then.  If this also doesn't work
              // (if we see a following character) then signal
              // an error.
              anchorEnd = true;
              break;
            }
            pos = pp.getIndex();
            // If this is a variable definition statement,
            // then the LHS variable will be undefined.  In
            // that case appendVariableDef() will append the
            // special placeholder char variableLimit-1.
            varStart = buf.length();
            parser.appendVariableDef(name, buf);
            varLimit = buf.length();
          }
        }
          break;
        case DOT:
          buf.append(parser.getDotStandIn());
          break;
        case KLEENE_STAR:
        case ONE_OR_MORE:
        case ZERO_OR_ONE:
        // Quantifiers.  We handle single characters, quoted strings,
        // variable references, and segments.
        //  a+      matches  aaa
        //  'foo'+  matches  foofoofoo
        //  $v+     matches  xyxyxy if $v == xy
        //  (seg)+  matches  segsegseg
        {
          ///CLOVER:OFF
          // isSegment is always false
          if (isSegment && buf.length() == bufStart) {
            // The */+ immediately follows '('
            syntaxError("Misplaced quantifier", rule, start);
            break;
          }
          ///CLOVER:ON

          int qstart, qlimit;
          // The */+ follows an isolated character or quote
          // or variable reference
          if (buf.length() == quoteLimit) {
            // The */+ follows a 'quoted string'
            qstart = quoteStart;
            qlimit = quoteLimit;
          } else if (buf.length() == varLimit) {
            // The */+ follows a $variableReference
            qstart = varStart;
            qlimit = varLimit;
          } else {
            // The */+ follows a single character, possibly
            // a segment standin
            qstart = buf.length() - 1;
            qlimit = qstart + 1;
          }

          UnicodeMatcher m;
          try {
            m = new StringMatcher(buf.toString(), qstart, qlimit, 0, parser.curData);
          } catch (RuntimeException e) {
            final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos);
            final String postContext = limit - pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos + 50) + "...";
            throw (RuntimeException) new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$" + postContext)
                .initCause(e);
          }
          int min = 0;
          int max = Quantifier.MAX;
          switch (c) {
          case ONE_OR_MORE:
            min = 1;
            break;
          case ZERO_OR_ONE:
            min = 0;
            max = 1;
            break;
          // case KLEENE_STAR:
          //    do nothing -- min, max already set
          }
          m = new Quantifier(m, min, max);
          buf.setLength(qstart);
          buf.append(parser.generateStandInFor(m));
        }
          break;

        //------------------------------------------------------
        // Elements allowed ONLY WITHIN segments
        //------------------------------------------------------
        case SEGMENT_CLOSE:
          // assert(isSegment);
          // We're done parsing a segment.
          break main;

        //------------------------------------------------------
        // Elements allowed ONLY OUTSIDE segments
        //------------------------------------------------------
        case CONTEXT_ANTE:
          if (ante >= 0) {
            syntaxError("Multiple ante contexts", rule, start);
          }
          ante = buf.length();
          break;
        case CONTEXT_POST:
          if (post >= 0) {
            syntaxError("Multiple post contexts", rule, start);
          }
          post = buf.length();
          break;
        case CURSOR_POS:
          if (cursor >= 0) {
            syntaxError("Multiple cursors", rule, start);
          }
          cursor = buf.length();
          break;
        case CURSOR_OFFSET:
          if (cursorOffset < 0) {
            if (buf.length() > 0) {
              syntaxError("Misplaced " + c, rule, start);
            }
            --cursorOffset;
          } else if (cursorOffset > 0) {
            if (buf.length() != cursorOffsetPos || cursor >= 0) {
              syntaxError("Misplaced " + c, rule, start);
            }
            ++cursorOffset;
          } else {
            if (cursor == 0 && buf.length() == 0) {
              cursorOffset = -1;
            } else if (cursor < 0) {
              cursorOffsetPos = buf.length();
              cursorOffset = 1;
            } else {
              syntaxError("Misplaced " + c, rule, start);
            }
          }
          break;

        //------------------------------------------------------
        // Non-special characters
        //------------------------------------------------------
        default:
          // Disallow unquoted characters other than [0-9A-Za-z]
          // in the printable ASCII range.  These characters are
          // reserved for possible future use.
          if (c >= 0x0021 && c <= 0x007E && !((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
            syntaxError("Unquoted " + c, rule, start);
          }
          buf.append(c);
          break;
        }
      }
      return pos;
    }

    /**
     * Remove context.
     */
    void removeContext() {
      text = text.substring(ante < 0 ? 0 : ante, post < 0 ? text.length() : post);
      ante = post = -1;
      anchorStart = anchorEnd = false;
    }

    /**
     * Return true if this half looks like valid output, that is, does not contain quantifiers or other special input-only elements.
     */
    public boolean isValidOutput(final TransliteratorParser parser) {
      for (int i = 0; i < text.length();) {
        int c = UTF16.charAt(text, i);
        i += UTF16.getCharCount(c);
        if (!parser.parseData.isReplacer(c)) {
          return false;
        }
      }
      return true;
    }

    /**
     * Return true if this half looks like valid input, that is, does not contain functions or other special output-only elements.
     */
    public boolean isValidInput(final TransliteratorParser parser) {
      for (int i = 0; i < text.length();) {
        int c = UTF16.charAt(text, i);
        i += UTF16.getCharCount(c);
        if (!parser.parseData.isMatcher(c)) {
          return false;
        }
      }
      return true;
    }
  }

  //----------------------------------------------------------------------
  // PUBLIC methods
  //----------------------------------------------------------------------

  /**
   * Constructor.
   */
  public TransliteratorParser() {
  }

  /**
   * Parse a set of rules. After the parse completes, examine the public data members for results.
   */
  public void parse(final String rules, final int dir) {
    parseRules(new RuleArray(new String[] { rules }), dir);
  }

  /*
   * Parse a set of rules.  After the parse completes, examine the public
   * data members for results.
   */
  /*    public void parse(ResourceReader rules, int direction) {
          parseRules(new RuleReader(rules), direction);
      }*/

  //----------------------------------------------------------------------
  // PRIVATE methods
  //----------------------------------------------------------------------

  /**
   * Parse an array of zero or more rules. The strings in the array are treated as if they were concatenated together, with rule
   * terminators inserted between array elements if not present already.
   *
   * Any previous rules are discarded. Typically this method is called exactly once, during construction.
   *
   * The member this.data will be set to null if there are no rules.
   *
   * @exception IllegalIcuArgumentException
   *                if there is a syntax error in the rules
   */
  void parseRules(final RuleBody ruleArray, final int dir) {
    boolean parsingIDs = true;
    int ruleCount = 0;

    dataVector = new ArrayList<Data>();
    idBlockVector = new ArrayList<String>();
    curData = null;
    direction = dir;
    compoundFilter = null;
    variablesVector = new ArrayList<Object>();
    variableNames = new HashMap<String, char[]>();
    parseData = new ParseData();

    List<RuntimeException> errors = new ArrayList<RuntimeException>();
    int errorCount = 0;

    ruleArray.reset();

    StringBuilder idBlockResult = new StringBuilder();

    // The compound filter offset is an index into idBlockResult.
    // If it is 0, then the compound filter occurred at the start,
    // and it is the offset to the _start_ of the compound filter
    // pattern.  Otherwise it is the offset to the _limit_ of the
    // compound filter pattern within idBlockResult.
    this.compoundFilter = null;
    int compoundFilterOffset = -1;

    main: for (;;) {
      String rule = ruleArray.nextLine();
      if (rule == null) {
        break;
      }
      int pos = 0;
      int limit = rule.length();
      while (pos < limit) {
        char c = rule.charAt(pos++);
        if (PatternProps.isWhiteSpace(c)) {
          continue;
        }
        // Skip lines starting with the comment character
        if (c == RULE_COMMENT_CHAR) {
          pos = rule.indexOf("\n", pos) + 1;
          if (pos == 0) {
            break; // No "\n" found; rest of rule is a commnet
          }
          continue; // Either fall out or restart with next line
        }

        // skip empty rules
        if (c == END_OF_RULE)
          continue;

        // Often a rule file contains multiple errors.  It's
        // convenient to the rule author if these are all reported
        // at once.  We keep parsing rules even after a failure, up
        // to a specified limit, and report all errors at once.
        try {
          ++ruleCount;

          // We've found the start of a rule or ID.  c is its first
          // character, and pos points past c.
          --pos;
          // Look for an ID token.  Must have at least ID_TOKEN_LEN + 1
          // chars left.
          if ((pos + ID_TOKEN_LEN + 1) <= limit && rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
            pos += ID_TOKEN_LEN;
            c = rule.charAt(pos);
            while (PatternProps.isWhiteSpace(c) && pos < limit) {
              ++pos;
              c = rule.charAt(pos);
            }
            int[] p = new int[] { pos };

            if (!parsingIDs) {
              if (curData != null) {
                if (direction == Transliterator.FORWARD)
                  dataVector.add(curData);
                else
                  dataVector.add(0, curData);
                curData = null;
              }
              parsingIDs = true;
            }

            TransliteratorIDParser.SingleID id = TransliteratorIDParser.parseSingleID(rule, p, direction);
            if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
              // Successful ::ID parse.

              if (direction == Transliterator.FORWARD) {
                idBlockResult.append(id.canonID).append(END_OF_RULE);
              } else {
                idBlockResult.insert(0, id.canonID + END_OF_RULE);
              }

            } else {
              // Couldn't parse an ID.  Try to parse a global filter
              int[] withParens = new int[] { -1 };
              UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);
              if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
                if ((direction == Transliterator.FORWARD) == (withParens[0] == 0)) {
                  if (compoundFilter != null) {
                    // Multiple compound filters
                    syntaxError("Multiple global filters", rule, pos);
                  }
                  compoundFilter = f;
                  compoundFilterOffset = ruleCount;
                }
              } else {
                // Invalid ::id
                // Can be parsed as neither an ID nor a global filter
                syntaxError("Invalid ::ID", rule, pos);
              }
            }

            pos = p[0];
          } else {
            if (parsingIDs) {
              if (direction == Transliterator.FORWARD)
                idBlockVector.add(idBlockResult.toString());
              else
                idBlockVector.add(0, idBlockResult.toString());
              idBlockResult.delete(0, idBlockResult.length());
              parsingIDs = false;
              curData = new RuleBasedTransliterator.Data();

              // By default, rules use part of the private use area
              // E000..F8FF for variables and other stand-ins.  Currently
              // the range F000..F8FF is typically sufficient.  The 'use
              // variable range' pragma allows rule sets to modify this.
              setVariableRange(0xF000, 0xF8FF);
            }

            if (resemblesPragma(rule, pos, limit)) {
              int ppp = parsePragma(rule, pos, limit);
              if (ppp < 0) {
                syntaxError("Unrecognized pragma", rule, pos);
              }
              pos = ppp;
              // Parse a rule
            } else {
              pos = parseRule(rule, pos, limit);
            }
          }
        } catch (IllegalArgumentException e) {
          if (errorCount == 30) {
            IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException(
                "\nMore than 30 errors; further messages squelched");
            icuEx.initCause(e);
            errors.add(icuEx);
            break main;
          }
          e.fillInStackTrace();
          errors.add(e);
          ++errorCount;
          pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
        }
      }
    }
    if (parsingIDs && idBlockResult.length() > 0) {
      if (direction == Transliterator.FORWARD)
        idBlockVector.add(idBlockResult.toString());
      else
        idBlockVector.add(0, idBlockResult.toString());
    } else if (!parsingIDs && curData != null) {
      if (direction == Transliterator.FORWARD)
        dataVector.add(curData);
      else
        dataVector.add(0, curData);
    }

    // Convert the set vector to an array
    for (int i = 0; i < dataVector.size(); i++) {
      Data data = dataVector.get(i);
      data.variables = new Object[variablesVector.size()];
      variablesVector.toArray(data.variables);
      data.variableNames = new HashMap<String, char[]>();
      data.variableNames.putAll(variableNames);
    }
    variablesVector = null;

    // Do more syntax checking and index the rules
    try {
      if (compoundFilter != null) {
        if ((direction == Transliterator.FORWARD && compoundFilterOffset != 1)
            || (direction == Transliterator.REVERSE && compoundFilterOffset != ruleCount)) {
          throw new IllegalIcuArgumentException("Compound filters misplaced");
        }
      }

      for (int i = 0; i < dataVector.size(); i++) {
        Data data = dataVector.get(i);
        data.ruleSet.freeze();
      }

      if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0)
        idBlockVector.remove(0);

    } catch (IllegalArgumentException e) {
      e.fillInStackTrace();
      errors.add(e);
    }

    if (errors.size() != 0) {
      for (int i = errors.size() - 1; i > 0; --i) {
        RuntimeException previous = errors.get(i - 1);
        while (previous.getCause() != null) {
          previous = (RuntimeException) previous.getCause(); // chain specially
        }
        previous.initCause(errors.get(i));
      }
      throw errors.get(0);
      // if initCause not supported: throw new IllegalArgumentException(errors.toString());
    }
  }

  /**
   * MAIN PARSER. Parse the next rule in the given rule string, starting at pos. Return the index after the last character parsed. Do not
   * parse characters at or after limit.
   *
   * Important: The character at pos must be a non-whitespace character that is not the comment character.
   *
   * This method handles quoting, escaping, and whitespace removal. It parses the end-of-rule character. It recognizes context and cursor
   * indicators. Once it does a lexical breakdown of the rule at pos, it creates a rule object and adds it to our rule list.
   *
   * This method is tightly coupled to the inner class RuleHalf.
   */
  private int parseRule(final String rule, int pos, final int limit) {
    // Locate the left side, operator, and right side
    int start = pos;
    char operator = 0;

    // Set up segments data
    segmentStandins = new StringBuffer();
    segmentObjects = new ArrayList<StringMatcher>();

    RuleHalf left = new RuleHalf();
    RuleHalf right = new RuleHalf();

    undefinedVariableName = null;
    pos = left.parse(rule, pos, limit, this);

    if (pos == limit || OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
      syntaxError("No operator pos=" + pos, rule, start);
    }
    ++pos;

    // Found an operator char.  Check for forward-reverse operator.
    if (operator == REVERSE_RULE_OP && (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
      ++pos;
      operator = FWDREV_RULE_OP;
    }

    // Translate alternate op characters.
    switch (operator) {
    case ALT_FORWARD_RULE_OP:
      operator = FORWARD_RULE_OP;
      break;
    case ALT_REVERSE_RULE_OP:
      operator = REVERSE_RULE_OP;
      break;
    case ALT_FWDREV_RULE_OP:
      operator = FWDREV_RULE_OP;
      break;
    }

    pos = right.parse(rule, pos, limit, this);

    if (pos < limit) {
      if (rule.charAt(--pos) == END_OF_RULE) {
        ++pos;
      } else {
        // RuleHalf parser must have terminated at an operator
        syntaxError("Unquoted operator", rule, start);
      }
    }

    if (operator == VARIABLE_DEF_OP) {
      // LHS is the name.  RHS is a single character, either a literal
      // or a set (already parsed).  If RHS is longer than one
      // character, it is either a multi-character string, or multiple
      // sets, or a mixture of chars and sets -- syntax error.

      // We expect to see a single undefined variable (the one being
      // defined).
      if (undefinedVariableName == null) {
        syntaxError("Missing '$' or duplicate definition", rule, start);
      }
      if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
        syntaxError("Malformed LHS", rule, start);
      }
      if (left.anchorStart || left.anchorEnd || right.anchorStart || right.anchorEnd) {
        syntaxError("Malformed variable def", rule, start);
      }
      // We allow anything on the right, including an empty string.
      int n = right.text.length();
      char[] value = new char[n];
      right.text.getChars(0, n, value, 0);
      variableNames.put(undefinedVariableName, value);

      ++variableLimit;
      return pos;
    }

    // If this is not a variable definition rule, we shouldn't have
    // any undefined variable names.
    if (undefinedVariableName != null) {
      syntaxError("Undefined variable $" + undefinedVariableName, rule, start);
    }

    // Verify segments
    if (segmentStandins.length() > segmentObjects.size()) {
      syntaxError("Undefined segment reference", rule, start);
    }
    for (int i = 0; i < segmentStandins.length(); ++i) {
      if (segmentStandins.charAt(i) == 0) {
        syntaxError("Internal error", rule, start); // will never happen
      }
    }
    for (int i = 0; i < segmentObjects.size(); ++i) {
      if (segmentObjects.get(i) == null) {
        syntaxError("Internal error", rule, start); // will never happen
      }
    }

    // If the direction we want doesn't match the rule
    // direction, do nothing.
    if (operator != FWDREV_RULE_OP && ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
      return pos;
    }

    // Transform the rule into a forward rule by swapping the
    // sides if necessary.
    if (direction == Transliterator.REVERSE) {
      RuleHalf temp = left;
      left = right;
      right = temp;
    }

    // Remove non-applicable elements in forward-reverse
    // rules.  Bidirectional rules ignore elements that do not
    // apply.
    if (operator == FWDREV_RULE_OP) {
      right.removeContext();
      left.cursor = -1;
      left.cursorOffset = 0;
    }

    // Normalize context
    if (left.ante < 0) {
      left.ante = 0;
    }
    if (left.post < 0) {
      left.post = left.text.length();
    }

    // Context is only allowed on the input side.  Cursors are only
    // allowed on the output side.  Segment delimiters can only appear
    // on the left, and references on the right.  Cursor offset
    // cannot appear without an explicit cursor.  Cursor offset
    // cannot place the cursor outside the limits of the context.
    // Anchors are only allowed on the input side.
    if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 || (right.cursorOffset != 0 && right.cursor < 0) ||
    // - The following two checks were used to ensure that the
    // - the cursor offset stayed within the ante- or postcontext.
    // - However, with the addition of quantifiers, we have to
    // - allow arbitrary cursor offsets and do runtime checking.
    //(right.cursorOffset > (left.text.length() - left.post)) ||
    //(-right.cursorOffset > left.ante) ||
        right.anchorStart || right.anchorEnd || !left.isValidInput(this) || !right.isValidOutput(this) || left.ante > left.post) {
      syntaxError("Malformed rule", rule, start);
    }

    // Flatten segment objects vector to an array
    UnicodeMatcher[] segmentsArray = null;
    if (segmentObjects.size() > 0) {
      segmentsArray = new UnicodeMatcher[segmentObjects.size()];
      segmentObjects.toArray(segmentsArray);
    }

    curData.ruleSet.addRule(new TransliterationRule(left.text, left.ante, left.post, right.text, right.cursor, right.cursorOffset,
        segmentsArray, left.anchorStart, left.anchorEnd, curData));

    return pos;
  }

  /**
   * Set the variable range to [start, end] (inclusive).
   */
  private void setVariableRange(final int start, final int end) {
    if (start > end || start < 0 || end > 0xFFFF) {
      throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);
    }

    curData.variablesBase = (char) start; // first private use

    if (dataVector.size() == 0) {
      variableNext = (char) start;
      variableLimit = (char) (end + 1);
    }
  }

  /**
   * Assert that the given character is NOT within the variable range. If it is, signal an error. This is neccesary to ensure that the
   * variable range does not overlap characters used in a rule.
   */
  private void checkVariableRange(final int ch, final String rule, final int start) {
    if (ch >= curData.variablesBase && ch < variableLimit) {
      syntaxError("Variable range character in rule", rule, start);
    }
  }

  // (The following method is part of an unimplemented feature.
  // Remove this clover pragma after the feature is implemented.
  // 2003-06-11 ICU 2.6 Alan)
  ///CLOVER:OFF
  /**
   * Set the maximum backup to 'backup', in response to a pragma statement.
   */
  private void pragmaMaximumBackup(final int backup) {
    //TODO Finish
    throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");
  }

  ///CLOVER:ON

  // (The following method is part of an unimplemented feature.
  // Remove this clover pragma after the feature is implemented.
  // 2003-06-11 ICU 2.6 Alan)
  ///CLOVER:OFF
  /**
   * Begin normalizing all rules using the given mode, in response to a pragma statement.
   */
  private void pragmaNormalizeRules(final Normalizer.Mode mode) {
    //TODO Finish
    throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");
  }

  ///CLOVER:ON

  /**
   * Return true if the given rule looks like a pragma.
   *
   * @param pos
   *            offset to the first non-whitespace character of the rule.
   * @param limit
   *            pointer past the last character of the rule.
   */
  static boolean resemblesPragma(final String rule, final int pos, final int limit) {
    // Must start with /use\s/i
    return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
  }

  /**
   * Parse a pragma. This method assumes resemblesPragma() has already returned true.
   *
   * @param pos
   *            offset to the first non-whitespace character of the rule.
   * @param limit
   *            pointer past the last character of the rule.
   * @return the position index after the final ';' of the pragma, or -1 on failure.
   */
  private int parsePragma(final String rule, int pos, final int limit) {
    int[] array = new int[2];

    // resemblesPragma() has already returned true, so we
    // know that pos points to /use\s/i; we can skip 4 characters
    // immediately
    pos += 4;

    // Here are the pragmas we recognize:
    // use variable range 0xE000 0xEFFF;
    // use maximum backup 16;
    // use nfd rules;
    int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
    if (p >= 0) {
      setVariableRange(array[0], array[1]);
      return p;
    }

    p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
    if (p >= 0) {
      pragmaMaximumBackup(array[0]);
      return p;
    }

    p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
    if (p >= 0) {
      pragmaNormalizeRules(Normalizer.NFD);
      return p;
    }

    p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
    if (p >= 0) {
      pragmaNormalizeRules(Normalizer.NFC);
      return p;
    }

    // Syntax error: unable to parse pragma
    return -1;
  }

  /**
   * Throw an exception indicating a syntax error. Search the rule string for the probable end of the rule. Of course, if the error is
   * that the end of rule marker is missing, then the rule end will not be found. In any case the rule start will be correctly reported.
   *
   * @param msg
   *            error description
   * @param rule
   *            pattern string
   * @param start
   *            position of first character of current rule
   */
  static final void syntaxError(final String msg, final String rule, final int start) {
    int end = ruleEnd(rule, start, rule.length());
    throw new IllegalIcuArgumentException(msg + " in \"" + Utility.escape(rule.substring(start, end)) + '"');
  }

  static final int ruleEnd(final String rule, final int start, final int limit) {
    int end = Utility.quotedIndexOf(rule, start, limit, ";");
    if (end < 0) {
      end = limit;
    }
    return end;
  }

  /**
   * Parse a UnicodeSet out, store it, and return the stand-in character used to represent it.
   */
  private final char parseSet(final String rule, final ParsePosition pos) {
    UnicodeSet set = new UnicodeSet(rule, pos, parseData);
    if (variableNext >= variableLimit) {
      throw new RuntimeException("Private use variables exhausted");
    }
    set.compact();
    return generateStandInFor(set);
  }

  /**
   * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer. Store the object.
   */
  char generateStandInFor(final Object obj) {
    // assert(obj != null);

    // Look up previous stand-in, if any.  This is a short list
    // (typical n is 0, 1, or 2); linear search is optimal.
    for (int i = 0; i < variablesVector.size(); ++i) {
      if (variablesVector.get(i) == obj) { // [sic] pointer comparison
        return (char) (curData.variablesBase + i);
      }
    }

    if (variableNext >= variableLimit) {
      throw new RuntimeException("Variable range exhausted");
    }
    variablesVector.add(obj);
    return variableNext++;
  }

  /**
   * Return the standin for segment seg (1-based).
   */
  public char getSegmentStandin(final int seg) {
    if (segmentStandins.length() < seg) {
      segmentStandins.setLength(seg);
    }
    char c = segmentStandins.charAt(seg - 1);
    if (c == 0) {
      if (variableNext >= variableLimit) {
        throw new RuntimeException("Variable range exhausted");
      }
      c = variableNext++;
      // Set a placeholder in the master variables vector that will be
      // filled in later by setSegmentObject().  We know that we will get
      // called first because setSegmentObject() will call us.
      variablesVector.add(null);
      segmentStandins.setCharAt(seg - 1, c);
    }
    return c;
  }

  /**
   * Set the object for segment seg (1-based).
   */
  public void setSegmentObject(final int seg, final StringMatcher obj) {
    // Since we call parseSection() recursively, nested
    // segments will result in segment i+1 getting parsed
    // and stored before segment i; be careful with the
    // vector handling here.
    while (segmentObjects.size() < seg) {
      segmentObjects.add(null);
    }
    int index = getSegmentStandin(seg) - curData.variablesBase;
    if (segmentObjects.get(seg - 1) != null || variablesVector.get(index) != null) {
      throw new RuntimeException(); // should never happen
    }
    segmentObjects.set(seg - 1, obj);
    variablesVector.set(index, obj);
  }

  /**
   * Return the stand-in for the dot set. It is allocated the first time and reused thereafter.
   */
  char getDotStandIn() {
    if (dotStandIn == -1) {
      dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
    }
    return (char) dotStandIn;
  }

  /**
   * Append the value of the given variable name to the given StringBuffer.
   *
   * @exception IllegalIcuArgumentException
   *                if the name is unknown.
   */
  private void appendVariableDef(final String name, final StringBuffer buf) {
    char[] ch = variableNames.get(name);
    if (ch == null) {
      // We allow one undefined variable so that variable definition
      // statements work.  For the first undefined variable we return
      // the special placeholder variableLimit-1, and save the variable
      // name.
      if (undefinedVariableName == null) {
        undefinedVariableName = name;
        if (variableNext >= variableLimit) {
          throw new RuntimeException("Private use variables exhausted");
        }
        buf.append(--variableLimit);
      } else {
        throw new IllegalIcuArgumentException("Undefined variable $" + name);
      }
    } else {
      buf.append(ch);
    }
  }
}

//eof
TOP

Related Classes of com.ibm.icu.text.TransliteratorParser$RuleHalf

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.