Package org.apache.lucene.analysis.cn.smart.hhmm

Examples of org.apache.lucene.analysis.cn.smart.hhmm.SegToken


        return false; // no more sentences, end of stream!
      }
    }
   
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = (SegToken) tokenIter.next();
    termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
    typeAtt.setType("word");
    return true;
  }
View Full Code Here


      }
    }
    // WordTokenFilter must clear attributes, as it is creating new tokens.
    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = tokenIter.next();
    termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
View Full Code Here

      }
    }
    // WordTokenFilter must clear attributes, as it is creating new tokens.
    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = tokenIter.next();
    termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
    typeAtt.setType("word");
    return true;
  }
View Full Code Here

    int i = 0, j;
    int length = sentence.length();
    int foundIndex;
    int[] charTypeArray = getCharTypes(sentence);
    StringBuilder wordBuf = new StringBuilder();
    SegToken token;
    int frequency = 0; // the number of times word appears.
    boolean hasFullWidth;
    int wordType;
    char[] charArray;

    SegGraph segGraph = new SegGraph();
    while (i < length) {
      hasFullWidth = false;
      switch (charTypeArray[i]) {
        case CharType.SPACE_LIKE:
          i++;
          break;
        case CharType.HANZI:
          j = i + 1;
          wordBuf.delete(0, wordBuf.length());
          // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
          // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will
          // cause word division.
          wordBuf.append(sentence.charAt(i));
          charArray = new char[] { sentence.charAt(i) };
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
              frequency);
          segGraph.addToken(token);

          foundIndex = wordDict.getPrefixMatch(charArray);
          while (j <= length && foundIndex != -1) {
            if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
              // It is the phrase we are looking for; In other words, we have found a phrase SegToken
              // from i to j.  It is not a monosyllabic word (single word).
              frequency = wordDict.getFrequency(charArray);
              token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                  frequency);
              segGraph.addToken(token);
            }

            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
              j++;

            if (j < length && charTypeArray[j] == CharType.HANZI) {
              wordBuf.append(sentence.charAt(j));
              charArray = new char[wordBuf.length()];
              wordBuf.getChars(0, charArray.length, charArray, 0);
              // idArray has been found (foundWordIndex!=-1) as a prefix before. 
              // Therefore, idArray after it has been lengthened can only appear after foundWordIndex. 
              // So start searching after foundWordIndex.
              foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
              j++;
            } else {
              break;
            }
          }
          i++;
          break;
        case CharType.FULLWIDTH_LETTER:
          hasFullWidth = true;
        case CharType.LETTER:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
              hasFullWidth = true;
            j++;
          }
          // Found a Token from i to j. Type is LETTER char string.
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case CharType.FULLWIDTH_DIGIT:
          hasFullWidth = true;
        case CharType.DIGIT:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
              hasFullWidth = true;
            j++;
          }
          // Found a Token from i to j. Type is NUMBER char string.
          charArray = Utility.NUMBER_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case CharType.DELIMITER:
          j = i + 1;
          // No need to search the weight for the punctuation.  Picking the highest frequency will work.
          frequency = Utility.MAX_FREQUENCE;
          charArray = new char[] { sentence.charAt(i) };
          token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        default:
          j = i + 1;
          // Treat the unrecognized char symbol as unknown string.
          // For example, any symbol not in GB2312 is treated as one of these.
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.STRING, frequency);
          segGraph.addToken(token);
          i = j;
          break;
      }
    }

    // Add two more Tokens: "beginning xx beginning"
    charArray = Utility.START_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
    segGraph.addToken(token);

    // "end xx end"
    charArray = Utility.END_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
        frequency);
    segGraph.addToken(token);

    return segGraph;
  }
View Full Code Here

  @Override
  protected boolean incrementWord() {
    if (tokens == null || !tokens.hasNext()) {
      return false;
    } else {
      SegToken token = tokens.next();
      clearAttributes();
      termAtt.copyBuffer(token.charArray, 0, token.charArray.length);
      offsetAtt.setOffset(correctOffset(token.startOffset), correctOffset(token.endOffset));
      typeAtt.setType("word");
      return true;
View Full Code Here

      }
    }
    // WordTokenFilter must clear attributes, as it is creating new tokens.
    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = (SegToken) tokenIter.next();
    termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
    typeAtt.setType("word");
    return true;
  }
View Full Code Here

    int i = 0, j;
    int length = sentence.length();
    int foundIndex;
    int[] charTypeArray = getCharTypes(sentence);
    StringBuffer wordBuf = new StringBuffer();
    SegToken token;
    int frequency = 0; // word的出现次数
    boolean hasFullWidth;
    int wordType;
    char[] charArray;

    SegGraph segGraph = new SegGraph();
    while (i < length) {
      hasFullWidth = false;
      switch (charTypeArray[i]) {
        case CharType.SPACE_LIKE:
          i++;
          break;
        case CharType.HANZI:
          j = i + 1;
          wordBuf.delete(0, wordBuf.length());
          // 不管单个汉字能不能构成词,都将单个汉字存到segGraph中去,否则会造成分此图断字
          wordBuf.append(sentence.charAt(i));
          charArray = new char[] { sentence.charAt(i) };
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
              frequency);
          segGraph.addToken(token);

          foundIndex = wordDict.getPrefixMatch(charArray);
          while (j <= length && foundIndex != -1) {
            if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
              // 就是我们要找的词, 也就是说找到了从i到j的一个成词SegToken,并且不是单字词
              frequency = wordDict.getFrequency(charArray);
              token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                  frequency);
              segGraph.addToken(token);
            }

            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
              j++;

            if (j < length && charTypeArray[j] == CharType.HANZI) {
              wordBuf.append(sentence.charAt(j));
              charArray = new char[wordBuf.length()];
              wordBuf.getChars(0, charArray.length, charArray, 0);
              // idArray作为前缀已经找到过(foundWordIndex!=-1),
              // 因此加长过后的idArray只可能出现在foundWordIndex以后,
              // 故从foundWordIndex之后开始查找
              foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
              j++;
            } else {
              break;
            }
          }
          i++;
          break;
        case CharType.FULLWIDTH_LETTER:
          hasFullWidth = true;
        case CharType.LETTER:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
              hasFullWidth = true;
            j++;
          }
          // 找到了从i到j的一个Token,类型为LETTER的字符串
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case CharType.FULLWIDTH_DIGIT:
          hasFullWidth = true;
        case CharType.DIGIT:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
              hasFullWidth = true;
            j++;
          }
          // 找到了从i到j的一个Token,类型为NUMBER的字符串
          charArray = Utility.NUMBER_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case CharType.DELIMITER:
          j = i + 1;
          // 标点符号的weight不用查了,选个最大的频率即可
          frequency = Utility.MAX_FREQUENCE;
          charArray = new char[] { sentence.charAt(i) };
          token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        default:
          j = i + 1;
          // 把不认识的字符当作未知串看待,例如GB2312编码之外的字符,每个字符当作一个
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.STRING, frequency);
          segGraph.addToken(token);
          i = j;
          break;
      }
    }

    // 为segGraph增加两个新Token: "始##始","末##末"
    charArray = Utility.START_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
    segGraph.addToken(token);

    // "末##末"
    charArray = Utility.END_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
        frequency);
    segGraph.addToken(token);

    return segGraph;
  }
View Full Code Here

    int i = 0, j;
    int length = sentence.length();
    int foundIndex;
    int[] charTypeArray = getCharTypes(sentence);
    StringBuilder wordBuf = new StringBuilder();
    SegToken token;
    int frequency = 0; // the number of times word appears.
    boolean hasFullWidth;
    int wordType;
    char[] charArray;

    SegGraph segGraph = new SegGraph();
    while (i < length) {
      hasFullWidth = false;
      switch (charTypeArray[i]) {
        case CharType.SPACE_LIKE:
          i++;
          break;
        case CharType.HANZI:
          j = i + 1;
          wordBuf.delete(0, wordBuf.length());
          // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
          // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will
          // cause word division.
          wordBuf.append(sentence.charAt(i));
          charArray = new char[] { sentence.charAt(i) };
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
              frequency);
          segGraph.addToken(token);

          foundIndex = wordDict.getPrefixMatch(charArray);
          while (j <= length && foundIndex != -1) {
            if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
              // It is the phrase we are looking for; In other words, we have found a phrase SegToken
              // from i to j.  It is not a monosyllabic word (single word).
              frequency = wordDict.getFrequency(charArray);
              token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                  frequency);
              segGraph.addToken(token);
            }

            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
              j++;

            if (j < length && charTypeArray[j] == CharType.HANZI) {
              wordBuf.append(sentence.charAt(j));
              charArray = new char[wordBuf.length()];
              wordBuf.getChars(0, charArray.length, charArray, 0);
              // idArray has been found (foundWordIndex!=-1) as a prefix before. 
              // Therefore, idArray after it has been lengthened can only appear after foundWordIndex. 
              // So start searching after foundWordIndex.
              foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
              j++;
            } else {
              break;
            }
          }
          i++;
          break;
        case CharType.FULLWIDTH_LETTER:
          hasFullWidth = true; /* intentional fallthrough */
        case CharType.LETTER:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
              hasFullWidth = true;
            j++;
          }
          // Found a Token from i to j. Type is LETTER char string.
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case CharType.FULLWIDTH_DIGIT:
          hasFullWidth = true; /* intentional fallthrough */
        case CharType.DIGIT:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
              hasFullWidth = true;
            j++;
          }
          // Found a Token from i to j. Type is NUMBER char string.
          charArray = Utility.NUMBER_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case CharType.DELIMITER:
          j = i + 1;
          // No need to search the weight for the punctuation.  Picking the highest frequency will work.
          frequency = Utility.MAX_FREQUENCE;
          charArray = new char[] { sentence.charAt(i) };
          token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        default:
          j = i + 1;
          // Treat the unrecognized char symbol as unknown string.
          // For example, any symbol not in GB2312 is treated as one of these.
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.STRING, frequency);
          segGraph.addToken(token);
          i = j;
          break;
      }
    }

    // Add two more Tokens: "beginning xx beginning"
    charArray = Utility.START_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
    segGraph.addToken(token);

    // "end xx end"
    charArray = Utility.END_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
        frequency);
    segGraph.addToken(token);

    return segGraph;
  }
View Full Code Here

  @Override
  protected boolean incrementWord() {
    if (tokens == null || !tokens.hasNext()) {
      return false;
    } else {
      SegToken token = tokens.next();
      clearAttributes();
      termAtt.copyBuffer(token.charArray, 0, token.charArray.length);
      offsetAtt.setOffset(correctOffset(token.startOffset), correctOffset(token.endOffset));
      typeAtt.setType("word");
      return true;
View Full Code Here

      }
    }
    // WordTokenFilter must clear attributes, as it is creating new tokens.
    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = tokenIter.next();
    termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.cn.smart.hhmm.SegToken

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.