Package org.apache.lucene.util

Examples of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator


  CharsRef utf16Result2;
  private final BytesRef scratchBytes = new BytesRef();

  // Currently used only by assert statements
  private boolean initUTF16Results() {
    utf16Result1 = new CharsRef(10);
    utf16Result2 = new CharsRef(10);
    return true;
  }
View Full Code Here


  private void add(String input, String output, boolean keepOrig) {
    if (VERBOSE) {
      System.out.println("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
    }
    CharsRef inputCharsRef = new CharsRef();
    SynonymMap.Builder.join(input.split(" +"), inputCharsRef);

    CharsRef outputCharsRef = new CharsRef();
    SynonymMap.Builder.join(output.split(" +"), outputCharsRef);

    b.add(inputCharsRef, outputCharsRef, keepOrig);
  }
View Full Code Here

    while ((line = in.readLine()) != null) {
      if (line.length() == 0 || line.charAt(0) == '#') {
        continue; // ignore empty lines and comments
      }
     
      CharsRef inputs[];
      CharsRef outputs[];
     
      // TODO: we could process this more efficiently.
      String sides[] = split(line, "=>");
      if (sides.length > 1) { // explicit mapping
        if (sides.length != 2) {
          throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
        }
        String inputStrings[] = split(sides[0], ",");
        inputs = new CharsRef[inputStrings.length];
        for (int i = 0; i < inputs.length; i++) {
          inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef());
        }
       
        String outputStrings[] = split(sides[1], ",");
        outputs = new CharsRef[outputStrings.length];
        for (int i = 0; i < outputs.length; i++) {
          outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRef());
        }
      } else {
        String inputStrings[] = split(line, ",");
        inputs = new CharsRef[inputStrings.length];
        for (int i = 0; i < inputs.length; i++) {
          inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef());
        }
        if (expand) {
          outputs = inputs;
        } else {
          outputs = new CharsRef[1];
View Full Code Here

  public void testRandomUnicodeStrings() throws Throwable {
    char[] buffer = new char[20];
    char[] expected = new char[20];

    BytesRef utf8 = new BytesRef(20);
    CharsRef utf16 = new CharsRef(20);

    int num = atLeast(100000);
    for (int iter = 0; iter < num; iter++) {
      boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);
View Full Code Here

  // LUCENE-510
  public void testAllUnicodeChars() throws Throwable {

    BytesRef utf8 = new BytesRef(10);
    CharsRef utf16 = new CharsRef(10);
    char[] chars = new char[2];
    for(int ch=0;ch<0x0010FFFF;ch++) {

      if (ch == 0xd800)
        // Skip invalid code points
View Full Code Here

    }

    ArrayList<String> tokens = new ArrayList<String>();
    ArrayList<Number> vals = new ArrayList<Number>();
    BytesRef spare;
    CharsRef charsSpare = new CharsRef();
    while ((spare = tfit.next()) != null) {
      charsSpare.grow(spare.length);
      UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
      tokens.add(charsSpare.toString());
      vals.add(Long.valueOf(tfit.weight()));
    }
    autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
  }
View Full Code Here

      tfit = new UnsortedInputIterator(tfit);
    }
    trie = new JaspellTernarySearchTrie();
    trie.setMatchAlmostDiff(editDistance);
    BytesRef spare;
    final CharsRef charsSpare = new CharsRef();

    while ((spare = tfit.next()) != null) {
      final long weight = tfit.weight();
      if (spare.length == 0) {
        continue;
      }
      charsSpare.grow(spare.length);
      UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
      trie.put(charsSpare.toString(), Long.valueOf(weight));
    }
  }
View Full Code Here

    int maxCnt = Math.min(num, list.size());
    if (onlyMorePopular) {
      LookupPriorityQueue queue = new LookupPriorityQueue(num);
      for (String s : list) {
        long freq = ((Number)trie.get(s)).longValue();
        queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq));
      }
      for (LookupResult lr : queue.getResults()) {
        res.add(lr);
      }
    } else {
      for (int i = 0; i < maxCnt; i++) {
        String s = list.get(i);
        long freq = ((Number)trie.get(s)).longValue();
        res.add(new LookupResult(new CharsRef(s), freq));
      }     
    }
    return res;
  }
View Full Code Here

   * strings in UTF-8. These strings must be binary-sorted.
   */
  public static Automaton build(Collection<BytesRef> input) {
    final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
   
    CharsRef scratch = new CharsRef();
    for (BytesRef b : input) {
      UnicodeUtil.UTF8toUTF16(b, scratch);
      builder.add(scratch);
    }
   
View Full Code Here

      // loop on the initial node, and 3) determinizing
      // that.  Then we would not have to restart matching
      // at each position.

      int lastMatchLen = -1;
      CharsRef lastMatch = null;

      final int firstCH = buffer.get(inputOff);
      if (firstCH != -1) {
        FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH));
        if (arc != null) {
          if (!FST.targetHasArcs(arc)) {
            // Fast pass for single character match:
            assert arc.isFinal();
            lastMatchLen = 1;
            lastMatch = arc.output;
          } else {
            int lookahead = 0;
            CharsRef output = arc.output;
            while (true) {
              lookahead++;

              if (arc.isFinal()) {
                // Match! (to node is final)
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.