Package org.apache.lucene.util

Examples of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator


    public CharsRef pullNext() {
      assert upto < count;
      lastEndOffset = endOffsets[upto];
      lastPosLength = posLengths[upto];
      final CharsRef result = outputs[upto++];
      posIncr = 0;
      if (upto == count) {
        reset();
      }
      return result;
View Full Code Here


        final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)];
        System.arraycopy(posLengths, 0, next, 0, count);
        posLengths = next;
      }
      if (outputs[count] == null) {
        outputs[count] = new CharsRef();
      }
      outputs[count].copyChars(output, offset, len);
      // endOffset can be -1, in which case we should simply
      // use the endOffset of the input token, or X >= 0, in
      // which case we use X as the endOffset for this output
View Full Code Here

      }

      final byte[] spare = new byte[5];
     
      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRef scratchIntsRef = new IntsRef();
     
      //System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
View Full Code Here

      }
    }
  }
 
  private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
    b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
          new CharsRef(output.replaceAll(" +", "\u0000")),
          keepOrig);
  }
View Full Code Here

  @Override
  public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) throws IOException {
    assert info.hasDeletions();
    BytesRef scratch = new BytesRef();
    CharsRef scratchUTF16 = new CharsRef();
   
    String fileName = IndexFileNames.fileNameFromGeneration(info.info.name, LIVEDOCS_EXTENSION, info.getDelGen());
    IndexInput in = null;
    boolean success = false;
    try {
View Full Code Here

    if (prefixOutput == null) {
      return Collections.<LookupResult>emptyList();
    }
   
    List<LookupResult> results = new ArrayList<LookupResult>(num);
    CharsRef spare = new CharsRef();
    if (exactFirst && arc.isFinal()) {
      spare.grow(scratch.length);
      UnicodeUtil.UTF8toUTF16(scratch, spare);
      results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
      if (--num == 0) {
        return results; // that was quick
      }
    }

    // complete top-N
    MinResult<Long> completions[] = null;
    try {
      completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
   
    BytesRef suffix = new BytesRef(8);
    for (MinResult<Long> completion : completions) {
      scratch.length = prefixLength;
      // append suffix
      Util.toBytesRef(completion.input, suffix);
      scratch.append(suffix);
      spare.grow(scratch.length);
      UnicodeUtil.UTF8toUTF16(scratch, spare);
      results.add(new LookupResult(spare.toString(), decodeWeight(completion.output)));
    }
    return results;
  }
View Full Code Here

    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRef spare = new CharsRef();

      //System.out.println("  now intersect exactFirst=" + exactFirst);
   
      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
View Full Code Here

  private void add(String input, String output, boolean keepOrig) {
    if (VERBOSE) {
      System.out.println("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
    }
    CharsRef inputCharsRef = new CharsRef();
    SynonymMap.Builder.join(input.split(" +"), inputCharsRef);

    CharsRef outputCharsRef = new CharsRef();
    SynonymMap.Builder.join(output.split(" +"), outputCharsRef);

    b.add(inputCharsRef, outputCharsRef, keepOrig);
  }
View Full Code Here

        } else {
          finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);
        }
        assert finalLastToken.offset == 0;
       
        CharsRef spare = new CharsRef();
       
        // complete top-N
        MinResult<Long> completions[] = null;
        try {
         
          // Because we store multiple models in one FST
          // (1gram, 2gram, 3gram), we must restrict the
          // search so that it only considers the current
          // model.  For highest order model, this is not
          // necessary since all completions in the FST
          // must be from this model, but for lower order
          // models we have to filter out the higher order
          // ones:
         
          // Must do num+seen.size() for queue depth because we may
          // reject up to seen.size() paths in acceptResult():
          Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
           
            BytesRef scratchBytes = new BytesRef();
           
            @Override
            protected void addIfCompetitive(Util.FSTPath<Long> path) {
              if (path.arc.label != separator) {
                //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                super.addIfCompetitive(path);
              } else {
                //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
              }
            }
           
            @Override
            protected boolean acceptResult(IntsRef input, Long output) {
              Util.toBytesRef(input, scratchBytes);
              finalLastToken.grow(finalLastToken.length + scratchBytes.length);
              int lenSav = finalLastToken.length;
              finalLastToken.append(scratchBytes);
              //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
              boolean ret = seen.contains(finalLastToken) == false;
             
              finalLastToken.length = lenSav;
              return ret;
            }
          };
         
          // since this search is initialized with a single start node
          // it is okay to start with an empty input path here
          searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
         
          completions = searcher.search();
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
       
        int prefixLength = token.length;
       
        BytesRef suffix = new BytesRef(8);
        //System.out.println("    " + completions.length + " completions");
       
        nextCompletion:
          for (MinResult<Long> completion : completions) {
            token.length = prefixLength;
            // append suffix
            Util.toBytesRef(completion.input, suffix);
            token.append(suffix);
           
            //System.out.println("    completion " + token.utf8ToString());
           
            // Skip this path if a higher-order model already
            // saw/predicted its last token:
            BytesRef lastToken = token;
            for(int i=token.length-1;i>=0;i--) {
              if (token.bytes[token.offset+i] == separator) {
                assert token.length-i-1 > 0;
                lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);
                break;
              }
            }
            if (seen.contains(lastToken)) {
              //System.out.println("      skip dup " + lastToken.utf8ToString());
              continue nextCompletion;
            }
            seen.add(BytesRef.deepCopyOf(lastToken));
            spare.grow(token.length);
            UnicodeUtil.UTF8toUTF16(token, spare);
            LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
            results.add(result);
            assert results.size() == seen.size();
            //System.out.println("  add result=" + result);
          }
        backoff *= ALPHA;
View Full Code Here

    }

    ArrayList<String> tokens = new ArrayList<String>();
    ArrayList<Number> vals = new ArrayList<Number>();
    BytesRef spare;
    CharsRef charsSpare = new CharsRef();
    while ((spare = tfit.next()) != null) {
      charsSpare.grow(spare.length);
      UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
      tokens.add(charsSpare.toString());
      vals.add(Long.valueOf(tfit.weight()));
    }
    autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.