Package org.apache.lucene.util

Examples of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator


    }

    ArrayList<String> tokens = new ArrayList<String>();
    ArrayList<Number> vals = new ArrayList<Number>();
    BytesRef spare;
    CharsRef charsSpare = new CharsRef();
    while ((spare = tfit.next()) != null) {
      charsSpare.grow(spare.length);
      UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
      tokens.add(charsSpare.toString());
      vals.add(Long.valueOf(tfit.weight()));
    }
    autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
  }
View Full Code Here


  private void add(String input, String output, boolean keepOrig) {
    if (VERBOSE) {
      System.out.println("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
    }
    CharsRef inputCharsRef = new CharsRef();
    SynonymMap.Builder.join(input.split(" +"), inputCharsRef);

    CharsRef outputCharsRef = new CharsRef();
    SynonymMap.Builder.join(output.split(" +"), outputCharsRef);

    b.add(inputCharsRef, outputCharsRef, keepOrig);
  }
View Full Code Here

      }
    }
  }
 
  private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
    b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
          new CharsRef(output.replaceAll(" +", "\u0000")),
          keepOrig);
  }
View Full Code Here

   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   */
  private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator( null );
    final CharsRef spare = new CharsRef();
    BytesRef text;
    while ( ( text = termsEnum.next() ) != null ) {
      UnicodeUtil.UTF8toUTF16( text, spare );
      final String term = spare.toString();
      if ( isNoiseWord( term ) ) {
        continue;
      }
      final int freq = (int) termsEnum.totalTermFreq();

View Full Code Here

  public void testMaxPosition3WithSynomyms() throws IOException {
    MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
    tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
   
    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRef multiWordCharsRef = new CharsRef();
    SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef, true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef, true);
    SynonymMap synonymMap = builder.build();
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false
   
    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
View Full Code Here

    this.delegate = delegate;
   
    for (String field : fields) {
      Set<String> stopWords = new HashSet<String>();
      Terms terms = MultiFields.getTerms(indexReader, field);
      CharsRef spare = new CharsRef();
      if (terms != null) {
        TermsEnum te = terms.iterator(null);
        BytesRef text;
        while ((text = te.next()) != null) {
          if (te.docFreq() > maxDocFreq) {
            UnicodeUtil.UTF8toUTF16(text, spare);
            stopWords.add(spare.toString());
          }
        }
      }
      stopWordsPerField.put(field, stopWords);
    }
View Full Code Here

      }

      final byte[] spare = new byte[5];
     
      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRef scratchIntsRef = new IntsRef();
     
      //System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
View Full Code Here

    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRef spare = new CharsRef();

      //System.out.println("  now intersect exactFirst=" + exactFirst);
   
      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
View Full Code Here

    if (prefixOutput == null) {
      return Collections.<LookupResult>emptyList();
    }
   
    List<LookupResult> results = new ArrayList<LookupResult>(num);
    CharsRef spare = new CharsRef();
    if (exactFirst && arc.isFinal()) {
      spare.grow(scratch.length);
      UnicodeUtil.UTF8toUTF16(scratch, spare);
      results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
      if (--num == 0) {
        return results; // that was quick
      }
    }

    // complete top-N
    MinResult<Long> completions[] = null;
    try {
      completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
   
    BytesRef suffix = new BytesRef(8);
    for (MinResult<Long> completion : completions) {
      scratch.length = prefixLength;
      // append suffix
      Util.toBytesRef(completion.input, suffix);
      scratch.append(suffix);
      spare.grow(scratch.length);
      UnicodeUtil.UTF8toUTF16(scratch, spare);
      results.add(new LookupResult(spare.toString(), decodeWeight(completion.output)));
    }
    return results;
  }
View Full Code Here

   * @return sorted list of the suggested words according to the comparator
   * @throws IOException If there is a low-level I/O error.
   */
  public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
      SuggestMode suggestMode, float accuracy) throws IOException {
    final CharsRef spare = new CharsRef();
    String text = term.text();
    if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
      return new SuggestWord[0];
   
    if (lowerCaseTerms) {
      term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
    }
   
    int docfreq = ir.docFreq(term);
   
    if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
      return new SuggestWord[0];
    }
   
    int maxDoc = ir.maxDoc();
   
    if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
      return new SuggestWord[0];
    } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) {
      return new SuggestWord[0];
    }
   
    if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0;
   
    if (thresholdFrequency >= 1f) {
      docfreq = Math.max(docfreq, (int) thresholdFrequency);
    } else if (thresholdFrequency > 0f) {
      docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1);
    }
   
    Collection<ScoreTerm> terms = null;
    int inspections = numSug * maxInspections;
   
    // try ed=1 first, in case we get lucky
    terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
    if (maxEdits > 1 && terms.size() < inspections) {
      HashSet<ScoreTerm> moreTerms = new HashSet<ScoreTerm>();
      moreTerms.addAll(terms);
      moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
      terms = moreTerms;
    }
   
    // create the suggestword response, sort it, and trim it to size.
   
    SuggestWord suggestions[] = new SuggestWord[terms.size()];
    int index = suggestions.length - 1;
    for (ScoreTerm s : terms) {
      SuggestWord suggestion = new SuggestWord();
      if (s.termAsString == null) {
        UnicodeUtil.UTF8toUTF16(s.term, spare);
        s.termAsString = spare.toString();
      }
      suggestion.string = s.termAsString;
      suggestion.score = s.score;
      suggestion.freq = s.docfreq;
      suggestions[index--] = suggestion;
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.