Package org.apache.lucene.util

Examples of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator


        final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
        final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
        final IntsRef scratch = new IntsRef();
        for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
          builder.add(Util.toUTF16(ent.getKey(), scratch),
                      new CharsRef(ent.getValue()));
        }
        map = builder.finish();
        pendingPairs.clear();
      } catch (IOException ioe) {
        // Bogus FST IOExceptions!!  (will never happen)
View Full Code Here


      }

      final byte[] spare = new byte[5];
     
      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRef scratchIntsRef = new IntsRef();
     
      //System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
View Full Code Here

    } else {
      this.origStemdict = null;
      // we don't need to ignore case here since we lowercase in this analyzer anyway
      StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
      CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator();
      CharsRef spare = new CharsRef();
      while (iter.hasNext()) {
        char[] nextKey = iter.nextKey();
        spare.copyChars(nextKey, 0, nextKey.length);
        builder.add(spare, iter.currentValue());
      }
      try {
        this.stemdict = builder.build();
      } catch (IOException ex) {
View Full Code Here

    this.delegate = delegate;
   
    for (String field : fields) {
      Set<String> stopWords = new HashSet<>();
      Terms terms = MultiFields.getTerms(indexReader, field);
      CharsRef spare = new CharsRef();
      if (terms != null) {
        TermsEnum te = terms.iterator(null);
        BytesRef text;
        while ((text = te.next()) != null) {
          if (te.docFreq() > maxDocFreq) {
            UnicodeUtil.UTF8toUTF16(text, spare);
            stopWords.add(spare.toString());
          }
        }
      }
      stopWordsPerField.put(field, stopWords);
    }
View Full Code Here

      } catch (IOException bogus) {
        throw new RuntimeException(bogus);
      }
      char cleaned[] = new char[scratchSegment.length()];
      scratchSegment.getChars(0, cleaned.length, cleaned, 0);
      return new CharsRef(cleaned, 0, cleaned.length);
    } else {
      return new CharsRef(buffer, 0, length);
    }
  }
View Full Code Here

    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRef scratchInts = new IntsRef();
    for (Map.Entry<String,String> entry : mappings.entrySet()) {
      Util.toUTF16(entry.getKey(), scratchInts);
      builder.add(scratchInts, new CharsRef(entry.getValue()));
    }
   
    return builder.finish();
  }
View Full Code Here

 
  // TODO: this could be more efficient!
  static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
   
    // temporary stuff
    final FST.Arc<CharsRef> arc = new FST.Arc<>();
    int longestMatch;
    CharsRef longestOutput;
   
    for (int i = 0; i < sb.length(); i++) {
      arc.copyFrom(firstArc);
      CharsRef output = NO_OUTPUT;
      longestMatch = -1;
      longestOutput = null;
     
      for (int j = i; j < sb.length(); j++) {
        char ch = sb.charAt(j);
View Full Code Here

   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   */
  private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator(null);
    final CharsRef spare = new CharsRef();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
      UnicodeUtil.UTF8toUTF16(text, spare);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      final int freq = (int) termsEnum.totalTermFreq();

View Full Code Here

    while ((line = in.readLine()) != null) {
      if (line.length() == 0 || line.charAt(0) == '#') {
        continue; // ignore empty lines and comments
      }
     
      CharsRef inputs[];
      CharsRef outputs[];
     
      // TODO: we could process this more efficiently.
      String sides[] = split(line, "=>");
      if (sides.length > 1) { // explicit mapping
        if (sides.length != 2) {
          throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
        }
        String inputStrings[] = split(sides[0], ",");
        inputs = new CharsRef[inputStrings.length];
        for (int i = 0; i < inputs.length; i++) {
          inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
        }
       
        String outputStrings[] = split(sides[1], ",");
        outputs = new CharsRef[outputStrings.length];
        for (int i = 0; i < outputs.length; i++) {
          outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
        }
      } else {
        String inputStrings[] = split(line, ",");
        inputs = new CharsRef[inputStrings.length];
        for (int i = 0; i < inputs.length; i++) {
          inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
        }
        if (expand) {
          outputs = inputs;
        } else {
          outputs = new CharsRef[1];
View Full Code Here

  public void add(Reader in) throws IOException, ParseException {
    LineNumberReader br = new LineNumberReader(in);
    try {
      String line = null;
      String lastSynSetID = "";
      CharsRef synset[] = new CharsRef[8];
      int synsetSize = 0;
     
      while ((line = br.readLine()) != null) {
        String synSetID = line.substring(2, 11);

        if (!synSetID.equals(lastSynSetID)) {
          addInternal(synset, synsetSize);
          synsetSize = 0;
        }

        if (synset.length <= synsetSize+1) {
          CharsRef larger[] = new CharsRef[synset.length * 2];
          System.arraycopy(synset, 0, larger, 0, synsetSize);
          synset = larger;
        }
       
        synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.