Package org.apache.lucene.util

Examples of org.apache.lucene.util.IntsRef


      final FST<CharsRef> map;
      try {
        final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
        final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
        final IntsRef scratch = new IntsRef();
        for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
          builder.add(Util.toUTF16(ent.getKey(), scratch),
                      new CharsRef(ent.getValue()));
        }
        map = builder.finish();
View Full Code Here


    List<String> data = new ArrayList<String>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<int[]>(featureEntries.size());
   
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRef scratch = new IntsRef();
    long ord = 0;
   
    for (String[] values : featureEntries) {
      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
      String[] readings = values[2].replaceAll("  *", " ").split(" ");
      String pos = values[3];
     
      if (segmentation.length != readings.length) {
        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
                                   " - the number of segmentations (" + segmentation.length + ")" +
                                   " does not the match number of readings (" + readings.length + ")");
      }
     
      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
      wordIdAndLength[0] = wordId;
      for (int i = 0; i < segmentation.length; i++) {
        wordIdAndLength[i + 1] = segmentation[i].length();
        data.add(readings[i] + INTERNAL_SEPARATOR + pos);
        wordId++;
      }
      // add mapping to FST
      String token = values[0];
      scratch.grow(token.length());
      scratch.length = token.length();
      for (int i = 0; i < token.length(); i++) {
        scratch.ints[i] = (int) token.charAt(i);
      }
      fstBuilder.add(scratch, ord);
View Full Code Here

    public StemmerOverrideMap build() throws IOException {
      ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(
          FST.INPUT_TYPE.BYTE4, outputs);
      final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
      IntsRef intsSpare = new IntsRef();
      final int size = hash.size();
      for (int i = 0; i < size; i++) {
        int id = sort[i];
        BytesRef bytesRef = hash.get(id, spare);
        UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
View Full Code Here

    }
    return new String(buffer, 0, end);
  }

  static IntsRef toIntsRef(String s, int inputMode) {
    return toIntsRef(s, inputMode, new IntsRef(10));
  }
View Full Code Here

      System.out.println("TEST: check valid terms/next()");
    }
    {
      IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
      for(InputOutput<T> pair : pairs) {
        IntsRef term = pair.input;
        if (LuceneTestCase.VERBOSE) {
          System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
        }
        T output = run(fst, term, null);
        assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
        assertTrue(outputsEqual(pair.output, output));

        // verify enum's next
        IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
        assertNotNull(t);
        assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
        assertTrue(outputsEqual(pair.output, t.output));
      }
      assertNull(fstEnum.next());
    }

    final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
    for(InputOutput<T> pair : pairs) {
      termsMap.put(pair.input, pair.output);
    }

    if (doReverseLookup && maxLong > minLong) {
      // Do random lookups so we test null (output doesn't
      // exist) case:
      assertNull(Util.getByOutput(fstLong, minLong-7));
      assertNull(Util.getByOutput(fstLong, maxLong+7));

      final int num = LuceneTestCase.atLeast(random, 100);
      for(int iter=0;iter<num;iter++) {
        Long v = _TestUtil.nextLong(random, minLong, maxLong);
        IntsRef input = Util.getByOutput(fstLong, v);
        assertTrue(validOutputs.contains(v) || input == null);
      }
    }

    // find random matching word and make sure it's valid
    if (LuceneTestCase.VERBOSE) {
      System.out.println("TEST: verify random accepted terms");
    }
    final IntsRef scratch = new IntsRef(10);
    int num = LuceneTestCase.atLeast(random, 500);
    for(int iter=0;iter<num;iter++) {
      T output = randomAcceptedWord(fst, scratch);
      assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
      assertTrue(outputsEqual(termsMap.get(scratch), output));

      if (doReverseLookup) {
        //System.out.println("lookup output=" + output + " outs=" + fst.outputs);
        IntsRef input = Util.getByOutput(fstLong, (Long) output);
        assertNotNull(input);
        //System.out.println("  got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
        assertEquals(scratch, input);
      }
    }
   
    // test IntsRefFSTEnum.seek:
    if (LuceneTestCase.VERBOSE) {
      System.out.println("TEST: verify seek");
    }
    IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
    num = LuceneTestCase.atLeast(random, 100);
    for(int iter=0;iter<num;iter++) {
      if (LuceneTestCase.VERBOSE) {
        System.out.println("  iter=" + iter);
      }
      if (random.nextBoolean()) {
        // seek to term that doesn't exist:
        while(true) {
          final IntsRef term = toIntsRef(getRandomString(random), inputMode);
          int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
          if (pos < 0) {
            pos = -(pos+1);
            // ok doesn't exist
            //System.out.println("  seek " + inputToString(inputMode, term));
            final IntsRefFSTEnum.InputOutput<T> seekResult;
            if (random.nextInt(3) == 0) {
              if (LuceneTestCase.VERBOSE) {
                System.out.println("  do non-exist seekExact term=" + inputToString(inputMode, term));
              }
              seekResult = fstEnum.seekExact(term);
              pos = -1;
            } else if (random.nextBoolean()) {
              if (LuceneTestCase.VERBOSE) {
                System.out.println("  do non-exist seekFloor term=" + inputToString(inputMode, term));
              }
              seekResult = fstEnum.seekFloor(term);
              pos--;
            } else {
              if (LuceneTestCase.VERBOSE) {
                System.out.println("  do non-exist seekCeil term=" + inputToString(inputMode, term));
              }
              seekResult = fstEnum.seekCeil(term);
            }

            if (pos != -1 && pos < pairs.size()) {
              //System.out.println("    got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
              assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
              if (LuceneTestCase.VERBOSE) {
                System.out.println("    got " + inputToString(inputMode, seekResult.input));
              }
              assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
              assertTrue(outputsEqual(pairs.get(pos).output, seekResult.output));
            } else {
              // seeked before start or beyond end
              //System.out.println("seek=" + seekTerm);
              assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
              if (LuceneTestCase.VERBOSE) {
                System.out.println("    got null");
              }
            }

            break;
          }
        }
      } else {
        // seek to term that does exist:
        InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
        final IntsRefFSTEnum.InputOutput<T> seekResult;
        if (random.nextInt(3) == 2) {
          if (LuceneTestCase.VERBOSE) {
            System.out.println("  do exists seekExact term=" + inputToString(inputMode, pair.input));
          }
          seekResult = fstEnum.seekExact(pair.input);
        } else if (random.nextBoolean()) {
          if (LuceneTestCase.VERBOSE) {
            System.out.println("  do exists seekFloor " + inputToString(inputMode, pair.input));
          }
          seekResult = fstEnum.seekFloor(pair.input);
        } else {
          if (LuceneTestCase.VERBOSE) {
            System.out.println("  do exists seekCeil " + inputToString(inputMode, pair.input));
          }
          seekResult = fstEnum.seekCeil(pair.input);
        }
        assertNotNull(seekResult);
        assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
        assertTrue(outputsEqual(pair.output, seekResult.output));
      }
    }

    if (LuceneTestCase.VERBOSE) {
      System.out.println("TEST: mixed next/seek");
    }

    // test mixed next/seek
    num = LuceneTestCase.atLeast(random, 100);
    for(int iter=0;iter<num;iter++) {
      if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: iter " + iter);
      }
      // reset:
      fstEnum = new IntsRefFSTEnum<T>(fst);
      int upto = -1;
      while(true) {
        boolean isDone = false;
        if (upto == pairs.size()-1 || random.nextBoolean()) {
          // next
          upto++;
          if (LuceneTestCase.VERBOSE) {
            System.out.println("  do next");
          }
          isDone = fstEnum.next() == null;
        } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
          int attempt = 0;
          for(;attempt<10;attempt++) {
            IntsRef term = toIntsRef(getRandomString(random), inputMode);
            if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
              int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
              assert pos < 0;
              upto = -(pos+1);

              if (random.nextBoolean()) {
View Full Code Here

    //System.out.println("TEST: tally prefixes");

    // build all prefixes
    final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
    final IntsRef scratch = new IntsRef(10);
    for(InputOutput<T> pair: pairs) {
      scratch.copyInts(pair.input);
      for(int idx=0;idx<=pair.input.length;idx++) {
        scratch.length = idx;
        CountMinOutput<T> cmo = prefixes.get(scratch);
        if (cmo == null) {
          cmo = new CountMinOutput<T>();
          cmo.count = 1;
          cmo.output = pair.output;
          prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
        } else {
          cmo.count++;
          T output1 = cmo.output;
          if (output1.equals(outputs.getNoOutput())) {
            output1 = outputs.getNoOutput();
          }
          T output2 = pair.output;
          if (output2.equals(outputs.getNoOutput())) {
            output2 = outputs.getNoOutput();
          }
          cmo.output = outputs.common(output1, output2);
        }
        if (idx == pair.input.length) {
          cmo.isFinal = true;
          cmo.finalOutput = cmo.output;
        }
      }
    }

    if (LuceneTestCase.VERBOSE) {
      System.out.println("TEST: now prune");
    }

    // prune 'em
    final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
    while(it.hasNext()) {
      Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
      final IntsRef prefix = ent.getKey();
      final CountMinOutput<T> cmo = ent.getValue();
      if (LuceneTestCase.VERBOSE) {
        System.out.println("  term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
      }
      final boolean keep;
View Full Code Here

      TermsEnum termsEnum = terms.iterator(null);

      Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
      Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);

      IntsRef scratchInts = new IntsRef();
      while (true) {
        BytesRef term = termsEnum.next();
        if (term == null) {
          break;
        }
View Full Code Here

        BytesRef lastTokenFragment = null;
       
        for(int i=token.length-1;i>=0;i--) {
          if (token.bytes[token.offset+i] == separator) {
            BytesRef context = new BytesRef(token.bytes, token.offset, i);
            Long output = Util.get(fst, Util.toIntsRef(context, new IntsRef()));
            assert output != null;
            contextCount = decodeWeight(output);
            lastTokenFragment = new BytesRef(token.bytes, token.offset + i + 1, token.length - i - 1);
            break;
          }
        }
       
        final BytesRef finalLastToken;
       
        if (lastTokenFragment == null) {
          finalLastToken = BytesRef.deepCopyOf(token);
        } else {
          finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);
        }
        assert finalLastToken.offset == 0;
       
        CharsRef spare = new CharsRef();
       
        // complete top-N
        MinResult<Long> completions[] = null;
        try {
         
          // Because we store multiple models in one FST
          // (1gram, 2gram, 3gram), we must restrict the
          // search so that it only considers the current
          // model.  For highest order model, this is not
          // necessary since all completions in the FST
          // must be from this model, but for lower order
          // models we have to filter out the higher order
          // ones:
         
          // Must do num+seen.size() for queue depth because we may
          // reject up to seen.size() paths in acceptResult():
          Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
           
            BytesRef scratchBytes = new BytesRef();
           
            @Override
            protected void addIfCompetitive(Util.FSTPath<Long> path) {
              if (path.arc.label != separator) {
                //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                super.addIfCompetitive(path);
              } else {
                //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
              }
            }
           
            @Override
            protected boolean acceptResult(IntsRef input, Long output) {
              Util.toBytesRef(input, scratchBytes);
              finalLastToken.grow(finalLastToken.length + scratchBytes.length);
              int lenSav = finalLastToken.length;
              finalLastToken.append(scratchBytes);
              //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
              boolean ret = seen.contains(finalLastToken) == false;
             
              finalLastToken.length = lenSav;
              return ret;
            }
          };
         
          // since this search is initialized with a single start node
          // it is okay to start with an empty input path here
          searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
         
          completions = searcher.search();
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
View Full Code Here

        w.close();
        */

        if (indexDivisor > 1) {
          // subsample
          final IntsRef scratchIntsRef = new IntsRef();
          final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
          final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
          final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
          BytesRefFSTEnum.InputOutput<Long> result;
          int count = indexDivisor;
View Full Code Here

      // build the partitionOrdinals map
      final HashMap<String,IntsRef> partitionOrdinals = new HashMap<String,IntsRef>();
      for (int i = 0; i < ordinals.length; i++) {
        int ordinal = ordinals.ints[i];
        final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, ordinal);
        IntsRef partitionOrds = partitionOrdinals.get(name);
        if (partitionOrds == null) {
          partitionOrds = new IntsRef(32);
          partitionOrdinals.put(name, partitionOrds);
          partitionEncoder.put(name, categoryListParams.createEncoder());
        }
        partitionOrds.ints[partitionOrds.length++] = ordinal % partitionSize;
      }
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.IntsRef

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.