Package org.apache.lucene.analysis.shingle

Examples of org.apache.lucene.analysis.shingle.ShingleFilter$InputWindowToken


    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);

      do {
        String term = (sf.getAttribute(CharTermAttribute.class)).toString();
        if (term.length() > 0 && dictionary.containsKey(term)) { // ngram
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
        }
      } while (sf.incrementToken());

      sf.end();
      sf.close();
    } else {
      for (String term : value.getEntries()) {
        if (term.length() > 0 && dictionary.containsKey(term)) { // unigram
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
View Full Code Here


   *  Tests PositionFilter setting all but the first positionIncrement to zero.
   * @throws java.io.IOException @see Token#next(Token)
   */
  public void test6GramFilterNoPositions() throws Exception {

    ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
    assertTokenStreamContents(new PositionFilter(filter),
               SIX_GRAM_NO_POSITIONS_TOKENS,
               SIX_GRAM_NO_POSITIONS_INCREMENTS);
  }
View Full Code Here

          return other;
        }

        @Override
        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
          ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);
          shingles.setTokenSeparator(Character.toString((char) separator));
          return new TokenStreamComponents(components.getTokenizer(), shingles);
        }
      };
    }
  }
View Full Code Here

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
        //TokenStream stream = new SopTokenFilter(tokenizer);
        TokenStream stream = new ShingleFilter(tokenizer, 5);
        //stream = new SopTokenFilter(stream);
        stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
        //stream = new SopTokenFilter(stream);
        return new TokenStreamComponents(tokenizer, stream);
     
View Full Code Here

    checkAnalysisConsistency(random, b, random.nextBoolean(), "");
  }

  public void testGraphs() throws IOException {
    TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
    tk = new ShingleFilter(tk);
    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
    assertTokenStreamContents(tk,
        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
        new int[]    { 6,11,11,14 },
        new int[]    { 13,19,19,21 },
View Full Code Here

   *  Tests PositionFilter setting all but the first positionIncrement to zero.
   * @throws java.io.IOException @see Token#next(Token)
   */
  public void test6GramFilterNoPositions() throws Exception {

    ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
    assertTokenStreamContents(new PositionFilter(filter),
               SIX_GRAM_NO_POSITIONS_TOKENS,
               SIX_GRAM_NO_POSITIONS_INCREMENTS);
  }
View Full Code Here

  public void testShingleFilteredAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    ShingleFilter sf = new ShingleFilter(ts, 3);
    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens),  true, sf);
    validateTokens(expectedShingleTokens, f);
    ts.end();
    ts.close();
  }
View Full Code Here

        TokenStream result = new StandardTokenizer(mMatchVersion, reader);
        result = new StandardFilter(mMatchVersion, result);
        result = new LowerCaseFilter(mMatchVersion, result);            // lowercased only
        result = new StopFilter(mMatchVersion, result, mStopWordSet);   // remove stopwords
        result = new DoubleMetaphoneFilter(result,mMaxCodeLength,true); // store phonetic code
        result = new ShingleFilter(result, 2, 3);                       // create token n-grams
        return result;
    }
View Full Code Here

   * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output.
   */
  @Override
  protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize);

    try {
      int count = 0; // ngram count

      OpenObjectIntHashMap<String> ngrams =
              new OpenObjectIntHashMap<String>(value.getEntries().size() * (maxShingleSize - 1));
      OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

      do {
        String term = sf.getAttribute(CharTermAttribute.class).toString();
        String type = sf.getAttribute(TypeAttribute.class).type();
        if ("shingle".equals(type)) {
          count++;
          ngrams.adjustOrPutValue(term, 1, 1);
        } else if (emitUnigrams && !term.isEmpty()) { // unigram
          unigrams.adjustOrPutValue(term, 1, 1);
        }
      } while (sf.incrementToken());

      final GramKey gramKey = new GramKey();

      ngrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          // obtain components, the leading (n-1)gram and the trailing unigram.
          int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
          if (i != -1) { // bigram, trigram etc

            try {
              Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
              Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
              Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

              gramKey.set(head, EMPTY);
              context.write(gramKey, head);

              gramKey.set(head, ngram.getBytes());
              context.write(gramKey, ngram);

              gramKey.set(tail, EMPTY);
              context.write(gramKey, tail);

              gramKey.set(tail, ngram.getBytes());
              context.write(gramKey, ngram);

            } catch (IOException e) {
              throw new IllegalStateException(e);
            } catch (InterruptedException e) {
              throw new IllegalStateException(e);
            }
          }
          return true;
        }
      });

      unigrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          try {
            Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
            gramKey.set(unigram, EMPTY);
            context.write(gramKey, unigram);
          } catch (IOException e) {
            throw new IllegalStateException(e);
          } catch (InterruptedException e) {
            throw new IllegalStateException(e);
          }
          return true;
        }
      });

      context.getCounter(Count.NGRAM_TOTAL).increment(count);
      sf.end();
    } finally {
      Closeables.closeQuietly(sf);
    }
  }
View Full Code Here

    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);
      try {
        do {
          String term = sf.getAttribute(CharTermAttribute.class).toString();
          if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
            int termId = dictionary.get(term);
            vector.setQuick(termId, vector.getQuick(termId) + 1);
          }
        } while (sf.incrementToken());

        sf.end();
      } finally {
        Closeables.closeQuietly(sf);
      }
    } else {
      for (String term : value.getEntries()) {
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.shingle.ShingleFilter$InputWindowToken

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.