Package org.apache.lucene.analysis.shingle

Examples of org.apache.lucene.analysis.shingle.ShingleFilter


    checkAnalysisConsistency(random, b, random.nextBoolean(), "");
  }

  public void testGraphs() throws IOException {
    TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
    tk = new ShingleFilter(tk);
    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
    assertTokenStreamContents(tk,
        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
        new int[]    { 6,11,11,14 },
        new int[]    { 13,19,19,21 },
View Full Code Here


    tokenSeparator = args.containsKey("tokenSeparator")
                     ? args.get("tokenSeparator")
                     : ShingleFilter.TOKEN_SEPARATOR;
  }
  public ShingleFilter create(TokenStream input) {
    ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
    r.setOutputUnigrams(outputUnigrams);
    r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
    r.setTokenSeparator(tokenSeparator);
    return r;
  }
View Full Code Here

   */
  @Override
  public void map(Text key, StringTuple value,
                  final OutputCollector<GramKey,Gram> collector, Reporter reporter) throws IOException {
   
    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize);
    int count = 0; // ngram count
   
    OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(value.getEntries().size()
                                                                           * (maxShingleSize - 1));
    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());
   
    do {
      String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
      String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class)).type();
      if ("shingle".equals(type)) {
        count++;
        ngrams.adjustOrPutValue(term, 1, 1);
      } else if (emitUnigrams && term.length() > 0) { // unigram
        unigrams.adjustOrPutValue(term, 1, 1);
      }
    } while (sf.incrementToken());
   
    try {
      final GramKey gramKey = new GramKey();
     
      ngrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          // obtain components, the leading (n-1)gram and the trailing unigram.
          int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
          if (i != -1) { // bigram, trigram etc
           
            try {
              Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
              Gram head  = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
              Gram tail  = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);
             
              gramKey.set(head, EMPTY);
              collector.collect(gramKey, head);
             
              gramKey.set(head, ngram.getBytes());
              collector.collect(gramKey, ngram);
             
              gramKey.set(tail, EMPTY);
              collector.collect(gramKey, tail);
             
              gramKey.set(tail, ngram.getBytes());
              collector.collect(gramKey, ngram);
             
            } catch (IOException e) {
              throw new IllegalStateException(e);
            }
          }
          return true;
        }
      });
 
      unigrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          try {
            Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
            gramKey.set(unigram, EMPTY);
            collector.collect(gramKey, unigram);
          } catch (IOException e) {
            throw new IllegalStateException(e);
          }
          return true;
        }
      });
    }
    catch (IllegalStateException ise) {
      // catch an re-throw original exceptions from the procedures.
      if (ise.getCause() instanceof IOException) {
        throw (IOException) ise.getCause();
      }
      else {
        // wasn't what was expected, so re-throw
        throw ise;
      }
    }
   
    reporter.incrCounter(Count.NGRAM_TOTAL, count);
   
    sf.end();
    sf.close();
  }
View Full Code Here

   
    Vector vector = new RandomAccessSparseVector(key.toString(), dimension, value.length()); // guess at
                                                                                             // initial size
   
    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
          maxNGramSize);
     
      do {
        String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
        if (term.length() > 0) { // ngram
          if (dictionary.containsKey(term) == false) {
            continue;
          }
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
        }
      } while (sf.incrementToken());
     
      sf.end();
      sf.close();
    } else {
      for (String term : value.getEntries()) {
        if (term.length() > 0) { // unigram
          if (dictionary.containsKey(term) == false) {
            continue;
View Full Code Here

    final String label = key.toString();
    String[] tokens = SPACE_PATTERN.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);
   
    if (gramSize > 1) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new ArrayIterator<String>(tokens)), gramSize);
      do {
        String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
        if (term.length() > 0) {
          if (wordList.containsKey(term)) {
            wordList.put(term, 1 + wordList.get(term));
          } else {
            wordList.put(term, 1);
          }
        }
      } while (sf.incrementToken());
    } else {
      for (String term : tokens) {
        if (wordList.containsKey(term)) {
          wordList.put(term, 1 + wordList.get(term));
        } else {
View Full Code Here

          return other;
        }

        @Override
        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
          ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);
          shingles.setTokenSeparator(Character.toString((char) separator));
          return new TokenStreamComponents(components.getTokenizer(), shingles);
        }
      };
    }
  }
View Full Code Here

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
        //TokenStream stream = new SopTokenFilter(tokenizer);
        TokenStream stream = new ShingleFilter(tokenizer, 5);
        //stream = new SopTokenFilter(stream);
        stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
        //stream = new SopTokenFilter(stream);
        return new TokenStreamComponents(tokenizer, stream);
     
View Full Code Here

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
        //TokenStream stream = new SopTokenFilter(tokenizer);
        TokenStream stream = new ShingleFilter(tokenizer, 5);
        //stream = new SopTokenFilter(stream);
        stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
        //stream = new SopTokenFilter(stream);
        return new TokenStreamComponents(tokenizer, stream);
     
View Full Code Here

          return other;
        }

        @Override
        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
          ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);
          shingles.setTokenSeparator(Character.toString((char) separator));
          return new TokenStreamComponents(components.getTokenizer(), shingles);
        }
      };
    }
  }
View Full Code Here

   *  Tests PositionFilter setting all but the first positionIncrement to zero.
   * @throws java.io.IOException @see Token#next(Token)
   */
  public void test6GramFilterNoPositions() throws Exception {

    ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
    assertTokenStreamContents(new PositionFilter(filter),
               SIX_GRAM_NO_POSITIONS_TOKENS,
               SIX_GRAM_NO_POSITIONS_INCREMENTS);
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.shingle.ShingleFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.