Examples of ShingleFilter

org.apache.lucene.analysis.shingle.ShingleFilter

A ShingleFilter constructs shingles (token n-grams) from a token stream. In other words, it creates combinations of tokens as a single token.
For example, the sentence "please divide this sentence into shingles" might be tokenized into shingles "please divide", "divide this", "this sentence", "sentence into", and "into shingles".
This filter handles position increments > 1 by inserting filler tokens (tokens with termtext "_"). It does not handle a position increment of 0.

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

   *           if there's a problem with the ShingleFilter reading data or the collector collecting output.
   */
  @Override
  protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException {


    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize);
    int count = 0; // ngram count


    OpenObjectIntHashMap<String> ngrams =
        new OpenObjectIntHashMap<String>(value.getEntries().size() * (maxShingleSize - 1));
    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());


    do {
      String term = (sf.getAttribute(TermAttribute.class)).term();
      String type = (sf.getAttribute(TypeAttribute.class)).type();
      if ("shingle".equals(type)) {
        count++;
        ngrams.adjustOrPutValue(term, 1, 1);
      } else if (emitUnigrams && term.length() > 0) { // unigram
        unigrams.adjustOrPutValue(term, 1, 1);
      }
    } while (sf.incrementToken());


    try {
      final GramKey gramKey = new GramKey();


      ngrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          // obtain components, the leading (n-1)gram and the trailing unigram.
          int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
          if (i != -1) { // bigram, trigram etc


            try {
              Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
              Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
              Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);


              gramKey.set(head, EMPTY);
              context.write(gramKey, head);


              gramKey.set(head, ngram.getBytes());
              context.write(gramKey, ngram);


              gramKey.set(tail, EMPTY);
              context.write(gramKey, tail);


              gramKey.set(tail, ngram.getBytes());
              context.write(gramKey, ngram);


            } catch (IOException e) {
              throw new IllegalStateException(e);
            } catch (InterruptedException e) {
              throw new IllegalStateException(e);
            }
          }
          return true;
        }
      });


      unigrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          try {
            Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
            gramKey.set(unigram, EMPTY);
            context.write(gramKey, unigram);
          } catch (IOException e) {
            throw new IllegalStateException(e);
          } catch (InterruptedException e) {
            throw new IllegalStateException(e);
          }
          return true;
        }
      });
    } catch (IllegalStateException ise) {
      // catch an re-throw original exceptions from the procedures.
      if (ise.getCause() instanceof IOException) {
        throw (IOException) ise.getCause();
      } else {
        // wasn't what was expected, so re-throw
        throw ise;
      }
    }


    context.getCounter(Count.NGRAM_TOTAL).increment(count);


    sf.end();
    sf.close();
  }

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

    StringTuple value = it.next();


    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size


    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);


      do {
        String term = (sf.getAttribute(TermAttribute.class)).term();
        if (term.length() > 0 && dictionary.containsKey(term)) { // ngram
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
        }
      } while (sf.incrementToken());


      sf.end();
      sf.close();
    } else {
      for (String term : value.getEntries()) {
        if (term.length() > 0 && dictionary.containsKey(term)) { // unigram
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

            @Override public String name() {
                return "shingle";
            }


            @Override public TokenStream create(TokenStream tokenStream) {
                return new ShingleFilter(tokenStream, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
            }
        }));


        tokenFilterFactories.put("unique", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
            @Override public String name() {

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

        maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
        outputUnigrams = settings.getAsBoolean("output_unigrams", true);
    }


    @Override public TokenStream create(TokenStream tokenStream) {
        ShingleFilter filter = new ShingleFilter(tokenStream, maxShingleSize);
        filter.setOutputUnigrams(outputUnigrams);
        return filter;
    }

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

    final String label = key.toString();
    String[] tokens = SPACE_TAB.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);
    
    if (gramSize > 1) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(Iterators.forArray(tokens)), gramSize);
      do {
        String term = sf.getAttribute(CharTermAttribute.class).toString();
        if (!term.isEmpty()) {
          if (wordList.containsKey(term)) {
            wordList.put(term, 1 + wordList.get(term));
          } else {
            wordList.put(term, 1);
          }
        }
      } while (sf.incrementToken());
    } else {
      for (String term : tokens) {
        if (wordList.containsKey(term)) {
          wordList.put(term, 1 + wordList.get(term));
        } else {

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

    final String label = key.toString();
    String[] tokens = SPACE_PATTERN.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);
    
    if (gramSize > 1) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new ArrayIterator<String>(tokens)), gramSize);
      do {
        String term = (sf.getAttribute(TermAttribute.class)).term();
        if (term.length() > 0) {
          if (wordList.containsKey(term)) {
            wordList.put(term, 1 + wordList.get(term));
          } else {
            wordList.put(term, 1);
          }
        }
      } while (sf.incrementToken());
    } else {
      for (String term : tokens) {
        if (wordList.containsKey(term)) {
          wordList.put(term, 1 + wordList.get(term));
        } else {

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

        this(DEFAULT_MAX_TOKEN_LENGTH);
    }
    
    @Override
    public TokenStream tokenStream(String arg0, Reader reader) {
        return new ShingleFilter(new UpperCaseFilter(new WhitespaceTokenizer(Version.LUCENE_31, reader)), 2, maxTokenLength);
    }

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

                     ? args.get("tokenSeparator")
                     : ShingleFilter.TOKEN_SEPARATOR;
  }
  @Override
  public ShingleFilter create(TokenStream input) {
    ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
    r.setOutputUnigrams(outputUnigrams);
    r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
    r.setTokenSeparator(tokenSeparator);
    return r;
  }

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

          return other;
        }


        @Override
        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
          ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);
          shingles.setTokenSeparator(Character.toString((char) separator));
          return new TokenStreamComponents(components.getTokenizer(), shingles);
        }
      };
    }
  }

View Full Code Here

Examples of org.apache.lucene.analysis.shingle.ShingleFilter

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
        //TokenStream stream = new SopTokenFilter(tokenizer);
        TokenStream stream = new ShingleFilter(tokenizer, 5);
        //stream = new SopTokenFilter(stream);
        stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
        //stream = new SopTokenFilter(stream);
        return new TokenStreamComponents(tokenizer, stream);
      }

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.