Package org.apache.lucene.analysis.shingle

Examples of org.apache.lucene.analysis.shingle.ShingleFilter


   *           if there's a problem with the ShingleFilter reading data or the collector collecting output.
   */
  @Override
  protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize);
    int count = 0; // ngram count

    OpenObjectIntHashMap<String> ngrams =
        new OpenObjectIntHashMap<String>(value.getEntries().size() * (maxShingleSize - 1));
    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

    do {
      String term = (sf.getAttribute(TermAttribute.class)).term();
      String type = (sf.getAttribute(TypeAttribute.class)).type();
      if ("shingle".equals(type)) {
        count++;
        ngrams.adjustOrPutValue(term, 1, 1);
      } else if (emitUnigrams && term.length() > 0) { // unigram
        unigrams.adjustOrPutValue(term, 1, 1);
      }
    } while (sf.incrementToken());

    try {
      final GramKey gramKey = new GramKey();

      ngrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          // obtain components, the leading (n-1)gram and the trailing unigram.
          int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
          if (i != -1) { // bigram, trigram etc

            try {
              Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
              Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
              Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

              gramKey.set(head, EMPTY);
              context.write(gramKey, head);

              gramKey.set(head, ngram.getBytes());
              context.write(gramKey, ngram);

              gramKey.set(tail, EMPTY);
              context.write(gramKey, tail);

              gramKey.set(tail, ngram.getBytes());
              context.write(gramKey, ngram);

            } catch (IOException e) {
              throw new IllegalStateException(e);
            } catch (InterruptedException e) {
              throw new IllegalStateException(e);
            }
          }
          return true;
        }
      });

      unigrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          try {
            Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
            gramKey.set(unigram, EMPTY);
            context.write(gramKey, unigram);
          } catch (IOException e) {
            throw new IllegalStateException(e);
          } catch (InterruptedException e) {
            throw new IllegalStateException(e);
          }
          return true;
        }
      });
    } catch (IllegalStateException ise) {
      // catch an re-throw original exceptions from the procedures.
      if (ise.getCause() instanceof IOException) {
        throw (IOException) ise.getCause();
      } else {
        // wasn't what was expected, so re-throw
        throw ise;
      }
    }

    context.getCounter(Count.NGRAM_TOTAL).increment(count);

    sf.end();
    sf.close();
  }
View Full Code Here


    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);

      do {
        String term = (sf.getAttribute(TermAttribute.class)).term();
        if (term.length() > 0 && dictionary.containsKey(term)) { // ngram
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
        }
      } while (sf.incrementToken());

      sf.end();
      sf.close();
    } else {
      for (String term : value.getEntries()) {
        if (term.length() > 0 && dictionary.containsKey(term)) { // unigram
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
View Full Code Here

            @Override public String name() {
                return "shingle";
            }

            @Override public TokenStream create(TokenStream tokenStream) {
                return new ShingleFilter(tokenStream, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
            }
        }));

        tokenFilterFactories.put("unique", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
            @Override public String name() {
View Full Code Here

        maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
        outputUnigrams = settings.getAsBoolean("output_unigrams", true);
    }

    @Override public TokenStream create(TokenStream tokenStream) {
        ShingleFilter filter = new ShingleFilter(tokenStream, maxShingleSize);
        filter.setOutputUnigrams(outputUnigrams);
        return filter;
    }
View Full Code Here

    final String label = key.toString();
    String[] tokens = SPACE_TAB.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);
   
    if (gramSize > 1) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(Iterators.forArray(tokens)), gramSize);
      do {
        String term = sf.getAttribute(CharTermAttribute.class).toString();
        if (!term.isEmpty()) {
          if (wordList.containsKey(term)) {
            wordList.put(term, 1 + wordList.get(term));
          } else {
            wordList.put(term, 1);
          }
        }
      } while (sf.incrementToken());
    } else {
      for (String term : tokens) {
        if (wordList.containsKey(term)) {
          wordList.put(term, 1 + wordList.get(term));
        } else {
View Full Code Here

    final String label = key.toString();
    String[] tokens = SPACE_PATTERN.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);
   
    if (gramSize > 1) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new ArrayIterator<String>(tokens)), gramSize);
      do {
        String term = (sf.getAttribute(TermAttribute.class)).term();
        if (term.length() > 0) {
          if (wordList.containsKey(term)) {
            wordList.put(term, 1 + wordList.get(term));
          } else {
            wordList.put(term, 1);
          }
        }
      } while (sf.incrementToken());
    } else {
      for (String term : tokens) {
        if (wordList.containsKey(term)) {
          wordList.put(term, 1 + wordList.get(term));
        } else {
View Full Code Here

        this(DEFAULT_MAX_TOKEN_LENGTH);
    }
   
    @Override
    public TokenStream tokenStream(String arg0, Reader reader) {
        return new ShingleFilter(new UpperCaseFilter(new WhitespaceTokenizer(Version.LUCENE_31, reader)), 2, maxTokenLength);
    }
View Full Code Here

                     ? args.get("tokenSeparator")
                     : ShingleFilter.TOKEN_SEPARATOR;
  }
  @Override
  public ShingleFilter create(TokenStream input) {
    ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
    r.setOutputUnigrams(outputUnigrams);
    r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
    r.setTokenSeparator(tokenSeparator);
    return r;
  }
View Full Code Here

          return other;
        }

        @Override
        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
          ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);
          shingles.setTokenSeparator(Character.toString((char) separator));
          return new TokenStreamComponents(components.getTokenizer(), shingles);
        }
      };
    }
  }
View Full Code Here

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
        //TokenStream stream = new SopTokenFilter(tokenizer);
        TokenStream stream = new ShingleFilter(tokenizer, 5);
        //stream = new SopTokenFilter(stream);
        stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
        //stream = new SopTokenFilter(stream);
        return new TokenStreamComponents(tokenizer, stream);
     
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.shingle.ShingleFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.