Package org.apache.lucene.analysis.shingle

Examples of org.apache.lucene.analysis.shingle.ShingleFilter$InputWindowToken


    checkAnalysisConsistency(random, b, random.nextBoolean(), "");
  }

  public void testGraphs() throws IOException {
    TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
    tk = new ShingleFilter(tk);
    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
    assertTokenStreamContents(tk,
        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
        new int[]    { 6,11,11,14 },
        new int[]    { 13,19,19,21 },
View Full Code Here


    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);
      sf.reset();
      try {
        do {
          String term = sf.getAttribute(CharTermAttribute.class).toString();
          if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
            int termId = dictionary.get(term);
            vector.setQuick(termId, vector.getQuick(termId) + 1);
          }
        } while (sf.incrementToken());

        sf.end();
      } finally {
        Closeables.close(sf, true);
      }
    } else {
      for (String term : value.getEntries()) {
View Full Code Here

   * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output.
   */
  @Override
  protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize);
    sf.reset();
    try {
      int count = 0; // ngram count

      OpenObjectIntHashMap<String> ngrams =
              new OpenObjectIntHashMap<String>(value.getEntries().size() * (maxShingleSize - 1));
      OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

      do {
        String term = sf.getAttribute(CharTermAttribute.class).toString();
        String type = sf.getAttribute(TypeAttribute.class).type();
        if ("shingle".equals(type)) {
          count++;
          ngrams.adjustOrPutValue(term, 1, 1);
        } else if (emitUnigrams && !term.isEmpty()) { // unigram
          unigrams.adjustOrPutValue(term, 1, 1);
        }
      } while (sf.incrementToken());

      final GramKey gramKey = new GramKey();

      ngrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          // obtain components, the leading (n-1)gram and the trailing unigram.
          int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
          if (i != -1) { // bigram, trigram etc

            try {
              Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
              Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
              Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

              gramKey.set(head, EMPTY);
              context.write(gramKey, head);

              gramKey.set(head, ngram.getBytes());
              context.write(gramKey, ngram);

              gramKey.set(tail, EMPTY);
              context.write(gramKey, tail);

              gramKey.set(tail, ngram.getBytes());
              context.write(gramKey, ngram);

            } catch (IOException e) {
              throw new IllegalStateException(e);
            } catch (InterruptedException e) {
              throw new IllegalStateException(e);
            }
          }
          return true;
        }
      });

      unigrams.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String term, int frequency) {
          try {
            Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
            gramKey.set(unigram, EMPTY);
            context.write(gramKey, unigram);
          } catch (IOException e) {
            throw new IllegalStateException(e);
          } catch (InterruptedException e) {
            throw new IllegalStateException(e);
          }
          return true;
        }
      });

      context.getCounter(Count.NGRAM_TOTAL).increment(count);
      sf.end();
    } finally {
      Closeables.close(sf, true);
    }
  }
View Full Code Here

  public void testShingleFilteredAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_43);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    ShingleFilter sf = new ShingleFilter(ts, 3);
    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens),  true, sf);
    validateTokens(expectedShingleTokens, f);
    ts.end();
    ts.close();
  }
View Full Code Here

    checkAnalysisConsistency(random, b, random.nextBoolean(), "");
  }

  public void testGraphs() throws IOException {
    TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
    tk = new ShingleFilter(tk);
    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
    tk.reset();
    assertTokenStreamContents(tk,
        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
        new int[]    { 6,11,11,14 },
View Full Code Here

    checkAnalysisConsistency(random, b, random.nextBoolean(), "");
  }

  public void testGraphs() throws IOException {
    TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
    tk = new ShingleFilter(tk);
    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
    tk.reset();
    assertTokenStreamContents(tk,
        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
        new int[]    { 6,11,11,14 },
View Full Code Here

    );
    outputUnigrams = getBoolean( "outputUnigrams", true );
  }

  public ShingleFilter create(TokenStream input) {
    ShingleFilter r = new ShingleFilter( input, maxShingleSize );
    r.setOutputUnigrams( outputUnigrams );
    return r;
  }
View Full Code Here

    Set<?> luceneStopWords = this.stopWords == null ? EnglishAnalyzer.getDefaultStopSet() : StopFilter.makeStopSet(LUCENE_VERSION, stopWords);
    Analyzer analyzer = new EnglishSpecialAnalyzer(LUCENE_VERSION, luceneStopWords, this.stemExclusionsSet);

    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
    if (this.nGram) {
      tokenStream = new ShingleFilter(tokenStream, this.minNGram, this.maxNGram);
    }

    return tokenStream;
  }
View Full Code Here

    final String label = key.toString();
    String[] tokens = SPACE_PATTERN.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);
   
    if (gramSize > 1) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(Iterators.forArray(tokens)), gramSize);
      do {
        String term = sf.getAttribute(TermAttribute.class).term();
        if (term.length() > 0) {
          if (wordList.containsKey(term)) {
            wordList.put(term, 1 + wordList.get(term));
          } else {
            wordList.put(term, 1);
          }
        }
      } while (sf.incrementToken());
    } else {
      for (String term : tokens) {
        if (wordList.containsKey(term)) {
          wordList.put(term, 1 + wordList.get(term));
        } else {
View Full Code Here

   * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output.
   */
  @Override
  protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize);
    int count = 0; // ngram count

    OpenObjectIntHashMap<String> ngrams =
            new OpenObjectIntHashMap<String>(value.getEntries().size() * (maxShingleSize - 1));
    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

    do {
      String term = sf.getAttribute(CharTermAttribute.class).toString();
      String type = sf.getAttribute(TypeAttribute.class).type();
      if ("shingle".equals(type)) {
        count++;
        ngrams.adjustOrPutValue(term, 1, 1);
      } else if (emitUnigrams && term.length() > 0) { // unigram
        unigrams.adjustOrPutValue(term, 1, 1);
      }
    } while (sf.incrementToken());

    final GramKey gramKey = new GramKey();

    ngrams.forEachPair(new ObjectIntProcedure<String>() {
      @Override
      public boolean apply(String term, int frequency) {
        // obtain components, the leading (n-1)gram and the trailing unigram.
        int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
        if (i != -1) { // bigram, trigram etc

          try {
            Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
            Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
            Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

            gramKey.set(head, EMPTY);
            context.write(gramKey, head);

            gramKey.set(head, ngram.getBytes());
            context.write(gramKey, ngram);

            gramKey.set(tail, EMPTY);
            context.write(gramKey, tail);

            gramKey.set(tail, ngram.getBytes());
            context.write(gramKey, ngram);

          } catch (IOException e) {
            throw new IllegalStateException(e);
          } catch (InterruptedException e) {
            throw new IllegalStateException(e);
          }
        }
        return true;
      }
    });

    unigrams.forEachPair(new ObjectIntProcedure<String>() {
      @Override
      public boolean apply(String term, int frequency) {
        try {
          Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
          gramKey.set(unigram, EMPTY);
          context.write(gramKey, unigram);
        } catch (IOException e) {
          throw new IllegalStateException(e);
        } catch (InterruptedException e) {
          throw new IllegalStateException(e);
        }
        return true;
      }
    });

    context.getCounter(Count.NGRAM_TOTAL).increment(count);

    sf.end();
    sf.close();
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.shingle.ShingleFilter$InputWindowToken

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.