Package com.tamingtext.fuzzy

Source Code of com.tamingtext.fuzzy.OverlapMeasures

/*
* Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
*
*    Licensed under the Apache License, Version 2.0 (the "License");
*    you may not use this file except in compliance with the License.
*    You may obtain a copy of the License at
*
*        http://www.apache.org/licenses/LICENSE-2.0
*
*    Unless required by applicable law or agreed to in writing, software
*    distributed under the License is distributed on an "AS IS" BASIS,
*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*    See the License for the specific language governing permissions and
*    limitations under the License.
* -------------------
* To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
* http://www.manning.com/ingersoll
*/

package com.tamingtext.fuzzy;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.PatternTokenizer;

import java.io.IOException;
import java.io.Reader;
import java.util.Collections;
import java.util.regex.Pattern;

public class OverlapMeasures {
 
  //<start id="jaccard_end"/>
  public float jaccard(char[] s, char[] t) {
    int intersection = 0;
    int union = s.length+t.length;
    boolean[] sdup = new boolean[s.length];
    union -= findDuplicates(s,sdup);   //<co id="co_fuzzy_jaccard_dups1"/>
    boolean[] tdup = new boolean[t.length];
    union -= findDuplicates(t,tdup);
    for (int si=0;si<s.length;si++) {
      if (!sdup[si]) {   //<co id="co_fuzzy_jaccard_skip1"/>
        for (int ti=0;ti<t.length;ti++) {
          if (!tdup[ti]) { 
            if (s[si] == t[ti]) {   //<co id="co_fuzzy_jaccard_intersection" />
              intersection++;
              break;
            }
          }
        }
      }
    }
    union-=intersection;
    return (float) intersection/union; //<co id="co_fuzzy_jaccard_return"/>
  }
 
  private int findDuplicates(char[] s, boolean[] sdup) {
    int ndup =0;
    for (int si=0;si<s.length;si++) {
      if (sdup[si]) {
        ndup++;
      }
      else {
        for (int si2=si+1;si2<s.length;si2++) {
          if (!sdup[si2]) {
            sdup[si2] = s[si] == s[si2];
          }
        }
      }
    }
    return ndup;
  }
  /*
  <calloutlist>
  <callout arearefs="co_fuzzy_jaccard_dups1"><para>Find duplicates and subtract from union.</para></callout>
  <callout arearefs="co_fuzzy_jaccard_skip1"><para>Skip duplicates.</para></callout>
  <callout arearefs="co_fuzzy_jaccard_intersection"><para>Find intersection.</para></callout>
  <callout arearefs="co_fuzzy_jaccard_return"><para>Return Jaccard distance.</para></callout>
  </calloutlist>
   */
  //<end id="jaccard_end"/>

  public TopDocs cosine(String queryTerm, int n, String... terms) throws IOException, ParseException {
    Directory directory = new RAMDirectory();
    final Pattern pattern = Pattern.compile(".");
    Analyzer analyzer = new Analyzer() {
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream result = null;
        try {
          result = new PatternTokenizer(reader, pattern, 0);
        } catch (IOException e) {
        }
        return result;
      }
    };
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
    IndexWriter writer = new IndexWriter(directory, conf);
    for (String term : terms) {
      Document doc = new Document();
      doc.add(new Field("chars", term, Field.Store.YES, Field.Index.ANALYZED));
      writer.addDocument(doc);
    }
    writer.close();
    IndexReader reader = IndexReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), terms.length);
    for (int i = 0; i < topDocs.scoreDocs.length; i++){
        System.out.println("Id: " + topDocs.scoreDocs[i].doc + " Val: " + searcher.doc(topDocs.scoreDocs[i].doc).get("chars"));
    }
    QueryParser qp = new QueryParser(Version.LUCENE_36, "chars", analyzer);
    Query query = qp.parse(queryTerm);
    return searcher.search(query, n);
  }
}
TOP

Related Classes of com.tamingtext.fuzzy.OverlapMeasures

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.