Package com.tamingtext.fuzzy

Source Code of com.tamingtext.fuzzy.MovieMatcher$Match

/*
* Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
*
*    Licensed under the Apache License, Version 2.0 (the "License");
*    you may not use this file except in compliance with the License.
*    You may obtain a copy of the License at
*
*        http://www.apache.org/licenses/LICENSE-2.0
*
*    Unless required by applicable law or agreed to in writing, software
*    distributed under the License is distributed on an "AS IS" BASIS,
*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*    See the License for the specific language governing permissions and
*    limitations under the License.
* -------------------
* To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
* http://www.manning.com/ingersoll
*/

package com.tamingtext.fuzzy;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

public class MovieMatcher {

 
 
  public MovieMatcher() throws MalformedURLException {
    solr = new CommonsHttpSolrServer(new URL("http://localhost:8983/solr"));
    query = new SolrQuery();
    query.setRows(10);
  }

//<start id="record-matching.candidates"/> 
private SolrServer solr;
private SolrQuery query;

public Iterator<SolrDocument> getCandidates(String title)
    throws SolrServerException {
    String etitle = escape(title); //<co id="co.rm.escape"/>   
    query.setQuery("title:\""+etitle+"\"")//<co id="co.rm.quotes"/>
    QueryResponse response = solr.query(query);
    SolrDocumentList dl = response.getResults();
    return dl.iterator();
}
/*
<calloutlist>
<callout arearefs="co.rm.escape"><para>Escaped the title.</para></callout>
<callout arearefs="co.rm.quotes"><para>Title in quotes to prevent tokenization.</para></callout>
</calloutlist>
*/
//<end id="record-matching.candidates"/>

/** Replaces PrecedenceQueryParser.escape(..) -- there is likely a better source for this logic. */
public String escape(String s) {
  StringBuffer sb = new StringBuffer();
  for (int i = 0; i < s.length(); i++) {
    char c = s.charAt(i);
    // NOTE: keep this in sync with _ESCAPED_CHAR below!
    if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
      || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
        || c == '*' || c == '?') {
      sb.append('\\');
    }
    sb.append(c);
  }
  return sb.toString();
}

//<start id="record-matching.match"/> 
  public String match(String title, int year, Set<String> cast) throws SolrServerException {
    Iterator<SolrDocument> di = getCandidates(title);
    Match bestMatch = null;
    Match secMatch = null;
    while (di.hasNext()) { //<co id="co.rm.docs"/>
      SolrDocument doc = di.next();
      String id = (String) doc.getFieldValue("imdb");
      String ititle = (String) doc.getFieldValue("title");
      Integer iyear = (Integer) doc.getFieldValue("year");
      Set<String> icast = constructCastSet(doc.getFieldValues("cast"));
      float score = score(title,year,cast,ititle,iyear,icast); //<co id="co.rm.score"/>
      if (bestMatch == null || score > bestMatch.score) { //<co id="co.rm.best"/>
        secMatch = bestMatch;
        bestMatch = new Match(score,id,title,year,cast);
      }
      else if (secMatch == null || score > secMatch.score) {//<co id="co.rm.second"/>
        secMatch = new Match(score,id,title,year,cast);
      }
    }
    if (bestMatch == null) {
      return null;
    }
    if (bestMatch.score > 0.75) { //<co id="co.rm.threshold"/>
      if (secMatch != null && secMatch.score >= 0.75) { //<co id="co.rm.second-threshold"/>
        return null;
      }
      return bestMatch.id;
    }
    return null;
  }
 
  private static Set<String> constructCastSet(Collection cast) {
    Set<String> castSet = new HashSet<String>();
    for (Object actor : cast) {
      castSet.add(actor.toString().toLowerCase());
    }
    return castSet;
  }
 
  class Match {
    public float score;
    public String id;
    public String title;
    public int year;
    public Set<String> cast;
   
    public Match(float score, String id, String title, int year, Set<String> cast) {
      this.score = score;
      this.id = id;
      this.title = title;
      this.year = year;
      this.cast = cast;
    }
  }

  /*
  <calloutlist>
  <callout arearefs="co.rm.docs"><para>Iterate through each of the documents.</para></callout> 
  <callout arearefs="co.rm.score"><para>Score each of the documents.</para></callout>   
  <callout arearefs="co.rm.best"><para>Check whether this is the best document.</para></callout>   
  <callout arearefs="co.rm.second"><para>Check whether this is the second best document.</para></callout>     
  <callout arearefs="co.rm.threshold"><para>Verify that the best match's score is larger than the threshold.</para></callout>
  <callout arearefs="co.rm.second-threshold"><para>Check that the second-best match is not larger than the threshold.</para></callout>   
  </calloutlist>
   */
//<end id="record-matching.match"/>

 
//<start id="record-matching.scoring"/> 
private StringDistance sd = new JaroWinklerDistance();
 
private float score(String title1, int year1, Set<String> cast1,
    String title2, int year2, Set<String> cast2) {
    float titleScore = sd.getDistance(title1.toLowerCase(), //<co id="co.rm.score.jaro"/>
        title2.toLowerCase());
   
    float yearScore = (float) 1/(Math.abs(year1-year2)+1); //<co id="co.rm.score.distance"/>
   
    float castScore = (float) intersectionSize(cast1,cast2)/ //<co id="co.rm.score.inter"/>
                        Math.min(cast1.size(),cast2.size());
    return (titleScore*.5f)+ //<co id="co.rm.score.combine"/>
           (yearScore*0.2f)+
           (castScore*0.3f);
}
 
private int intersectionSize(Set<String> cast1, //<co id="co.rm.score.inter-equals"/>
                             Set<String> cast2) {
    int size = 0;
    for (String actor : cast1)
        if (cast2.contains(actor)) size++;
    return size;
}
 
/*
<calloutlist>
<callout arearefs="co.rm.score.jaro"><para>Use the Jaro-Winkler on titles.</para></callout>
<callout arearefs="co.rm.score.distance"><para>Use the reciprocal on years.</para></callout>   
<callout arearefs="co.rm.score.inter"><para>Use cast overlap percentage.</para></callout>     
<callout arearefs="co.rm.score.combine"><para>Combine the scores into a single score.</para></callout>   
<callout arearefs="co.rm.score.inter-equals"><para>Compute intersection using exact string matching.</para></callout>     
</calloutlist>
*/
//<end id="record-matching.scoring"/>

 
 
  public static void main(String[] args) throws IOException, SolrServerException {
    MovieMatcher mm = new MovieMatcher();
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    int matches = 0;
    int lines = 0;
    for (String line = in.readLine();line != null; line = in.readLine()) {
      String[] parts = line.split("\t");
      int pi=0;
      String id = parts[pi++];
      String title = parts[pi++];
      String ys = parts[pi++];
      int year = 0;
      if (!ys.equals("NULL")) {
        year = Integer.parseInt(ys);       
      }
      Set<String> castSet = constructCastSet(Arrays.asList(parts[pi++].split(",")));
      String imdbId = mm.match(title, year, castSet);
      if (imdbId != null) {
        System.out.println(id+","+imdbId);
        matches++;
      }
      lines++;
    }
    //System.err.println(matches+"/"+lines+" "+((float) matches/lines));
  }
}
TOP

Related Classes of com.tamingtext.fuzzy.MovieMatcher$Match

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.