* Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* -------------------
* To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
* http://www.manning.com/ingersoll
package com.tamingtext.fuzzy;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
public class MovieMatcher {
public MovieMatcher() throws MalformedURLException {
solr = new CommonsHttpSolrServer(new URL("http://localhost:8983/solr"));
query = new SolrQuery();
//<start id="record-matching.candidates"/>
private SolrServer solr;
private SolrQuery query;
public Iterator<SolrDocument> getCandidates(String title)
throws SolrServerException {
String etitle = escape(title); //<co id="co.rm.escape"/>
query.setQuery("title:\""+etitle+"\""); //<co id="co.rm.quotes"/>
QueryResponse response = solr.query(query);
SolrDocumentList dl = response.getResults();
return dl.iterator();
<callout arearefs="co.rm.escape"><para>Escaped the title.</para></callout>
<callout arearefs="co.rm.quotes"><para>Title in quotes to prevent tokenization.</para></callout>
//<end id="record-matching.candidates"/>
/** Replaces PrecedenceQueryParser.escape(..) -- there is likely a better source for this logic. */
public String escape(String s) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
// NOTE: keep this in sync with _ESCAPED_CHAR below!
if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
|| c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
|| c == '*' || c == '?') {
return sb.toString();
//<start id="record-matching.match"/>
public String match(String title, int year, Set<String> cast) throws SolrServerException {
Iterator<SolrDocument> di = getCandidates(title);
Match bestMatch = null;
Match secMatch = null;
while (di.hasNext()) { //<co id="co.rm.docs"/>
SolrDocument doc = di.next();
String id = (String) doc.getFieldValue("imdb");
String ititle = (String) doc.getFieldValue("title");
Integer iyear = (Integer) doc.getFieldValue("year");
Set<String> icast = constructCastSet(doc.getFieldValues("cast"));
float score = score(title,year,cast,ititle,iyear,icast); //<co id="co.rm.score"/>
if (bestMatch == null || score > bestMatch.score) { //<co id="co.rm.best"/>
secMatch = bestMatch;
bestMatch = new Match(score,id,title,year,cast);
else if (secMatch == null || score > secMatch.score) {//<co id="co.rm.second"/>
secMatch = new Match(score,id,title,year,cast);
if (bestMatch == null) {
return null;
if (bestMatch.score > 0.75) { //<co id="co.rm.threshold"/>
if (secMatch != null && secMatch.score >= 0.75) { //<co id="co.rm.second-threshold"/>
return null;
return bestMatch.id;
return null;
private static Set<String> constructCastSet(Collection cast) {
Set<String> castSet = new HashSet<String>();
for (Object actor : cast) {
return castSet;
class Match {
public float score;
public String id;
public String title;
public int year;
public Set<String> cast;
public Match(float score, String id, String title, int year, Set<String> cast) {
this.score = score;
this.id = id;
this.title = title;
this.year = year;
this.cast = cast;
<callout arearefs="co.rm.docs"><para>Iterate through each of the documents.</para></callout>
<callout arearefs="co.rm.score"><para>Score each of the documents.</para></callout>
<callout arearefs="co.rm.best"><para>Check whether this is the best document.</para></callout>
<callout arearefs="co.rm.second"><para>Check whether this is the second best document.</para></callout>
<callout arearefs="co.rm.threshold"><para>Verify that the best match's score is larger than the threshold.</para></callout>
<callout arearefs="co.rm.second-threshold"><para>Check that the second-best match is not larger than the threshold.</para></callout>
//<end id="record-matching.match"/>
//<start id="record-matching.scoring"/>
private StringDistance sd = new JaroWinklerDistance();
private float score(String title1, int year1, Set<String> cast1,
String title2, int year2, Set<String> cast2) {
float titleScore = sd.getDistance(title1.toLowerCase(), //<co id="co.rm.score.jaro"/>
float yearScore = (float) 1/(Math.abs(year1-year2)+1); //<co id="co.rm.score.distance"/>
float castScore = (float) intersectionSize(cast1,cast2)/ //<co id="co.rm.score.inter"/>
return (titleScore*.5f)+ //<co id="co.rm.score.combine"/>
private int intersectionSize(Set<String> cast1, //<co id="co.rm.score.inter-equals"/>
Set<String> cast2) {
int size = 0;
for (String actor : cast1)
if (cast2.contains(actor)) size++;
return size;
<callout arearefs="co.rm.score.jaro"><para>Use the Jaro-Winkler on titles.</para></callout>
<callout arearefs="co.rm.score.distance"><para>Use the reciprocal on years.</para></callout>
<callout arearefs="co.rm.score.inter"><para>Use cast overlap percentage.</para></callout>
<callout arearefs="co.rm.score.combine"><para>Combine the scores into a single score.</para></callout>
<callout arearefs="co.rm.score.inter-equals"><para>Compute intersection using exact string matching.</para></callout>
//<end id="record-matching.scoring"/>
public static void main(String[] args) throws IOException, SolrServerException {
MovieMatcher mm = new MovieMatcher();
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
int matches = 0;
int lines = 0;
for (String line = in.readLine();line != null; line = in.readLine()) {
String[] parts = line.split("\t");
int pi=0;
String id = parts[pi++];
String title = parts[pi++];
String ys = parts[pi++];
int year = 0;
if (!ys.equals("NULL")) {
year = Integer.parseInt(ys);
Set<String> castSet = constructCastSet(Arrays.asList(parts[pi++].split(",")));
String imdbId = mm.match(title, year, castSet);
if (imdbId != null) {
//System.err.println(matches+"/"+lines+" "+((float) matches/lines));