Package no.priv.garshol.duke.matchers

Source Code of no.priv.garshol.duke.matchers.LinkDatabaseMatchListener

package no.priv.garshol.duke.matchers;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.DukeException;
import no.priv.garshol.duke.Link;
import no.priv.garshol.duke.LinkDatabase;
import no.priv.garshol.duke.LinkKind;
import no.priv.garshol.duke.LinkStatus;
import no.priv.garshol.duke.Property;
import no.priv.garshol.duke.Record;

/**
* Maintains a LinkDatabase of the recorded matches. Assumes that the
* same record may be processed several times (for example in
* different runs), and will keep the database correctly updated.
*
* <p><b>WARNING:</b> This class is not thread-safe, so attempting to
* use it with multiple threads will lead to database corruption.
*/
public class LinkDatabaseMatchListener extends AbstractMatchListener {
  private Configuration config;
  private LinkDatabase linkdb;
  private Record current;
  private Collection<Link> curlinks;

  public LinkDatabaseMatchListener(Configuration config, LinkDatabase linkdb) {
    this.config = config;
    this.linkdb = linkdb;
  }

  // the only callbacks we get are matches(), matchesPerhaps(), and
  // noMatchFor(). from these, we need to work out when Duke starts
  // on a new record, and call startRecord_() and endRecord_()
  // accordingly.

  public void matches(Record r1, Record r2, double confidence) {
    if (r1 != current) {
      // we've finished processing the previous record, so make the calls
      if (current != null)
        endRecord_();
      startRecord_(r1);
    }

    String id1 = getIdentity(r1);
    String id2 = getIdentity(r2);
    curlinks.add(new Link(id1, id2, LinkStatus.INFERRED, LinkKind.SAME,
                          confidence));
  }

  public void matchesPerhaps(Record r1, Record r2, double confidence) {
    if (r1 != current) {
      // we've finished processing the previous record, so make the calls
      if (current != null)
        endRecord_();
      startRecord_(r1);
    }

    String id1 = getIdentity(r1);
    String id2 = getIdentity(r2);
    curlinks.add(new Link(id1, id2, LinkStatus.INFERRED, LinkKind.MAYBESAME,
                          confidence));
  }

  public void noMatchFor(Record record) {
    // this is the only call we'll get for this record. it means the
    // previous record has ended, and this one has begun (and will end
    // with the next call, whatever it is)
    if (current != null)
      endRecord_();
    startRecord_(record);
    // next callback will trigger endRecord_()
  }

  // this method is called from the event methods
  public void startRecord_(Record r) {
    current = r;
    curlinks = new ArrayList();
  }

  // this method is called from the event methods
  public void endRecord_() {
    // this is where we actually update the link database. basically,
    // all we need to do is to retract those links which weren't seen
    // this time around, and that can be done via assertLink, since it
    // can override existing links.

    // get all the existing links
    Collection<Link> oldlinks = linkdb.getAllLinksFor(getIdentity(current));

    // build a hashmap so we can look up corresponding old links from
    // new links
    if (oldlinks != null) {
      Map<String, Link> oldmap = new HashMap(oldlinks.size());
      for (Link l : oldlinks)
        oldmap.put(makeKey(l), l);

      // removing all the links we found this time around from the set of
      // old links. any links remaining after this will be stale, and need
      // to be retracted
      for (Link newl : new ArrayList<Link>(curlinks)) {
        String key = makeKey(newl);
        Link oldl = oldmap.get(key);
        if (oldl == null)
          continue;

        if (oldl.overrides(newl))
          // previous information overrides this link, so ignore
          curlinks.remove(newl);
        else if (sameAs(oldl, newl)) {
          // there's no new information here, so just ignore this
          curlinks.remove(newl);
          oldmap.remove(key); // we don't want to retract the old one
        } else
          // the link is out of date, but will be overwritten, so remove
          oldmap.remove(key);
      }

      // all the inferred links left in oldmap are now old links we
      // didn't find on this pass. there is no longer any evidence
      // supporting them, and so we can retract them.
      for (Link oldl : oldmap.values())
        if (oldl.getStatus() == LinkStatus.INFERRED) {
          oldl.retract(); // changes to retracted, updates timestamp
          curlinks.add(oldl);
        }
    }

    // okay, now we write it all to the database
    for (Link l : curlinks)
      linkdb.assertLink(l);
  }

  public void batchReady(int size) {
    linkdb.validateConnection();
  }

  public void batchDone() {
    // clearly, this is the end of the previous record
    endRecord_();
    current = null;
    linkdb.commit();
  }

  private String getIdentity(Record r) {
    for (Property p : config.getIdentityProperties()) {
      Collection<String> vs = r.getValues(p.getName());
      if (vs == null)
        continue;
      for (String v : vs)
        return v;
    }
    throw new DukeException("No identity found in record [" +
                            PrintMatchListener.toString(r) + "]");
  }

  private String makeKey(Link l) {
    return l.getID1() + "\t" + l.getID2();
  }

  private boolean sameAs(Link l1, Link l2) {
    // we know the IDs are the same, so we're not going to check those
    return l1.getStatus() == l2.getStatus() &&
      l1.getKind() == l2.getKind();
    // confidence and timestamp are irrelevant
  }
}
TOP

Related Classes of no.priv.garshol.duke.matchers.LinkDatabaseMatchListener

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.