}
}
public static void main_(String[] argv) throws IOException {
// parse command-line
CommandLineParser parser = setupParser();
try {
argv = parser.parse(argv);
} catch (CommandLineParser.CommandLineParserException e) {
System.err.println("ERROR: " + e.getMessage());
usage();
System.exit(1);
}
// set up some initial options
boolean datadebug = parser.getOptionState("showdata");
Logger logger = new CommandLineLogger(parser.getOptionState("verbose") ?
1 : 0);
boolean progress = parser.getOptionState("progress");
int count = 0;
int batch_size = parser.getOptionInteger("batchsize", 40000);
int threads = parser.getOptionInteger("threads", 1);
// load the configuration
Configuration config;
try {
config = ConfigLoader.load(argv[0]);
} catch (FileNotFoundException e) {
System.err.println("ERROR: Config file '" + argv[0] + "' not found!");
return;
} catch (SAXParseException e) {
System.err.println("ERROR: Couldn't parse config file: " + e.getMessage());
System.err.println("Error in " + e.getSystemId() + ":" +
e.getLineNumber() + ":" + e.getColumnNumber());
return;
} catch (SAXException e) {
System.err.println("ERROR: Couldn't parse config file: " + e.getMessage());
return;
}
// validate the configuration
if (!datadebug) // unless --showdata
config.validate();
// if we're in data debug mode we branch out here
if (datadebug) {
showdata(config);
return; // stop here
}
// set up listeners
boolean noreindex = parser.getOptionState("noreindex");
Processor processor = new Processor(config, !noreindex);
processor.setLogger(logger);
processor.setThreads(threads);
// sanity check
if (noreindex && processor.getDatabase().isInMemory()) {
System.out.println("Option --noreindex not available with in-memory " +
"database");
return;
}
// display lookup properties?
if (parser.getOptionState("lookups")) {
System.out.println("Lookup properties:");
for (Property p : config.getLookupProperties())
System.out.println(" " + p.getName());
System.out.println();
}
boolean interactive = parser.getOptionState("interactive");
boolean pretty = parser.getOptionState("pretty") || interactive;
boolean showmatches = parser.getOptionState("showmatches") || interactive;
PrintMatchListener listener =
new PrintMatchListener(showmatches,
parser.getOptionState("showmaybe"),
progress,
!config.isDeduplicationMode(),
config.getProperties(),
pretty);
processor.addMatchListener(listener);
// needs to be before the link file handler, in case the link file
// is the same as the test file
TestFileListener testfile = null;
if (parser.getOptionValue("testfile") != null) {
testfile = new TestFileListener(parser.getOptionValue("testfile"),
config,
parser.getOptionState("testdebug"),
processor,
showmatches,
pretty);
testfile.setPessimistic(true);
processor.addMatchListener(testfile);
}
AbstractLinkFileListener linkfile = null;
if (parser.getOptionValue("linkfile") != null) {
String fname = parser.getOptionValue("linkfile");
if (fname.endsWith(".ntriples"))
linkfile = new NTriplesLinkFileListener(fname, config.getIdentityProperties());
else
linkfile = new LinkFileListener(fname, config.getIdentityProperties(),
interactive,
parser.getOptionValue("testfile"));
processor.addMatchListener(linkfile);
}
// --profile
if (parser.getOptionState("profile"))
processor.setPerformanceProfiling(true);
// --singlematch setting
boolean matchall = true;
if (parser.getOptionState("singlematch")) {
if (config.isDeduplicationMode())
throw new DukeConfigException("--singlematch only works in record linkage mode");
matchall = false;
}
// this is where we get started for real. the first thing we do
// is to distinguish between modes.
if (config.isDeduplicationMode())
// deduplication mode
processor.deduplicate(config.getDataSources(), batch_size);
else {
// record linkage mode
if (noreindex) {
// user has specified that they already have group 1 indexed up,
// and don't want to do it again, for whatever reason. in that
// case we just do the linking, and don't touch group 1 at all.
processor.linkRecords(config.getDataSources(2), matchall);
} else
processor.link(config.getDataSources(1),
config.getDataSources(2),
matchall,
batch_size);
}
// close up shop, then finish
if (parser.getOptionValue("linkfile") != null)
linkfile.close();
processor.close();
}