}
LOG.info("collection: " + collectionPath);
LOG.info("index: " + indexPath);
LongOpenHashSet deletes = null;
if (cmdline.hasOption(DELETES_OPTION)) {
deletes = new LongOpenHashSet();
File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION));
if (!deletesFile.exists()) {
System.err.println("Error: " + deletesFile + " does not exist!");
System.exit(-1);
}
LOG.info("Reading deletes from " + deletesFile);
FileInputStream fin = new FileInputStream(deletesFile);
byte[] ignoreBytes = new byte[2];
fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools
BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin)));
String s;
while ((s = br.readLine()) != null) {
if (s.contains("\t")) {
deletes.add(Long.parseLong(s.split("\t")[0]));
} else {
deletes.add(Long.parseLong(s));
}
}
br.close();
fin.close();
LOG.info("Read " + deletes.size() + " tweetids from deletes file.");
}
long maxId = Long.MAX_VALUE;
if (cmdline.hasOption(MAX_ID_OPTION)) {
maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION));
LOG.info("index: " + maxId);
}
long startTime = System.currentTimeMillis();
File file = new File(collectionPath);
if (!file.exists()) {
System.err.println("Error: " + file + " does not exist!");
System.exit(-1);
}
StatusStream stream = new JsonStatusCorpusReader(file);
Directory dir = FSDirectory.open(new File(indexPath));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER);
config.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, config);
int cnt = 0;
Status status;
try {
while ((status = stream.next()) != null) {
if (status.getText() == null) {
continue;
}
// Skip deletes tweetids.
if (deletes != null && deletes.contains(status.getId())) {
continue;
}
if (status.getId() > maxId) {
continue;