String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
String indexPath = cmdline.getOptionValue(INDEX_OPTION);
long startTime = System.currentTimeMillis();
StatusStream stream;
File file = new File(collectionPath);
if (!file.exists()) {
System.err.println("Error: " + file + " does not exist!");
System.exit(-1);
}
if (cmdline.hasOption(TSV_OPTION)) {
stream = new TSVStatusCorpusReader(file);
} else {
stream = new JsonStatusCorpusReader(file);
}
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41);
Directory dir = FSDirectory.open(new File(indexPath));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, analyzer);
config.setOpenMode(OpenMode.CREATE);
LOG.info("collection: " + collectionPath);
LOG.info("index: " + indexPath);
IndexWriter writer = new IndexWriter(dir, config);
int cnt = 0;
Status status;
try {
while ((status = stream.next()) != null) {
if (status.getText() == null) {
continue;
}
cnt++;
Document doc = new Document();
doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
doc.add(new TextField(StatusField.TEXT.name, status.getText(), Store.YES));
doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));
long inReplyToStatusId = status.getInReplyToStatusId();
if (inReplyToStatusId > 0) {
doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES));
doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES));
}
String lang = status.getLang();
if (!lang.equals("unknown")) {
doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
}
long retweetStatusId = status.getRetweetedStatusId();
if (retweetStatusId > 0) {
doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES));
doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES));
doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
if ( status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
LOG.warn("Error parsing retweet fields of " + status.getId());
}
}
writer.addDocument(doc);
if (cnt % 100000 == 0) {
LOG.info(cnt + " statuses indexed");
}
}
LOG.info(String.format("Total of %s statuses added", cnt));
LOG.info("Merging segments...");
writer.forceMerge(1);
LOG.info("Done!");
LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
} catch (Exception e) {
e.printStackTrace();
} finally {
writer.close();
dir.close();
stream.close();
}
}