File fetchFile = new File(getSegmentPath(), "fetch/info.avro");
File parseFile = new File(getSegmentPath(), "parse_data/info.avro");
DbReader<CrawlDatum> reader = new DbReader<CrawlDatum>(CrawlDatum.class, crawldbFile);
while (reader.hasNext()) {
CrawlDatum datum = reader.readNext();
addToRedis(datum);
}
if (fetchFile.exists()) {
reader = new DbReader<CrawlDatum>(CrawlDatum.class, fetchFile);
while (reader.hasNext()) {
CrawlDatum datum = reader.readNext();
addToRedis(datum);
}
}
reader.close();
if (parseFile.exists()) {
DbReader<ParseData> parseReader = new DbReader<ParseData>(ParseData.class, parseFile);
while (parseReader.hasNext()) {
ParseData parseData = parseReader.readNext();
if (parseData.getLinks() == null) {
continue;
}
for (Link link : parseData.getLinks()) {
CrawlDatum datum = new CrawlDatum();
datum.setUrl(link.getUrl());
datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
datum.setFetchTime(CrawlDatum.FETCHTIME_UNDEFINED);
addToRedis(datum);
}
}
parseReader.close();
}
DbWriter<CrawlDatum> writer = new DbWriter<CrawlDatum>(CrawlDatum.class, new File(getCrawlPath(), Config.current_info_path));
Set set = jedis.hkeys(getCrawlPath());
Iterator ite = set.iterator();
while (ite.hasNext()) {
String key = ite.next().toString();
String value = jedis.hget(getCrawlPath(), key);
int status = Integer.valueOf(value.charAt(0) + "");
long fetchTime = Long.valueOf(value.substring(1));
CrawlDatum datum = new CrawlDatum();
datum.setUrl(key);
datum.setStatus(status);
datum.setFetchTime(fetchTime);
writer.write(datum);
if (writeCount.incrementAndGet() % 5000 == 0) {
LogUtils.getLogger().info(writeCount.get() + " crawlDatum write to crawldb");
}
//LogUtils.getLogger().info("write "+datum.getUrl());