DbReader<CrawlDatum> reader_fetch = new DbReader<CrawlDatum>(CrawlDatum.class, file_fetch);
HashMap<String, Integer> indexmap = new HashMap<String, Integer>();
ArrayList<CrawlDatum> datums_origin = new ArrayList<CrawlDatum>();
CrawlDatum datum = null;
while (reader_current.hasNext()) {
datum = reader_current.readNext();
datums_origin.add(datum);
indexmap.put(datum.getUrl(), datums_origin.size() - 1);
}
while (reader_fetch.hasNext()) {
datum = reader_fetch.readNext();
if (indexmap.containsKey(datum.getUrl())) {
if (datum.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) {
continue;
} else {
int preindex = indexmap.get(datum.getUrl());
datums_origin.set(preindex, datum);
indexmap.put(datum.getUrl(), preindex);
}
} else {
datums_origin.add(datum);
indexmap.put(datum.getUrl(), datums_origin.size() - 1);
}
}
reader_fetch.close();
File file_parse = new File(getSegmentPath(), "parse_data/info.avro");
if (file_parse.exists()) {
DbReader<ParseData> reader_parse = new DbReader<ParseData>(ParseData.class, file_parse);
ParseData parseresult = null;
while (reader_parse.hasNext()) {
parseresult = reader_parse.readNext();
for (Link link : parseresult.getLinks()) {
datum = new CrawlDatum();
datum.setUrl(link.getUrl());
datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
if (indexmap.containsKey(datum.getUrl())) {
continue;
} else {
datums_origin.add(datum);
indexmap.put(datum.getUrl(), datums_origin.size() - 1);
}
}
}
reader_parse.close();
}