@SuppressWarnings({ "unchecked", "rawtypes" })
public static void importSeedUrls(BasePlatform platform, BasePath crawlDbPath, String fileName) throws Exception {
SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
InputStream is = null;
TupleEntryCollector writer = null;
try {
Tap urlSink = platform.makeTap(platform.makeTextScheme(), crawlDbPath, SinkMode.REPLACE);
writer = urlSink.openForWrite(platform.makeFlowProcess());
is = DemoWebMiningWorkflow.class.getResourceAsStream(fileName);
if (is == null) {
throw new FileNotFoundException("The seed urls file doesn't exist");
}
List<String> lines = IOUtils.readLines(is);
for (String line : lines) {
line = line.trim();
if (line.startsWith("#")) {
continue;
}
CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f, 0.0f);
writer.add(datum.getTuple());
}
} catch (IOException e) {
crawlDbPath.delete(true);