SimpleHttpFetcher fetcher = new SimpleHttpFetcher(1, policy, new FirefoxUserAgent());
fetcher.setMaxRetryCount(options.getMaxRetries());
// Give a long timeout for parsing
ParserPolicy parserPolicy = new ParserPolicy(MAX_PARSE_DURATION);
SimpleParser parser = new SimpleParser(parserPolicy);
SimpleParser rawParser = new SimpleParser(parserPolicy, true);
// Create Boilperpipe content extractor
SimpleParser bpParser = new SimpleParser(new BoilerpipeContentExtractor(), new NullLinkExtractor(), parserPolicy);
if (options.isTraceLogging()) {
Logger.getRootLogger().setLevel(Level.TRACE);
System.setProperty("bixo.root.level", "TRACE");
}
String urls[] = options.getUrls() == null ? null : options.getUrls().split(",");
boolean interactive = (urls == null);
int index = 0;
while (interactive || (index < urls.length)) {
String url;
try {
if (interactive) {
System.out.print("URL to fetch: ");
url = readInputLine();
if (url.length() == 0) {
System.exit(0);
}
} else {
url = args[index++];
}
System.out.println("Fetching " + url);
FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
System.out.println(String.format("Fetched %s: headers = %s", result.getUrl(), result.getHeaders()));
System.out.flush();
// System.out.println("Result = " + result.toString());
ParsedDatum parsed = parser.parse(result);
System.out.println(String.format("Parsed %s: lang = %s, size = %d", parsed.getUrl(),
parsed.getLanguage(), parsed.getParsedText().length()));
ParsedDatum bpParsed = bpParser.parse(result);
ParsedDatum rawParsed = rawParser.parse(result);
if (interactive) {
while (true) {
System.out.print("Next action - (d)ump regular, dump (b)oilerpipe, dump (r)aw, (e)xit: ");