// Just to be really robust, allow a huge number of redirects and retries.
FetcherPolicy policy = new FetcherPolicy();
policy.setMaxRedirects(options.getMaxRedirects());
policy.setMaxContentSize(options.getMaxSize());
SimpleHttpFetcher fetcher = new SimpleHttpFetcher(1, policy, new FirefoxUserAgent());
fetcher.setMaxRetryCount(options.getMaxRetries());
// Give a long timeout for parsing
ParserPolicy parserPolicy = new ParserPolicy(MAX_PARSE_DURATION);
SimpleParser parser = new SimpleParser(parserPolicy);
SimpleParser rawParser = new SimpleParser(parserPolicy, true);
// Create Boilperpipe content extractor
SimpleParser bpParser = new SimpleParser(new BoilerpipeContentExtractor(), new NullLinkExtractor(), parserPolicy);
if (options.isTraceLogging()) {
Logger.getRootLogger().setLevel(Level.TRACE);
System.setProperty("bixo.root.level", "TRACE");
}
String urls[] = options.getUrls() == null ? null : options.getUrls().split(",");
boolean interactive = (urls == null);
int index = 0;
while (interactive || (index < urls.length)) {
String url;
try {
if (interactive) {
System.out.print("URL to fetch: ");
url = readInputLine();
if (url.length() == 0) {
System.exit(0);
}
} else {
url = args[index++];
}
System.out.println("Fetching " + url);
FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
System.out.println(String.format("Fetched %s: headers = %s", result.getUrl(), result.getHeaders()));
System.out.flush();
// System.out.println("Result = " + result.toString());
ParsedDatum parsed = parser.parse(result);