Package bixo.tools

Source Code of bixo.tools.FetchAndParseTool$FirefoxUserAgent

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.tools;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;

import bixo.config.FetcherPolicy;
import bixo.config.ParserPolicy;
import bixo.config.UserAgent;
import bixo.datum.FetchedDatum;
import bixo.datum.ParsedDatum;
import bixo.datum.ScoredUrlDatum;
import bixo.fetcher.SimpleHttpFetcher;
import bixo.parser.BoilerpipeContentExtractor;
import bixo.parser.NullLinkExtractor;
import bixo.parser.SimpleParser;

public class FetchAndParseTool {


  @SuppressWarnings("serial")
  private static class FirefoxUserAgent extends UserAgent {
    public FirefoxUserAgent() {
      super("Firefox", "", "");
    }
   
    @Override
    public String getUserAgentString() {
        // Use standard Firefox agent name, as some sites won't work w/non-standard names.
      return "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.8) Gecko/2009032608 Firefox/3.0.8";
    }
  }

    private static final int MAX_PARSE_DURATION = 180 * 1000;
 
    private static String readInputLine() throws IOException {
        InputStreamReader isr = new InputStreamReader(System.in);
        BufferedReader br = new BufferedReader(isr);
       
        try {
            return br.readLine();
        } finally {
            // TODO KKr - will this actually close System.in?
            // Should I reuse this buffered reader? Check out password masking code.
            // br.close();
        }
    }

    private static void printUsageAndExit(CmdLineParser parser) {
        parser.printUsage(System.err);
        System.exit(-1);
    }

    /**
     * @param args - URL to fetch
     */
    public static void main(String[] args) {
        FetchAndParseToolOptions options = new FetchAndParseToolOptions();
        CmdLineParser cmdParser = new CmdLineParser(options);
       
        try {
            cmdParser.parseArgument(args);
        } catch(CmdLineException e) {
            System.err.println(e.getMessage());
            printUsageAndExit(cmdParser);
        }

        // Just to be really robust, allow a huge number of redirects and retries.
        FetcherPolicy policy = new FetcherPolicy();
        policy.setMaxRedirects(options.getMaxRedirects());
        policy.setMaxContentSize(options.getMaxSize());
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(1, policy, new FirefoxUserAgent());
        fetcher.setMaxRetryCount(options.getMaxRetries());
       
        // Give a long timeout for parsing
        ParserPolicy parserPolicy = new ParserPolicy(MAX_PARSE_DURATION);
        SimpleParser parser = new SimpleParser(parserPolicy);

        SimpleParser rawParser = new SimpleParser(parserPolicy, true);
       
        // Create Boilperpipe content extractor
        SimpleParser bpParser = new SimpleParser(new BoilerpipeContentExtractor(), new NullLinkExtractor(), parserPolicy);
       
        if (options.isTraceLogging()) {
            Logger.getRootLogger().setLevel(Level.TRACE);
            System.setProperty("bixo.root.level", "TRACE");
        }
       
        String urls[] = options.getUrls() == null ? null : options.getUrls().split(",");
        boolean interactive = (urls == null);
        int index = 0;
       
        while (interactive || (index < urls.length)) {
          String url;
         
          try {
              if (interactive) {
                System.out.print("URL to fetch: ");
                url = readInputLine();
                if (url.length() == 0) {
                  System.exit(0);
                }
              } else {
                url = args[index++];
              }

              System.out.println("Fetching " + url);
            FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
            System.out.println(String.format("Fetched %s: headers = %s", result.getUrl(), result.getHeaders()));
            System.out.flush();
           
            // System.out.println("Result = " + result.toString());
            ParsedDatum parsed = parser.parse(result);
            System.out.println(String.format("Parsed %s: lang = %s, size = %d", parsed.getUrl(),
                            parsed.getLanguage(), parsed.getParsedText().length()));
           
            ParsedDatum bpParsed = bpParser.parse(result);
            ParsedDatum rawParsed = rawParser.parse(result);
           
            if (interactive) {
                while (true) {
                    System.out.print("Next action - (d)ump regular, dump (b)oilerpipe, dump (r)aw, (e)xit: ");
                    String action = readInputLine();
                    if (action.startsWith("e") || (action.length() == 0)) {
                        break;
                        } else if (action.startsWith("d")) {
                            System.out.println("=====================================================================");
                            System.out.println(parsed.getParsedText());
                            System.out.println("=====================================================================");
                        } else if (action.startsWith("b")) {
                            System.out.println("=====================================================================");
                            System.out.println(bpParsed.getParsedText());
                            System.out.println("=====================================================================");
                        } else if (action.startsWith("r")) {
                            System.out.println("=====================================================================");
                            System.out.println(rawParsed.getParsedText());
                            System.out.println("=====================================================================");
                    } else {
                        System.out.println("Unknown command - " + action);
                    }
                }
            }
          } catch (Exception e) {
            e.printStackTrace(System.out);
               
            if (interactive) {
                System.out.println();
                System.out.flush();
            } else {
              System.exit(-1);
            }
          }
        }
    }

}
TOP

Related Classes of bixo.tools.FetchAndParseTool$FirefoxUserAgent

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.