Package fr.eolya.utils.http

Examples of fr.eolya.utils.http.HttpLoader


            String charSet = null;
//           String declaredLanguage = null;
            String contentType = null;
            String contentEncoding = null;
           
            HttpLoader urlLoader = new HttpLoader();
           
            int ret = -1;
//           int tryCount = 0;
//           String temp = url;
           
            ret = urlLoader.open(url);
           
//            while (ret == -1 && tryCount < 3) {
//                try {
//                    urlLoader.close();
//                    ret = urlLoader.open("", "", true);
//                }
//                catch (IOException e) {
//                    String msg = e.getMessage();
//                    if (tryCount == 0 && msg!=null && msg.toLowerCase().startsWith("invalid uri")) {
//                        System.out.println(msg);
//                        temp = HttpUtils.urlEncode(temp);
//                        urlLoader.setUrl(temp);
//                    }
//                    else {
//                        Utils.sleep(tryCount * 1000);               
//                    }
//                    tryCount++;
//                    ret = -1;
//                    urlLoader.close();
//                    if (tryCount == 3) throw new IOException(e.getMessage());
//                }
//            }
           
            if (ret == HttpLoader.LOAD_SUCCESS) {
                contentType = urlLoader.getContentType();
                contentEncoding = urlLoader.getContentEncoding();
                WebStream ws = new WebStream(urlLoader.getStream(), "", contentType, contentEncoding);
                rawPage = ws.getString();
                charSet = ws.getCharSet();
                //String declaredLanguage = ws.getDeclaredLanguage();
                ws.clear();
            }   
           
            if ("links".equals(action)) {
                printVerbose(url, scriptsPath, action, verbose);
                List<String> links = null;
                if (HttpLoader.isRss(contentType, null)) {
                    links = HttpUtils.extractLinksFromFeed(rawPage);
                }
                else {
                    links = HttpUtils.extractAbsoluteLinks(rawPage, url, 2);
                    String [] aLinks = htmlLinks(url, rawPage, links.toArray(new String[]{}), scriptsPath, null);
                    links = Arrays.asList(aLinks)
                }
               
                for (String strLink : links) {
                    try {
                        //strLink = strLink.trim();
                        //strLink = URLUtils.urlGetAbsoluteURL(url, strLink);
                        strLink = HttpUtils.urlNormalize(strLink.trim(), null)
                        System.out.println(strLink);
                    }
                    catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            }
           
            if ("parse".equals(action)) {
                printVerbose(url, scriptsPath, action, verbose);
                String title = "";
                String d = "";
                String page = "";
               
                HashMap<String, String> m = htmlParse(url, rawPage, contentType, scriptsPath, null);
                if (m!=null && m.size()>0) {
                    title = m.get("title");
                    d = m.get("date");
                    page = m.get("page");
                }
               
                // Get page text
                //MultiFormatTextExtractor extractor = new MultiFormatTextExtractor();
        TikaWrapper tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT);

                String text = "";
                InputStream in = null;
                if (page==null || "".equals(page)) {
                    //text = extractor.htmlPageToText(rawPage, "", "");
                  in = IOUtils.toInputStream(rawPage);
                } else {
                    //text = extractor.htmlPageToText(page, "", "");
                  in = IOUtils.toInputStream(page);
                }
               
        tikaWrapper.process(in, TikaWrapper.CONTENT_TYPE_HTML);
        text = tikaWrapper.getText();
       
                if (title==null || "".equals(title))
                    title = tikaWrapper.getMetaTitle();
               
                System.out.println("Title = "+ title);
                System.out.println("Date  = " + d);
                System.out.println("Text  = " + text);
                System.out.println("Page  = " + page);
            }
           
            if ("meta".equals(action)) {
                printVerbose(url, scriptsPath, action, verbose);
                HashMap<String, String> m = extractMeta(url, rawPage, contentType, charSet, scriptsPath, null, false);
                if (m!=null && m.size()>0) {
                    for (Map.Entry<String, String> entry : m.entrySet()) {
                        System.out.println("meta_extracted_" + entry.getKey() + " = " + entry.getValue());
                    }
                }
            }
            urlLoader.close();
            urlLoader = null;
        }
        catch (Exception e) {
            e.printStackTrace();
        }
View Full Code Here


  public GeoLocalisation (String host, String knowedIP, Properties prop) throws URISyntaxException
  {
    this.host = host;
    this.knowedIP = knowedIP;
    httpLoader = new HttpLoader();
    if (prop!=null && !"".equals(prop.getProperty("proxy.host", "")) && !"".equals(prop.getProperty("proxy.port", ""))) {
      httpLoader.setProxyHost(prop.getProperty("proxy.host", ""));
      httpLoader.setProxyPort(Integer.parseInt(prop.getProperty("proxy.port", "")));
      if (!"".equals(prop.getProperty("proxy.exclude", ""))) httpLoader.setProxyExclude(prop.getProperty("proxy.exclude", ""));
      if (!"".equals(prop.getProperty("proxy.username", "")) && !"".equals(prop.getProperty("proxy.password", ""))) {
View Full Code Here

  private int cacheMode = CACHE_NONE;

  public WebPageLoader(int cacheMode, String type, IDBConnection con, String dbName, String dbCollName, String sourceId) {
    this.cacheMode = cacheMode;
    if (this.cacheMode!=CACHE_ONLY) httpLoader = new HttpLoader();
    if (this.cacheMode!=CACHE_NONE) cache = DocumentCacheFactory.getDocumentCacheInstance(type, con, dbName, dbCollName, sourceId);
    cacheItem = null;
  }
View Full Code Here

public class HttpLoaderTest extends TestCase {

  @Test
  public void testHttpLoader(){
    try {
      HttpLoader loader = new HttpLoader();
      assertEquals(HttpLoader.LOAD_SUCCESS, loader.open("http://www.google.fr/"));
     
      assertEquals(200, loader.getResponseStatusCode());
      assertEquals("text/html; charset=ISO-8859-1", loader.getContentType());
      assertEquals("ISO-8859-1", loader.getCharSet());

      loader = new HttpLoader();
      assertEquals(HttpLoader.LOAD_SUCCESS, loader.open("https://www.google.fr/"));
     
      assertEquals(200, loader.getResponseStatusCode());
      assertEquals("text/html; charset=ISO-8859-1", loader.getContentType());
      assertEquals("ISO-8859-1", loader.getCharSet());

      loader = new HttpLoader();
      assertEquals(HttpLoader.LOAD_ERROR, loader.open("https://www.eolya.fr/"));

      loader = new HttpLoader();
      assertEquals(HttpLoader.LOAD_ERROR, loader.open("http://www.googlegooglegoogle.fr/"));

      loader = new HttpLoader();
      assertEquals(HttpLoader.LOAD_ERROR, loader.open("http://www.google.fr/zzzzz/"));
      assertEquals(404, loader.getResponseStatusCode())
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
View Full Code Here

        String authParam = StringUtils.trimToEmpty(req.getParameter("auth_param")).trim();
        Map<String, String> authCookies = null;
        Map<String, String> authBasicLogin = null;
       
        try {
          HttpLoader urlLoader;
      //try {
        urlLoader = new HttpLoader();
      //} catch (URISyntaxException e1) {
      //  e1.printStackTrace();
            //    return XmlResponse.buildErrorXml(10, "Failed load page (bad url : " + page + ")");
      //}
           
            String userAgent = ServletUtils.getSetting(this, xmlConfig, "crawler_user_agent", "CaBot");
            urlLoader.setUserAgent(userAgent);
           
            if (!"0".equals(authMode)) {
                if ("3".equals(authMode)) {
          authBasicLogin = new HashMap<String, String>();
          authBasicLogin.put("login",authLogin);
          authBasicLogin.put("password",authPasswd);   
                    urlLoader.setBasicLogin(authBasicLogin);
                } else {
                    authCookies = HttpUtils.getAuthCookies(Integer.parseInt(authMode), authLogin, authPasswd, authParam,
                            ServletUtils.getSetting(this, xmlConfig, "proxy_host", ""), ServletUtils.getSetting(this, xmlConfig, "proxy_port", ""), ServletUtils.getSetting(this, xmlConfig, "proxy_exclude", ""), ServletUtils.getSetting(this, xmlConfig, "proxy_username", ""), ServletUtils.getSetting(this, xmlConfig, "proxy_password", ""));                 
                    if (authCookies!=null)
                        urlLoader.setCookies(authCookies);
                    else
                        return XmlResponse.buildErrorXml(10, "Failed get authentication cookie");
                }
            }
           
            if (urlLoader.open(page) == HttpLoader.LOAD_SUCCESS) {
                String contentType = urlLoader.getContentType();
                String contentEncoding = urlLoader.getContentEncoding();
                if ((contentType!=null) && (contentType.toLowerCase().startsWith("text/html"))) {
          HttpStream ws = new HttpStream(urlLoader.getStream(), "", contentType, contentEncoding);
                    String rawPage = ws.getString();
                    ws.clear();
                    try {
                        String ret = "<?xml version=\"1.0\" encoding=\"utf-8\"?><result>";
                        ret += "<page><![CDATA[" + rawPage + "]]>" + "</page></result>";
                        urlLoader.close();
                        return ret;
                    }
                    catch(Exception e) {}
                }
                else {
                    if (contentType!=null)
                        return XmlResponse.buildErrorXml(10, "Failed load page (content-type = " + contentType + ")");
                    else
                        return XmlResponse.buildErrorXml(10, "Failed load page (no content-type)");
                   
                }
            }
            else {
                return XmlResponse.buildErrorXml(10, "Failed load page (response code = " + String.valueOf(urlLoader.errorCode));
            }
            urlLoader.close();
           
        } catch (IOException e) {
            e.printStackTrace();
        }
       
View Full Code Here

            return XmlResponse.buildErrorXml(-1, "Invalid URL");
        }
       
        try {
            //MultiFormatTextExtractor extractor = new MultiFormatTextExtractor();
          HttpLoader urlLoader;
      //try {
        urlLoader = new HttpLoader();
      //} catch (URISyntaxException e1) {
      //  e1.printStackTrace();
            //    return XmlResponse.buildErrorXml(10, "Failed load page (bad url : " + page + ")");
      //}
            if (urlLoader.open(url.toExternalForm()) == HttpLoader.LOAD_SUCCESS) {
                String ret = "<?xml version=\"1.0\" encoding=\"utf-8\"?><result>";
               
                //String contentType = urlLoader.getContentType();
                //String contentEncoding = urlLoader.getContentEncoding();
               
                //HttpStream ws = new HttpStream(urlLoader.getStream(), "", contentType, contentEncoding);
                //String data = ws.getString();
                //ws.clear();
               
                //String rawPage = extractor.htmlPageToText(data, page, "");
                //String title = extractor.getTitle();
        TikaWrapper tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_HTML);
        tikaWrapper.process(urlLoader.getStream());
        String rawPage = tikaWrapper.getText();
        String title = tikaWrapper.getMetaTitle();

                ret += "<page_0><![CDATA[" + rawPage + "]]>" + "</page_0>";
                ret += "<title_0><![CDATA[" + title + "]]>" + "</title_0>";
               
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_article");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_1><![CDATA[" + rawPage + "]]>" + "</page_1>";
                ret += "<title_1><![CDATA[" + title + "]]>" + "</title_1>";
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_default");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_2><![CDATA[" + rawPage + "]]>" + "</page_2>";
                ret += "<title_2><![CDATA[" + title + "]]>" + "</title_2>";
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_canola");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_3><![CDATA[" + rawPage + "]]>" + "</page_3>";
                ret += "<title_3><![CDATA[" + title + "]]>" + "</title_3>";

                //rawPage = extractor.htmlPageToText(data, page, "snacktory");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_4><![CDATA[" + rawPage + "]]>" + "</page_4>";
                ret += "<title_4><![CDATA[" + title + "]]>" + "</title_4>";
               
                ret += "</result>";
                urlLoader.close();
                return ret;
            } else {
                return XmlResponse.buildErrorXml(-1, "Error loading page");
            }
        }
View Full Code Here

TOP

Related Classes of fr.eolya.utils.http.HttpLoader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.