Package pagelyzer

Source Code of pagelyzer.JPagelyzer

/*
# Andrés Sanoja
# UPMC - LIP6
# pagelyzer
#
# Copyright (C) 2011, 2012, 2013, 2014 Andrés Sanoja, Université Pierre et Marie Curie -
# Laboratoire d'informatique de Paris 6 (LIP6)
#
# Authors
# Andrés Sanoja andres.sanoja@lip6.fr
# Alexis Lechervy alexis.lechervy@lip6.fr
# Zeynep Pehlivan zeynep.pehlivan@lip6.fr
# Myriam Ben Saad myriam.ben-saad@lip6.fr
# Marc Law marc.law@lip6.fr
# Carlos Sureda carlos.sureda@lip6.fr
# Jordi Creus jordi.creus@lip6.fr
# LIP6 / Université Pierre et Marie Curie

# Responsables WP
# Matthieu CORD/UPMC
# Stéphane GANÇARSKI/UPMC
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Some parts of this package are adapted from the BrowserShot proyect developed by IM, France.
# https://github.com/sbarton/browser-shot-tool-mapred
*/

package pagelyzer;


import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URISyntaxException;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.imageio.ImageIO;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.XMLConfiguration;

import Scape.MarcAlizer;
import Scape.ScapeTrain;

/**
* Class to calculate de change detection between two web pages
* @author sanojaa
*/
public class JPagelyzer {

     public XMLConfiguration config;

     Options displayoptions = new Options();
     public String comparemode ;// public to use in test
     public String cfile;
     Boolean isDebugActive = false;
     String debugfilePattern;
     String debugPathtoSave ;
     String outputfile;
     boolean screenshot;
     boolean segmentation;
     boolean isTrain;
     ScapeTrain sc ;
     public MarcAlizer marcalizer;
     public String browser1;
     public  String browser2; // public to use in test
     int idcounter = 0; // to count how many time change detection called and use this counter as id to save files if debug mode is on
     public static final String LOCAL = "local";
     /**
* Constant remote
*/
     public static final String MODE_IMAGE = "image";
     public static final String MODE_CONTENT = "content";
     public static final String MODE_HYBRID = "hybrid";
     public static final String REMOTE = "remote";
     /**
* Constant score
*/
     public static final String SCORE = "score";
     /**
* Constant screenshot
*/
     public static final String SCREENSHOT = "screenshot";
     /**
* Constant source
*/
     public static final String SOURCE = "source";
     /**
* Constant segmentation
*/
     public static final String SEGMENTATION = "segmentation";

     public String url1, url2,url;
    /**
     * get the current configuration
     * @return the current configuration
     */
    public XMLConfiguration getConfig() {
        return(this.config);
    }

 
    public JPagelyzer(String[] args, boolean isTrain)
    {
      this.isTrain = isTrain;
        // no need any more cpath browser etc. they are all in config file
      // not to change the usage of options I am adding display options to send to usage.
      displayoptions.addOption("url1",true,"First URL to compra");
      displayoptions.addOption("url2",true,"Second URL to compare");
      displayoptions.addOption("config",true,"Global configuration file for an example of file: https://github.com/openplanets/pagelyzer/blob/master/config.xml");
      //displayoptions.addOption("mode",true,"hybrid/content/image it can be also set in config file");

        CommandLineParser parser = new BasicParser();
        CommandLine cmd;
       
        /* Parsing comandline parameters*/
        try {
            cmd = parser.parse(displayoptions, args);
        } catch (ParseException pe) {
            usage(displayoptions); return;
        }
        if (!cmd.hasOption("config")) {usage(displayoptions);System.exit(0);}

    try {
      config = new XMLConfiguration(cmd.getOptionValue("config"));
     
    } catch (ConfigurationException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
       
    comparemode = this.config.getString("pagelyzer.run.default.comparison.mode");
   
   
    cfile = config.getString("pagelyzer.run.default.comparison.subdir")+ "ex_" + config.getString("pagelyzer.run.default.comparison.mode") +".xml";
    this.config.setProperty("pagelyzer.run.default.comparison.file","ex_"+comparemode+".xml");
   
    isDebugActive =  config.getBoolean("pagelyzer.debug.screenshots.active");
    debugfilePattern = config.getString("pagelyzer.debug.screenshots.filepattern");
    debugPathtoSave = config.getString("pagelyzer.debug.screenshots.path");
    outputfile = config.getString("pagelyzer.run.default.parameter.outputfile");
    browser1 = config.getString("pagelyzer.run.default.parameter.browser1");
    browser2 = config.getString("pagelyzer.run.default.parameter.browser2");
   
   
    if(isTrain)
    {
      sc = new ScapeTrain();
      try {
        sc.init(new File(cfile));
      } catch (Exception ex) {
        System.err.println("Marcalize could not be initialized");
        System.exit(0);
      }
       
    }
    else
    {
      url1 = displayoptions.getOption("url1").getValue();
      url2 = displayoptions.getOption("url2").getValue();
     
    }
        /* Validate program intrinsic input parameters and configuration */
        if (this.config.getString("pagelyzer.run.default.parameter.get")==null) {usage(displayoptions);System.exit(0);}
        if (this.config.getString("selenium.run.mode").equals(LOCAL)) {
           System.out.println("Selenium: local WebDriver");
        } else {
           System.out.println("Selenium: remote " + this.config.getString("selenium.server.url"));
        }
        if (( this.config.getBoolean("pagelyzer.debug.screenshots.active")) && (this.config.getString("pagelyzer.debug.screenshots.path") == null)) {
            System.out.println("Debug was activated, but no path is specified to put the files. Use -debugpath path or change the configuration file");
            System.exit(0);
        }
       
        if (this.config.getString("pagelyzer.run.default.comparison.path") ==null) {
          this.config.setProperty("pagelyzer.run.default.comparison.path"this.config.getString("pagelyzer.run.default.comparison.subdir"));
              //this.getClass().getResource(this.getConfig().get("pagelyzer.run.default.comparison.subdir")).getPath());
        }
       
        if(!isTrain)
        {
         
          if (this.config.getString("pagelyzer.run.default.parameter.get").equals(SCORE) ) {
              if (!cmd.hasOption("url1")) {
                  System.out.println("URL1 parameter missing");
                  System.exit(0);
              } else url1 = cmd.getOptionValue("url1");
              if (!cmd.hasOption("url2")) {
                  System.out.println("URL2 parameter missing");
                  System.exit(0);
              } else url2 = cmd.getOptionValue("url2");
             
              /*
              * assure that if the comparison configuration is not passed neither as parameter in the commandline
              * nor in the configuration file (commented), use the one included as resource with the jar file
              */
             
          } else {
              if (!cmd.hasOption("url")) {
                  System.out.println("URL parameter missing");
                  System.exit(0);
              } else url = cmd.getOptionValue("url");
          }
         
          marcalizer = new MarcAlizer();
         
          try {
                marcalizer.init(new File(cfile));
                } catch (Exception ex) {
                    System.err.println("Marcalize could not be initialized");
                    System.exit(0);
                }
        }
       
        //Capture settings // I removerd them from changedetection function but it can be global
        switch (comparemode) {
        case MODE_IMAGE    : screenshot = true;
                   segmentation = false;
                   break;
        case MODE_CONTENT :
             screenshot = false;
         segmentation = true;
         break;
        case MODE_HYBRID    :
            screenshot = true;
         segmentation = true;
         break;  
        }
   
     
     
    }
   
    /**
     * This function returns Capture object that contains image and/or xml file obtained from url1
     * based on browser used as selenium browser type
     *
     * @param url1 : Url to get the capture ( to get screenshot, to do web segmentation depending on comparison mode)
     * @param browser: Browser name (ex: "firefox", "chrome")
     * @return Capture object
     */
   
   
    public Capture GetCapture(String url1, String browser)
    {
       Capture capture = new Capture(this.config);
       boolean done = capture.setup(browser);
       if(done)
         capture.run(url1, screenshot, segmentation,isDebugActive);
       else
      {
         System.out.println("Capture GetCapture error : Can not get capture for page " + url1);
         capture = null;
      }
       return capture;
    }


    /**
     * This function takes two capture for two different Urls and annotation for this pair to train the system.
     * @param capture1: Capture for url1 can be obtained by calling GetCapture
     * @param capture2: Capture for url2 can be obtained by calling GetCapture
     * @param label: annotation "0" dissimilar "1" similar
     */
   
    public void CallTrain(Capture capture1, Capture capture2, String label)
    {

      if(capture1.result!=null)
        {
              switch (comparemode) {
              case MODE_IMAGE  :
              sc.addExampleOfTrain_Img(capture1.result.getBufferedImage(), capture2.result.getBufferedImage(), Integer.parseInt(label));
              break;
              case MODE_CONTENT :
              sc.addExampleOfTrain(capture1.result.viXML, capture2.result.viXML, Integer.parseInt(label));
            break;
              case MODE_HYBRID    :
              sc.addExampleOfTrain(capture1.result.viXML, capture2.result.viXML, capture1.result.getBufferedImage(), capture2.result.getBufferedImage(), Integer.parseInt(label));
              break;
  
              }
        }
     
     
    }
    /**
     * This function takes two capture for two different Urls and returns the similarity score between them
     * @param capture1: Capture for url1 can be obtained by calling GetCapture
     * @param capture2: Capture for url2 can be obtained by calling GetCapture
     *
     */
   
   
    public double CallMarcalizerResult(Capture capture1, Capture capture2) throws FileNotFoundException
    {
          double result=-100; // error code
     
       switch (comparemode) {
       case MODE_IMAGE     :
       
            result = marcalizer.run(capture1.result.getBufferedImage(), capture2.result.getBufferedImage());
             break;
       case MODE_CONTENT  :
        // result = marcalizer.run(new Scanner(new File("/home/pehlivanz/SCAPE_ZP/Roc/page3_1.png.xml")).useDelimiter("\\Z").next(),new Scanner(new File("/home/pehlivanz/SCAPE_ZP/Roc/page3_2.png.xml")).useDelimiter("\\Z").next());
             result = marcalizer.run(capture1.result.viXML,capture2.result.viXML);
         break;
       case MODE_HYBRID     :
             result = marcalizer.run(capture1.result.viXML,capture2.result.viXML,capture1.result.getBufferedImage(), capture2.result.getBufferedImage());
         break;  
     }
  
      System.out.println("Distance between the two web-pages:: " + result);
     
      return result;
    }
   
    /**
    * Method to detect the changes on two web pages versions. It returns the score
    * @param url1 the first web page URL
    * @param url2 the second web page URL
    * @return score
    * @throws FileNotFoundException
    **/
    public double changeDetection(String url1,String url2, String label) throws FileNotFoundException {
     
      double result = -100; // train or if it is not train run error code
      idcounter++;
      long startTime = System.currentTimeMillis();
     
      Capture capture1 = GetCapture(url1,browser1 );
     
      long endTime   = System.currentTimeMillis();
      System.out.println("Capture 1 : " + (endTime - startTime));
      startTime = System.currentTimeMillis();
      Capture capture2 = GetCapture(url2,browser2);
      endTime   = System.currentTimeMillis();
      System.out.println("Capture 2 : " + (endTime - startTime));
     
      startTime = System.currentTimeMillis();
          if(capture1!=null && capture2!=null && capture1.result!=null &&  capture2.result!=null)

          {
            if(isTrain)
            {
              CallTrain(capture1,capture2,label);
             
            }
            else
            {
              result = CallMarcalizerResult(capture1, capture2);
               
            }
       
            endTime   = System.currentTimeMillis();  
          
        if (isDebugActive && capture1.result!=null && capture2.result!=null) {
          long timestamp = System.currentTimeMillis();
              capture1.result.saveDebugFile(debugPathtoSave + "/" + debugfilePattern.replace("#{n}", timestamp + "_1" ));
              capture2.result.saveDebugFile(debugPathtoSave + "/" + debugfilePattern.replace("#{n}", timestamp + "_2" ));
              System.out.println("timestamp:" + timestamp);
             
          }
          try
         
            capture1.cleanup();
            capture2.cleanup();
         
          }catch (Exception ex) {
              Logger.getLogger(JPagelyzer.class.getName()).log(Level.SEVERE, null, ex);
          }
       
          }
        return result;
    }
   
    /**
     * Method to call different functionalities of pagelyzer. If you want to just get screenshot of the pages
     * or just get web page segmentation result  or just to save source code of the URL.
     * Target is usually set in config file  "pagelyzer.run.default.parameter.get" tag
     * @param target extra functionality. It can be: screenshot, segmentation, source
     * @param url the web page to process
     */
    public void get(String target, String url) {
        Capture capture = new Capture(config);
        capture.setup(this.config.getString("pagelyzer.run.default.parameter.browser"));
        switch(target) {
            case SCREENSHOT   : capture.run(url,true,false,isDebugActive);break;
            case SEGMENTATION : capture.run(url,false,true,isDebugActive);break;
            case SOURCE       : capture.run(url,false,false,isDebugActive);break;
        }
        CaptureResult result = capture.result;

        OutputStream out=null;
        try {
            String ext="";
           
            switch(target) {
                case SCREENSHOT   :ext="png";break;
                case SEGMENTATION :ext="xml";break;
                case SOURCE       : ext="html";break;
            }
             out = new BufferedOutputStream(new FileOutputStream(outputfile.replace("#{ext}", ext)));
             switch(target) {
                case SCREENSHOT   : out.write(result.image);break;
                case SEGMENTATION : out.write(result.viXML.getBytes("UTF-8"));break;
                case SOURCE       : out.write(result.srcHTML.getBytes("UTF-8"));break;
            }
            out.close();
            capture.cleanup();
         } catch (FileNotFoundException ex) {
             Logger.getLogger(JPagelyzer.class.getName()).log(Level.SEVERE, null, ex);
         } catch ( IOException | InterruptedException ex) {
             Logger.getLogger(JPagelyzer.class.getName()).log(Level.SEVERE, null, ex);
         }
    }
   
    /**
     * print the help usage of this application
    * @param options the command line arguments
    **/
    private static void usage(Options options){
    // Use the inbuilt formatter class
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp( "Pagelyzer Help", options );
    }
   
    /**
     * Main function to create JPagelyzer object based on config file settings
    * @param args the command line arguments
     * @throws URISyntaxException
    **/
   
   
    public static void main(String[] args) throws URISyntaxException {

       
        JPagelyzer pagelyzer = new JPagelyzer(args,false);
      
        /*
         * All is validated and fine, we can proceed to call functionalities
         */
         switch(pagelyzer.getConfig().getString("pagelyzer.run.default.parameter.get")) {
             case SCORE:
      try {
        pagelyzer.changeDetection(pagelyzer.url1,pagelyzer.url2,null);
      } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
                 break;
             case SCREENSHOT:
               pagelyzer.get(SCREENSHOT,pagelyzer.url);
                 break;
             case SOURCE:
               pagelyzer.get(SOURCE,pagelyzer.url);
                 break;
             case SEGMENTATION:
               pagelyzer.get(SEGMENTATION,pagelyzer.url);
                 break;
         }
     
      
       
    }
   
}
TOP

Related Classes of pagelyzer.JPagelyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.