Package DataAcquisition

Source Code of DataAcquisition.WebDataSource

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/

package DataAcquisition;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.definition.DefinitionResolver;
import org.webharvest.runtime.Scraper;
import org.webharvest.runtime.variables.Variable;

import seekfeel.dataholders.DataUnit;
import seekfeel.dataholders.Review;
import preprocessing.*;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;

/**
*
* @author Tarek
*/
public abstract class WebDataSource extends DataSource {
    protected ArrayList<DataUnit> revs;
    protected String configFile;
    protected String URL;
    protected String resFile;
    protected String htmlFile;
    ArrayList<String> paths;

    final String xqExp = "<xq-expression><![CDATA[" + "\n\t" + "declare variable $doc as node() external;" "\n\t" + "for $item in $doc//";
    final String endOfEpin = "\n\t" + "let $name := data($item)" + "\n\t" + "return" + "\n\t" + "<name> {normalize-space($name)} </name>" + "\n\t" + " ]]></xq-expression>" + "\n\t" + " </xQuery>";
    final String endFileConfig =  " </file>" + "\n\t" + "</config>";
    protected WebDataSource(String url)
    {
        String curDir = System.getProperty("user.dir");
        revs = new ArrayList<DataUnit>();
        URL = url;
        configFile =  curDir + "\\conf.txt";
        resFile= curDir + "\\result.txt" ;
        htmlFile = curDir + "\\Downloaded.html";
        downloadfile(url);
    }
    public void executeConfig()
    {
        try
        {
            ScraperConfiguration config = new ScraperConfiguration(configFile);
            Scraper scraper = new Scraper(config,resFile);
            scraper.setDebug(true);
            scraper.execute();
        }
        catch (FileNotFoundException ex)
        {
          
        }
    }
    private void downloadfile(String url)
    {
        try
        {
            URL google = new URL(url);
            ReadableByteChannel rbc = Channels.newChannel(google.openStream());
            FileOutputStream fos = new FileOutputStream(htmlFile);
            fos.getChannel().transferFrom(rbc, 0, 1 << 24);
        }
        catch (IOException ex)
        {
           
        }
    }

}
TOP

Related Classes of DataAcquisition.WebDataSource

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.