/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package DataAcquisition;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.definition.DefinitionResolver;
import org.webharvest.runtime.Scraper;
import org.webharvest.runtime.variables.Variable;
import seekfeel.dataholders.DataUnit;
import seekfeel.dataholders.Review;
import preprocessing.*;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
/**
*
* @author Tarek
*/
public abstract class WebDataSource extends DataSource {
protected ArrayList<DataUnit> revs;
protected String configFile;
protected String URL;
protected String resFile;
protected String htmlFile;
ArrayList<String> paths;
final String xqExp = "<xq-expression><![CDATA[" + "\n\t" + "declare variable $doc as node() external;" + "\n\t" + "for $item in $doc//";
final String endOfEpin = "\n\t" + "let $name := data($item)" + "\n\t" + "return" + "\n\t" + "<name> {normalize-space($name)} </name>" + "\n\t" + " ]]></xq-expression>" + "\n\t" + " </xQuery>";
final String endFileConfig = " </file>" + "\n\t" + "</config>";
protected WebDataSource(String url)
{
String curDir = System.getProperty("user.dir");
revs = new ArrayList<DataUnit>();
URL = url;
configFile = curDir + "\\conf.txt";
resFile= curDir + "\\result.txt" ;
htmlFile = curDir + "\\Downloaded.html";
downloadfile(url);
}
public void executeConfig()
{
try
{
ScraperConfiguration config = new ScraperConfiguration(configFile);
Scraper scraper = new Scraper(config,resFile);
scraper.setDebug(true);
scraper.execute();
}
catch (FileNotFoundException ex)
{
}
}
private void downloadfile(String url)
{
try
{
URL google = new URL(url);
ReadableByteChannel rbc = Channels.newChannel(google.openStream());
FileOutputStream fos = new FileOutputStream(htmlFile);
fos.getChannel().transferFrom(rbc, 0, 1 << 24);
}
catch (IOException ex)
{
}
}
}