/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package DataAcquisition;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.definition.DefinitionResolver;
import org.webharvest.runtime.Scraper;
import org.webharvest.runtime.variables.Variable;
import preprocessing.*;
import seekfeel.dataholders.DataUnit;
import seekfeel.dataholders.Review;
import java.io.BufferedReader;
import java.io.FileReader;
/**
*
* @author Tarek
*/
public class AmazonSource extends WebDataSource {
final String sof = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "\n" + "<config>" + "\n\t" + "<file action=\"write\"";
String midOfFile = "\n" + "</html-to-xml>" + "\n\t" + "</xq-param>" + "\n\t" + xqExp ;
final String eof = endOfEpin + "\n\t" + endFileConfig;
String text = "\n" + "<xQuery>" + "\n\t\t" + "<xq-param name=\"doc\">" + "\n\t\t" + "<html-to-xml>" +"\n\t\t";
String amazonTag = "div[@style=\"margin-left:0.5em;\"]/text()";
public AmazonSource(String url)
{
super(url);
}
private void writeAmazonConfigFile()
{
FileWriter fstream = null;
try
{
fstream = new FileWriter(configFile);
BufferedWriter out = new BufferedWriter(fstream);
out.write(sof);
String path = " path=" + "\"" + resFile + "\" " + "charset=\"UTF-8\">";
out.write(path);
out.write(text);
path = "<file action=\"read\" path=" + "\"" + htmlFile + "\"" + " charset=\"UTF-8\">" + "\n\t" + "</file>" ;
out.write(path);
out.write(midOfFile);
out.write(amazonTag);
out.write(eof);
out.flush();
out.close();
}
catch (IOException e)
{
}
}
private void deNoiseAmazon()
{
try
{
BufferedReader in = new BufferedReader(new FileReader(resFile));
String str;
int count = 0;
int start = 1;
Review r = new Review();
while ((str = in.readLine()) != null)
{
if(!(str.equals("<name/>")|| str.equals("<name>")))
{
if(str.matches("<name>" + ".+" + "</name>" ))
{
String text= str.substring(6 , str.length() - 7);
if(count >= 3)
{
if( start != 1)
{
revs.add(r);
}
else
{
start = 0;
}
r = new Review();
r.Review_Body = text;
}
else
{
r.Review_Body += " " + text;
}
count =0;
}
}
else
{
count++;
}
}
in.close();
}
catch (IOException e)
{
}
}
public ArrayList<DataUnit> harvest()
{
revs.clear();
writeAmazonConfigFile();
executeConfig();
deNoiseAmazon();
return revs;
}
}