/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package DataAcquisition;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.definition.DefinitionResolver;
import org.webharvest.runtime.Scraper;
import org.webharvest.runtime.variables.Variable;
//import ReviewParser.*;
import preprocessing.*;
import java.io.BufferedReader;
import java.io.FileReader;
import seekfeel.dataholders.DataUnit;
import seekfeel.dataholders.Review;
/**
*
* @author Tarek
*/
public class EpinionSource extends WebDataSource
{
private ArrayList<String> epinionSections;
final String epinSof = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "\n\t" + "<config>" + "\n\t" + "<var-def name=\"bolds\">" + "\n\t" + "<xpath expression=\"//a[starts-with(., 'Read the')]/@href\">" + "\n\t" + "<html-to-xml>" + "\n\t" + "<http url= " ;
final String endtags = "</html-to-xml>" + "\n\t" + "</xpath>" + "\n\t" + "</var-def> ";
final String fileTag = "<file action=\"write\" path=";
final String openLoop = " <loop item=\"item\" index=\"i\">" + "\n\t" + "<list><var name=\"bolds\"/></list>" + "\n\t" + " <body> ";
final String varDef = "<var-def name=\"linkUrl\">" + "\t\n" + "<template> ${sys.fullUrl(\"http://www.epinions.com\", item.toString())} </template>" + "</var-def>";
final String xQuery = "<xQuery>";
final String xqParam = "<xq-param name=\"doc\">" + "\t\n" + "<html-to-xml>" + "\t\n" + "<http url=\"${linkUrl}\"/>" + "\t\n" + "</html-to-xml>" + "\t\n" + "</xq-param>";
final String endLoopBody = "</body>" + "</loop>";
final String xqexpression = "<xq-expression><![CDATA[" + "\n\t" + "declare variable $doc as node() external;" + "\n\t";
//final String epinionTag = "td[@class=\"nav-new-1-pixel\"]/span/b | $doc//td[@class=\"nav-new-1-pixel\"]/span/text()";
final String epinionTag = "for $item in ($doc//td[@class=\"nav-new-1-pixel\"]/span/b | $doc//td[@class=\"nav-new-1-pixel\"]/span/text())";
public EpinionSource(String url)
{
super(url);
}
private void deNoiseEpinion()
{
try
{
BufferedReader in = new BufferedReader(new FileReader(resFile));
String str;
Review r = new Review();
r.Review_Body = "";
int start = 1;
while ((str = in.readLine()) != null)
{
if(!(str.equals("<name/>")|| str.equals("<name>")))
{
if(str.matches("<name>" + ".+" + "</name>" ))
{
String text = str.substring(6 , str.length() - 7);
if(text.equals("Recommended:"))
{
revs.add(r);
while ((str = in.readLine()) != null)
{
if(str.startsWith("http"))
{
r = new Review();
r.Review_Body = "";
break;
}
}
if( str == null)
break;
}
//if(!epinionSections.contains(text))
else
{
r.Review_Sentences.add(text);
r.Review_Body += text;
}
}
/*
else if(str.startsWith("http"))
{
revs.add(r);
r = new Review();
r.Review_Body = "";
}
*
*/
}
else
{
}
}
in.close();
}
catch (IOException e)
{
}
}
private void writeEpinionConfigFile()
{
FileWriter fstream = null;
try
{
fstream = new FileWriter(configFile);
BufferedWriter out = new BufferedWriter(fstream);
out.write(epinSof);
String url = "\"" + URL + "\"" + " />";
out.write(url);
out.write ("\n\t" + endtags + "\n\t" + fileTag);
String path = "\"" + resFile + "\" " + "charset=\"UTF-8\">";
out.write(path);
String loop = "\n\t" + openLoop + "\n\t" + "\n\t" + varDef + xQuery;
out.write(loop);
out.write("\n\t" + xqParam + "\n\t" + xqexpression);
out.write(epinionTag);
String endOffile = endOfEpin + "\n\t" + endLoopBody + "\n\t" + endFileConfig;
out.write(endOffile);
out.flush();
out.close();
}
catch (IOException e)
{
}
}
public ArrayList<DataUnit> harvest()
{
revs.clear();
epinionSections = new ArrayList<String>();
writeEpinionConfigFile();
executeConfig();
deNoiseEpinion();
return revs;
}
}