Package DataAcquisition

Source Code of DataAcquisition.EpinionSource

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/

package DataAcquisition;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.definition.DefinitionResolver;
import org.webharvest.runtime.Scraper;
import org.webharvest.runtime.variables.Variable;
//import ReviewParser.*;
import preprocessing.*;
import java.io.BufferedReader;
import java.io.FileReader;

import seekfeel.dataholders.DataUnit;
import seekfeel.dataholders.Review;
/**
*
* @author Tarek
*/
public class EpinionSource extends WebDataSource
{
    private ArrayList<String> epinionSections;
    final String epinSof = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "\n\t" + "<config>" + "\n\t" + "<var-def name=\"bolds\">" + "\n\t" + "<xpath expression=\"//a[starts-with(., 'Read the')]/@href\">" + "\n\t" + "<html-to-xml>" + "\n\t" + "<http url= " ;
    final String endtags = "</html-to-xml>" + "\n\t" + "</xpath>" + "\n\t" "</var-def> ";
    final String fileTag = "<file action=\"write\" path=";
    final String openLoop = " <loop item=\"item\" index=\"i\">" + "\n\t" + "<list><var name=\"bolds\"/></list>" + "\n\t" + " <body> ";
    final String varDef =  "<var-def name=\"linkUrl\">" + "\t\n" + "<template> ${sys.fullUrl(\"http://www.epinions.com\", item.toString())} </template>" + "</var-def>";
    final String xQuery = "<xQuery>";
    final String xqParam = "<xq-param name=\"doc\">" + "\t\n" + "<html-to-xml>" + "\t\n" + "<http url=\"${linkUrl}\"/>" + "\t\n" "</html-to-xml>" + "\t\n" + "</xq-param>";
    final String endLoopBody =   "</body>" "</loop>";
    final String xqexpression = "<xq-expression><![CDATA[" + "\n\t" + "declare variable $doc as node() external;" "\n\t";
    //final String epinionTag = "td[@class=\"nav-new-1-pixel\"]/span/b | $doc//td[@class=\"nav-new-1-pixel\"]/span/text()";
    final String epinionTag = "for $item in ($doc//td[@class=\"nav-new-1-pixel\"]/span/b | $doc//td[@class=\"nav-new-1-pixel\"]/span/text())";

    public EpinionSource(String url)
    {
        super(url);
    }
    private void deNoiseEpinion()
    {
        try
        {
            BufferedReader in = new BufferedReader(new FileReader(resFile));
            String str;
            Review r = new Review();
            r.Review_Body = "";
            int start = 1;
            while ((str = in.readLine()) != null)
            {
                if(!(str.equals("<name/>")|| str.equals("<name>")))
                {
                    if(str.matches("<name>" + ".+" + "</name>" ))
                    {
                        String text  = str.substring(6 , str.length() - 7);
                       
                        if(text.equals("Recommended:"))
                        {
                             revs.add(r);
                             while ((str = in.readLine()) != null)
                             {
                                  if(str.startsWith("http"))
                                  {
                                    r = new Review();
                                    r.Review_Body = "";
                                    break;
                                  }
                             }
                             if( str == null)
                                break;
                        }
                       
                        //if(!epinionSections.contains(text))
                        else
                        {
                            r.Review_Sentences.add(text);
                            r.Review_Body += text;
                        }
                       
                    }
                    /*
                    else if(str.startsWith("http"))
                    {
                       
                            revs.add(r);
                            r = new Review();
                            r.Review_Body = "";
                       
                    }
                     *
                     */
                }
                else
                {

                }

            }
            in.close();
        }
        catch (IOException e)
        {
        }
    }
    private void writeEpinionConfigFile()
    {
        FileWriter fstream = null;
        try
        {
            fstream = new FileWriter(configFile);
            BufferedWriter out = new BufferedWriter(fstream);
            out.write(epinSof);
            String url = "\"" + URL + "\"" " />";
            out.write(url);
            out.write ("\n\t" + endtags + "\n\t" + fileTag);
            String path =  "\"" + resFile + "\" " + "charset=\"UTF-8\">";
            out.write(path);
            String loop = "\n\t" + openLoop + "\n\t" + "\n\t" + varDef + xQuery;
            out.write(loop);
            out.write("\n\t" + xqParam + "\n\t" + xqexpression);
            out.write(epinionTag);
            String endOffile = endOfEpin + "\n\t" + endLoopBody + "\n\t" + endFileConfig;
            out.write(endOffile);
            out.flush();
            out.close();

        }
        catch (IOException e)
        {
        }
    }
     public ArrayList<DataUnit> harvest()
     {
            revs.clear();
            epinionSections = new ArrayList<String>();
            writeEpinionConfigFile();
            executeConfig();
            deNoiseEpinion();
            return revs;
     }
}
TOP

Related Classes of DataAcquisition.EpinionSource

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.