Package DataAcquisition

Source Code of DataAcquisition.AmazonSource

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/

package DataAcquisition;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.definition.DefinitionResolver;
import org.webharvest.runtime.Scraper;
import org.webharvest.runtime.variables.Variable;
import preprocessing.*;
import seekfeel.dataholders.DataUnit;
import seekfeel.dataholders.Review;
import java.io.BufferedReader;
import java.io.FileReader;

/**
*
* @author Tarek
*/
public class AmazonSource extends WebDataSource {

    final String sof = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "\n" + "<config>" + "\n\t" "<file action=\"write\"";
    String midOfFile = "\n" + "</html-to-xml>" + "\n\t" + "</xq-param>" + "\n\t" + xqExp ;
    final String eof = endOfEpin + "\n\t" + endFileConfig;
    String text = "\n" + "<xQuery>" + "\n\t\t" + "<xq-param name=\"doc\">" + "\n\t\t" + "<html-to-xml>" +"\n\t\t";
    String amazonTag = "div[@style=\"margin-left:0.5em;\"]/text()";

    public AmazonSource(String url)
    {
        super(url);
    }
    private void writeAmazonConfigFile()
    {
        FileWriter fstream = null;
        try
        {
            fstream = new FileWriter(configFile);
            BufferedWriter out = new BufferedWriter(fstream);
            out.write(sof);
            String path = " path=" + "\"" + resFile + "\" " + "charset=\"UTF-8\">";
            out.write(path);
            out.write(text);
            path = "<file action=\"read\" path=" + "\"" + htmlFile + "\"" + " charset=\"UTF-8\">" + "\n\t" + "</file>"  ;
            out.write(path);
            out.write(midOfFile);
            out.write(amazonTag);
            out.write(eof);
            out.flush();
            out.close();
        }
        catch (IOException e)
        {
        }
    }
    private void deNoiseAmazon()
    {
         try
        {
            BufferedReader in = new BufferedReader(new FileReader(resFile));
            String str;
            int count = 0;
            int start = 1;
            Review r = new Review();
            while ((str = in.readLine()) != null)
            {
                if(!(str.equals("<name/>")|| str.equals("<name>")))
                {
                    if(str.matches("<name>" + ".+" + "</name>" ))
                    {

                        String text= str.substring(6 , str.length() - 7);
                        if(count >= 3)
                        {
                           if( start != 1)
                           {
                             revs.add(r);
                           }
                           else
                           {
                                start = 0;
                           }
                           r = new Review();
                           r.Review_Body = text;     
                        }
                        else
                        {
                             r.Review_Body += " " + text;
                        }
                         count =0;
                    }
                }
                else
                {
                     count++;
                }
            }
            in.close();
        }
        catch (IOException e)
        {
        }
    }
    public ArrayList<DataUnit> harvest()
    {
            revs.clear();
            writeAmazonConfigFile();
            executeConfig();
            deNoiseAmazon();
            return revs;
    }

}
TOP

Related Classes of DataAcquisition.AmazonSource

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.