Source Code of NewsComExtractor

import java.io.File;
import java.net.URL;


import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.vietspider.common.io.DataWriter;
import org.vietspider.html.HTMLDocument;
import org.vietspider.html.parser.HTMLParser2;
import org.vietspider.html.path2.HTMLExtractor;
import org.vietspider.html.path2.NodePath;
import org.vietspider.html.path2.NodePathParser;
import org.vietspider.net.client.HttpResponseReader;
import org.vietspider.net.client.WebClient;


/**
 *  Author : Nhu Dinh Thuan
 *          Email:nhudinhthuan@yahoo.com
 * Dec 5, 2006
 */
public class NewsComExtractor {
  
  private static byte[] download(WebClient webClient, String address) throws Exception {
    HttpGet httpGet = null;
    try {
      httpGet = webClient.createGetMethod(address, null);      


      if(httpGet == null) return null;
      HttpHost httpHost = webClient.createHttpHost(address);
      HttpResponse httpResponse = webClient.execute(httpHost, httpGet);


      HttpResponseReader httpResponseReader = new HttpResponseReader();
      return httpResponseReader.readBody(httpResponse);
    } catch(Exception exp) {
      throw exp;
    }
  }
  
  public static void main(String[] args) throws Exception  {
    URL url = new URL("http://news.google.com.vn/");
    
    WebClient webClient = new WebClient();
    webClient.setURL(null, url);
    
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    
    byte  [] bytes = download(webClient, "http://news.google.com.vn/");
    
    HTMLDocument document = new HTMLParser2().createDocument(bytes, null);
    
    String [] paths = {
        "BODY[0].TABLE[2].TBODY[0].TR[0].TD[3].TABLE[1].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].DIV[0].TABLE[0].TBODY[0].TR[0].TD[0].DIV[*]"
    };
    
    NodePath [] nodePaths = new NodePath[paths.length];
    for(int i=0; i<paths.length; i++){
      nodePaths[i] = pathParser.toPath(paths[i]);
    }
    
    HTMLDocument doc = htmlExtractor.extract(document, nodePaths);
    System.out.println(doc.getTextValue());
    
    paths = new String[]{
        "DIV[*].BR[*]",
    };
    
    nodePaths = new NodePath[paths.length];
    for(int i=0; i<paths.length; i++){
      nodePaths[i] = pathParser.toPath(paths[i]);
    }
    
    htmlExtractor.remove(doc.getRoot(), nodePaths);
    
    System.out.println(doc.getRoot().getTextValue());
    
    File file = new File("a.html");
    byte[] data = doc.getTextValue().getBytes();
    new DataWriter().save(file, data);
  }
  
}
Source Code of NewsComExtractor

Related Classes of NewsComExtractor