Package letweb.semanticum.parser

Source Code of letweb.semanticum.parser.Parser

package letweb.semanticum.parser;

import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;

import letweb.semanticum.DatabaseCalls;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.util.logging.Logger;

public class Parser {

  static int timeOut = 10 * 1000;
  private static Logger logger = Logger.getLogger("letweb.semanticum.parser");

  public void run(String url){

//     String url = "http://ocw.mit.edu/courses/aeronautics-and-astronautics/16-660-introduction-to-lean-six-sigma-methods-january-iap-2008/videos/session-1-2-the-start-of-your-lean-journey/";
//     String url = "http://www.ted.com/talks/mark_forsyth_what_s_a_snollygoster_a_short_lesson_in_political_speak.html";
//    String url = "http://videolectures.net/mlss2011_vandenberghe_convex/";
    try {
     
    if (url.contains("videolectures"))
        runVL(url);
    else if (url.contains("mit"))
      runMit(url);
    else if (url.contains("ted"))
      runTed(url);
   
    } catch (IOException e) {
      logger.severe(e.getMessage());
    } catch (SQLException e) {
      logger.severe(e.getMessage());
    }
  }

  public void runVL(String url) throws IOException, SQLException {
    Document doc = Jsoup.connect(url).timeout(timeOut).timeout(timeOut).get();

    ArrayList<String> temp = new ArrayList<String>();
    MetaData vl = new MetaData();

    vl.setKind("VL");
    // Metadata extractor
    Elements tag = doc.select("meta");
    for (int i = 0; i < tag.size(); i++) {
      if (tag.get(i).attr("name").toLowerCase().contains("title")) {
        vl.setLecturesName(tag.get(i).attr("content"));
      }
    }

    tag = doc.select("div");
    for (int i = 0; i < tag.size(); i++) {
      if (tag.get(i).attr("class").toLowerCase().contains("lec_data")) {
        temp.add(tag.get(i).select("a").text());
      }
       if (tag.get(i).attr("id").contentEquals("lec_desc_edit")){
        vl.setDetails(tag.get(i).select("p").text());
//        System.out.println(tag.get(i));
      }
    }

    tag = doc.select("li");
    String tmp = "";
    for (int i = 0; i < tag.size(); i++) {
      Elements a = tag.get(i).select("a");
      for (int j = 0; j < a.size(); j++) {
       
//        int num = StringUtils.countMatches(a.get(j).attr("href"), "/");
       
        // Da testare
        int count = 0;
        for(int h =0; h < a.get(j).attr("href").length(); h++)
            if(a.get(j).attr("href").charAt(h) == '/')
                count++;
       
        if (count == 4) {
          if (!a.get(j).attr("href").contains("authors")) {
            tmp += a.get(j).text() + ", ";
          }
        }
      }
    }
    if (tmp.length() > 3)
      vl.setKeywords(tmp.substring(0, tmp.length() - 2));

    vl.setAuthors(temp);

    // Link Extractor
    Elements links = doc.select("div");
    int videoNum = 0;
    for (int i = 0; i < links.size(); i++) {
      if (links.get(i).attr("class").contains("partlist_thumb")) {
        videoNum++;
      }
    }

    temp = new ArrayList<String>();
    for (int i = 0; i < videoNum; i++) {
      int index = i + 1;
      doc = Jsoup.connect(url + "video/" + index + "/").timeout(timeOut).timeout(timeOut).get();
      // Link Extractor
      links = doc.select("a");
      for (int j = 0; j < links.size(); j++) {
        if (links.get(j).attr("href").contains("mms:")) {
          temp.add(links.get(j).attr("href"));
        }
      }

    }
    vl.setLink(temp);
    DatabaseCalls db = new DatabaseCalls();
    db.insertInfo(url, vl.getLink(), vl.getAuthors(), vl.getLecturesName(), vl.getDetails(), vl.getKeywords(), vl.getKind(), vl.getData());

  }

  public void runTed(String url) throws IOException, SQLException {
    Document doc = Jsoup.connect(url).timeout(timeOut).timeout(timeOut).get();

    ArrayList<String> temp = new ArrayList<String>();
    MetaData ted = new MetaData();

    ted.setKind("TED");
    // Metadata extractor
    Elements tag = doc.select("meta");
    for (int i = 0; i < tag.size(); i++) {
      if (tag.get(i).attr("name").toLowerCase().contains("title")) {
        ted.setLecturesName(tag.get(i).attr("content").substring(tag.get(i).attr("content").indexOf(":") + 2, tag.get(i).attr("content").indexOf("|")));
        temp.add((tag.get(i).attr("content").substring(0, tag.get(i).attr("content").indexOf(":"))));
      }
      if (tag.get(i).attr("name").toLowerCase().contains("description")) {
        ted.setDetails(tag.get(i).attr("content").substring(10, tag.get(i).attr("content").indexOf("<")));
        // System.out.println("Content: " + content);
      }
    }

    ted.setAuthors(temp);
    temp = new ArrayList<String>();

    // Link Extractor
    Elements links = doc.select("input");
    for (int i = 0; i < links.size(); i++) {
      if (links.get(i).attr("id").contentEquals("embedthisvideo")) {
        temp.add(links.get(i).toString().substring(links.get(i).toString().indexOf("vu=") + 3, links.get(i).toString().indexOf("mp4") + 3));
        // System.out.println("Link: " + uriString);
      }
    }

    ted.setLink(temp);
//    print(ted);
    DatabaseCalls db = new DatabaseCalls();
    db.insertInfo(url, ted.getLink(), ted.getAuthors(), ted.getLecturesName(), ted.getDetails(), ted.getKeywords(), ted.getKind(), ted.getData());
   
  }

  public void runMit(String url) throws IOException, SQLException {
    Document doc = Jsoup.connect(url).timeout(timeOut).timeout(timeOut).get();

    ArrayList<String> temp = new ArrayList<String>();
    MetaData mit = new MetaData();

    mit.setKind("MIT");
    // Metadata extractor
    Elements tag = doc.select("meta");
    for (int i = 0; i < tag.size(); i++) {
      if (tag.get(i).attr("name").toLowerCase().contains("author")) {
        temp.add(tag.get(i).attr("content"));
      }
      if (tag.get(i).attr("name").toLowerCase().contains("title")) {
        mit.setLecturesName(tag.get(i).attr("content"));
      }
      if (tag.get(i).attr("name").toLowerCase().contains("keyword")) {
        mit.setKeywords(tag.get(i).attr("content").substring(0, tag.get(i).attr("content").length() - 1));
      }
    }

    mit.setAuthors(temp);
    temp = new ArrayList<String>();

    // LInk extractor
    Elements links = doc.select("li");
    for (int i = 0; i < links.size(); i++) {
      if (links.get(i).text().contains("iTunes") || links.get(i).text().contains("Internet Archive")) {
        i = i + 1;
        int init = links.get(i).outerHtml().indexOf("\"") + 1;
        int end = links.get(i).outerHtml().lastIndexOf("\"");
        temp.add(links.get(i).outerHtml().substring(init, end));
      }
    }
    mit.setLink(temp);
    DatabaseCalls db = new DatabaseCalls();
    db.insertInfo(url, mit.getLink(), mit.getAuthors(), mit.getLecturesName(), mit.getDetails(), mit.getKeywords(), mit.getKind(), mit.getData());
  }

//  public void print(MetaData meta) {
//
//    System.out.println("********************************************************************************************************");
//    System.out.println(meta.getKind());
//    System.out.println(meta.getLecturesName());
//
//    System.out.println("___________________________");
//    ArrayList<String> temp = new ArrayList<String>();
//    temp = meta.getAuthors();
//    for (int i = 0; i < temp.size(); i++)
//      System.out.println(temp.get(i));
//    System.out.println("---------------------------");
//    System.out.println(meta.getDetails());
//    System.out.println(meta.getKeywords());
//    System.out.println(meta.getData());
//
//    System.out.println("___________________________");
//    temp = new ArrayList<String>();
//    temp = meta.getLink();
//    for (int i = 0; i < temp.size(); i++)
//      System.out.println(temp.get(i));
//    System.out.println("___________________________");
//
//  }

}
TOP

Related Classes of letweb.semanticum.parser.Parser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.