package letweb.semanticum.parser;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import letweb.semanticum.DatabaseCalls;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.util.logging.Logger;
public class Parser {
static int timeOut = 10 * 1000;
private static Logger logger = Logger.getLogger("letweb.semanticum.parser");
public void run(String url){
// String url = "http://ocw.mit.edu/courses/aeronautics-and-astronautics/16-660-introduction-to-lean-six-sigma-methods-january-iap-2008/videos/session-1-2-the-start-of-your-lean-journey/";
// String url = "http://www.ted.com/talks/mark_forsyth_what_s_a_snollygoster_a_short_lesson_in_political_speak.html";
// String url = "http://videolectures.net/mlss2011_vandenberghe_convex/";
try {
if (url.contains("videolectures"))
runVL(url);
else if (url.contains("mit"))
runMit(url);
else if (url.contains("ted"))
runTed(url);
} catch (IOException e) {
logger.severe(e.getMessage());
} catch (SQLException e) {
logger.severe(e.getMessage());
}
}
public void runVL(String url) throws IOException, SQLException {
Document doc = Jsoup.connect(url).timeout(timeOut).timeout(timeOut).get();
ArrayList<String> temp = new ArrayList<String>();
MetaData vl = new MetaData();
vl.setKind("VL");
// Metadata extractor
Elements tag = doc.select("meta");
for (int i = 0; i < tag.size(); i++) {
if (tag.get(i).attr("name").toLowerCase().contains("title")) {
vl.setLecturesName(tag.get(i).attr("content"));
}
}
tag = doc.select("div");
for (int i = 0; i < tag.size(); i++) {
if (tag.get(i).attr("class").toLowerCase().contains("lec_data")) {
temp.add(tag.get(i).select("a").text());
}
if (tag.get(i).attr("id").contentEquals("lec_desc_edit")){
vl.setDetails(tag.get(i).select("p").text());
// System.out.println(tag.get(i));
}
}
tag = doc.select("li");
String tmp = "";
for (int i = 0; i < tag.size(); i++) {
Elements a = tag.get(i).select("a");
for (int j = 0; j < a.size(); j++) {
// int num = StringUtils.countMatches(a.get(j).attr("href"), "/");
// Da testare
int count = 0;
for(int h =0; h < a.get(j).attr("href").length(); h++)
if(a.get(j).attr("href").charAt(h) == '/')
count++;
if (count == 4) {
if (!a.get(j).attr("href").contains("authors")) {
tmp += a.get(j).text() + ", ";
}
}
}
}
if (tmp.length() > 3)
vl.setKeywords(tmp.substring(0, tmp.length() - 2));
vl.setAuthors(temp);
// Link Extractor
Elements links = doc.select("div");
int videoNum = 0;
for (int i = 0; i < links.size(); i++) {
if (links.get(i).attr("class").contains("partlist_thumb")) {
videoNum++;
}
}
temp = new ArrayList<String>();
for (int i = 0; i < videoNum; i++) {
int index = i + 1;
doc = Jsoup.connect(url + "video/" + index + "/").timeout(timeOut).timeout(timeOut).get();
// Link Extractor
links = doc.select("a");
for (int j = 0; j < links.size(); j++) {
if (links.get(j).attr("href").contains("mms:")) {
temp.add(links.get(j).attr("href"));
}
}
}
vl.setLink(temp);
DatabaseCalls db = new DatabaseCalls();
db.insertInfo(url, vl.getLink(), vl.getAuthors(), vl.getLecturesName(), vl.getDetails(), vl.getKeywords(), vl.getKind(), vl.getData());
}
public void runTed(String url) throws IOException, SQLException {
Document doc = Jsoup.connect(url).timeout(timeOut).timeout(timeOut).get();
ArrayList<String> temp = new ArrayList<String>();
MetaData ted = new MetaData();
ted.setKind("TED");
// Metadata extractor
Elements tag = doc.select("meta");
for (int i = 0; i < tag.size(); i++) {
if (tag.get(i).attr("name").toLowerCase().contains("title")) {
ted.setLecturesName(tag.get(i).attr("content").substring(tag.get(i).attr("content").indexOf(":") + 2, tag.get(i).attr("content").indexOf("|")));
temp.add((tag.get(i).attr("content").substring(0, tag.get(i).attr("content").indexOf(":"))));
}
if (tag.get(i).attr("name").toLowerCase().contains("description")) {
ted.setDetails(tag.get(i).attr("content").substring(10, tag.get(i).attr("content").indexOf("<")));
// System.out.println("Content: " + content);
}
}
ted.setAuthors(temp);
temp = new ArrayList<String>();
// Link Extractor
Elements links = doc.select("input");
for (int i = 0; i < links.size(); i++) {
if (links.get(i).attr("id").contentEquals("embedthisvideo")) {
temp.add(links.get(i).toString().substring(links.get(i).toString().indexOf("vu=") + 3, links.get(i).toString().indexOf("mp4") + 3));
// System.out.println("Link: " + uriString);
}
}
ted.setLink(temp);
// print(ted);
DatabaseCalls db = new DatabaseCalls();
db.insertInfo(url, ted.getLink(), ted.getAuthors(), ted.getLecturesName(), ted.getDetails(), ted.getKeywords(), ted.getKind(), ted.getData());
}
public void runMit(String url) throws IOException, SQLException {
Document doc = Jsoup.connect(url).timeout(timeOut).timeout(timeOut).get();
ArrayList<String> temp = new ArrayList<String>();
MetaData mit = new MetaData();
mit.setKind("MIT");
// Metadata extractor
Elements tag = doc.select("meta");
for (int i = 0; i < tag.size(); i++) {
if (tag.get(i).attr("name").toLowerCase().contains("author")) {
temp.add(tag.get(i).attr("content"));
}
if (tag.get(i).attr("name").toLowerCase().contains("title")) {
mit.setLecturesName(tag.get(i).attr("content"));
}
if (tag.get(i).attr("name").toLowerCase().contains("keyword")) {
mit.setKeywords(tag.get(i).attr("content").substring(0, tag.get(i).attr("content").length() - 1));
}
}
mit.setAuthors(temp);
temp = new ArrayList<String>();
// LInk extractor
Elements links = doc.select("li");
for (int i = 0; i < links.size(); i++) {
if (links.get(i).text().contains("iTunes") || links.get(i).text().contains("Internet Archive")) {
i = i + 1;
int init = links.get(i).outerHtml().indexOf("\"") + 1;
int end = links.get(i).outerHtml().lastIndexOf("\"");
temp.add(links.get(i).outerHtml().substring(init, end));
}
}
mit.setLink(temp);
DatabaseCalls db = new DatabaseCalls();
db.insertInfo(url, mit.getLink(), mit.getAuthors(), mit.getLecturesName(), mit.getDetails(), mit.getKeywords(), mit.getKind(), mit.getData());
}
// public void print(MetaData meta) {
//
// System.out.println("********************************************************************************************************");
// System.out.println(meta.getKind());
// System.out.println(meta.getLecturesName());
//
// System.out.println("___________________________");
// ArrayList<String> temp = new ArrayList<String>();
// temp = meta.getAuthors();
// for (int i = 0; i < temp.size(); i++)
// System.out.println(temp.get(i));
// System.out.println("---------------------------");
// System.out.println(meta.getDetails());
// System.out.println(meta.getKeywords());
// System.out.println(meta.getData());
//
// System.out.println("___________________________");
// temp = new ArrayList<String>();
// temp = meta.getLink();
// for (int i = 0; i < temp.size(); i++)
// System.out.println(temp.get(i));
// System.out.println("___________________________");
//
// }
}