String charSet = null;
// String declaredLanguage = null;
String contentType = null;
String contentEncoding = null;
HttpLoader urlLoader = new HttpLoader();
int ret = -1;
// int tryCount = 0;
// String temp = url;
ret = urlLoader.open(url);
// while (ret == -1 && tryCount < 3) {
// try {
// urlLoader.close();
// ret = urlLoader.open("", "", true);
// }
// catch (IOException e) {
// String msg = e.getMessage();
// if (tryCount == 0 && msg!=null && msg.toLowerCase().startsWith("invalid uri")) {
// System.out.println(msg);
// temp = HttpUtils.urlEncode(temp);
// urlLoader.setUrl(temp);
// }
// else {
// Utils.sleep(tryCount * 1000);
// }
// tryCount++;
// ret = -1;
// urlLoader.close();
// if (tryCount == 3) throw new IOException(e.getMessage());
// }
// }
if (ret == HttpLoader.LOAD_SUCCESS) {
contentType = urlLoader.getContentType();
contentEncoding = urlLoader.getContentEncoding();
WebStream ws = new WebStream(urlLoader.getStream(), "", contentType, contentEncoding);
rawPage = ws.getString();
charSet = ws.getCharSet();
//String declaredLanguage = ws.getDeclaredLanguage();
ws.clear();
}
if ("links".equals(action)) {
printVerbose(url, scriptsPath, action, verbose);
List<String> links = null;
if (HttpLoader.isRss(contentType, null)) {
links = HttpUtils.extractLinksFromFeed(rawPage);
}
else {
links = HttpUtils.extractAbsoluteLinks(rawPage, url, 2);
String [] aLinks = htmlLinks(url, rawPage, links.toArray(new String[]{}), scriptsPath, null);
links = Arrays.asList(aLinks);
}
for (String strLink : links) {
try {
//strLink = strLink.trim();
//strLink = URLUtils.urlGetAbsoluteURL(url, strLink);
strLink = HttpUtils.urlNormalize(strLink.trim(), null);
System.out.println(strLink);
}
catch (Exception e) {
e.printStackTrace();
}
}
}
if ("parse".equals(action)) {
printVerbose(url, scriptsPath, action, verbose);
String title = "";
String d = "";
String page = "";
HashMap<String, String> m = htmlParse(url, rawPage, contentType, scriptsPath, null);
if (m!=null && m.size()>0) {
title = m.get("title");
d = m.get("date");
page = m.get("page");
}
// Get page text
//MultiFormatTextExtractor extractor = new MultiFormatTextExtractor();
TikaWrapper tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT);
String text = "";
InputStream in = null;
if (page==null || "".equals(page)) {
//text = extractor.htmlPageToText(rawPage, "", "");
in = IOUtils.toInputStream(rawPage);
} else {
//text = extractor.htmlPageToText(page, "", "");
in = IOUtils.toInputStream(page);
}
tikaWrapper.process(in, TikaWrapper.CONTENT_TYPE_HTML);
text = tikaWrapper.getText();
if (title==null || "".equals(title))
title = tikaWrapper.getMetaTitle();
System.out.println("Title = "+ title);
System.out.println("Date = " + d);
System.out.println("Text = " + text);
System.out.println("Page = " + page);
}
if ("meta".equals(action)) {
printVerbose(url, scriptsPath, action, verbose);
HashMap<String, String> m = extractMeta(url, rawPage, contentType, charSet, scriptsPath, null, false);
if (m!=null && m.size()>0) {
for (Map.Entry<String, String> entry : m.entrySet()) {
System.out.println("meta_extracted_" + entry.getKey() + " = " + entry.getValue());
}
}
}
urlLoader.close();
urlLoader = null;
}
catch (Exception e) {
e.printStackTrace();
}