package net.sf.jabref.external;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import net.sf.jabref.BibtexEntry;
import net.sf.jabref.Globals;
import net.sf.jabref.net.URLDownload;
/**
* Utility class for trying to resolve URLs to full-text PDF for articles.
*/
public class FindFullText {
public final static int
FOUND_PDF = 0,
WRONG_MIME_TYPE = 1,
UNKNOWN_DOMAIN = 2,
LINK_NOT_FOUND = 3,
IO_EXCEPTION = 4,
NO_URLS_DEFINED = 5;
List<FullTextFinder> finders = new ArrayList<FullTextFinder>();
public FindFullText() {
finders.add(new ScienceDirectPdfDownload());
finders.add(new SpringerLinkPdfDownload());
}
public FindResult findFullText(BibtexEntry entry) {
String urlText = entry.getField("url");
String doiText = entry.getField("doi");
// First try the DOI link, if defined:
if ((doiText != null) && (doiText.trim().length() > 0)) {
FindResult resDoi = lookForFullTextAtURL(Globals.DOI_LOOKUP_PREFIX+doiText);
if (resDoi.status == FOUND_PDF)
return resDoi;
// The DOI link failed, try falling back on the URL link, if defined:
else if ((urlText != null) && (urlText.trim().length() > 0)) {
FindResult resUrl = lookForFullTextAtURL(urlText);
if (resUrl.status == FOUND_PDF)
return resUrl;
else {
return resDoi; // If both URL and DOI fail, we assume that the error code for DOI is
// probably the most relevant.
}
}
else return resDoi;
}
// No DOI? Try URL:
else if ((urlText != null) && (urlText.trim().length() > 0)) {
return lookForFullTextAtURL(urlText);
}
// No URL either? Return error code.
else return new FindResult(NO_URLS_DEFINED, null);
}
private FindResult lookForFullTextAtURL(String urlText) {
try {
URL url = new URL(urlText);
url = resolveRedirects(url, 0);
boolean domainKnown = false;
for (FullTextFinder finder : finders) {
if (finder.supportsSite(url)) {
domainKnown = true;
URL result = finder.findFullTextURL(url);
if (result != null) {
// Check the MIME type of this URL to see if it is a PDF. If not,
// it could be because the user doesn't have access:
try {
URLDownload udl = new URLDownload(null, result, null);
udl.openConnectionOnly();
String mimeType = udl.getMimeType();
if ((mimeType != null) && (mimeType.toLowerCase().equals("application/pdf"))) {
return new FindResult(result, url);
}
else {
udl = new URLDownload(null, result, new File("page.html"));
udl.download();
return new FindResult(WRONG_MIME_TYPE, url);
}
} catch (IOException ex) {
ex.printStackTrace();
return new FindResult(IO_EXCEPTION, url);
}
}
}
}
if (!domainKnown)
return new FindResult(UNKNOWN_DOMAIN, url);
else
return new FindResult(LINK_NOT_FOUND, url);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* Follow redirects until the final location is reached. This is necessary to handle DOI links, which
* redirect to publishers' web sites. We need to know the publisher's domain name in order to choose
* which FullTextFinder to use.
* @param url The url to start with.
* @param redirectCount The number of previous redirects. We will follow a maximum of 5 redirects.
* @return the final URL, or the initial one in case there is no redirect.
* @throws IOException for connection error
*/
private URL resolveRedirects(URL url, int redirectCount) throws IOException {
URLConnection uc = url.openConnection();
if (uc instanceof HttpURLConnection) {
HttpURLConnection huc = (HttpURLConnection)uc;
huc.setInstanceFollowRedirects(false);
huc.connect();
int responseCode = huc.getResponseCode();
String location = huc.getHeaderField("location");
huc.disconnect();
if ((responseCode == HttpURLConnection.HTTP_MOVED_TEMP) && (redirectCount < 5)) {
//System.out.println(responseCode);
//System.out.println(location);
try {
URL newUrl = new URL(location);
return resolveRedirects(newUrl, redirectCount+1);
} catch (MalformedURLException ex) {
return url; // take the previous one, since this one didn't make sense.
// TODO: this could be caused by location being a relative link, but this would just give
// the default page in the case of www.springerlink.com, not the article page. Don't know why.
}
}
else return url;
}
else return url;
}
public static String loadPage(URL url) throws IOException {
Reader in = null;
URLConnection uc;
HttpURLConnection huc = null;
try {
uc = url.openConnection();
if (uc instanceof HttpURLConnection) {
huc = (HttpURLConnection)uc;
huc.setInstanceFollowRedirects(false);
huc.connect();
in = new InputStreamReader(huc.getInputStream());
StringBuilder sb = new StringBuilder();
int c;
while ((c = in.read()) != -1)
sb.append((char)c);
return sb.toString();
}
else
return null; // TODO: are other types of connection (https?) relevant?
} finally {
try {
if (in != null) in.close();
if (huc != null) huc.disconnect();
} catch (IOException ex) { ex.printStackTrace(); }
}
}
public static class FindResult {
public URL url;
public String host = null;
public int status;
public FindResult(URL url, URL originalUrl) {
this.url = url;
this.status = FOUND_PDF;
if (originalUrl != null)
host = originalUrl.getHost();
}
public FindResult(int status, URL originalUrl) {
this.url = null;
this.status = status;
if (originalUrl != null)
this.host = originalUrl.getHost();
}
}
public static void dumpToFile(String text, File f) {
try {
FileWriter fw = new FileWriter(f);
fw.write(text);
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}