package bgu.bio.util;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import bgu.bio.ds.rna.RNA;
/**
* @author milon This class retrieve information from NCBI from a query of GI
* (or an array of GI) according to <a
* href="http://www.ncbi.nlm.nih.gov/books/NBK25499">this guide</a>
*
*/
public class NCBIInformation {
public static final String TAX_ID = "TaxId";
public static final String EXTRA = "Extra";
public static final String TITLE = "Title";
public static final String ORGANISM = "Organism";
public static final String GI = "Gi";
private static final String[] fields = new String[] { TAX_ID, EXTRA, TITLE,
ORGANISM, GI };
public static RNA getSequence(String queryGI, int start, int stop,
boolean minus) {
return getSequence(queryGI, start, stop, minus, 3);
}
public static RNA getSequence(String queryGI, int start, int stop,
boolean minus, int attempts) {
if (attempts < 0) {
return null;
}
int strand = minus ? 0 : 1;
String urlStr = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id="
+ queryGI;
urlStr += "&strand=" + strand;
if (start >= 0) {
urlStr += "&seq_start=" + start;
urlStr += "&seq_stop=" + stop;
}
urlStr += "&rettype=fasta&retmode=text";
StringBuffer buf = new StringBuffer();
try {
URL url = new URL(urlStr);
URLConnection yc = url.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(
yc.getInputStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
buf.append(inputLine);
buf.append('\n');
}
in.close();
} catch (Exception ex) {
if (attempts > 0) {
return getSequence(queryGI, start, stop, minus, attempts - 1);
}
throw new RuntimeException(ex);
}
String str = buf.toString();
BufferedReader reader = new BufferedReader(new StringReader(str));
return RNA.loadFromFile(reader, false).get(0);
}
public static List<String> search(String term) throws IOException,
ParserConfigurationException, SAXException {
List<String> list = new ArrayList<String>();
URL url = new URL(
"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nuccore&version=2.0&term="
+ term.trim());
URLConnection yc = url.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(
yc.getInputStream()));
String inputLine;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
StringBuffer buf = new StringBuffer();
while ((inputLine = in.readLine()) != null) {
buf.append(inputLine);
buf.append('\n');
}
in.close();
ByteArrayInputStream r = new ByteArrayInputStream(buf.toString()
.getBytes());
Document dom = db.parse(r);
Element docEle = (Element) dom.getDocumentElement()
.getElementsByTagName("IdList").item(0);
NodeList nl = docEle.getElementsByTagName("Id");
for (int i = 0; i < nl.getLength(); i++) {
Node itm = nl.item(i);
Element el = (Element) itm;
list.add(el.getTextContent());
}
return list;
}
public static HashMap<String, String> fetch(String queryGI)
throws IOException, ParserConfigurationException, SAXException {
HashMap<String, String> ans = new HashMap<String, String>();
URL url = new URL(
"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nuccore&version=2.0&id="
+ queryGI.trim());
URLConnection yc = url.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(
yc.getInputStream()));
String inputLine;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
StringBuffer buf = new StringBuffer();
while ((inputLine = in.readLine()) != null) {
buf.append(inputLine);
buf.append('\n');
}
in.close();
// System.out.println(buf);
ByteArrayInputStream r = new ByteArrayInputStream(buf.toString()
.getBytes());
Document dom = db.parse(r);
Element docEle = (Element) dom.getDocumentElement()
.getElementsByTagName("DocumentSummarySet").item(0);
NodeList nl = docEle.getElementsByTagName("DocumentSummary");
if (nl.getLength() == 0) {
throw new RuntimeException("Didn't get information: " + buf);
}
for (int i = 0; i < nl.getLength(); i++) {
Node itm = nl.item(i);
Element el = (Element) itm;
for (int f = 0; f < fields.length; f++) {
Node field = el.getElementsByTagName(fields[f]).item(0)
.getFirstChild();
if (field == null) {
throw new RuntimeException("Missing field " + fields[f]
+ ": " + buf);
}
ans.put(fields[f], field.getNodeValue().trim());
}
}
r.close();
return ans;
}
public static HashMap<String, HashMap<String, String>> fetch(
String[] queryGI) throws IOException, ParserConfigurationException,
SAXException {
HashMap<String, HashMap<String, String>> ans = new HashMap<String, HashMap<String, String>>();
HashSet<String> set = new HashSet<String>();
for (int i = 0; i < queryGI.length; i++) {
set.add(queryGI[i]);
}
for (String string : set) {
ans.put(string, fetch(string));
}
return ans;
}
public static String match(HashMap<String, String> query,
HashMap<String, HashMap<String, String>> database) {
String ans = "";
int bestScore = -1;
for (String key : database.keySet()) {
HashMap<String, String> current = database.get(key);
String currentGI = current.get(NCBIInformation.GI);
int currentScore = score(query, current);
if (currentScore > bestScore) {
bestScore = currentScore;
ans = currentGI;
}
}
return ans;
}
/**
* @param query
* @param currentScore
* @param current
* @return
*/
public static int score(HashMap<String, String> query,
HashMap<String, String> current) {
int currentScore = 0;
if (current.get(NCBIInformation.GI).equals(query.get(GI))) {
currentScore += (fields.length + 1) * 100;
}
if (current.get(NCBIInformation.TAX_ID).equals(query.get(TAX_ID))) {
currentScore++;
}
if (current.get(NCBIInformation.ORGANISM).equals(query.get(ORGANISM))) {
currentScore++;
}
if (current.get(NCBIInformation.TITLE).equals(query.get(TITLE))) {
currentScore++;
}
return currentScore;
}
public static void main(String[] args) throws IOException,
ParserConfigurationException, SAXException {
HashMap<String, HashMap<String, String>> database = NCBIInformation
.fetch(("340616011, 340616011, 319951593, 319951593, 209401268, 423474542, 423474542, 423474542, 423474542, 423603419, 423603419, 423603419, 423603419, 423603419, 423603419, 423572779, 423572779, 423572779, 423572779, 423572779, 423572779, 423572779, 423549369, 423549369, 423549369, 423549369, 423549369, 423549369, 423549369, 423456860, 423456860, 423456860, 423456860, 423456860, 423399602, 423399602, 423399602, 423399602, 423399602, 423399602, 423399602, 386733873, 386733873, 386733873, 386733873, 386733873, 386733873, 386733873, 384177910, 384177910, 384177910, 384177910, 384177910, 384177910, 384177910, 452721103, 452721103, 452721103, 452721103, 344201337, 344201337, 238801509, 238801509, 238801509, 238801509, 238801509, 238801509, 238801509, 238801507, 238801507, 238801507, 238801507, 238801507, 238801507, 238801507, 238801506, 238801506, 238801506, 238801506, 238801506, 238801506, 238801506, 238801506, 238801504, 238801504, 238801504, 238801504, 238801504, 238801504, 238801504, 238801504, 238801504, 238801484, 238801484, 238801484, 238801484, 238801484, 238801484, 238801484, 238801479, 238801479, 238801479, 238801479, 238801479, 238801479, 238801479, 238801472, 238801472, 238801472, 238801472, 238801472")
.split(","));
HashMap<String, String> q = NCBIInformation.fetch("157484856");
System.out.println(NCBIInformation.match(q, database));
List<String> l = NCBIInformation.search("AACY020322622.1");
System.out.println("list:" + l);
RNA r = NCBIInformation.getSequence("157484856", 1, 100, false);
System.out.println(r.getHeader() + "\n" + r.getPrimary());
}
public static HashMap<String, HashMap<String, String>> fetch(
List<String> list) throws IOException,
ParserConfigurationException, SAXException {
String[] arr = new String[list.size()];
for (int i = 0; i < arr.length; i++) {
arr[i] = list.get(i);
}
return fetch(arr);
}
public static void getSequence(String string, Writer writer)
throws IOException, ParserConfigurationException, SAXException {
String urlStr = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id="
+ string;
urlStr += "&strand=1";
urlStr += "&rettype=fasta&retmode=text";
URL url = new URL(urlStr);
URLConnection yc = url.openConnection();
yc.setConnectTimeout(0);
yc.setReadTimeout(0);
BufferedReader in = new BufferedReader(new InputStreamReader(
yc.getInputStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
writer.write(inputLine);
writer.write("\n");
}
in.close();
}
}