package net.sf.jabref.imports;
import net.sf.jabref.BibtexEntry;
import net.sf.jabref.Globals;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import net.sf.jabref.BibtexFields;
/**
* Importer for records downloaded from CSA: Cambridge Scientific Abstracts
* in full text format. Although the same basic format is used by all CSA
* databases, this importer has been tailored and tested to handle
* ASFA: Aquatic Sciences and Fisheries records.
*
* @author John Relph
*/
public class CsaImporter extends ImportFormat {
// local fields
private int line;
// pre-compiled patterns
private final static Pattern FIELD_PATTERN =
Pattern.compile("^([A-Z][A-Z]): ([A-Z].*)$");
private final static Pattern VOLNOPP_PATTERN =
Pattern.compile("[;,\\.]\\s+(\\d+[A-Za-z]?)\\((\\d+(?:-\\d+)?)\\)(?:,\\s+|:)(\\d+-\\d+)");
private final static Pattern PAGES_PATTERN =
Pattern.compile("[;,\\.]\\s+(?:(\\[?[vn]\\.?p\\.?\\]?)|(?:pp?\\.?\\s+)(\\d+[A-Z]?(?:-\\d+[A-Z]?)?)|(\\d+[A-Z]?(?:-\\d+[A-Z]?)?)(?:\\s+pp?))");
private final static Pattern VOLUME_PATTERN =
Pattern.compile("[;,\\.]?\\s+[vV][oO][lL]\\.?\\s+(\\d+[A-Z]?(?:-\\d+[A-Z]?)?)");
private final static Pattern NUMBER_PATTERN =
Pattern.compile("[;,\\.]\\s+(?:No|no|Part|part|NUMB)\\.?\\s+([A-Z]?\\d+(?:[/-]\\d+)?)");
private final static Pattern DATE_PATTERN =
Pattern.compile("[;,\\.]\\s+(?:(\\d+)\\s)?(?:([A-Z][a-z][a-z])[\\.,]*\\s)?\\(?(\\d\\d\\d\\d)\\)?(?:\\s([A-Z][a-z][a-z]))?(?:\\s+(\\d+))?");
private final static Pattern LT_PATTERN =
Pattern.compile("\\[Lt\\]");
// other constants
private static final String MONS =
"jan feb mar apr may jun jul aug sep oct nov dec";
private static final String[] MONTHS =
{ "January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December" };
/**
* Return the name of this import format.
*/
public String getFormatName() {
return "CSA";
}
/*
* (non-Javadoc)
* @see net.sf.jabref.imports.ImportFormat#getCLIId()
*/
public String getCLIId() {
return "csa";
}
// read a line
private String readLine(BufferedReader file) throws IOException {
String str = file.readLine();
if (str != null)
line++;
return str;
}
// append to the "note" field
private void addNote(HashMap<String, String> hm, String note) {
StringBuffer notebuf = new StringBuffer();
if (hm.get("note") != null) {
notebuf.append(hm.get("note"));
notebuf.append("\n");
}
notebuf.append(note);
hm.put("note", notebuf.toString());
}
// parse the date from the Source field
private String parseDate(HashMap<String, String> hm, String fstr) {
// find LAST matching date in string
int match = -1;
Matcher pm = DATE_PATTERN.matcher(fstr);
while (pm.find()) {
match = pm.start();
// System.out.println("MATCH: " + match + ": " + pm.group(0));
}
if (match == -1) {
// System.out.println("NO MATCH: \"" + fstr + "\"");
return fstr;
}
if (!pm.find(match)) {
// System.out.println("MATCH FAILED: \"" + fstr + "\"");
return fstr;
}
StringBuffer date = new StringBuffer();
String day = pm.group(1);
if (day == null)
day = pm.group(5);
else if (pm.group(5) != null)
return fstr; // possible day found in two places
if (day != null && !day.equals("0")) {
date.append(day);
date.append(" ");
} else
day = null;
String mon = pm.group(2);
if (mon == null)
mon = pm.group(4);
else if (pm.group(4) != null)
return fstr; // possible month found in two places
int idx = -1;
if (mon != null) {
String lmon = mon.toLowerCase();
idx = MONS.indexOf(lmon);
if (idx == -1) // not legal month, error
return fstr;
date.append(mon);
date.append(" ");
idx = idx / 4;
hm.put("month", MONTHS[idx]);
} else if (day != null) // day found but not month, error
return fstr;
String year = pm.group(3);
date.append(year);
StringBuffer note = new StringBuffer();
if (day != null && !day.equals("0")) {
note.append("Source Date: ");
note.append(date);
note.append(".");
addNote(hm, note.toString());
}
// check if journal year matches PY field
if (hm.get("year") != null) {
String oyear = hm.get("year");
if (!year.equals(oyear)) {
note.setLength(0);
note.append("Source Year: ");
note.append(year);
note.append(".");
addNote(hm, note.toString());
// System.out.println(year + " != " + oyear);
}
} else
hm.put("year", year);
int len = fstr.length();
StringBuffer newf = new StringBuffer();
if (pm.start() > 0)
newf.append(fstr.substring(0, pm.start()));
if (pm.end() < len)
newf.append(fstr.substring(pm.end(), len));
return newf.toString();
}
/**
* Check whether the source is in the correct format for this importer.
*/
public boolean isRecognizedFormat(InputStream stream) throws IOException {
// CSA records start with "DN: Database Name"
BufferedReader in =
new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));
String str;
while ((str = in.readLine()) != null) {
if (str.equals("DN: Database Name"))
return true;
}
return false;
}
/**
* Parse the entries in the source, and return a List of BibtexEntry
* objects.
*/
public List<BibtexEntry> importEntries(InputStream stream) throws IOException {
ArrayList<BibtexEntry> bibitems = new ArrayList<BibtexEntry>();
StringBuffer sb = new StringBuffer();
HashMap<String, String> hm = new HashMap<String, String>();
BufferedReader in =
new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));
String Type = null;
String str;
boolean first = true;
line = 1;
str = readLine(in);
while (true) {
if (str == null || str.length() == 0) { // end of record
if (!hm.isEmpty()) { // have a record
if (Type == null) {
addNote(hm, "Publication Type: [NOT SPECIFIED]");
addNote(hm, "[PERHAPS NOT FULL FORMAT]");
Type = "article";
}
// post-process Journal article
if (Type.equals("article") &&
hm.get("booktitle") != null) {
String booktitle = hm.get("booktitle");
hm.remove("booktitle");
hm.put("journal", booktitle);
}
BibtexEntry b =
new BibtexEntry(BibtexFields.DEFAULT_BIBTEXENTRY_ID,
Globals.getEntryType(Type));
// create one here
b.setField(hm);
bibitems.add(b);
}
hm.clear(); // ready for next record
first = true;
if (str == null)
break; // end of file
str = readLine(in);
continue;
}
int fline = line; // save this before reading field contents
Matcher fm = FIELD_PATTERN.matcher(str);
if (fm.find()) {
// save the field name (long and short)
String fabbr = fm.group(1);
String fname = fm.group(2);
// read the contents of the field
sb.setLength(0); // clear the buffer
while ((str = readLine(in)) != null) {
if (! str.startsWith(" ")) // field contents?
break; // nope
if (sb.length() > 0) {
sb.append(" ");
}
sb.append(str.substring(4)); // skip spaces
}
String fstr = sb.toString();
if (fstr == null || fstr.length() == 0) {
int line1 = line - 1;
throw new IOException("illegal empty field at line " +
line1);
}
// replace [Lt] with <
fm = LT_PATTERN.matcher(fstr);
if (fm.find())
fstr = fm.replaceAll("<");
// check for start of new record
if (fabbr.equals("DN") &&
fname.equalsIgnoreCase("Database Name")) {
if (first == false) {
throw new IOException("format error at line " + fline +
": DN out of order");
}
first = false;
} else if (first == true) {
throw new IOException("format error at line " + fline +
": missing DN");
}
if (fabbr.equals("PT")) {
Type = null;
String flow = fstr.toLowerCase();
String[] types = flow.split("; ");
for (int ii = 0; ii < types.length; ++ii) {
if ((types[ii].indexOf("article")>=0) ||
(types[ii].indexOf("journal article")>=0)) {
Type = "article";
break;
} else if (types[ii].equals("dissertation")) {
Type = "phdthesis";
break;
} else if (types[ii].equals("conference")) {
Type = "inproceedings";
break;
} else if (types[ii].equals("book monograph") &&
Type == null) {
Type = "book";
break;
} else if (types[ii].equals("report") &&
Type == null) {
Type = "techreport";
break;
}
}
if (Type == null) {
Type = "misc";
}
}
String ftype = null;
if (fabbr.equals("AB"))
ftype = "abstract";
else if (fabbr.equals("AF"))
ftype = "affiliation";
else if (fabbr.equals("AU")) {
ftype = "author";
if (fstr.indexOf(";") >= 0)
fstr = fstr.replaceAll("; ", " and ");
}
else if (fabbr.equals("CA"))
ftype = "organization";
else if (fabbr.equals("DE"))
ftype = "keywords";
else if (fabbr.equals("DO"))
ftype = "doi";
else if (fabbr.equals("ED"))
ftype = "editor";
else if (fabbr.equals("IB"))
ftype = "ISBN";
else if (fabbr.equals("IS"))
ftype = "ISSN";
else if (fabbr.equals("JN"))
ftype = "journal";
else if (fabbr.equals("LA"))
ftype = "language";
else if (fabbr.equals("PB"))
ftype = "publisher";
else if (fabbr.equals("PY")) {
ftype = "year";
if (hm.get("year") != null) {
String oyear = hm.get("year");
if (!fstr.equals(oyear)) {
StringBuffer note = new StringBuffer();
note.append("Source Year: ");
note.append(oyear);
note.append(".");
addNote(hm, note.toString());
// System.out.println(fstr + " != " + oyear);
}
}
} else if (fabbr.equals("RL")) {
ftype = "url";
String[] lines = fstr.split(" ");
StringBuffer urls = new StringBuffer();
for (int ii = 0; ii < lines.length; ++ii) {
if (lines[ii].startsWith("[URL:"))
urls.append(lines[ii].substring(5));
else if (lines[ii].endsWith("]")) {
int len = lines[ii].length();
urls.append(lines[ii].substring(0, len - 1));
if (ii < lines.length - 1)
urls.append("\n");
} else
urls.append(lines[ii]);
}
fstr = urls.toString();
} else if (fabbr.equals("SO")) {
ftype = "booktitle";
// see if we can extract journal information
// compact vol(no):page-page:
Matcher pm = VOLNOPP_PATTERN.matcher(fstr);
if (pm.find()) {
hm.put("volume", pm.group(1));
hm.put("number", pm.group(2));
hm.put("pages", pm.group(3));
fstr = pm.replaceFirst("");
}
// pages
pm = PAGES_PATTERN.matcher(fstr);
StringBuffer pages = new StringBuffer();
while (pm.find()) {
if (pages.length() > 0)
pages.append(",");
String pp = pm.group(1);
if (pp == null)
pp = pm.group(2);
if (pp == null)
pp = pm.group(3);
pages.append(pp);
fstr = pm.replaceFirst("");
pm = PAGES_PATTERN.matcher(fstr);
}
if (pages.length() > 0)
hm.put("pages", pages.toString());
// volume:
pm = VOLUME_PATTERN.matcher(fstr);
if (pm.find()) {
hm.put("volume", pm.group(1));
fstr = pm.replaceFirst("");
}
// number:
pm = NUMBER_PATTERN.matcher(fstr);
if (pm.find()) {
hm.put("number", pm.group(1));
fstr = pm.replaceFirst("");
}
// journal date:
fstr = parseDate(hm, fstr);
// strip trailing whitespace
Pattern pp = Pattern.compile(",?\\s*$");
pm = pp.matcher(fstr);
if (pm.find())
fstr = pm.replaceFirst("");
if (fstr.equals(""))
continue;
// System.out.println("SOURCE: \"" + fstr + "\"");
} else if (fabbr.equals("TI"))
ftype = "title";
else if (fabbr.equals("RE"))
continue; // throw away References
if (ftype != null) {
hm.put(ftype, fstr);
} else {
StringBuffer val = new StringBuffer();
val.append(fname);
val.append(": ");
val.append(fstr);
val.append(".");
addNote(hm, val.toString());
}
} else
str = readLine(in);
}
return bibitems;
}
}