/*
Copyright (C) 2005 Andreas Rudert
All programs in this directory and
subdirectories are published under the GNU General Public License as
described below.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA
Further information about the GNU GPL is available at:
http://www.gnu.org/copyleft/gpl.ja.html
*/
package net.sf.jabref.imports;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import net.sf.jabref.BibtexEntry;
import net.sf.jabref.BibtexEntryType;
import net.sf.jabref.Util;
/**
* Imports a New Economics Papers-Message from the REPEC-NEP Service.
*
* <p>{@link http://www.repec.org RePEc} (Research Papers in Economics)
* is a collaborative effort of over 100 volunteers in 49 countries
* to enhance the dissemination of research in economics. The heart of
* the project is a decentralized database of working papers, journal
* articles and software components. All RePEc material is freely available.</p>
* At the time of writing RePEc holds over 300.000 items.</p>
*
* <p>{@link http://nep.repec.org NEP} (New Economic Papers) is an announcement
* service which filters information on new additions to RePEc into edited
* reports. The goal is to provide subscribers with up-to-date information
* to the research literature.</p>
*
* <p>This importer is capable of importing NEP messages into JabRef.</p>
*
* <p>There is no officially defined message format for NEP. NEP messages are assumed to have
* (and almost always have) the form given by the following semi-formal grammar:
* <pre>
* NEPMessage:
* MessageSection NEPMessage
* MessageSection
*
* MessageSection:
* OverviewMessageSection
* OtherMessageSection
*
* # we skip the overview
* OverviewMessageSection:
* 'In this issue we have: ' SectionSeparator OtherStuff
*
* OtherMessageSection:
* SectionSeparator OtherMessageSectionContent
*
* # we skip other stuff and read only full working paper references
* OtherMessageSectionContent:
* WorkingPaper EmptyLine OtherMessageSectionContent
* OtherStuff EmptyLine OtherMessageSectionContent
* ''
*
* OtherStuff:
* NonEmptyLine OtherStuff
* NonEmptyLine
*
* NonEmptyLine:
* a non-empty String that does not start with a number followed by a '.'
*
* # working papers are recognized by a number followed by a '.'
* # in a non-overview section
* WorkingPaper:
* Number'.' WhiteSpace TitleString EmptyLine Authors EmptyLine Abstract AdditionalFields
* Number'.' WhiteSpace TitleString AdditionalFields Abstract AdditionalFields
*
* TitleString:
* a String that may span several lines and should be joined
*
* # there must be at least one author
* Authors:
* Author '\n' Authors
* Author '\n'
*
* # optionally, an institution is given for an author
* Author:
* AuthorName
* AuthorName '(' Institution ')'
*
* # there are no rules about the name, it may be firstname lastname or lastname, firstname or anything else
* AuthorName:
* a non-empty String without '(' or ')' characters, not spanning more that one line
*
* Institution:
* a non-empty String that may span several lines
*
* Abstract:
* a (possibly empty) String that may span several lines
*
* AdditionalFields:
* AdditionalField '\n' AdditionalFields
* EmptyLine AdditionalFields
* ''
*
* AdditionalField:
* 'Keywords:' KeywordList
* 'URL:' non-empty String
* 'Date:' DateString
* 'JEL:' JelClassificationList
* 'By': Authors
*
* KeywordList:
* Keyword ',' KeywordList
* Keyword ';' KeywordList
* Keyword
*
* Keyword:
* non-empty String that does not contain ',' (may contain whitespace)
*
* # if no date is given, the current year as given by the system clock is assumed
* DateString:
* 'yyyy-MM-dd'
* 'yyyy-MM'
* 'yyyy'
*
* JelClassificationList:
* JelClassification JelClassificationList
* JelClassification
*
* # the JEL Classifications are set into a new BIBTEX-field 'jel'
* # they will appear if you add it as a field to one of the BIBTex Entry sections
* JelClassification:
* one of the allowed classes, see http://ideas.repec.org/j/
*
* SectionSeparator:
* '\n-----------------------------'
* </pre>
* </p>
*
* @see http://nep.repec.org
* @author andreas_sf at rudert-home dot de
*/
public class RepecNepImporter extends ImportFormat {
private final static Collection<String> recognizedFields = Arrays.asList(new String[]{"Keywords", "JEL", "Date", "URL", "By"});
private int line = 0;
private String lastLine = "";
private String preLine = "";
private BufferedReader in = null;
private boolean inOverviewSection = false;
/**
* Return the name of this import format.
*/
public String getFormatName() {
return "REPEC New Economic Papers (NEP)";
}
/*
* (non-Javadoc)
* @see net.sf.jabref.imports.ImportFormat#getCLIId()
*/
public String getCLIId() {
return "repecnep";
}
/*
* (non-Javadoc)
* @see net.sf.jabref.imports.ImportFormat#getExtensions()
*/
public String getExtensions() {
return ".txt";
}
/*
* (non-Javadoc)
* @see net.sf.jabref.imports.ImportFormat#getDescription()
*/
public String getDescription() {
return
"Imports a New Economics Papers-Message (see http://nep.repec.org)\n"
+ "from the REPEC-NEP Service (see http://www.repec.org).\n"
+ "To import papers either save a NEP message as a text file and then import or\n"
+ "copy&paste the papers you want to import and make sure, one of the first lines\n"
+ "contains the line \"nep.repec.org\".";
}
/*
* (non-Javadoc)
* @see net.sf.jabref.imports.ImportFormat#isRecognizedFormat(java.io.InputStream)
*/
public boolean isRecognizedFormat(InputStream stream) throws IOException {
// read the first couple of lines
// NEP message usually contain the String 'NEP: New Economics Papers'
// or, they are from nep.repec.org
BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));
String startOfMessage = "";
String line = in.readLine();
for (int i = 0; i < 25 && line != null; i++) {
startOfMessage += line;
line = in.readLine();
}
return startOfMessage.indexOf("NEP: New Economics Papers") >= 0 || startOfMessage.indexOf("nep.repec.org") >= 0;
}
private boolean startsWithKeyword(Collection<String> keywords) {
boolean result = this.lastLine.indexOf(':') > 0;
if (result) {
String possibleKeyword = this.lastLine.substring(0, this.lastLine.indexOf(':'));
result = keywords.contains(possibleKeyword);
}
return result;
}
private void readLine() throws IOException {
this.line++;
this.preLine = this.lastLine;
this.lastLine = this.in.readLine();
}
/**
* Read multiple lines.
*
* <p>Reads multiple lines until either
* <ul>
* <li>an empty line</li>
* <li>the end of file</li>
* <li>the next working paper or</li>
* <li>a keyword</li>
* </ul>
* is found. Whitespace at start or end of lines is trimmed except for one blank character.</p>
*
* @return result
*/
private String readMultipleLines() throws IOException {
String result = this.lastLine.trim();
readLine();
while (this.lastLine != null && !this.lastLine.trim().equals("") && !startsWithKeyword(recognizedFields) && !isStartOfWorkingPaper()) {
result += this.lastLine.length() == 0 ? this.lastLine.trim() : " " + this.lastLine.trim();
readLine();
}
return result;
}
/**
* Implements grammar rule "TitleString".
*
* @param be
* @throws IOException
*/
private void parseTitleString(BibtexEntry be) throws IOException {
// skip article number
this.lastLine = this.lastLine.substring(this.lastLine.indexOf('.') + 1, this.lastLine.length());
be.setField("title", readMultipleLines());
}
/**
* Implements grammer rule "Authors"
*
* @param be
* @throws IOException
*/
private void parseAuthors(BibtexEntry be) throws IOException {
// read authors and institutions
String authors = "";
String institutions = "";
while (this.lastLine != null && !this.lastLine.equals("") && !startsWithKeyword(recognizedFields)) {
// read single author
String author = null;
String institution = null;
boolean institutionDone = false;
if (this.lastLine.indexOf('(') >= 0) {
author = this.lastLine.substring(0, this.lastLine.indexOf('(')).trim();
institutionDone = this.lastLine.indexOf(')') > 0;
institution = this.lastLine.substring(this.lastLine.indexOf('(') + 1, institutionDone && this.lastLine.indexOf(')') > this.lastLine.indexOf('(') + 1 ? this.lastLine.indexOf(')') : this.lastLine.length()).trim();
} else {
author = this.lastLine.substring(0, this.lastLine.length()).trim();
institutionDone = true;
}
readLine();
while (!institutionDone && this.lastLine!= null) {
institutionDone = this.lastLine.indexOf(')') > 0;
institution += this.lastLine.substring(0, institutionDone ? this.lastLine.indexOf(')') : this.lastLine.length()).trim();
readLine();
}
if (author != null) {
authors += !authors.equals("") ? " and " + author : "" + author;
}
if (institution != null) {
institutions += !institutions.equals("") ? " and " + institution : "" + institution;
}
}
if (!authors.equals("")) {
be.setField("author", authors);
}
if (!institutions.equals("")) {
be.setField("institution", institutions);
}
}
/**
* Implements grammar rule "Abstract".
*
* @param be
* @throws IOException
*/
private void parseAbstract(BibtexEntry be) throws IOException {
String theabstract = readMultipleLines();
if (!theabstract.equals("")) {
be.setField("abstract", theabstract);
}
}
/**
* Implements grammar rule "AdditionalFields".
*
* @param be
* @throws IOException
*/
private void parseAdditionalFields(BibtexEntry be, boolean multilineUrlFieldAllowed) throws IOException {
// one empty line is possible before fields start
if (this.lastLine != null && this.lastLine.trim().equals("")) {
readLine();
}
// read other fields
while (this.lastLine != null && !isStartOfWorkingPaper() && (startsWithKeyword(recognizedFields) || this.lastLine.equals(""))) {
// if multiple lines for a field are allowed and field consists of multiple lines, join them
String keyword = this.lastLine.equals("") ? "" : this.lastLine.substring(0, this.lastLine.indexOf(':')).trim();
// skip keyword
this.lastLine = this.lastLine.equals("") ? "" : this.lastLine.substring(this.lastLine.indexOf(':')+1, this.lastLine.length()).trim();
// parse keywords field
if (keyword.equals("Keywords")) {
String content = readMultipleLines();
String[] keywords = content.split("[,;]");
String keywordStr = "";
for (int i = 0; i < keywords.length; i++) {
keywordStr += " '" + keywords[i].trim() + "'";
}
be.setField("keywords", keywordStr.trim());
// parse JEL field
} else if (keyword.equals("JEL")) {
be.setField("jel", readMultipleLines());
// parse date field
} else if (keyword.startsWith("Date")) {
Date date = null;
String content = readMultipleLines();
String[] recognizedDateFormats = new String[] {"yyyy-MM-dd","yyyy-MM","yyyy"};
int i = 0;
for (; i < recognizedDateFormats.length && date == null; i++) {
try {
date = new SimpleDateFormat(recognizedDateFormats[i]).parse(content);
} catch (ParseException e) {
// wrong format
}
}
Calendar cal = new GregorianCalendar();
cal.setTime(date != null ? date : new Date());
be.setField("year", "" + cal.get(Calendar.YEAR));
if (date != null && recognizedDateFormats[i-1].indexOf("MM") >= 0) {
be.setField("month", "" + cal.get(Calendar.MONTH));
}
// parse URL field
} else if (keyword.startsWith("URL")) {
String content = null;
if (multilineUrlFieldAllowed) {
content = readMultipleLines();
} else {
content = this.lastLine;
readLine();
}
be.setField("url", content);
// authors field
} else if (keyword.startsWith("By")) {
// parse authors
parseAuthors(be);
} else {
readLine();
}
}
}
/**
* if line starts with a string of the form 'x. ' and we are not in the overview
* section, we have a working paper entry we are interested in
*/
private boolean isStartOfWorkingPaper() {
return this.lastLine.matches("\\d+\\.\\s.*") && !this.inOverviewSection && this.preLine.trim().equals("");
}
/*
* (non-Javadoc)
* @see net.sf.jabref.imports.ImportFormat#importEntries(java.io.InputStream)
*/
public List<BibtexEntry> importEntries(InputStream stream) throws IOException {
ArrayList<BibtexEntry> bibitems = new ArrayList<BibtexEntry>();
String paperNoStr = null;
this.line = 0;
try {
this.in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));
readLine(); // skip header and editor information
while (this.lastLine != null) {
if (this.lastLine.startsWith("-----------------------------")) {
this.inOverviewSection = this.preLine.startsWith("In this issue we have");
}
if (isStartOfWorkingPaper()) {
BibtexEntry be = new BibtexEntry(Util.createNeutralId());
be.setType(BibtexEntryType.getType("techreport"));
paperNoStr = this.lastLine.substring(0, this.lastLine.indexOf('.'));
parseTitleString(be);
if (startsWithKeyword(recognizedFields)) {
parseAdditionalFields(be, false);
} else {
readLine(); // skip empty line
parseAuthors(be);
readLine(); // skip empty line
}
if (!startsWithKeyword(recognizedFields)) {
parseAbstract(be);
}
parseAdditionalFields(be, true);
bibitems.add(be);
paperNoStr = null;
} else {
this.preLine = this.lastLine;
readLine();
}
}
} catch (Exception e) {
String message = "Error in REPEC-NEP import on line " + this.line;
if (paperNoStr != null) {
message += ", paper no. " + paperNoStr + ": ";
}
message += e.getMessage();
System.err.println(message);
if (!(e instanceof IOException)) {
e.printStackTrace();
e = new IOException(message);
}
throw (IOException)e;
}
return bibitems;
}
}