/*
* JDBCCorpus.java
*
* Copyright (c) 2010, Austrian Research Institute for
* Artificial Intelligence (OFAI)
*
* This file is free
* software, licenced under the GNU General Public License,
*
* Johann Petrak, 30/8/2010
*
* $Id: JDBCCorpus.java 124 2014-04-24 18:23:51Z johann.petrak $
*/
package at.ofai.gate.virtualcorpus;
import java.io.FileFilter;
import java.io.IOException;
import java.net.URL;
import java.sql.SQLException;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.ListIterator;
import java.util.Iterator;
import java.util.Collection;
import java.util.HashMap;
import java.util.Properties;
import gate.*;
import gate.corpora.CorpusImpl;
import gate.corpora.DocumentImpl;
import gate.creole.*;
import gate.creole.metadata.*;
import gate.event.CorpusEvent;
import gate.event.CorpusListener;
import gate.event.CreoleEvent;
import gate.event.CreoleListener;
import gate.persist.PersistenceException;
import gate.util.*;
import gate.util.persistence.PersistenceManager;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.io.IOUtils;
/**
* A Corpus LR that mirrors documents stored in a JDBC database table field.
* The table must have a unique id field which will serve as the document
* name. The JDBC corpus can be used in two different modes: in-place update
* and from-to. In in-place update mode, only a single field is used for the
* database content and the field must already contain documents in GATE xml
* format. In from-to mode, the input field can contain documents in other
* formats and the processing result is stored in GATE xml format to a different
* output field in the same row. In this mode, if the input document format is
* not GATE xml, the mime type and encoding fields should be specified.
* The list of documents shown in this corpus can be arbitrarily created from
* a SQL query over any table in the database which returns the IDs of documents
* to be taken from the document table. If the SQL query field is empty, all
* the IDs from the document table are taken (or limited to whatever is
* specified for the limit field).
* <p>
* This LR does not implement the following methods:
* <ul>
* <li>toArray: none of the toArray methods is implemented.
* <li>inserting new documents: TODO
* </ul>
* This corpus LR automatically uses a "dummy datastore" internally.
* This datastore is created and removed automatically when the corpus LR is
* created and removed. This datastore cannot be used for anything useful, it
* does not allow listing of resources or storing of anything but documents
* that are already in the corpus. It is mainly here because GATE assumes that
* documents are either transient or from a datastore. To avoid documents from
* a JDBCCorpus to get treated as transient documents, their DataStore is
* set to this dummy DataStore.
* <p>
* A document will always be saved to the original table field or the field
* specified as outDocumentContentField whenever the document is synced or
* unloaded.
* <p>
* NOTE: there are some situations where the JDBC corpus will throw an
* exception:
* <ul>
* <li>if you use the select statement to only show a subset of
* documents in the corpus, adding a document that has a name that is already
* in the table, but not shown in the corpus, will result in an error.</li>
* <li>The plugin does not check which SQL field type is used for any the fields
* specified. It is expected that all the fields can be converted to and from
* a Java String. For the document name, a numeric field type can be used,
* but only if document names can be converted to a number.</li>
* <li>If a document is added, only the id and content or outcontent fields
* are actually filled with some value. This will onlys succeed if there is no
* other field or no other field that does not have a default value.
* (in other words, be sure that all other fields do have some default value
* set when the table is created).</li>
* <lr> this LR does not allow multi-threaded use! It does NOT allow being
* used for getting or saving documents and getting serialized (saved as
* part of a .gapp file) at the same time!</lr>
* </ul>
* @author Johann Petrak
*/
// TODO:
// Think about if and how to implement document removal!
// Proposal: if saving is disabled, just remove from the corpus, if saving
// is enabled, actually remove the row from the table.
// TODO: check how problems are handled if a document cannot be parsed.
@CreoleResource(
name = "JDBCCorpus",
interfaceName = "gate.Corpus",
icon = "corpus",
helpURL = "http://code.google.com/p/gateplugin-virtualcorpus/wiki/JDBCCorpusUsage",
comment = "A corpus backed by GATE documents stored in a JDBC table")
public class JDBCCorpus
extends VirtualCorpus
implements Corpus, CreoleListener
{
//*****
// Fields
//******
/**
*
*/
private static final long serialVersionUID = -8485133333415382902L;
// for accessing document name by index
protected List<String> documentNames = new ArrayList<String>();
// for checking if ith document is loaded
protected List<Boolean> isLoadeds = new ArrayList<Boolean>();
// for finding index for document name
//REMOVE protected Map<String,Integer> documentIndexes = new HashMap<String,Integer>();
protected Map<String,Document> loadedDocuments = new HashMap<String,Document>();
protected List<CorpusListener> listeners = new ArrayList<CorpusListener>();
//***************
// Parameters
//***************
// JDBC Driver class name (default: mysql)
// JDBC url - this should include the database, userid and password
// document table
// document content field name
// document id field name
// output document content field name
// input document mime type field name or a constant enclosed in quotes
// input document encoding field name or a constant enclosed in quotes
// document id SQL query (default: select $docid from $doctable
// doUpdates (boolean, default true)
@CreoleParameter(comment = "The JDBC database driver class name",
defaultValue = "com.mysql.jdbc.Driver")
public void setDriverClassName(String name) {
this.driverClassName = name;
}
/**
*/
public String getDriverClassName() {
return this.driverClassName;
}
protected String driverClassName = "com.mysql.jdbc.Driver";
/**
*/
@CreoleParameter(comment = "The JDBC connection URL",
defaultValue = "jdbc:mysql://localhost:3306/database?user=user&password=pass")
public void setConnectionURL(String connectionURL) {
this.connectionURL = connectionURL;
}
/**
*/
public String getConnectionURL() {
return this.connectionURL;
}
protected String connectionURL = "jdbc:mysql://localhost:3306/dbname?user=username&password=pwd";
@CreoleParameter(comment = "The database table name")
public void setTableName(String name) {
this.tableName = name;
}
/**
*/
public String getTableName() {
return this.tableName;
}
protected String tableName;
@CreoleParameter(comment = "The document id/name field name")
public void setDocumentNameField(String fieldname) {
this.documentNameField = fieldname;
}
/**
*/
public String getDocumentNameField() {
return this.documentNameField;
}
protected String documentNameField;
@CreoleParameter(comment = "The document content field name")
public void setDocumentContentField(String fieldname) {
this.documentContentField = fieldname;
}
/**
*/
public String getDocumentContentField() {
return this.documentContentField;
}
protected String documentContentField;
@CreoleParameter(comment = "The output document content field name")
@Optional
public void setOutDocumentContentField(String fieldname) {
this.outDocumentContentField = fieldname;
}
/**
* @return
*/
public String getOutDocumentContentField() {
return this.outDocumentContentField;
}
protected String outDocumentContentField = "";
@CreoleParameter(
comment = "Mime type of input content - field name or type in quotes",
defaultValue = "")
@Optional
public void setMimeTypeField(String fieldname) {
this.mimeTypeField = fieldname;
}
/**
* @return
*/
public String getMimeTypeField() {
return this.mimeTypeField;
}
protected String mimeTypeField = "";
@CreoleParameter(
comment = "Encoding of input content - field name or type in quotes",
defaultValue = "")
@Optional
public void setEncodingField(String fieldname) {
this.encodingField = fieldname;
}
/**
* @return
*/
public String getEncodingField() {
return this.encodingField;
}
protected String encodingField = "";
@CreoleParameter(comment = "SQL Query for selecting the set of document ids/names",
defaultValue = "SELECT ${documentNameField} from ${tableName}")
@Optional
public void setSelectSQL(String sql) {
this.selectSQL = sql;
}
/**
* @return
*/
public String getSelectSQL() {
return this.selectSQL;
}
protected String selectSQL = "SELECT ${documentNameField} from ${tableName}";
@CreoleParameter(comment = "URL of a property file for setting the other parameters")
@Optional
public void setInitParmsFileURL(URL url) {
initParmsFileURL = url;
}
public URL getInitParmsFileURL() {
return initParmsFileURL;
}
protected URL initParmsFileURL;
// fields
DummyDataStore4JDBCCorp ourDS = null;
Connection dbConnection = null;
PreparedStatement getContentStatement = null;
PreparedStatement getEncodingStatement = null;
PreparedStatement getMimeTypeStatement = null;
PreparedStatement updateContentStatement = null;
PreparedStatement insertContentStatement = null;
PreparedStatement deleteRowStatement = null;
boolean haveMimeTypeField = false;
boolean haveMimeTypeConstant = false;
boolean haveEncodingField = false;
boolean haveEncodingConstant = false;
String mimeType = "application/xml";
String encoding = "utf-8";
@Override
/**
* Initializes the JDBCCorpus LR
*/
public Resource init()
throws ResourceInstantiationException {
if(initParmsFileURL != null) {
File initparmsfile = gate.util.Files.fileFromURL(initParmsFileURL);
Properties props = new Properties();
try {
props.load(new FileInputStream(initparmsfile));
} catch (IOException ex) {
throw new ResourceInstantiationException(
"Could not read propertiles from file "+initParmsFileURL,ex);
}
for (String prop : props.stringPropertyNames()) {
String val = props.getProperty(prop);
System.out.println("Setting parameter "+prop+"="+val);
// TODO: we could make this dynamically adjust to our parameters by inspection
if(prop.equals("connectionURL")) {
setConnectionURL(val);
} else if(prop.equals("documentContentField")) {
setDocumentContentField(val);
} else if(prop.equals("documentNameField")) {
setDocumentNameField(val);
} else if(prop.equals("driverClassName")) {
setDriverClassName(val);
} else if(prop.equals("encodingField")) {
setEncodingField(val);
} else if(prop.equals("mimeTypeField")) {
setMimeTypeField(val);
} else if(prop.equals("outDocumentContentField")) {
setOutDocumentContentField(val);
} else if(prop.equals("removeDocuments")) {
setRemoveDocuments(Boolean.valueOf(val));
} else if(prop.equals("saveDocuments")) {
setSaveDocuments(Boolean.valueOf(val));
} else if(prop.equals("selectSQL")) {
setSelectSQL(val);
} else if(prop.equals("tableName")) {
setTableName(val);
} else if(prop.equals("transientCorpus")) {
setTransientCorpus(Boolean.valueOf(val));
} else if(prop.equals("useCompression")) {
setUseCompression(Boolean.valueOf(val));
} else if(prop.equals("compressOnCopy")) {
setCompressOnCopy(Boolean.valueOf(val));
} else {
System.err.println("Parameter "+prop+"="+val+" not found");
}
}
}
if(getTableName() == null || getTableName().equals("")) {
throw new ResourceInstantiationException("tableName must not be empty");
}
if(getDocumentNameField() == null || getDocumentNameField().equals("")) {
throw new ResourceInstantiationException("documentNameField must not be empty");
}
if(getDocumentContentField() == null || getDocumentContentField().equals("")) {
throw new ResourceInstantiationException("documentContentField must not be empty");
}
if(getSelectSQL() == null || getSelectSQL().equals("")) {
throw new ResourceInstantiationException("selectSQL must not be empty");
}
String query = getSelectSQL(); // this contains the ${tableName} and ${documentNameField} vars
query = query.replaceAll(Pattern.quote("${tableName}"), getTableName());
query = query.replaceAll(Pattern.quote("${documentNameField}"), getDocumentNameField());
try {
Class.forName(getDriverClassName());
dbConnection = DriverManager.getConnection(getConnectionURL());
} catch (Exception ex) {
throw new ResourceInstantiationException("Could not get driver/connection",ex);
}
Statement stmt = null;
try {
stmt = dbConnection.createStatement();
ResultSet rs = null;
rs = stmt.executeQuery(query);
int i = 0;
while(rs.next()) {
String docName = rs.getString(getDocumentNameField());
documentNames.add(docName);
isLoadeds.add(false);
documentIndexes.put(docName, i);
i++;
}
} catch(SQLException ex) {
throw new ResourceInstantiationException("Problem accessing database",ex);
}
try {
PersistenceManager.registerPersistentEquivalent(
at.ofai.gate.virtualcorpus.JDBCCorpus.class,
at.ofai.gate.virtualcorpus.JDBCCorpusPersistence.class);
} catch (PersistenceException e) {
throw new ResourceInstantiationException(
"Could not register persistence",e);
}
if (!isTransientCorpus) {
try {
// TODO: use more fields or a hash to make this unique?
ourDS =
(DummyDataStore4JDBCCorp) Factory.createDataStore("at.ofai.gate.virtualcorpus.DummyDataStore4JDBCCorp", getConnectionURL() + "//" + getTableName());
ourDS.setName("DummyDS4_" + this.getName());
ourDS.setComment("Dummy DataStore for JDBCCorpus " + this.getName());
ourDS.setCorpus(this);
//System.err.println("Created dummy corpus: "+ourDS+" with name "+ourDS.getName());
} catch (Exception ex) {
throw new ResourceInstantiationException(
"Could not create dummy data store", ex);
}
}
Gate.getCreoleRegister().addCreoleListener(this);
// check if we have field names or constants or nothing at all for
// mime type and encoding
String mt = getMimeTypeField();
if(mt.length() > 2 && mt.startsWith("\"") && mt.endsWith("\"")) {
haveMimeTypeConstant = true;
mimeType = mt.substring(1,mt.length()-1);
System.out.println("Have constant mime type: "+mimeType);
} else if(mt.length() > 0) {
haveMimeTypeField = true;
System.out.println("Have mime type field: "+getMimeTypeField());
} else {
System.out.println("No mime type field and no constant mimetype specified");
}
String enc = getEncodingField();
if(enc.length() > 2 && enc.startsWith("\"") && enc.endsWith("\"")) {
haveEncodingConstant = true;
encoding = enc.substring(1,enc.length()-1);
System.out.println("Have constant encoding: "+encoding);
} else if (enc.length() > 0) {
haveEncodingField = true;
System.out.println("Have encoding field: "+getEncodingField());
} else {
System.out.println("No encoding field and no constant encoding specified");
}
// create all the prepared statements we need for accessing stuff in the db
try {
query = "SELECT "+getDocumentContentField()+" FROM "+
getTableName()+" WHERE "+getDocumentNameField()+" = ?";
System.out.println("Preparing get document statement: "+query);
getContentStatement = dbConnection.prepareStatement(query);
if(haveEncodingField) {
query = "SELECT "+getEncodingField()+" FROM "+
getTableName()+" WHERE "+getDocumentNameField()+" = ?";
System.out.println("Preparing get encoding statement: "+query);
getEncodingStatement = dbConnection.prepareStatement(query);
}
if(haveMimeTypeField) {
query = "SELECT "+getMimeTypeField()+" FROM "+
getTableName()+" WHERE "+getDocumentNameField()+" = ?";
System.out.println("Preparing get mimetype statement: "+query);
getMimeTypeStatement = dbConnection.prepareStatement(query);
}
String outfield = getDocumentContentField();
if(getOutDocumentContentField() != null && !getOutDocumentContentField().equals("")) {
outfield = getOutDocumentContentField();
}
query = "UPDATE "+getTableName()+" SET "+outfield+" = ? "+
" WHERE "+getDocumentNameField()+" = ?";
System.out.println("Preparing update document statement: "+query);
updateContentStatement = dbConnection.prepareStatement(query);
// for the insertion we need to prepare the statement depending on
// whether the encoding and/or mimetype fields are there
if(haveEncodingField) {
if(haveMimeTypeField) {
// both encoding and mimetype fields
query = "INSERT INTO "+getTableName()+
" ( "+getDocumentNameField()+","+outfield+","+encodingField+
","+mimeTypeField+" ) VALUES ( ?,?,?,? )";
} else {
// encoding field but no mime type field
query = "INSERT INTO "+getTableName()+
" ( "+getDocumentNameField()+","+outfield+","+encodingField+" ) VALUES ( ?,?,? )";
}
} else {
if(haveMimeTypeField) {
// no encoding but mimetype
query = "INSERT INTO "+getTableName()+
" ( "+getDocumentNameField()+","+outfield+","+mimeTypeField+" ) VALUES ( ?,?,? )";
} else {
// no encoding and no mime type
query = "INSERT INTO "+getTableName()+
" ( "+getDocumentNameField()+","+outfield+") VALUES ( ?,? )";
}
}
System.out.println("Preparing insert statement: "+query);
insertContentStatement = dbConnection.prepareStatement(query);
query = "DELETE FROM "+getTableName()+
" WHERE "+getDocumentNameField()+" = ?";
System.out.println("Preparing delete statement: "+query);
deleteRowStatement = dbConnection.prepareStatement(query);
} catch (SQLException ex) {
throw new ResourceInstantiationException("Could not prepare statement",ex);
}
return this;
}
/**
* Test is the document with the given index is loaded. If an index is
* specified that is not in the corpus, a GateRuntimeException is thrown.
*
* @param index
* @return true if the document is loaded, false otherwise.
*/
public boolean isDocumentLoaded(int index) {
if(index < 0 || index >= isLoadeds.size()) {
throw new GateRuntimeException("Document number "+index+
" not in corpus "+this.getName()+" of size "+isLoadeds.size());
}
//System.out.println("isDocumentLoaded called: "+isLoadeds.get(index));
return isLoadeds.get(index);
}
public boolean isDocumentLoaded(Document doc) {
String docName = doc.getName();
//System.out.println("DirCorp: called unloadDocument: "+docName);
Integer index = documentIndexes.get(docName);
if(index == null) {
throw new RuntimeException("Document "+docName+
" is not contained in corpus "+this.getName());
}
return isDocumentLoaded(index);
}
/**
* Unload a document from the corpus. When a document is unloaded it
* is automatically stored in GATE XML format to the directory where it
* was read from or to the directory specified for the outDirectoryURL
* parameter. If saveDocuments is false, nothing is saved at all.
* if the document is not part of the corpus, a GateRuntimeException is
* thrown.
*
* @param doc
*/
public void unloadDocument(Document doc) {
String docName = doc.getName();
//System.out.println("DirCorp: called unloadDocument: "+docName);
Integer index = documentIndexes.get(docName);
if(index == null) {
throw new RuntimeException("Document "+docName+
" is not contained in corpus "+this.getName());
}
if(isDocumentLoaded(index)) {
// if saveOnUnload is set, save the document
if(saveDocuments) {
try {
saveDocument(doc);
} catch (Exception ex) {
throw new GateRuntimeException("Problem saving document "+docName,ex);
}
}
loadedDocuments.remove(docName);
isLoadeds.set(index, false);
//System.err.println("Document unloaded: "+docName);
} // else silently do nothing
}
public void removeCorpusListener(CorpusListener listener) {
listeners.remove(listener);
}
public void addCorpusListener(CorpusListener listener) {
listeners.add(listener);
}
/**
* Get the list of document names in this corpus.
*
* @return the list of document names
*/
public List<String> getDocumentNames() {
List<String> newList = new ArrayList<String>(documentNames);
return newList;
}
/**
* Return the name of the document with the given index from the corpus.
*
* @param i the index of the document to return
* @return the name of the document with the given index
*/
public String getDocumentName(int i) {
return documentNames.get(i);
}
/**
* This method is not implemented and throws a
* gate.util.MethodNotImplementedException.
*
* @param directory
* @param filter
* @param encoding
* @param recurseDirectories
*/
public void populate(
URL directory, FileFilter filter,
String encoding, boolean recurseDirectories) {
populate(directory, filter, encoding, null, recurseDirectories);
}
/**
* This method is not implemented and throws a
* gate.util.MethodNotImplementedException.
*
* @param directory
* @param filter
* @param encoding
* @param mimeType
* @param recurseDirectories
*/
public void populate (
URL directory, FileFilter filter,
String encoding, String mimeType,
boolean recurseDirectories) {
if(isTransientCorpus) {
throw new GateRuntimeException("Cannot populate a transient JDBC corpus");
} else {
try {
CorpusImpl.populate(this, directory, filter, encoding, mimeType, recurseDirectories);
} catch (IOException ex) {
throw new GateRuntimeException("IO error",ex);
}
}
}
/**
* @return
*/
public DataStore getDataStore() {
if(dataStoreIsHidden) {
return null;
} else {
return ourDS;
}
}
/**
* This always throws a PersistenceException as this kind of corpus cannot
* be saved to a datastore.
*
* @param ds
* @throws PersistenceException
*/
public void setDataStore(DataStore ds) throws PersistenceException {
throw new PersistenceException("Corpus "+this.getName()+
" cannot be saved to a datastore");
}
/**
* This follows the convention for transient corpus objects and always
* returns false.
*
* @return always false
*/
public boolean isModified() {
return false;
}
@Override
public void sync() {
// TODO: save document!?!?!?
}
@Override
public void cleanup() {
// TODO:
// deregister our listener for resources of type document
//
try {
if(dbConnection != null && !dbConnection.isClosed()) {
dbConnection.close();
}
} catch (SQLException ex) {
// TODO: log, but otherwise ignore
}
if(!isTransientCorpus) {
Gate.getDataStoreRegister().remove(ourDS);
}
}
@Override
public void setName(String name) {
super.setName(name);
if(ourDS != null) {
ourDS.setName("DummyDS4_"+this.getName());
ourDS.setComment("Dummy DataStore for JDBCCorpus "+this.getName());
}
}
// Methods to be implemented from List
/**
* Add a document to the corpus. If the document has a name that is already
* in the list of documents, return false and do not add the document.
* Note that only the name is checked!
* If the name of the document added is not ending in ".xml", a
* GateRuntimeException is thrown.
* If the document is already adopted by some data store throw an exception.
*/
public boolean add(Document doc) {
if(!saveDocuments) {
return false;
}
//System.out.println("JDBCCorp: called add(Object): "+doc.getName());
String docName = doc.getName();
Integer index = documentIndexes.get(docName);
if(index != null) {
return false; // if that name is already in the corpus, do not add
} else {
if(doc.getDataStore() != null) {
throw new GateRuntimeException("Cannot add "+doc.getName()+" which belongs to datastore "+doc.getDataStore().getName());
}
try {
insertDocument(doc);
} catch (Exception ex) {
throw new GateRuntimeException("Problem inserting document "+docName,ex);
}
int i = documentNames.size();
documentNames.add(docName);
documentIndexes.put(docName, i);
isLoadeds.add(false);
if(!isTransientCorpus) {
adoptDocument(doc);
}
fireDocumentAdded(new CorpusEvent(
this, doc, i, CorpusEvent.DOCUMENT_ADDED));
return true;
}
}
/**
* This removes all documents from the corpus. Note that this does nothing
* when the saveDocuments parameter is set to false.
* If the outDirectoryURL parameter was set, this method will throw
* a GateRuntimeException.
*/
public void clear() {
if(!saveDocuments) {
return;
}
/*
if(outDirectoryURL != null) {
throw new GateRuntimeException(
"clear method not supported when outDirectoryURL is set for "+
this.getName());
}
*
*/
for(int i=documentNames.size()-1; i>=0; i--) {
remove(i);
}
}
/**
* This checks if a document with the same name as the document
* passed is already in the corpus. The content is not considered
* for this.
*/
public boolean contains(Object docObj) {
Document doc = (Document)docObj;
String docName = doc.getName();
return (documentIndexes.get(docName) != null);
}
/**
* Return the document for the given index in the corpus.
* An IndexOutOfBoundsException is thrown when the index is not contained
* in the corpus.
* The document will be read from the file only if it is not already loaded.
* If it is already loaded a reference to that document is returned.
*
* @param index
* @return
*/
public Document get(int index) {
//System.out.println("DirCorp: called get(index): "+index);
if(index < 0 || index >= documentNames.size()) {
throw new IndexOutOfBoundsException(
"Index "+index+" not in corpus "+this.getName()+
" of size "+documentNames.size());
}
String docName = documentNames.get(index);
if(isDocumentLoaded(index)) {
Document doc = loadedDocuments.get(docName);
//System.out.println("Returning loaded document "+doc);
return doc;
}
//System.out.println("Document not loaded, reading");
Document doc;
try {
doc = readDocument(docName);
} catch (Exception ex) {
throw new GateRuntimeException("Problem retrieving document data for "+docName,ex);
}
loadedDocuments.put(docName, doc);
isLoadeds.set(index, true);
if(!isTransientCorpus) {
adoptDocument(doc);
}
return doc;
}
/**
* Returns the index of the document with the same name as the given document
* in the corpus. The content of the document is not considered for this.
*
* @param docObj
* @return
*/
public int indexOf(Object docObj) {
Document doc = (Document)docObj;
String docName = doc.getName();
Integer index = documentIndexes.get(docName);
if(index == null) {
return -1;
} else {
return index;
}
}
/**
* Check if the corpus is empty.
*
* @return true if the corpus is empty
*/
public boolean isEmpty() {
return (documentNames.size() == 0);
}
/**
* Returns an iterator to iterate through the documents of the
* corpus. The iterator does not allow modification of the corpus.
*
* @return
*/
public Iterator<Document> iterator() {
return new JDBCCorpusIterator();
}
/**
* This method is not implemented and always throws a
* MethodNotImplementedException.
*
* @param docObj
* @return
*/
public int lastIndexOf(Object docObj) {
throw new MethodNotImplementedException(
notImplementedMessage("lastIndexOf(Object)"));
}
/**
* This method is not implemented and always throws a
* MethodNotImplementedException.
*
* @return
*/
public ListIterator<Document> listIterator() {
throw new MethodNotImplementedException(
notImplementedMessage("listIterator"));
}
/**
* This method is not implemented and always throws a
* MethodNotImplementedException.
*
*
* @param i
* @return
*/
public ListIterator<Document> listIterator(int i) {
throw new MethodNotImplementedException(
notImplementedMessage("listIterator(int)"));
}
/**
*
* @param index
* @return the document that was just removed from the corpus
*/
public Document remove(int index) {
Document doc = (Document)get(index);
String docName = documentNames.get(index);
documentNames.remove(index);
if(isLoadeds.get(index)) {
loadedDocuments.remove(docName);
}
isLoadeds.remove(index);
documentIndexes.remove(docName);
removeDocument(docName);
if (!isTransientCorpus) {
try {
doc.setDataStore(null);
} catch (PersistenceException ex) {
// this should never happen
}
}
fireDocumentRemoved(new CorpusEvent(
this, doc,
index, CorpusEvent.DOCUMENT_REMOVED));
return doc;
}
/**
* Removes a document with the same name as the given document
* from the corpus. This is not
* supported and throws a GateRuntimeException if the outDirectoryURL
* was specified for this corpus. If the saveDocuments parameter is false
* for this corpus, this method does nothing and always returns false.
* If the a document with the same name as the given document is not
* found int the corpus, this does nothing and returns false.
*
* @param docObj
* @return true if a document was removed from the corpus
*/
public boolean remove(Object docObj) {
int index = indexOf(docObj);
if(index == -1) {
return false;
}
String docName = documentNames.get(index);
documentNames.remove(index);
isLoadeds.remove(index);
documentIndexes.remove(docName);
removeDocument(docName);
Document doc = isDocumentLoaded(index) ? (Document)get(index) : null;
if (!isTransientCorpus) {
try {
doc.setDataStore(null);
} catch (PersistenceException ex) {
// this should never happen
}
}
fireDocumentRemoved(new CorpusEvent(
this, doc,
index, CorpusEvent.DOCUMENT_REMOVED));
return true;
}
/**
* Remove all the documents in the collection from the corpus.
*
* @param coll
* @return true if any document was removed
*/
public boolean removeAll(Collection coll) {
boolean ret = false;
for(Object docObj : coll) {
ret = ret || remove(docObj);
}
return ret;
}
public int size() {
return documentNames.size();
}
//******
//Listener methods
//***********
protected void fireDocumentAdded(CorpusEvent e) {
for(CorpusListener listener : listeners) {
listener.documentAdded(e);
}
}
protected void fireDocumentRemoved(CorpusEvent e) {
for(CorpusListener listener : listeners) {
listener.documentRemoved(e);
}
}
public void resourceLoaded(CreoleEvent e) {
//System.out.println("DirCorp: Resource loaded");
}
public void resourceRenamed(
Resource resource,
String oldName,
String newName) {
// if one of our documents gets renamed, rename it back and
// write an error message
if(resource instanceof Document) {
Document doc = (Document)resource;
if(loadedDocuments.containsValue(doc)) {
System.err.println("ERROR: documents from a JDBC corpus cannot be renamed!");
doc.setName(oldName);
}
}
}
public void resourceUnloaded(CreoleEvent e) {
Resource res = e.getResource();
if(res instanceof Document) {
Document doc = (Document)res;
// check if this document has actually been loaded by us
if(loadedDocuments.containsValue(doc)) {
unloadDocument(doc);
} // else: its not ours, ignore
} else if(res == this) {
Gate.getCreoleRegister().removeCreoleListener(this);
}
}
public void datastoreClosed(CreoleEvent ev) {
}
public void datastoreCreated(CreoleEvent ev) {
}
public void datastoreOpened(CreoleEvent ev) {
}
//**************************
// helper methods
// ************************
protected void saveDocument(Document doc) throws ResourceInstantiationException, IOException, SQLException {
if(!getSaveDocuments()) {
return;
}
String docContent = doc.toXml();
String docName = doc.getName();
updateContentStatement.setString(2, docName);
if (getUseCompression() || getCompressOnCopy()) {
String docEncoding = (String) doc.getParameterValue("encoding");
String usedEncoding = getActiveEncoding(docEncoding);
InputStream iscomp = getGZIPCompressedInputStream(docContent, usedEncoding);
updateContentStatement.setBinaryStream(1, iscomp);
updateContentStatement.execute();
iscomp.close();
} else {
updateContentStatement.setString(1, docContent);
updateContentStatement.execute();
}
}
protected void insertDocument(Document doc) throws SQLException, ResourceInstantiationException, IOException {
if (!getSaveDocuments()) {
return;
}
String docContent = doc.toXml();
String docName = doc.getName();
String docEncoding = (String) doc.getParameterValue("encoding");
String usedEncoding = getActiveEncoding(docEncoding);
insertContentStatement.setString(1, docName);
String docMimeType = (String)doc.getParameterValue("mimeType");
// when we have encoding and/or mime type fields, set them!
if(haveEncodingField) {
if(haveMimeTypeField) {
insertContentStatement.setString(3, usedEncoding);
insertContentStatement.setString(4, docMimeType);
} else {
insertContentStatement.setString(3, usedEncoding);
}
} else {
if(haveMimeTypeField) {
insertContentStatement.setString(3, docMimeType);
} else {
// neither encoding, nor mime type, nothing needs to be done
}
}
if (getUseCompression() || getCompressOnCopy()) {
InputStream iscomp = getGZIPCompressedInputStream(docContent, usedEncoding);
insertContentStatement.setBinaryStream(2, iscomp);
insertContentStatement.execute();
iscomp.close();
} else {
insertContentStatement.setString(2, docContent);
insertContentStatement.execute();
}
}
protected InputStream getGZIPCompressedInputStream(String theString, String theEncoding)
throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
GZIPOutputStream gos = new GZIPOutputStream(baos);
gos.write(theString.getBytes(theEncoding));
gos.close();
ByteArrayInputStream inputStream = new ByteArrayInputStream(baos.toByteArray());
return inputStream;
}
protected Document readDocument(String docName) throws SQLException, IOException {
//System.out.println("JDBCCorp: read doc "+docName);
Document doc = null;
ResultSet rs = null;
String docEncoding = encoding;
if (haveEncodingField) {
getEncodingStatement.setString(1, docName);
rs = getEncodingStatement.executeQuery();
if(!rs.first()) {
throw new GateRuntimeException("Could not retrieve encoding for "+docName);
}
if(!rs.last()) {
throw new GateRuntimeException("More than one match for document "+docName);
}
docEncoding = rs.getString(1);
}
//System.out.println("Trying to get content for "+docName);
getContentStatement.setString(1, docName);
//System.out.println("After setString: "+getContentStatement);
rs = getContentStatement.executeQuery();
if (!rs.next()) {
throw new GateRuntimeException("Document not found int the DB table: " + docName);
}
if (!rs.isLast()) {
throw new GateRuntimeException("More than one row found for document name " + docName);
}
String content = null;
if (getUseCompression()) {
InputStream is = rs.getBinaryStream(1);
InputStream isdec = null;
isdec = new GZIPInputStream(is);
String usedEncoding = getActiveEncoding(docEncoding);
content = IOUtils.toString(isdec, usedEncoding);
isdec.close();
is.close();
} else {
content = rs.getString(1);
}
String docMimeType = mimeType;
if (haveMimeTypeField) {
getMimeTypeStatement.setString(1, docName);
rs = getMimeTypeStatement.executeQuery();
rs.first();
mimeType = rs.getString(1);
}
FeatureMap params = Factory.newFeatureMap();
params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, content);
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, docEncoding);
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, docMimeType);
try {
doc =
(Document) Factory.createResource(DocumentImpl.class.getName(),
params, null, docName);
} catch (Exception ex) {
throw new GateRuntimeException("Exception creating the document", ex);
}
return doc;
}
protected String getActiveEncoding(String docEncoding) {
String usedEncoding = "UTF-8"; // if all else fails, use UTF8
if(docEncoding != null && !docEncoding.isEmpty()) {
usedEncoding = docEncoding; // encoding from or for document overwrites all else
} else if (encoding != null && !encoding.isEmpty()) {
usedEncoding = encoding; // use LR's encoding parameter
} else if (System.getProperty("file.encoding") != null) {
usedEncoding = System.getProperty("file.encoding");
}
return usedEncoding;
}
protected void removeDocument(String docName) {
if(getRemoveDocuments() && getSaveDocuments()) {
try {
deleteRowStatement.execute();
} catch (SQLException ex) {
throw new GateRuntimeException("Problem when trying to delete table row for document "+docName,ex);
}
}
}
protected void adoptDocument(Document doc) {
try {
doc.setDataStore(ourDS);
//System.err.println("Adopted document "+doc.getName());
} catch (PersistenceException ex) {
System.err.println("Got exception when adopting: "+ex);
}
}
protected class JDBCCorpusIterator implements Iterator<Document> {
int nextIndex = 0;
@Override
public boolean hasNext() {
return (documentNames.size() > nextIndex);
}
@Override
public Document next() {
if(hasNext()) {
return get(nextIndex++);
} else {
return null;
}
}
@Override
public void remove() {
throw new MethodNotImplementedException();
}
}
} // class JDBCCorpus