ci.getLock().readLock().lock();
try {
docId = new URIImpl(ci.getUri().getUnicodeString());
m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem "
+ ci.getUri()+" with Metaxa",e);
} catch (IOException e) {
throw new EngineException("Error while processing ContentItem "
+ ci.getUri()+" with Metaxa",e);
} finally {
ci.getLock().readLock().unlock();
}
// Convert the RDF2go model to a Clerezza Graph and also extract
// the extracted plain text from the model
if (null == m) {
log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa",
ci.getUri(),ci.getMimeType());
return;
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink("text/plain");
} catch (IOException e) {
m.close();
throw new EngineException("Unable to initialise Blob for storing" +
"the plain text content",e);
}
HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();
RDF2GoUtils.urifyBlankNodes(m);
ClosableIterator<Statement> it = m.iterator();
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
plainTextSink.getOutputStream(), UTF8));
boolean textExtracted = false; //used to detect if some text was extracted
try {
MGraph g = new SimpleMGraph(); //first add to a temporary graph
while (it.hasNext()) {
Statement oneStmt = it.next();
//we need to treat triples that provide the plain/text
//version differently. Such Objects need to be added to
//the plain text Blob!
if(oneStmt.getSubject().equals(docId) &&
oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
String text = oneStmt.getObject().toString();
if(text != null && !text.isEmpty()){
try {
out.write(oneStmt.getObject().toString());
} catch (IOException e) {
throw new EngineException("Unable to write extracted" +
"plain text to Blob (blob impl: "
+ plainTextSink.getBlob().getClass()+")",e);
}
textExtracted = true;
if (includeText) {