}
public void computeEnhancements(ContentItem ci) throws EngineException {
// get model from the extraction
URIImpl docId;
Model m = null;
ci.getLock().readLock().lock();
try {
docId = new URIImpl(ci.getUri().getUnicodeString());
m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem "
+ ci.getUri()+" with Metaxa",e);
} catch (IOException e) {
throw new EngineException("Error while processing ContentItem "
+ ci.getUri()+" with Metaxa",e);
} finally {
ci.getLock().readLock().unlock();
}
// Convert the RDF2go model to a Clerezza Graph and also extract
// the extracted plain text from the model
if (null == m) {
log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa",
ci.getUri(),ci.getMimeType());
return;
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink("text/plain");
} catch (IOException e) {
m.close();
throw new EngineException("Unable to initialise Blob for storing" +
"the plain text content",e);
}
HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();
RDF2GoUtils.urifyBlankNodes(m);
ClosableIterator<Statement> it = m.iterator();
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
plainTextSink.getOutputStream(), UTF8));
boolean textExtracted = false; //used to detect if some text was extracted
try {
MGraph g = new SimpleMGraph(); //first add to a temporary graph
while (it.hasNext()) {
Statement oneStmt = it.next();
//we need to treat triples that provide the plain/text
//version differently. Such Objects need to be added to
//the plain text Blob!
if(oneStmt.getSubject().equals(docId) &&
oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
String text = oneStmt.getObject().toString();
if(text != null && !text.isEmpty()){
try {
out.write(oneStmt.getObject().toString());
} catch (IOException e) {
throw new EngineException("Unable to write extracted" +
"plain text to Blob (blob impl: "
+ plainTextSink.getBlob().getClass()+")",e);
}
textExtracted = true;
if (includeText) {
NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
g.add(new TripleImpl(subject, predicate, object));
}
}
} else { //add metadata to the metadata of the contentItem
NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
if (null != subject && null != predicate && null != object) {
Triple t = new TripleImpl(subject, predicate, object);
g.add(t);
log.debug("added " + t.toString());
}
}
}
//add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
try {
ci.getMetadata().addAll(g);
g = null;
} finally {
ci.getLock().writeLock().unlock();
}
} finally {
it.close();
m.close();
IOUtils.closeQuietly(out);
}
if(textExtracted){
//add plain text to the content item
UriRef blobUri = new UriRef("urn:metaxa:plain-text:"+randomUUID());