//also explicitly set the charset as contentEncoding
String charset = mtas.mediaType.getParameters().get("charset");
if(charset != null){
metadata.set(Metadata.CONTENT_ENCODING, charset);
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink(TEXT_PLAIN +"; charset="+UTF8.name());
} catch (IOException e) {
IOUtils.closeQuietly(in); //close the input stream
throw new EngineException("Error while initialising Blob for" +
"writing the text/plain version of the parsed content",e);
}
final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
final ContentHandler textHandler = new BodyContentHandler( //only the Body
new PlainTextHandler(plainTextWriter, false,skipLinebreaks)); //skip ignoreable
final ToXMLContentHandler xhtmlHandler;
final ContentHandler mainHandler;
ContentSink xhtmlSink = null;
try {
if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
try {
xhtmlSink = ciFactory.createContentSink(XHTML +"; charset="+UTF8.name());
} catch (IOException e) {
throw new EngineException("Error while initialising Blob for" +
"writing the application/xhtml+xml version of the parsed content",e);
}
try {
xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(),UTF8.name());
} catch (UnsupportedEncodingException e) {
throw new EngineException("This system does not support the encoding "+UTF8,e);
}
mainHandler = new MultiHandler(textHandler,xhtmlHandler);
} else {
mainHandler = textHandler;
xhtmlHandler = null;
xhtmlSink = null;
}
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
public Object run() throws IOException, SAXException, TikaException {
/*
* We need to replace the context Classloader with the Bundle ClassLoader
* to ensure that Singleton instances of XML frameworks (such as node4j)
* do not leak into the OSGI environment.
*
* Most Java XML libs prefer to load implementations by using the
* {@link Thread#getContextClassLoader()}. However OSGI has no control over
* this {@link ClassLoader}. Because of that there can be situations where
* Interfaces are loaded via the Bundle Classloader and the implementations
* are taken from the context Classloader. What can cause
* {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
*
* Setting the context Classloader to the Bundle classloader helps to avoid
* those situations.
*/
ClassLoader contextClassLoader = updateContextClassLoader();
try {
parser.parse(in, mainHandler, metadata, context);
}finally {
//reset the previous context ClassLoader
Thread.currentThread().setContextClassLoader(contextClassLoader);
}
return null;
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if(e instanceof IOException || e instanceof SAXException || e instanceof TikaException){
throw new EngineException("Unable to convert ContentItem "+
ci.getUri()+" with mimeType '"+ci.getMimeType()+"' to "+
"plain text!",e);
} else { //runtime exception
throw RuntimeException.class.cast(e);
}
}
} finally { //ensure that the writers are closed correctly
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(plainTextWriter);
if(xhtmlSink != null){
IOUtils.closeQuietly(xhtmlSink.getOutputStream());
}
}
String random = randomUUID().toString();
UriRef textBlobUri = new UriRef("urn:tika:text:"+random);
ci.addPart(textBlobUri, plainTextSink.getBlob());
if(xhtmlHandler != null){
UriRef xhtmlBlobUri = new UriRef("urn:tika:xhtml:"+random);
ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
}
//add the extracted metadata
if(log.isInfoEnabled()){
for(String name : metadata.names()){
log.info("{}: {}",name,Arrays.toString(metadata.getValues(name)));