* they're encrypted (potentially both text and metadata),
* but we can decrypt them easily.
*/
@Test
public void testProtectedPDF() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
InputStream stream = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDF_protected.pdf");
try {
parser.parse(stream, handler, metadata, context);
} finally {
stream.close();
}
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
String content = handler.toString();
assertTrue(content.contains("RETHINKING THE FINANCIAL NETWORK"));
assertTrue(content.contains("On 16 November 2002"));
assertTrue(content.contains("In many important respects"));
// Try again with an explicit empty password
handler = new BodyContentHandler();
metadata = new Metadata();
context = new ParseContext();
context.set(PasswordProvider.class, new PasswordProvider() {
public String getPassword(Metadata metadata) {
return "";
}
});
stream = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDF_protected.pdf");
try {
parser.parse(stream, handler, metadata, context);
} finally {
stream.close();
}
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));