/*
* PDF Scrutinizer, a library for detecting and analyzing malicious PDF documents.
* Copyright 2013 Florian Schmitt <florian@florianschmitt.de>, Fraunhofer FKIE
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.pdf_scrutinizer.document;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.*;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.COSArrayList;
import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationUnknown;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDXFA;
import org.apache.pdfbox.util.PDFTextStripper;
import org.w3c.dom.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import de.pdf_scrutinizer.Scrutinizer;
import de.pdf_scrutinizer.API.app.doc.Annotation;
public class DocumentAdapter {
private static Log log = LogFactory.getLog(DocumentAdapter.class);
private PDDocument document;
private boolean isDocLoaded = false;
private HashMap<Integer, String> textPerPage = null;
private final DocumentAnnotationScanTask documentAnnotationScanTask = new DocumentAnnotationScanTask(this);
private final Thread documentAnnotationScanThread = new Thread(documentAnnotationScanTask);
public PDDocument getDocument() {
return document;
}
public boolean IsDocLoaded() {
return isDocLoaded;
}
/**
* Loads a PDF file
*/
public DocumentAdapter(Scrutinizer scrutinizer, File pdffile) {
log.debug("loading file");
try {
document = PDDocument.load(new FileInputStream(pdffile), true);
isDocLoaded = true;
log.debug("successfully loaded");
if (document.isEncrypted()) {
log.info("document is encrypted. trying to decrypt with empty password.");
PDEncryptionDictionary dict = document.getEncryptionDictionary();
if (dict.getRevision() > 4) {
log.info("too bad! currently PDFBox does not support this decryption algorithm.");
return;
}
StandardDecryptionMaterial s = new StandardDecryptionMaterial("");
document.openProtection(s);
}
} catch (IOException e) {
log.error("Problem while loading PDF document.", e);
return;
} catch (BadSecurityHandlerException e) {
log.error("Problem while decrypting PDF document.", e);
return;
} catch (CryptographyException e) {
log.error("Problem while decrypting PDF document.", e);
return;
}
// we do not need to wait for the document annots scan thread to
// finish. if annots were not used until that time, they
// cannot affect the analysis result anymore.
documentAnnotationScanThread.setDaemon(true);
documentAnnotationScanThread.start();
}
/**
* Extracts text from a given page using PDFBox's PDFTextStripper.
*
* @param page the page number
*/
private void extractText(int page) {
if (textPerPage != null && textPerPage.containsKey(page)) { // page text was already extracted
return;
}
if (textPerPage == null) {
textPerPage = new HashMap<Integer, String>();
}
PDFTextStripper strip = null;
try {
strip = new PDFTextStripper();
// Adobe-API uses zero-based page index. PDFTextStripper uses one-based.
strip.setStartPage(page + 1);
strip.setEndPage(page + 1);
textPerPage.put(page, strip.getText(document).trim());
} catch (IOException e) {
log.warn("Problem while extracting text from PDF.", e);
}
}
public int getPageNumWords(int page) {
extractText(page);
return textPerPage.get(page).split("[.]*\\s").length;
}
public String getPageNthWord(int page, int wordIndex) {
extractText(page);
String[] result = textPerPage.get(page).split("[.]*\\s");
return result[wordIndex];
}
public List<String[]> getEmbeddedFiles() {
List<String[]> result = new ArrayList<String[]>();
log.info("searching for embedded files [names dictionary]");
PDDocumentNameDictionary names_dict = document.getDocumentCatalog().getNames();
if (names_dict != null) {
PDEmbeddedFilesNameTreeNode ef = names_dict.getEmbeddedFiles();
if (ef != null) {
try {
Map<String, Object> names_file = ef.getNames();
if (names_file != null) {
for (Object o : names_file.values()) {
if (o instanceof PDComplexFileSpecification) {
PDComplexFileSpecification a = (PDComplexFileSpecification) o;
PDEmbeddedFile f = a.getEmbeddedFile();
if (f != null) {
String subtype = f.getSubtype();
log.info("found file of type " + subtype + ": " + a.getFile());
result.add(new String[]{subtype, decodeStream(f.getStream())});
}
}
}
}
} catch (IOException e) {
log.warn("", e);
}
}
}
log.info("searching for embedded files [brute force]");
List<COSObject> objects = document.getDocument().getObjects();
for (COSObject o : objects) {
COSBase b = o.getDictionaryObject(COSName.TYPE);
if (b != null && b.equals(COSName.EMBEDDED_FILE)) {
COSStream s = (COSStream) o.getObject();
result.add(new String[]{"", decodeStream(s)});
}
}
if (result.size() > 0) {
log.info("found " + result.size() + " embedded file/s");
}
return result;
}
public void syncAnnotScan() {
try {
documentAnnotationScanThread.join();
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
}
}
public Annotation[] getCachedAnnots(int pageno) {
try {
documentAnnotationScanThread.join();
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
}
return documentAnnotationScanTask.getResult().get(pageno);
}
/**
* Returns the Annotation-objects from a given page number.
*
* @param pageno the page number
* @return an Array of Annotation-objects
*/
Annotation[] getAnnots(int pageno) {
List<Annotation> result = new ArrayList<Annotation>();
@SuppressWarnings("unchecked")
List<PDPage> list = (List<PDPage>) document.getDocumentCatalog().getAllPages();
if (pageno >= list.size()) {
log.error("trying to access non existing page");
return null;
}
PDPage page = (PDPage) list.get(pageno); // 0-based
COSArrayList annotations = null;
try {
annotations = (COSArrayList) page.getAnnotations();
} catch (IOException e) {
log.warn("", e);
return null;
}
for (Object annotation : annotations) {
if (annotation instanceof PDAnnotationText) {
PDAnnotationText annotation2 = (PDAnnotationText) annotation;
COSBase subjectbase = annotation2.getDictionary().getItem(COSName.SUBJ);
if (subjectbase instanceof COSObject) {
COSObject subject = (COSObject) subjectbase;
if (subject != null) {
COSBase ref_subject = ((COSObject) subject).getObject();
if (ref_subject instanceof COSStream) {
result.add(new Annotation(decodeStream((COSStream) ref_subject)));
}
}
} else if (subjectbase instanceof COSString) {
COSString subject = (COSString) subjectbase;
result.add(new Annotation(subject.getString()));
}
} else if (annotation instanceof PDAnnotationUnknown) {
PDAnnotationUnknown annotation2 = (PDAnnotationUnknown) annotation;
COSObject subject = (COSObject) annotation2.getDictionary().getItem(COSName.SUBJ);
if (subject != null) {
COSBase ref_subject = ((COSObject) subject).getObject();
if (ref_subject instanceof COSStream) {
result.add(new Annotation(decodeStream((COSStream) ref_subject)));
continue;
}
}
subject = (COSObject) annotation2.getDictionary().getItem(COSName._3DD); // 3D-Annots
if (subject != null) {
COSBase ref_subject = ((COSObject) subject).getObject();
if (ref_subject instanceof COSStream) {
result.add(new Annotation(decodeStream((COSStream) ref_subject)));
}
}
}
}
return result.toArray(new Annotation[]{});
}
public Annotation getAnnot(int pageno, String name) {
PDPage page = (PDPage) document.getDocumentCatalog().getAllPages().get(pageno); // 0-based
COSArrayList annotations = null;
try {
annotations = (COSArrayList) page.getAnnotations();
} catch (IOException e) {
log.warn("", e);
return null;
}
for (Object annotationRef : annotations) {
if (annotationRef instanceof PDAnnotationText) {
PDAnnotationText annotation = (PDAnnotationText) annotationRef;
if (annotation.getAnnotationName().equals(name)) {
COSBase subject = annotation.getDictionary().getItem(COSName.SUBJ);
if (subject instanceof COSString) {
return new Annotation(((COSString) subject).getString());
} else if (subject instanceof COSObject) {
COSObject subjectObj = (COSObject) subject;
if (subjectObj != null) {
COSBase subjectRef = ((COSObject) subjectObj).getObject();
if (subjectRef instanceof COSStream) {
return new Annotation(decodeStream((COSStream) subjectRef));
}
}
} else {
throw new RuntimeException("getAnnot error");
}
}
}
}
return null;
}
/**
* Tries to decode a given COSStream.
*
* @param cosstream the stream
* @return the decoded stream.
*/
private static String decodeStream(COSStream cosstream) {
BufferedInputStream stream;
StringBuilder sb = new StringBuilder();
String result = null;
boolean unicode = false;
try {
stream = (BufferedInputStream) cosstream.getUnfilteredStream();
try {
if (stream.read() == 0xFE) {
if (stream.read() == 0xFF) {
unicode = true;
}
}
if (unicode) {
byte lsb;
byte msb;
stream.read();
for (int data; (data = stream.read()) != -1; ) {
lsb = (byte) (data);
msb = (byte) (stream.read());
if (msb != 0) {
//this will leave unicode-chars out
continue;
}
if (lsb != 0) {
sb.append((char) data);
}
}
} else {
stream = (BufferedInputStream) cosstream.getUnfilteredStream();
for (int data; (data = stream.read()) != -1; ) {
sb.append((char) data);
}
}
} finally {
stream.close();
}
} catch (IOException e) {
log.warn("", e);
return "";
}
if (unicode) {
try {
result = new String(sb.toString().getBytes(), "UTF-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
} else {
result = sb.toString();
}
/*
* removes non-printable characters
* "[^\\p{Print}]"
* "\\p{Cntrl}"
* \p{Space}
*/
return result.replaceAll("[^\\p{Print}^\\p{Space}]", "");
}
/**
* Collects code from all possible locations.
*
* @return List of code-strings.
*/
public List<String> getCodeEvents() {
log.info("searching for code execution events");
List<String> result = new ArrayList<String>();
List<String> tmp = documentCatalogNames();
if (tmp.size() > 0) {
log.info(String.format("found %d code event%s in document catalog -> names", tmp.size(), (tmp.size() == 1 ? "" : "s")));
result.addAll(tmp);
}
tmp = documentOpenAction();
if (tmp.size() > 0) {
log.info(String.format("found %d code event%s in document catalog -> openaction", tmp.size(), (tmp.size() == 1 ? "" : "s")));
result.addAll(tmp);
}
tmp = pageAdditionalActions();
if (tmp.size() > 0) {
log.info(String.format("found %d code event%s in additional actions of a page", tmp.size(), (tmp.size() == 1 ? "" : "s")));
result.addAll(tmp);
}
tmp = acroForms();
if (tmp.size() > 0) {
log.info(String.format("found %d code event%s in an AcroForm", tmp.size(), (tmp.size() == 1 ? "" : "s")));
result.addAll(tmp);
}
tmp = bruteForce();
if (tmp.size() > 0) {
log.info(String.format("found %d code event%s with brute force", tmp.size(), (tmp.size() == 1 ? "" : "s")));
result.addAll(tmp);
}
if (result.size() == 0) {
log.info("no code found");
}
//remove empty results
tmp = new ArrayList<String>();
for (String x : result) {
if (x.trim().length() == 0) {
tmp.add(x);
}
}
for (String x : tmp) {
result.remove(x);
}
/* DEBUG */
Pattern pat = Pattern.compile("[^\\p{Print}^\\p{Space}]");//Pattern.compile("[^\\p{Print}]");("[^\\p{Print}^\\p{Space}]"
for (String x : result) {
Matcher matcher = pat.matcher(x);
if (matcher.find()) {
log.error("found non-printable character in code");
//System.exit(-1);
}
}
// distinct
// this will kick out strings that we extracted more than one time
// doesn't work because ordering will be destroyed
//result = new ArrayList<String>(new HashSet<String>(result));
List<Integer> delete = new ArrayList<Integer>();
for (int i = result.size(); i > 0; i--) {
for (int j = 0; i < result.size() && j != i; j++) {
if (result.get(j).equals(result.get(i))) {
if (!delete.contains(i)) {
delete.add(i);
}
}
}
}
for (int i : delete) {
result.remove(i);
}
String[] replace = {"app.view\\\\erVer\\\\sion", "app.viewerVersion",
"une\\\\scape", "unescape",
"/\\[\\^\\]", "/[/^]" // Adobe Engine seems to allow [^] regular expression:
// "^e^v^a^^l^".replace(/[^]/g, "") -> escape this
};
for (int i = 0; i < result.size(); i++) {
for (int j = 0; j < replace.length; j += 2) {
Pattern p = Pattern.compile(replace[j]);
Matcher m = p.matcher(result.get(i));
m.find();
result.set(i, m.replaceAll(replace[j + 1]));
}
}
return result;
}
private List<String> bruteForce() {
List<String> result = new ArrayList<String>();
COSDocument doc = document.getDocument();
List<COSObject> list = doc.getObjects();
for (COSObject o : list) {
COSBase b = o.getObject();
if (b instanceof COSDictionary) {
COSDictionary c = (COSDictionary) b;
COSBase d = c.getItem(COSName.JS);
if (d != null) {
if (d instanceof COSObject) {
COSObject e = (COSObject) d;
COSBase f = e.getObject();
if (f instanceof COSStream) {
result.add(decodeStream((COSStream) f));
}
}
}
}
}
return result;
}
public static String xmlFromXFA(PDXFA xfa) {
StringBuilder sb = new StringBuilder();
if (xfa.getCOSObject() instanceof COSArray) {
COSArray a = (COSArray) xfa.getCOSObject();
for (int i = 1; i < a.size(); i += 2) {
COSObject b = (COSObject) a.get(i);
COSStream c = (COSStream) b.getObject();
if (c != null) sb.append(decodeStream(c));
}
} else if (xfa.getCOSObject() instanceof COSStream) {
sb.append(decodeStream((COSStream) xfa.getCOSObject()));
}
return sb.toString();
}
public static String getCharacterDataFromElement(Element e) {
Node child = e.getFirstChild();
if (child instanceof CharacterData) {
CharacterData cd = (CharacterData) child;
return cd.getData();
}
return "?";
}
private List<String> acroForms() {
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDAcroForm af = catalog.getAcroForm();
List<String> result = new ArrayList<String>();
if (af != null) {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = null;
Document doc = null;
String xml = null;
boolean parsingerror = false;
try {
db = dbf.newDocumentBuilder();
InputSource is = new InputSource();
PDXFA xfa = af.getXFA();
if (xfa == null)
return result;
xml = xmlFromXFA(xfa);
is.setCharacterStream(new StringReader(xml));
doc = db.parse(is);
} catch (ParserConfigurationException e) {
log.warn("", e);
parsingerror = true;
} catch (SAXException e) {
log.warn(e.getMessage());
parsingerror = true;
} catch (IOException e) {
log.warn("", e);
parsingerror = true;
}
if (!parsingerror) {
NodeList l = doc.getElementsByTagName("field");
for (int i = 0; i < l.getLength(); i++) {
Element element = (Element) l.item(i);
String fieldname = element.getAttribute("name");
NodeList m = element.getElementsByTagName("script");
Element script = (Element) m.item(0);
if (script != null && script.getAttribute("contentType").equals("application/x-javascript")) {
/*
* most likely, the code in the XFA template will access 'fieldname'.rawValue.
* so we need to get an object with the fieldname in the JS-context.
*/
String tmp = "var %s = new Object(); " +
"%s.value = new Object();" +
"%s.value.image = new Object()\n";
//result.add(String.format(tmp, fieldname, fieldname, fieldname));
result.add(String.format(tmp, fieldname, fieldname, fieldname) + getCharacterDataFromElement(script));
}
}
} else {
Pattern p = Pattern.compile("<script\\s[\\w\\s=\\\"/-]*>([^<]*)<?", Pattern.DOTALL);
Matcher m = p.matcher(xml);
if (m.find()) { //FIXME: what if more than one <script> ?
String tmp = m.group(1);
tmp = StringEscapeUtils.unescapeXml(tmp);
result.add(tmp);
}
}
}
return result;
}
/**
* Searches for an names directory and tries to extract possible JavaScript code out of it.
*
* @return List of code-strings or empty List.
*/
private ArrayList<String> documentCatalogNames() {
ArrayList<String> result = new ArrayList<String>();
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
if (names != null) {
PDJavascriptNameTreeNode js_node = names.getJavaScript();
if (js_node != null) {
COSDictionary namestable = js_node.getCOSDictionary();
if (namestable != null) {
COSArray namesarray = (COSArray) namestable.getDictionaryObject(COSName.NAMES);
if (namesarray != null) {
ArrayList<COSBase> namesarray_items = new ArrayList<COSBase>();
for (int i = 0; i < namesarray.size(); i += 2) { //this is just a guess...
if (i + 1 < namesarray.size()) {
namesarray_items.add(namesarray.getObject(i + 1));
}
}
for (COSBase item : namesarray_items) {
if (item instanceof COSDictionary) {
COSDictionary item_ref = (COSDictionary) item;
if (item_ref.getItem(COSName.S).equals(COSName.JAVA_SCRIPT) || item_ref.getItem(COSName.S).equals(COSName.JS)) {
extractJavaScript(item_ref, result);
}
}
}
}
}
}
}
return result;
}
/**
* This method looks into the documents catalog and searches for an OpenAction.
* If an OpenAction was found the code gets extracted.
*
* @return the extracted code or an empty ArrayList<String>.
*/
private ArrayList<String> documentOpenAction() {
ArrayList<String> result = new ArrayList<String>();
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDestinationOrAction act = null;
try {
act = catalog.getOpenAction();
} catch (IOException e) {
log.error("", e);
return result;
} finally {
if (act == null) {
return result;
}
}
if (act.getCOSObject() instanceof COSDictionary) {
COSDictionary openaction_ref = (COSDictionary) act.getCOSObject();
if (openaction_ref.getItem(COSName.S) != null && openaction_ref.getItem(COSName.S).equals(COSName.JAVA_SCRIPT)) {
extractJavaScript(openaction_ref, result);
}
} else if (act.getCOSObject() instanceof COSArray) {
COSArray openaction_ref = (COSArray) act.getCOSObject();
Iterator<COSBase> it = openaction_ref.iterator();
while (it.hasNext()) {
COSBase b = it.next();
if (b instanceof COSObject) {
COSObject b2 = (COSObject) b;
COSBase b3 = b2.getObject();
if (b3 instanceof COSDictionary) {
COSDictionary openaction_ref2 = (COSDictionary) b3;
if (openaction_ref2.getItem(COSName.S) != null && openaction_ref2.getItem(COSName.S).equals(COSName.JAVA_SCRIPT)) {
extractJavaScript(openaction_ref2, result);
}
}
}
}
}
return result;
}
/**
* Looks at the AdditionalActions of all pages and tries to extract JavaScript Code.
* <p/>
* Looks only at the O-action right now. (triggered when page opened).
* More events: PDF32000_2008, page 415-416
*
* @return List of code-strings.
*/
private ArrayList<String> pageAdditionalActions() {
ArrayList<String> result = new ArrayList<String>();
@SuppressWarnings("unchecked")
List<PDPage> pages = (List<PDPage>) document.getDocumentCatalog().getAllPages();
for (PDPage p : pages) {
PDPageAdditionalActions aa = p.getActions();
if (aa != null) {
COSDictionary dict = aa.getCOSDictionary();
COSBase o = dict.getItem(COSName.O);
if (o != null && o instanceof COSObject) {
COSObject o2 = (COSObject) o;
if (o2.getObject() instanceof COSDictionary) {
COSDictionary o3 = (COSDictionary) o2.getObject();
extractJavaScript(o3, result);
}
}
}
}
return result;
}
private static void extractJavaScript(COSDictionary dictionary, ArrayList<String> result) {
COSBase js = dictionary.getItem(COSName.JS);
if (js instanceof COSString) {
result.add(((COSString) js).getString());
} else if (js instanceof COSObject) { //object is referenced
COSBase js_ref = ((COSObject) js).getObject();
if (js_ref instanceof COSStream) {
result.add(decodeStream((COSStream) js_ref));
} else if (js_ref instanceof COSString) {
result.add(((COSString) js_ref).getString());
}
}
}
public String getDocInfo() {
PDDocumentInformation info = document.getDocumentInformation();
if (info != null) {
StringBuilder sb = new StringBuilder();
sb.append(String.format("\nAuthor:\t\t%.50s", info.getAuthor()));
sb.append(String.format("\nCreator:\t%.50s", info.getCreator()));
try {
sb.append(String.format("\nCreationDate:\t%tF", info.getCreationDate()));
} catch (IOException e) {
log.warn("", e);
}
sb.append(String.format("\nSubject:\t%.50s", info.getSubject()));
sb.append(String.format("\nTitle:\t\t%.50s", info.getTitle()));
sb.append(String.format("\nPages:\t\t%d", document.getNumberOfPages()));
sb.append(String.format("\nProducer:\t%.50s", info.getProducer()));
return sb.toString();
} else return "";
}
}