Source Code of com.jgaap.util.Document

/*
 * JGAAP -- a graphical program for stylometric authorship attribution
 * Copyright (C) 2009,2011 by Patrick Juola
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
/**
 **/
package com.jgaap.util;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;


import com.jgaap.generics.AnalysisDriver;
import com.jgaap.generics.CanonicizationException;
import com.jgaap.generics.Canonicizer;
import com.jgaap.generics.EventCuller;
import com.jgaap.generics.EventDriver;
import com.jgaap.generics.Language;
import com.jgaap.generics.LanguageParsingException;
import com.jgaap.languages.English;


/**
 * Code for storing and processing individual documents of any type.
 */
public class Document {


  private String author;
  private String filepath;
  private String title;
  private String text;
  private int size;
  private Type docType;
  private Language language;
  private List<EventCuller> eventCullers;
  private List<Canonicizer> canonicizers;
  private Map<EventDriver, EventSet> eventSets;
  private Map<AnalysisDriver, List<Pair<String, Double>>> results;
  private boolean failed = false;
  
  private static final String tab = "        "; 
  
  public Document() {
    filepath = "";
    title = "";
    size = 0;
    canonicizers = new ArrayList<Canonicizer>();
    eventSets = new HashMap<EventDriver, EventSet>();
    results = new HashMap<AnalysisDriver, List<Pair<String,Double>>>();
    eventCullers = new ArrayList<EventCuller>();
    docType = Type.GENERIC;
    this.language = new English();
  }


  public Document(String filepath, String author) throws Exception {
    this(filepath, author, getTitleFromPath(filepath));
  }


  /**
   * Copy constructor. Can be used to break object references and protect a
   * Document instance from being modified by other classes.
   * 
   * @param document
   *            The document to be copied
   */
  public Document(Document document) {
    this.author = document.author;
    this.canonicizers = new ArrayList<Canonicizer>(document.canonicizers);
    this.eventCullers = new ArrayList<EventCuller>(document.eventCullers);
    this.docType = document.docType;
    this.eventSets = new HashMap<EventDriver, EventSet>(document.eventSets);
    this.results = new HashMap<AnalysisDriver, List<Pair<String,Double>>>(document.results);
    this.filepath = document.filepath;
    this.text = document.text;
    this.size = document.size;
    this.title = document.title;
    this.language = document.getLanguage();
  }


  /**
   * Constructor that takes three arguments: file path, file author, file
   * title
   * 
   * @param filepath
   *            The path to the file
   * @param author
   *            The author of the document
   * @param title
   *            The title of the document
   * @throws Exception
   */
  public Document(String filepath, String author, String title){
    this.author = author;
    if (author != null && author.equals("")) // unknown authors are null
      this.author = null;
    this.filepath = filepath;
    this.title = title;
    if (title == null || title.equals(""))
      this.title = getTitleFromPath(filepath);
    this.docType = DocumentHelper.getDocType(filepath);
    this.language = new English();
    this.eventSets = new HashMap<EventDriver, EventSet>();
    this.canonicizers = new ArrayList<Canonicizer>();
    this.eventCullers = new ArrayList<EventCuller>();
    results = new HashMap<AnalysisDriver, List<Pair<String,Double>>>();
  }
  
  public void load() throws Exception {
    if (this.docType != Type.DATABASE) {
      this.text = DocumentHelper.loadDocument(filepath, language.getCharset());
      this.size = this.text.length();
      if (this.size == 0) {
        throw new Exception("Document: "+this.filepath+" was empty.");
      }
    }
  }


  /**
   * Takes a file path and returns only the file name.
   * 
   * @param filePath
   *            the full path to the file
   * @return A document title derived from the file path.
   */
  private static String getTitleFromPath(String filePath) {
    String[] split = filePath.split("[\\\\[\\/]]");
    return split[split.length - 1];
  }
  
  public void setText(String text){
    this.text = text;
    size = text.length();
  }


  /**
   * Prints the text of the document to std out
   * 
   */
  public void print() {
    System.out.println(stringify());
  }


  /** Retrieves the author of the current document **/
  public String getAuthor() {
    return author;
  }


  /** Returns the docType of the current document **/
  public Type getDocType() {
    return docType;
  }


  /** Returns the full filepath of the current document **/
  public String getFilePath() {
    return filepath;
  }
  
  /**
   * The text of the document as a character array
   * 
   * This is only preprocessed if processCanonicizers() has been run
   * 
   * @return 
   */
  public char[] getText() {
    return text.toCharArray();
  }


  /**
   * Returns the size of the document. Size is determined by the number of
   * characters plus whitespace
   **/
  public int getSize() {
    return size;
  }


  /** Returns the title of the current document **/
  public String getTitle() {
    return title;
  }


  /** Sets the author of the current document **/
  public void setAuthor(String author) {
    this.author = author;
  }


  /** Sets the docType of the current document **/
  public void setDocType(Type docType) {
    this.docType = docType;
  }


  /** Sets the title of the current document **/
  public void setTitle(String title) {
    this.title = title;
  }


  /**
   * Clear the list of canonicizers associated with this Document.
   */
  public void clearCanonicizers() {
    canonicizers.clear();
  }


  /**
   * Add a Canonicizer to the internal list maintained by this Document.
   * 
   * @param canonicizer
   *            A new canonicizer to add to the list
   */
  public void addCanonicizer(Canonicizer canonicizer) {
    canonicizers.add(canonicizer);
  }


  /**
   * Remove a Canonicizer from the internal list maintained by this Document.
   * 
   * @param canonicizer
   *            A canonicizer to remove from the list
   * @return Returns true if a matching Canonicizer was found and removed
   */
  public boolean removeCanonicizer(Canonicizer canonicizer) {
    return canonicizers.remove(canonicizer);
  }


  /**
   * Get all the canonicizers associated with this Document.
   * 
   * return A vector of canonicizers associated with this document
   */
  public List<Canonicizer> getCanonicizers() {
    return new ArrayList<Canonicizer>(canonicizers);
  }


  /**
   * Take the list of canonicizers associated with this document and apply
   * them to the document one by one, in the same order they were added.
   */
  public void processCanonicizers() throws LanguageParsingException, CanonicizationException {
    char[] text;
    if (language.isParseable()){
      text = language.parseLanguage(this.text);
    } else {
      text = getText();
    }
    for (Canonicizer canonicizer : canonicizers) {
      text = canonicizer.process(text);
    }
    this.text = new String(text);
  }


  /**
   * Adds a mapping of an EventDriver used on this Document to the EventSet generated by using the EventDriver on it
   * @param eventDriver
   * @param eventSet
   */
  public void addEventSet(EventDriver eventDriver, EventSet eventSet) {
    eventSets.put(eventDriver, eventSet);
  }


  /**
   * Returns a map of all EventDrivers used on this Document to the EventSets generated by them
   * @return EventDrivers to EventSets
   */
  public Map<EventDriver, EventSet> getEventSets() {
    return eventSets;
  }


  /**
   * Get the EventSet generated by the passed eventDriver
   * 
   * @param eventDriver 
   * @return the eventset generated by running the passed eventDriver on this document
   */
  public EventSet getEventSet(EventDriver eventDriver) {
    return eventSets.get(eventDriver);
  }


  /**
   * Removes all EventSets generated from this document
   */
  public void clearEventSets() {
    eventSets.clear();
  }


  /** 
   * add the result of an analysis to the document
   * 
   * @param analysisDriver
   * @param list
   */
  public void addResult(AnalysisDriver analysisDriver, List<Pair<String, Double>> list) {
    results.put(analysisDriver, list);
  }
  
  /**
   * 
   * @param analysisDriver
   * @return
   */
  public List<Pair<String, Double>> getRawResult(AnalysisDriver analysisDriver){
    return results.get(analysisDriver);
  }
  
  /**
   * Generates a formatted report for the analysisDriver and eventDriver specified 
   * @param analysisDriver
   * @return Report of analysis run on this document
   */
  public String getFormattedResult(AnalysisDriver analysisDriver) {
    StringBuilder buffer = new StringBuilder();
    buffer.append(getTitle()).append(" ").append(getFilePath()).append("\n");
    buffer.append("Canonicizers: \n");
    if (canonicizers.isEmpty()) {
      buffer.append(tab).append("none\n");
    } else {
      for (Canonicizer canonicizer : canonicizers) {
        buffer.append(tab).append(canonicizer.displayName()).append(" ").append(canonicizer.getParameters()).append("\n");
      }
    }
    buffer.append("EventDrivers: \n");
    for (EventDriver eventDriver : eventSets.keySet()) {
      buffer.append(tab).append(eventDriver.displayName()).append(" ").append(eventDriver.getParameters());
      List<Canonicizer> canonicizers = eventDriver.getCanonicizers();
      if (!canonicizers.isEmpty()) {
        buffer.append("\n").append(tab).append(tab).append("Canonicizers: ");
        for (Canonicizer canonicizer : canonicizers) {
          buffer.append("\n").append(tab).append(tab).append(tab).append(canonicizer.displayName()).append(" ").append(canonicizer.getParameters());
        }
      }
      List<EventCuller> eventCullers = eventDriver.getEventCullers();
      if (!eventCullers.isEmpty()) {
        buffer.append("\n").append(tab).append(tab).append("EventCullers: ");
        for (EventCuller eventCuller : eventDriver.getEventCullers()) {
          buffer.append("\n").append(tab).append(tab).append(tab).append(eventCuller.displayName()).append(" ").append(eventCuller.getParameters());
        }
      }
      buffer.append("\n");
    }
    buffer.append("Analysis: \n").append(tab).append(analysisDriver.displayName()).append(" ").append(analysisDriver.getParameters());
    buffer.append("\n");
    int count = 0; // Keeps a relative count (adjusted for ties)
    int fullCount = 0; // Keeps the absolute count (does not count ties)
    Double lastResult = Double.NaN;
    List<Pair<String, Double>> results = getRawResult(analysisDriver);
    if (results == null) {
      return null;
    }
    for (Pair<String, Double> result : results) {
      fullCount++;
      // Account for ties
      if (!result.getSecond().equals(lastResult)) {
        count = fullCount;
      }
      lastResult = result.getSecond();
      buffer.append(count + ". " + result.getFirst() + " " + result.getSecond() + "\n");
    }
    buffer.append("\n\n");
    return buffer.toString();
  }


  /**
   * Generates and returns a formatted report of all results 
   * @return
   */
  public String getResult() {
    StringBuilder buffer = new StringBuilder();
    Set<AnalysisDriver> analysisDrivers = results.keySet();
    for (AnalysisDriver analysisDriver : analysisDrivers) {
      buffer.append(getFormattedResult(analysisDriver));
    }
    return buffer.toString();
  }
  
  public  Map<AnalysisDriver, List<Pair<String, Double>>> getRawResults() { 
    return results;
  }


  public void clearResults() {
    results.clear();
  }


  /**
   * Indicates whether this document has a known author or not.
   * 
   * @return boolean value indicating whether the author of this document is
   *         known
   */
  public boolean isAuthorKnown() {
    return (author != null);
  }


  /**
   * Convert processed document into one really long string.
   **/
  public String stringify() {
    return text;
  }


  @Override
  public String toString() {
    String string = this.getTitle()+" (";
    if(isAuthorKnown()){
      string += this.getAuthor()+")";
    } else {
      string += "unknown)";
    }
    return string;
  }


  public Language getLanguage() {
    return language;
  }


  public void setLanguage(Language language) {
    this.language = language;
  }
  
  public void failed(){
    failed = true;
  }
  
  public boolean hasFailed(){
    return failed;
  }
  
  public enum Type {
    PDF, DOC, HTML, GENERIC, DATABASE
  }
}
Source Code of com.jgaap.util.Document

Related Classes of com.jgaap.util.Document