package org.maltparserx.parser;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Formatter;
import java.util.regex.Pattern;
import org.apache.log4j.FileAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.maltparserx.core.config.ConfigurationDir;
import org.maltparserx.core.config.ConfigurationException;
import org.maltparserx.core.config.ConfigurationRegistry;
import org.maltparserx.core.exception.MaltChainedException;
import org.maltparserx.core.helper.SystemLogger;
import org.maltparserx.core.helper.URLFinder;
import org.maltparserx.core.io.dataformat.DataFormatInstance;
import org.maltparserx.core.options.OptionManager;
import org.maltparserx.core.propagation.PropagationManager;
import org.maltparserx.core.symbol.SymbolTableHandler;
import org.maltparserx.core.syntaxgraph.DependencyStructure;
import org.maltparserx.parser.guide.ClassifierGuide;
/**
* @author Johan Hall
*
*/
public class SingleMalt implements DependencyParserConfig {
public static final int LEARN = 0;
public static final int PARSE = 1;
protected ConfigurationDir configDir;
protected Logger configLogger;
protected int optionContainerIndex;
protected Algorithm parsingAlgorithm = null;
protected int mode;
protected ConfigurationRegistry registry;
protected SymbolTableHandler symbolTableHandler;
protected DataFormatInstance dataFormatInstance;
protected long startTime;
protected long endTime;
protected int nIterations = 0;
protected PropagationManager propagationManager;
private Parser parser;
private Trainer trainer;
public void initialize(int containerIndex, DataFormatInstance dataFormatInstance, ConfigurationDir configDir, int mode) throws MaltChainedException {
this.optionContainerIndex = containerIndex;
this.mode = mode;
setConfigurationDir(configDir);
startTime = System.currentTimeMillis();
configLogger = initConfigLogger(getOptionValue("config", "logfile").toString(), getOptionValue("config", "logging").toString());
registry = new ConfigurationRegistry();
this.dataFormatInstance = dataFormatInstance;
symbolTableHandler = dataFormatInstance.getSymbolTables();
if (mode == SingleMalt.LEARN) {
checkOptionDependency();
}
registry.put(org.maltparserx.core.symbol.SymbolTableHandler.class, getSymbolTables());
registry.put(org.maltparserx.core.io.dataformat.DataFormatInstance.class, dataFormatInstance);
// registry.put(org.maltparser.parser.DependencyParserConfig.class, this);
initPropagation();
initParsingAlgorithm();
if (configLogger.isInfoEnabled()) {
URL inputFormatURL = configDir.getInputFormatURL();
URL outputFormatURL = configDir.getOutputFormatURL();
if (inputFormatURL != null) {
if (outputFormatURL == null || outputFormatURL.toString().equals(inputFormatURL.toString())) {
int index = inputFormatURL.toString().indexOf('!');
if (index == -1) {
configLogger.info(" Data Format : "+inputFormatURL.toString()+"\n");
} else {
configLogger.info(" Data Format : "+inputFormatURL.toString().substring(index+1)+"\n");
}
} else {
int indexIn = inputFormatURL.toString().indexOf('!');
int indexOut = outputFormatURL.toString().indexOf('!');
if (indexIn == -1) {
configLogger.info(" Input Data Format : "+inputFormatURL.toString()+"\n");
} else {
configLogger.info(" Input Data Format : "+inputFormatURL.toString().substring(indexIn+1)+"\n");
}
if (indexOut == -1) {
configLogger.info(" Output Data Format : "+outputFormatURL.toString()+"\n");
} else {
configLogger.info(" Output Data Format : "+outputFormatURL.toString().substring(indexOut+1)+"\n");
}
}
}
}
}
private void initPropagation() throws MaltChainedException {
String propagationSpecFileName = getOptionValue("singlemalt", "propagation").toString();
if (propagationSpecFileName == null || propagationSpecFileName.length() == 0) {
return;
}
propagationManager = new PropagationManager(configDir);
if (mode == SingleMalt.LEARN) {
propagationSpecFileName = configDir.copyToConfig(propagationSpecFileName);
OptionManager.instance().overloadOptionValue(optionContainerIndex, "singlemalt", "propagation", propagationSpecFileName);
}
getConfigLogger().info(" Propagation : " + propagationSpecFileName+"\n");
propagationManager.loadSpecification(propagationSpecFileName);
}
/**
* Initialize the parsing algorithm
*
* @throws MaltChainedException
*/
protected void initParsingAlgorithm() throws MaltChainedException {
if (mode == LEARN) {
parsingAlgorithm = trainer = new BatchTrainer(this);
} else if (mode == PARSE) {
parsingAlgorithm = parser = new DeterministicParser(this);
}
}
public void addRegistry(Class<?> clazz, Object o) {
registry.put(clazz, o);
}
public void process(Object[] arguments) throws MaltChainedException {
if (mode == LEARN) {
if (arguments.length < 2 || !(arguments[0] instanceof DependencyStructure) || !(arguments[1] instanceof DependencyStructure)) {
throw new MaltChainedException("The single malt learn task must be supplied with at least two dependency structures. ");
}
DependencyStructure systemGraph = (DependencyStructure)arguments[0];
DependencyStructure goldGraph = (DependencyStructure)arguments[1];
if (systemGraph.hasTokens() && getGuide() != null) {
getGuide().finalizeSentence(((Trainer)getAlgorithm()).parse(goldGraph, systemGraph));
}
} else if (mode == PARSE) {
if (arguments.length < 1 || !(arguments[0] instanceof DependencyStructure)) {
throw new MaltChainedException("The single malt parse task must be supplied with at least one input terminal structure and one output dependency structure. ");
}
DependencyStructure processGraph = (DependencyStructure)arguments[0];
if (processGraph.hasTokens()) {
parser.parse(processGraph);
// ((Parser)getAlgorithm()).parse(processGraph);
}
}
}
public void parse(DependencyStructure graph) throws MaltChainedException {
if (graph.hasTokens()) {
// ((Parser)getAlgorithm()).parse(graph);
parser.parse(graph);
}
}
public void oracleParse(DependencyStructure goldGraph, DependencyStructure oracleGraph) throws MaltChainedException {
if (oracleGraph.hasTokens()) {
if (getGuide() != null) {
getGuide().finalizeSentence(trainer.parse(goldGraph, oracleGraph));
} else {
trainer.parse(goldGraph, oracleGraph);
}
}
}
public void train() throws MaltChainedException {
if (getGuide() == null) {
((Trainer)getAlgorithm()).train();
}
}
public void terminate(Object[] arguments) throws MaltChainedException {
// if (getAlgorithm() instanceof Trainer) {
// ((Trainer)getAlgorithm()).terminate();
// }
getAlgorithm().terminate();
if (getGuide() != null) {
getGuide().terminate();
}
if (mode == LEARN) {
endTime = System.currentTimeMillis();
long elapsed = endTime - startTime;
if (configLogger.isInfoEnabled()) {
configLogger.info("Learning time: " +new Formatter().format("%02d:%02d:%02d", elapsed/3600000, elapsed%3600000/60000, elapsed%60000/1000)+" ("+elapsed+" ms)\n");
}
} else if (mode == PARSE) {
endTime = System.currentTimeMillis();
long elapsed = endTime - startTime;
if (configLogger.isInfoEnabled()) {
configLogger.info("Parsing time: " +new Formatter().format("%02d:%02d:%02d", elapsed/3600000, elapsed%3600000/60000, elapsed%60000/1000)+" ("+elapsed+" ms)\n");
}
}
if (SystemLogger.logger() != configLogger && configLogger != null) {
configLogger.removeAllAppenders();
}
}
/**
* Initialize the configuration logger
*
* @return the configuration logger
* @throws MaltChainedException
*/
public Logger initConfigLogger(String logfile, String level) throws MaltChainedException {
if (logfile != null && logfile.length() > 0 && !logfile.equalsIgnoreCase("stdout") && configDir != null) {
configLogger = Logger.getLogger(logfile);
FileAppender fileAppender = null;
try {
fileAppender = new FileAppender(new PatternLayout("%m"),configDir.getWorkingDirectory().getPath()+File.separator+logfile, true);
} catch(IOException e) {
throw new ConfigurationException("It is not possible to create a configuration log file. ", e);
}
fileAppender.setThreshold(Level.toLevel(level, Level.INFO));
configLogger.addAppender(fileAppender);
configLogger.setLevel(Level.toLevel(level, Level.INFO));
} else {
configLogger = SystemLogger.logger();
}
return configLogger;
}
public Logger getConfigLogger() {
return configLogger;
}
public void setConfigLogger(Logger logger) {
configLogger = logger;
}
public ConfigurationDir getConfigurationDir() {
return configDir;
}
public void setConfigurationDir(ConfigurationDir configDir) {
this.configDir = configDir;
}
public int getMode() {
return mode;
}
public ConfigurationRegistry getRegistry() {
return registry;
}
public void setRegistry(ConfigurationRegistry registry) {
this.registry = registry;
}
public Object getOptionValue(String optiongroup, String optionname) throws MaltChainedException {
return OptionManager.instance().getOptionValue(optionContainerIndex, optiongroup, optionname);
}
public String getOptionValueString(String optiongroup, String optionname) throws MaltChainedException {
return OptionManager.instance().getOptionValueString(optionContainerIndex, optiongroup, optionname);
}
public OptionManager getOptionManager() throws MaltChainedException {
return OptionManager.instance();
}
/******************************** MaltParserConfiguration specific ********************************/
/**
* Returns the list of symbol tables
*
* @return the list of symbol tables
*/
public SymbolTableHandler getSymbolTables() {
return symbolTableHandler;
}
public PropagationManager getPropagationManager() {
return propagationManager;
}
public Algorithm getAlgorithm() {
return parsingAlgorithm;
}
/**
* Returns the guide
*
* @return the guide
*/
public ClassifierGuide getGuide() {
return parsingAlgorithm.getGuide();
}
public void checkOptionDependency() throws MaltChainedException {
try {
if (configDir.getInfoFileWriter() != null) {
configDir.getInfoFileWriter().write("\nDEPENDENCIES\n");
}
// Copy the feature model file into the configuration directory
String featureModelFileName = getOptionValue("guide", "features").toString().trim();
if (featureModelFileName.equals("")) {
// use default feature model depending on the selected parser algorithm
OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "features", getOptionValueString("singlemalt", "parsing_algorithm"));
featureModelFileName = getOptionValue("guide", "features").toString().trim();
/* START: Temp fix during development of new liblinear and libsvm interface */
String learner = getOptionValueString("guide", "learner");
if (!learner.startsWith("lib")) {
learner = "lib"+learner;
}
/* END: Temp fix during development of new liblinear and libsvm interface */
featureModelFileName = featureModelFileName.replace("{learner}", learner);
final URLFinder f = new URLFinder();
featureModelFileName = configDir.copyToConfig(f.findURLinJars(featureModelFileName));
} else {
featureModelFileName = configDir.copyToConfig(featureModelFileName);
}
OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "features", featureModelFileName);
if (configDir.getInfoFileWriter() != null) {
configDir.getInfoFileWriter().write("--guide-features ( -F) "+getOptionValue("guide", "features").toString()+"\n");
}
if (getOptionValue("guide", "data_split_column").toString().equals("") && !getOptionValue("guide", "data_split_structure").toString().equals("")) {
configLogger.warn("Option --guide-data_split_column = '' and --guide-data_split_structure != ''. Option --guide-data_split_structure is overloaded with '', this will cause the parser to induce a single model.\n ");
OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "data_split_structure", "");
if (configDir.getInfoFileWriter() != null) {
configDir.getInfoFileWriter().write("--guide-data_split_structure ( -s)\n");
}
}
if (!getOptionValue("guide", "data_split_column").toString().equals("") && getOptionValue("guide", "data_split_structure").toString().equals("")) {
configLogger.warn("Option --guide-data_split_column != '' and --guide-data_split_structure = ''. Option --guide-data_split_column is overloaded with '', this will cause the parser to induce a single model.\n");
OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "data_split_column", "");
if (configDir.getInfoFileWriter() != null) {
configDir.getInfoFileWriter().write("--guide-data_split_column ( -d)\n");
}
}
String decisionSettings = getOptionValue("guide", "decision_settings").toString().trim();
String markingStrategy = getOptionValue("pproj", "marking_strategy").toString().trim();
String coveredRoot = getOptionValue("pproj", "covered_root").toString().trim();
StringBuilder newDecisionSettings = new StringBuilder();
if (decisionSettings == null || decisionSettings.length() < 1 || decisionSettings.equals("default")) {
decisionSettings = "T.TRANS+A.DEPREL";
} else {
decisionSettings = decisionSettings.toUpperCase();
}
if (markingStrategy.equalsIgnoreCase("head") || markingStrategy.equalsIgnoreCase("path") || markingStrategy.equalsIgnoreCase("head+path")) {
if (!Pattern.matches(".*A\\.PPLIFTED.*", decisionSettings)) {
newDecisionSettings.append("+A.PPLIFTED");
}
}
if (markingStrategy.equalsIgnoreCase("path") || markingStrategy.equalsIgnoreCase("head+path")) {
if (!Pattern.matches(".*A\\.PPPATH.*", decisionSettings)) {
newDecisionSettings.append("+A.PPPATH");
}
}
if (!coveredRoot.equalsIgnoreCase("none") && !Pattern.matches(".*A\\.PPCOVERED.*", decisionSettings)) {
newDecisionSettings.append("+A.PPCOVERED");
}
if (!getOptionValue("guide", "decision_settings").toString().equals(decisionSettings) || newDecisionSettings.length() > 0) {
OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "decision_settings", decisionSettings+newDecisionSettings.toString());
if (configDir.getInfoFileWriter() != null) {
configDir.getInfoFileWriter().write("--guide-decision_settings ( -gds) "+getOptionValue("guide", "decision_settings").toString()+"\n");
}
}
if (configDir.getInfoFileWriter() != null) {
configDir.getInfoFileWriter().flush();
}
} catch (IOException e) {
throw new ConfigurationException("Could not write to the configuration information file. ", e);
}
}
}