Source Code of edu.ucla.sspace.tools.ChildesParser

/*
 * Copyright 2009 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.tools;


import edu.ucla.sspace.common.ArgOptions;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.util.HashMultiMap;
import edu.ucla.sspace.util.MultiMap;


import java.io.BufferedReader;
import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;


import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;


import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;




/**
 * A simple xml parser for the Childes corpus.  Words in each utterance will be
 * extracted from the XML and saved into a specified file.  The resulting
 * document may consist of all uterances in an XML file or a single utterance,
 * where a single xml file generates multiple documents.
 *
 * @author Keith Stevens
 * @author David Jurgens
 */
public class ChildesParser {


    /**
     * A writer for writing utterances.
     */
    private PrintWriter writer;


    /**
     * A writer for writing part of speech tags for words in Childes.
     */
    private PrintWriter posWriter;


    /**
     * A map from strings to their parts of speech tags.
     */
    private MultiMap<String, String> posTags;


    /**
     * {@code true} if the parser should generate augmented utterances from the
     * comments when parsing
     */
    private final boolean generateAugmented;


    /**
     * {@code true} if the parser should separate sentences with periods.
     */
    private final boolean separateByPeriod;


    /**
     * {@code true} if the parser should append pos tags to each token in the
     * corpus.  This is useful when the corpus needs the tags to be aligned with
     * the text.  The format of each token will be TOKEN-POS.
     */
    private final boolean appendPosTags;


    /**
     * {@code true} if the parser shold generate one document for all the text
     * processed.
     */
    private final boolean generateOneDoc;


    /**
     * Creates the {@code ChildesParser}.   The given file name will be used to
     * write the extracted words.
     */
    public ChildesParser(String outFile,
                         String posOutFile,
                         boolean generateAugmented,
                         boolean separateByPeriod,
                         boolean appendPosTags,
                         boolean generateOneDoc) {
        this.generateAugmented = generateAugmented;
        this.separateByPeriod = separateByPeriod;
        this.appendPosTags = appendPosTags;
        this.generateOneDoc = generateOneDoc;
        posTags = new HashMultiMap<String, String>();
        try {
            writer = new PrintWriter(outFile);
            posWriter = new PrintWriter(posOutFile);
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Writes strings to the resulting file.
     */
    private synchronized void print(String output) {
        if (generateOneDoc) 
            writer.print(output);
        else 
            writer.println(output);
    }


    /**
     * Parses a single xml file.  If {@code utterancePerDoc} is true, each
     * utterance will be on a separate line, otherwise they will all be
     * concantanated, and separated by periods, and stored on a single line.
     */
    public void parseFile(File file, boolean utterancePerDoc) {
        try {
            // Build an xml document.
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            Document doc = db.parse(file);


            // Extract all utterances.
            NodeList utterances = doc.getElementsByTagName("u");
            StringBuilder fileBuilder = new StringBuilder();
            for (int i = 0; i < utterances.getLength(); ++i) {
                // Extract all words from the utterance
                Element item = (Element) utterances.item(i);
                NodeList words = item.getElementsByTagName("w");
                StringBuilder utteranceBuilder = new StringBuilder();


                // Iterate over the words and get just the word text.
                List<String> wordsInUtterance =
                    new ArrayList<String>(words.getLength());
                for (int j = 0; j < words.getLength(); ++j) {
                    // Get the part of speech tag.
                    Element wordNode = (Element) words.item(j);
                    NodeList posNodeList = wordNode.getElementsByTagName("pos");
                    String word = wordNode.getFirstChild().getNodeValue();
                    if (posNodeList.getLength() > 0) {
                        Node posNode = 
                            posNodeList.item(0).getFirstChild().getFirstChild();
                        String pos = posNode.getNodeValue();
                        posTags.put(word, pos);
                        if (appendPosTags)
                            word += "-" + pos;
                    }
                    wordsInUtterance.add(word);
                }


                // Each of the <a> nodes contains additional information about
                // the currnet utterances.  This may be syntactic information,
                // comments on the scene, descriptions of the action, or
                // clarification by the observer.  For all comments but the
                // syntactic, use the comment text to create new pseudo
                // utterances by combining tokens from the utterance with
                // pseudo-tokens in the comment.  The pseudo-tokens have a
                // "-GROUNDING" suffix which distiguishes them from tokens
                // actually present in the uttered speech.
                NodeList auxNodes = item.getElementsByTagName("a");
                List<String> augmentedUtterances = new LinkedList<String>();
                if (generateAugmented) {
                    for (int j = 0; j < auxNodes.getLength(); ++j) {
                        // Get any comment for the utterance
                        Node n = auxNodes.item(j);
                        String auxNodeType = n.getAttributes().
                            getNamedItem("type").getNodeValue();


                        // Get only those nodes that contain comments or
                        // descriptions on the utterance that may be used to
                        // ground the words being referred to.
                        if (auxNodeType.equals("action")
                            || auxNodeType.equals("actions")
                            || auxNodeType.equals("addressee")
                            || auxNodeType.equals("comments")
                            || auxNodeType.equals("explanation")
                            || auxNodeType.equals("gesture")
                            || auxNodeType.equals("happening")
                            || auxNodeType.equals("situation")) {
                            
                            String commentOnUtterance = 
                                n.getFirstChild().getNodeValue();
                            // Use the iterator factory to tokenize in the event
                            // that the user has specified some form of token
                            // filtering
                            Iterator<String> tokenIter = 
                                IteratorFactory.tokenize(
                                        new BufferedReader(
                                            new StringReader(
                                                commentOnUtterance)));
                            // For each of the tokens in the additional
                            // information, create a pseudo-utterance using a
                            // word from the actual utterance and an
                            // grounding-token
                            while (tokenIter.hasNext()) {
                                String token = tokenIter.next();
                                for (String word : wordsInUtterance) {
                                    augmentedUtterances.add(word + " "
                                        + token + "-GROUNDING");
                                }
                            }
                        }
                    }
                }
                    
                // Write the utterance if an utterance is a document.
                for (Iterator<String> it = wordsInUtterance.iterator(); 
                         it.hasNext(); ) {
                    utteranceBuilder.append(it.next());
                    if (it.hasNext())
                        utteranceBuilder.append(" ");
                }
                String utterance = utteranceBuilder.toString();
                if (utterancePerDoc) {
                    print(utterance);
                    // Print all the psuedo utterances constructed from the
                    // comments
                    for (String aug : augmentedUtterances)
                        print(aug);
                }
                else {  // otherwise save the utterance.
                    fileBuilder.append(utterance);
                    if (separateByPeriod)
                        fileBuilder.append(".");
                    fileBuilder.append(" ");


                    // Print all the psuedo utterances constructed from the
                    // comments.  Unlike the utterances, print these as separate
                    // documents to avoid having them register as co-occurrences
                    // with other utterances.
                    for (String aug : augmentedUtterances)
                        print(aug);
                }
            }


            // Write all the utterances if the whole xml file is a document.
            if (!utterancePerDoc)
                print(fileBuilder.toString());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


    /**
     * Finalizes the writing of documents.
     */
    public void finish() {
        for (Map.Entry<String, String> entry : posTags.entrySet()) {
            posWriter.println(entry.getKey() + " " + entry.getValue());
        }
        posWriter.flush();
        posWriter.close();


        writer.flush();
        writer.close();
    } 


    public static void main(String[] args) {


        // Add the options.
        ArgOptions options = new ArgOptions();
        options.addOption('p', "partOfSpeechTag",
                          "If set, each token will be appended with it's " +
                          "part of speech tag, such as cat-noun",
                          false, null, "Optional");
        options.addOption('S', "separateByPeriod",
                          "If set, seperates sentences by periods",
                          false, null, "Optional");
        options.addOption('U', "utterancePerDoc",
                          "If set, one utterance is considered a document, " +
                          "otherwise all uterances in a file will be " +
                          "considered a document",
                          false, null, "Optional");
        options.addOption('g', "generateOneDoc",
                          "If set, only one document will be generated for " +
                          "all the text processed",
                          false, null, "Optional");


        options.addOption('A', "augmentedUtterances",
                          "Generates augmented utterances from comments " +
                          "about the utterances", false, null, "Augmented");
        options.addOption('F', "augmentedUtterancesFilter",
                          "Specifes a token filter for which tokens in " +
                          "comments are used to generate augmented utterances",
                          true, "SPEC", "Augmented");


        options.addOption('d', "baseChildesDirectory",
                          "The base childes directory.  XML files will be " +
                          "searched for recursively from this base.  Use of " +
                          "this overrides the fileList option.",
                          true, "DIRECTORY", "Required (At least one of)");
        options.addOption('f', "fileList",
                          "The list of files to process",
                          true, "FILE[,FILE]*", "Required (At least one of)");


        // Process the options and emit errors if any required options are
        // missing.
        options.parseOptions(args);
        if ((!options.hasOption("fileList") &&
             !options.hasOption("baseChildesDirectory")) ||
             options.numPositionalArgs() != 2) {
            System.out.println(
                    "usage: java ChildesParser [options] " +
                    "<outfile> <pos-file>\n" +
                    options.prettyPrint());
            return;
        }


        // The default is to have all utterances from a conversation be in a
        // single document
        boolean utterancePerDoc = false;
        utterancePerDoc = options.hasOption("utterancePerDoc");


        boolean genAugmented = options.hasOption("augmentedUtterances");
        if (genAugmented && options.hasOption("augmentedUtterancesFilter")) {
            String filterConf = 
                options.getStringOption("augmentedUtterancesFilter");
            Properties p = System.getProperties();
            p.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY, filterConf);
            IteratorFactory.setProperties(p);
        }


        ChildesParser parser = new ChildesParser(options.getPositionalArg(0),
                                                 options.getPositionalArg(1),
                                                 genAugmented,
                                                 options.hasOption('S'),
                                                 options.hasOption('p'),
                                                 options.hasOption('g'));


        // Process the given file list, if provided.
        if (options.hasOption("fileList")) {
            String[] files = options.getStringOption("fileList").split(",");
            for (String file : files)
                parser.parseFile(new File(file), utterancePerDoc);
        } else {
            // Otherwise search for xml files to process.
            File baseDir =
                new File(options.getStringOption("baseChildesDirectory"));
            findXmlFiles(parser, utterancePerDoc, baseDir);
        }


        parser.finish();
    }


    /**
     * Recursively finds any xml documents to parse.
     */
    public static void findXmlFiles(ChildesParser parser,
                                    boolean utterancePerDoc,
                                    File directory) {
        File[] files = directory.listFiles();
        for (File file : files) {
            if (file.isDirectory())
                findXmlFiles(parser, utterancePerDoc, file);
            else if (file.isFile() && file.getPath().endsWith(".xml"))
                parser.parseFile(file, utterancePerDoc);
        }
    }
}
Source Code of edu.ucla.sspace.tools.ChildesParser

Related Classes of edu.ucla.sspace.tools.ChildesParser