Source Code of opennlp.ccgbank.extract.LexExtract

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White
// 
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////


//Program which creates a temp.xml file from the bareparse. temp.xml serves are the input for creating lexicon.xml & morph.xml


package opennlp.ccgbank.extract;


import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;


import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXResult;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;


import opennlp.ccgbank.extract.ExtractGrammar.ExtractionProperties;


import org.apache.xml.serializer.OutputPropertiesFactory;
import org.apache.xml.serializer.Serializer;
import org.apache.xml.serializer.SerializerFactory;
import org.jdom.JDOMException;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLFilter;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;




/**
 * Program which reads in each file of the bare parse xml rep and generates a lexicon, 
 * a freq tally of the lexical info and a list of ccgbank sentences.
 */
public class LexExtract{
  
  public static void extractLex(ExtractionProperties extractProps) throws TransformerException,TransformerConfigurationException,SAXException,IOException,JDOMException {
    
    System.out.println("Extracting lexicon info:");
    
    File lexFile = new File(new File(extractProps.destDir), "lexicon.xml");
    File tempFile = new File(new File(extractProps.tempDir), "temp.xml");
    PrintWriter tempOut = new PrintWriter(new FileOutputStream(tempFile),true);
    
    File ccgbankDir = new File(extractProps.srcDir);
    File[] ccgbankSections=ccgbankDir.listFiles();
    Arrays.sort(ccgbankSections);
    
    FreqTally.CAT_FREQ_CUTOFF = extractProps.catFreqCutoff;
    FreqTally.LEX_FREQ_CUTOFF = extractProps.lexFreqCutoff;
    FreqTally.OPEN_FREQ_CUTOFF = extractProps.openFreqCutoff;
    
    //temp.xml creation


    TransformerFactory tFactory = TransformerFactory.newInstance();
    Transformer lexExtrTransformer = tFactory.newTransformer(ExtractGrammar.getSource("opennlp.ccgbank/transform/lexExtr.xsl"));
    
    // add root 
    tempOut.println("<ccg-lexicon>");
    
    for (int i=extractProps.startSection; i<=extractProps.endSection; i++){
      
      System.out.println("Section " + ccgbankSections[i].getName());
      File[] files=ccgbankSections[i].listFiles();
      Arrays.sort(files);
      
      int fileStart = 0; int fileLimit = files.length;
      if (extractProps.fileNum >= 0) {
        fileStart = extractProps.fileNum;
        fileLimit = extractProps.fileNum + 1;
      }
      
      for (int j=fileStart; j<fileLimit; j++){
        String inputFile=files[j].getAbsolutePath();
        if (j == fileStart) System.out.print(files[j].getName() + " ");
        else if (j == (fileLimit-1)) System.out.println(" " + files[j].getName());
        else System.out.print(".");
        if (fileStart == fileLimit-1) System.out.println();
        try {
          lexExtrTransformer.transform(new StreamSource(inputFile), new StreamResult(tempOut));
        }
        catch (Exception exc) {
                    System.out.println("Skipping: " + inputFile);
                    System.out.println(exc.toString());
        }
        tempOut.flush();
      }
    }
    
    //Closing the root element
    tempOut.println("</ccg-lexicon>");
    tempOut.flush();
    tempOut.close();
    
    //Generating a freq tally from static datastructures
    FreqTally.printTally(extractProps);
    
    System.out.println("Generating lexicon.xml");
    
    if (tFactory.getFeature(SAXSource.FEATURE) && tFactory.getFeature(SAXResult.FEATURE)) {
      
      SAXTransformerFactory saxTFactory = ((SAXTransformerFactory) tFactory);
      
      // Create an XMLFilter for each stylesheet.
      
      // Extract lexicon from temp.xml
      XMLFilter xmlFilter0 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/filterLex.xsl"));
      
      XMLFilter xmlFilter1 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/closedCatInsert.xsl"));
      
      XMLFilter xmlFilter2 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/insertLF.xsl"));


      XMLFilter xmlFilter3 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/insertPunctLF.xsl"));


      XMLFilter xmlFilter4 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/insertOrigPunctsLF.xsl"));


      XMLFilter xmlFilter5 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/addFilterLexFeats.xsl"));
      
      XMLFilter xmlFilter6 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/insertSemFeats.xsl"));
      
      XMLFilter xmlFilter7 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/markUnmatched.xsl"));
      
      // Create an XMLReader.
      XMLReader reader = XMLReaderFactory.createXMLReader();
      
      // xmlFilter0 uses the XMLReader as its reader.
      xmlFilter0.setParent(reader);
      xmlFilter1.setParent(xmlFilter0);


      xmlFilter2.setParent(xmlFilter1);
      xmlFilter3.setParent(xmlFilter2);


      if (extractProps.lexF) {
        xmlFilter5.setParent(xmlFilter3);
        xmlFilter6.setParent(xmlFilter5);
      } else if (extractProps.origPuncts) {
        xmlFilter4.setParent(xmlFilter2);
        xmlFilter6.setParent(xmlFilter4);
      }


      else xmlFilter6.setParent(xmlFilter3);


      xmlFilter7.setParent(xmlFilter6);
      XMLFilter xmlFilter = xmlFilter7;
      
      java.util.Properties xmlProps = OutputPropertiesFactory.getDefaultMethodProperties("xml");
      xmlProps.setProperty("indent", "yes");
      xmlProps.setProperty("standalone", "no"); 
        xmlProps.setProperty("{http://xml.apache.org/xalan}indent-amount", "2");
      Serializer serializer = SerializerFactory.getSerializer(xmlProps);              
      serializer.setOutputStream(new FileOutputStream(lexFile));
      xmlFilter.setContentHandler(serializer.asContentHandler());
      xmlFilter.parse(new InputSource(tempFile.getPath()));
    }
  }
}
Source Code of opennlp.ccgbank.extract.LexExtract

Related Classes of opennlp.ccgbank.extract.LexExtract