Package joshua.decoder.segment_file.sax_parser

Source Code of joshua.decoder.segment_file.sax_parser.SAXSegmentParser

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.decoder.segment_file.sax_parser;

import joshua.decoder.segment_file.TypeCheckingException;
import joshua.decoder.segment_file.SegmentFileParser;
import joshua.decoder.segment_file.Segment;
import joshua.decoder.segment_file.ConstraintSpan;
import joshua.decoder.segment_file.ConstraintRule;
import joshua.util.Regex;
import joshua.util.CoIterator;

import org.xml.sax.Attributes;
import org.xml.sax.Locator;
import org.xml.sax.SAXParseException;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import java.io.InputStream;
import java.io.FileInputStream;
import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.EmptyStackException;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;


// MAJOR TODO: We need to figure out how to make a *decent* Locator
// object for our InputStream. This will allow us to (meaningfully)
// construct SAXParseExceptions and call the handler methods on our
// ErrorHandler interface (which should be updated to print location
// info). As it stands, we use the interface for logging, but we
// have no location info.
//
// TODO: we should also define helper methods akin to the ErrorHandler
// interface, but which just take a message string and do all the
// boilerplate for us. See, in particular, the bug notes about
// mutable/volatile Locators.


/**
* This is a Xerces SAX parser for parsing files in the format
* specified by SegmentFile.dtd. All extraneous tags and text are
* ignored (they raise warnings, but do not halt the parser).
*
* @author wren ng thornton <wren@users.sourceforge.net>
* @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
*/
public class SAXSegmentParser extends DefaultHandler
implements SegmentFileParser {
 
  /** Co-iterator for consuming output. */
  private CoIterator<Segment> coit;
 
  /**
   * A Locater (or equivalent information) is needed to create
   * SAXParseExceptions. If this object is mutable or volatile
   * then it should not be shared by multiple SAXParseExceptions,
   * since that would allow locations to move out from under
   * the exception. See bug notes below.
   */
  private Locator locator;
 
  // For maintaining context
  private boolean             seenRootTag = false;
  private Stack<StringBuffer> tempText;
  private SAXSegment          tempSeg;
  private SAXConstraintSpan   tempSpan;
  private SAXConstraintRule   tempRule;
 
 
  private static final Logger logger =
    Logger.getLogger(SAXSegmentParser.class.getName());
 
 
  public SAXSegmentParser() {
    this.tempText = new Stack<StringBuffer>();
  }
 
 
//===============================================================
// SegmentFileParser Methods
//===============================================================

  public void parseSegmentFile(InputStream in, CoIterator<Segment> coit)
  throws IOException {
    if (null == coit) {
      // Maybe just return immediately instead?
      // Maybe use NullPointerException instead?
      throw new IllegalArgumentException("null co-iterator");
    }
    this.coit = coit;
   
    // TODO: maybe ensure that this.tempText is the empty Stack?
   
    SAXParserFactory spf = SAXParserFactory.newInstance();
    try {
      try {
        SAXParser sp = spf.newSAXParser();
       
// FIXME: this Locator is trivial, returning the "unavailable" value
// for all methods. This works because we never actually call these
// methods ourselves, but it's circumventing the very idea of Locator
// just so that we can use the ErrorHandler interface.
//
// BUG: also, if the returned values ever change, it's buggy to
// share this among different errors. We should always generate a
// new one at exception-throwing time based on the InputStream state.
        this.locator = new Locator() {
          public String getPublicId()     { return null; }
          public String getSystemId()     { return null; }
          public int    getLineNumber()   { return -1;   }
          public int    getColumnNumber() { return -1;   }
        };
        sp.parse(in, this);
       
// FIXME: see the bug notes above and below.
//        final LocatorInputStream lin = new LocatorInputStream(in);
//        this.locator = new Locator() {
//          public String getPublicId()  { return null; } // FIXME
//          public String getSystemId()  { return null; } // FIXME
//          public int getLineNumber()   { return lin.getLineNumber(); }
//          public int getColumnNumber() { return lin.getColumnNumber(); }
//        };
//        sp.parse(lin, this);
//
// BUG: the SAXParser (or FileInputStream?) reads in a huge buffer
// at a time, so the line/column numbers from LocatorInputStream
// will be very wrong, even moreso than specified by the Locator
// interface.  I'm not sure how we can actually determine where in
// the buffer the SAXParser is when it sends us the sax events that
// cause errors...
//      } catch (SAXParseException e) {
//        // TODO: something better
//        IOException ioe = new IOException(
//          "SAXParseException before line "
//          + e.getLineNumber()
//          + ", column " + e.getColumnNumber()
//          + ": " + e.getMessage());
//        ioe.initCause(e);
//        throw ioe;
       
      } catch (SAXException e) { // other than SAXParseException
        // TODO: something better
        IOException ioe = new IOException(
          "SAXException: " + e.getMessage());
        ioe.initCause(e);
        throw ioe;
       
      } catch (ParserConfigurationException e) {
        // TODO: something better
        IOException ioe = new IOException(
          "ParserConfigurationException: " +  e.getMessage());
        ioe.initCause(e);
        throw ioe;
      }
     
      // BUG: Do we need to close() the InputStream, or does parse() handle that? Or should clients handle that?
     
    } finally {
      coit.finish();
    }
  }
 
 
//===============================================================
// org.xml.sax.ContentHandler non-default event handlers
// (overriding DefaultHandler)
//===============================================================
  // BUG: we need to deal with uri+localName vs qName, in
  // case anyone messes around with XML namespaces. (Assuming
  // they're always absent, like we do, is a bug.) But that'll
  // require putting SegmentFile.dtd in a public known location
  // (with version numbers!) so people can refer to it; too
  // much bother for now.
 
  public void startElement(
    String uri, String localName, String qName, Attributes attributes)
  throws SAXException {
    this.tempText.push(new StringBuffer());
   
    if (! this.seenRootTag) {
      // Flag for so we don't warn on seeing the root tag
      // This is only because it's not specified in the DTD
      this.seenRootTag = true;
     
    } else if ("seg".equalsIgnoreCase(qName)) {
      String id = attributes.getValue("id");
      if (null == id) {
        this.error(new SAXParseException(
          "Missing 'id' attribute for tag <seg>", this.locator));
      } else {
        this.tempSeg = new SAXSegment(id);
      }
     
    } else if ("span".equalsIgnoreCase(qName)) {
      // TODO: helper method to combine these two blocks
      int start; {
        String startString = attributes.getValue("start");
        if (null == startString) {
          this.error(new SAXParseException(
            "Missing 'start' attribute for tag <span>",
            this.locator));
        }
        try {
          start = Integer.parseInt(startString);
        } catch (NumberFormatException e) {
          this.error(new SAXParseException(
            "Malformed 'start' attribute for tag <span>. Must be an integer, found: " + startString,
            this.locator, e));
         
          // Unreachable, but javac fails if this isn't here
          start = -1;
        }
      }
     
      int end; {
        String endString = attributes.getValue("end");
        if (null == endString) {
          this.error(new SAXParseException(
            "Missing 'end' attribute for tag <span>",
            this.locator));
        }
        try {
          end = Integer.parseInt(endString);
        } catch (NumberFormatException e) {
          this.error(new SAXParseException(
            "Malformed 'end' attribute for tag <span>. Must be an integer, found: " + endString,
            this.locator, e));
         
          // Unreachable, but javac fails if this isn't here
          end = -1;
        }
      }
     
      boolean hard; {
        String hardString = attributes.getValue("hard");
        // Boolean.parseBoolean is too permissive
        if (null == hardString) {
          hard = false;
        } else if ("true".equalsIgnoreCase(hardString)) {
          hard = true;
        } else if ("false".equalsIgnoreCase(hardString)) {
          hard = false;
        } else {
          this.error(new SAXParseException(
            "Malformed 'hard' attribute for tag <span>. Must be \"true\" or \"false\", found: " + hardString,
            this.locator));
         
          // Unreachable, but javac fails if this isn't here
          hard = false;
        }
      }
     
      try {
        this.tempSpan = new SAXConstraintSpan(start, end, hard);
      } catch (TypeCheckingException e) {
        this.error(new SAXParseException(null, this.locator, e));
      }
     
    } else if ("constraint".equalsIgnoreCase(qName)) {
      this.tempRule = new SAXConstraintRule();
     
    } else if ("lhs".equalsIgnoreCase(qName)) {
      // TODO: anything?
     
    } else if ("rhs".equalsIgnoreCase(qName)) {
      this.tempRule.setFeatures(
        attributes.getValue("features")); // #IMPLIED, no check for null
     
    } else {
      logger.warning("Skipping unknown tag: " + qName);
    }
  }
 
 
  public void characters(char[] ch, int start, int length)
  throws SAXException {
    try {
      this.tempText.peek().append(new String(ch, start, length));
     
    } catch (EmptyStackException e) {
      // TODO: maybe we should throw an Error instead?
      this.fatalError(new SAXParseException(
        "The impossible happened", this.locator, e));
    }
  }
 
 
  // TODO: this *is* ignorable afterall, should we just ignore it?
  public void ignorableWhitespace(char[] ch, int start, int length)
  throws SAXException {
    this.characters(ch, start, length);
  }
 
 
  public void endElement(String uri, String localName, String qName)
  throws SAXException {
    try {
      String text = this.tempText.peek().toString();
     
      // BUG: debug for pushing nulls due to malformed files
      if ("seg".equalsIgnoreCase(qName)) {
        if (null == this.tempSeg) {
          this.fatalError(new SAXParseException(
            "Found </seg> but segment was null (missing root tag?)",
            this.locator));
        } else {
          try {
            Segment seg  = this.tempSeg.typeCheck(text);
            this.tempSeg = null;
            this.coit.coNext(seg);
           
          } catch (TypeCheckingException e) {
            this.error(new SAXParseException(
              null, this.locator, e));
          }
        }
       
      } else if ("span".equalsIgnoreCase(qName)) {
        ignoringTextWarning(qName, text);
        this.tempSeg.addSpan(this.tempSpan);
        this.tempSpan = null;
       
      } else if ("constraint".equalsIgnoreCase(qName)) {
        ignoringTextWarning(qName, text);
        this.tempSpan.addRule(this.tempRule);
        this.tempRule = null;
       
      } else if ("lhs".equalsIgnoreCase(qName)) {
        this.tempRule.setLhs(text);
       
      } else if ("rhs".equalsIgnoreCase(qName)) {
        this.tempRule.setRhs(text);
       
      } else {
        ignoringTextWarning(qName, text);
      }
     
      this.tempText.pop();
     
    } catch (EmptyStackException e) {
      // TODO: maybe we should throw an Error instead?
      this.fatalError(new SAXParseException(
        "The impossible happened", this.locator, e));
    }
  }
 
  private static final Regex WHITESPACE_ONLY = new Regex("^\\s*$");
  private void ignoringTextWarning(String qName, String text) {
    if (logger.isLoggable(Level.WARNING)
    && ! WHITESPACE_ONLY.matches(text)) {
      String cleanText = Regex.spaces.replaceAll(text, " ").trim();
      logger.warning(
        "Ignoring extraneous text in <" + qName + ">: " + cleanText);
    }
  }
 
 
//===============================================================
// org.xml.sax.ErrorHandler event handlers (overriding DefaultHandler)
//===============================================================
  // TODO: print the location info in the SAXParseException as well
 
  /**
   * Respond to recoverable warnings.
   */
  public void warning(SAXParseException e) throws SAXException {
    if (logger.isLoggable(Level.WARNING))
      logger.warning(e.getMessage());
  }
 
  /**
   * Respond to recoverable errors like validity violations.
   */
  public void error(SAXParseException e) throws SAXException {
    // FIXME: is that the right logging level?
    if (logger.isLoggable(Level.WARNING))
      logger.warning(e.getMessage());
    throw e;
  }
 
  /**
   * Respond to non-recoverable errors like well-formedness
   * violations.
   */
  public void fatalError(SAXParseException e) throws SAXException {
    if (logger.isLoggable(Level.SEVERE))
      logger.severe(e.getMessage());
    throw e;
  }
 
 
//===============================================================
// Main method (for debugging, should be moved to ./test/ somewhere)
//===============================================================
  public static void main(String[] args) throws IOException {
    if (1 != args.length) {
      System.out.println("Usage: java SAXSegmentParser segmentFile.xml");
      System.exit(1);
    }
   
    final List<Segment> segments = new LinkedList<Segment>();
    new SAXSegmentParser().parseSegmentFile(
      new FileInputStream(new File(args[0])),
      new CoIterator<Segment>() {
        public void coNext(Segment segment) { segments.add(segment); }
        public void finish() {}
      });
   
    int segs = 0;
    for (Segment s : segments) {
      ++segs;
     
      System.out.println(s.sentence());
      int spans = 0;
      for (ConstraintSpan span : s.constraints()) {
        ++spans;
       
        System.out.println(
          "<span start=\"" + span.start()
          + "\" end=\"" + span.end()
          + "\" hard=\"" + span.isHard()
          + "\">");
        int rs = 0;
        for (ConstraintRule rule : span.rules()) {
          ++rs;
          System.out.println(
            "<lhs>" + rule.lhs()
            // FIXME: array printing is buggy in Java
            + "</lhs><rhs features=\"" + rule.features()
            + "\">" + rule.foreignRhs()
            + " ||| " + rule.nativeRhs() + "</rhs>");
        }
        System.out.println("# Parsed " + rs + " rules for this span");
        System.out.println("</span>");
      }
      System.out.println("# Parsed " + spans + " spans for this segment");
      System.out.println("");
    }
    System.out.println("# Parsed " + segs + " segments");
  }
}
TOP

Related Classes of joshua.decoder.segment_file.sax_parser.SAXSegmentParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.