Package edu.jhu.agiga

Source Code of edu.jhu.agiga.AgigaSentenceReader

package edu.jhu.agiga;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.ximpleware.AutoPilot;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.PilotException;
import com.ximpleware.VTDException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;

import edu.jhu.agiga.AgigaConstants.DependencyForm;

/**
* Provides an iterator over AgigaSentence objects given an Annotated Gigaword
* file. This class should usually not be used directly since VTD-XML will load
* the entire XML file into memory, and requires that the file be unzipped.
* Instead, StreamingSentenceReader should be used which provides a fast,
* memory-efficient version of this iterator.
*
* This implementation using VTD-XML should handle XML files up to 2GB in size.
* For larger files, we can switch to extended VTD-XML as described here:
* <url>http://vtd-xml.sourceforge.net/codeSample/cs12.html</url>
*
* @author mgormley
*
*/
class AgigaSentenceReader implements Iterable<AgigaSentence>, Iterator<AgigaSentence> {

    private static final String NULL_NER_TAG = "0";

    private static Logger log = Logger.getLogger(AgigaSentenceReader.class.getName());

    private int numSentences;

    private VTDNav vn;
    private AutoPilot sentAp;

    private AgigaPrefs prefs;

    private int nextIdx = -1;
   
    public AgigaSentenceReader(String inputFile, AgigaPrefs prefs) {
        try {
            this.prefs = prefs;

            // Read the file into a byte array
            log.fine("Reading file into byte array");
            File f = new File(inputFile);
            InputStream fis = new FileInputStream(f);
            log.fine("File size: " + f.length());
            byte[] b = new byte[(int)f.length()];
            fis.read(b);
            fis.close();
           
            init(b);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
   
    public AgigaSentenceReader(byte[] b, AgigaPrefs prefs) {
        this.prefs = prefs;
        init(b);
    }
   
    public AgigaSentenceReader(VTDNav vn, AgigaPrefs prefs) {
        this.prefs = prefs;
        this.vn = vn;
        init();
    }

    private void init(byte[] b) {
        try {           
            // Index the xml with VTD-XML
            log.fine("Building VTD index");
            VTDGen vg = new VTDGen();
            vg.setDoc(b);
            vg.parse(false);
            vn = vg.getNav();

            numSentences = 0;
            vn.toElement(VTDNav.ROOT);

            // Initialize auto pilot
            init();
        } catch (NavException e) {
            throw new RuntimeException(e);
        } catch (ParseException e) {
            throw new RuntimeException(e);
        }
    }

    private void init() {
        try {
            sentAp = new AutoPilot(this.vn);
            //sentAp.selectXPath(String.format("//%s[@id]", AgigaConstants.SENTENCE));
            sentAp.selectXPath(String.format("//%s/%s", AgigaConstants.SENTENCES, AgigaConstants.SENTENCE));
            nextIdx  = sentAp.evalXPath();
        } catch (VTDException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public Iterator<AgigaSentence> iterator() {
        return this;
    }
   
    @Override
    public boolean hasNext() {
        return nextIdx != -1;
    }

    @Override
    public AgigaSentence next() {
        try {
            int sentId = vn.parseInt(vn.getAttrVal(AgigaConstants.TOKEN_ID));
            log.finer("sentence id=" + sentId);
   
            StanfordAgigaSentence agigaSent = getSentenceInstance(prefs);
            // Subtract one, since the sentences are one-indexed in the XML but
            // zero-indexed in this API
            agigaSent.setSentIdx(sentId - 1);
           
            // Below we use clone nav to avoid having to find the "sent" tag
            // again
            if (prefs.readWord || prefs.readLemma || prefs.readOffsets || prefs.readPos || prefs.readNer || prefs.readNormNer) {
                List<AgigaToken> agigaTokens = parseTokens(vn.cloneNav());
                agigaSent.setTokens(agigaTokens);
            }
            if (prefs.readParse) {
                String parseText = parseParse(vn.cloneNav());
                agigaSent.setParseText(parseText);
            }
            if (prefs.readBasicDeps) {
                List<AgigaTypedDependency> basicDeps = parseDependencies(vn.cloneNav(), DependencyForm.BASIC_DEPS);
                agigaSent.setBasicDeps(basicDeps);
            }
            if (prefs.readColDeps) {
                List<AgigaTypedDependency> colDeps = parseDependencies(vn.cloneNav(), DependencyForm.COL_DEPS);
                agigaSent.setColDeps(colDeps);
            }
            if (prefs.readColCcprocDeps) {
                List<AgigaTypedDependency> colCcprocDeps = parseDependencies(vn.cloneNav(), DependencyForm.COL_CCPROC_DEPS);
                agigaSent.setColCcprocDeps(colCcprocDeps);
            }

            // Note that if we instead wanted to find the sent element using
            // XPath evaluation, we could do the following.
            //            basicDepRelAp.selectXPath(String.format("//sent[@id=%d]", sentId));
            //            require(basicDepRelAp.evalXPath() != -1);
            //
            // We could alternatively move up through the tree to find it, after calling parseTokens().
            //            // Move back up to the <sent> tag
            //            while (!vn.matchElement(AgigaConstants.SENTENCE)) {
            //                vn.toElement(VTDNav.PARENT);
            //            }
               
   
            numSentences++;
           
            nextIdx  = sentAp.evalXPath();
           
            return agigaSent;
        } catch(VTDException e) {
            throw new RuntimeException(e);
        }
    }

    protected StanfordAgigaSentence getSentenceInstance(AgigaPrefs prefs) {
        return new StanfordAgigaSentence(prefs);
    }

    @Override
    public void remove() {
        throw new RuntimeException("not implemented");       
    }

    public int getNumSentences() {
        return numSentences;
    }
   
    /**
     * Assumes the position of vn is at a AgigaConstants.SENTENCE tag
     * @param tree
     * @return
     */
    private List<AgigaToken> parseTokens(VTDNav vn) throws PilotException, NavException {
        require (vn.matchElement(AgigaConstants.SENTENCE));

        int tokId = -1;
       
        List<AgigaToken> agigaTokens = new ArrayList<AgigaToken>();
       
        // Loop through each token
        AutoPilot tokAp = new AutoPilot(vn);
        tokAp.selectElement(AgigaConstants.TOKEN);
        while (tokAp.iterate()) {
            // Just double check that the tokens are in order
            if (tokId < 0) {
                tokId = vn.parseInt(vn.getAttrVal(AgigaConstants.TOKEN_ID));
            }
            require (vn.parseInt(vn.getAttrVal(AgigaConstants.TOKEN_ID)) == tokId);

            AgigaToken agigaToken = new AgigaToken();
            // Subtract one, since the tokens are one-indexed in the XML but
            // zero-indexed in this API
            agigaToken.setTokIdx(tokId - 1);
           
            // Read the word, lemma, token offsets, POS tag, NER tag, and
            // normalized NER
           
            // We have to move to the word (first child) so that the next
            // sibling moves succeed.
            require(vn.toElement(VTDNav.FC, AgigaConstants.WORD));
            if (prefs.readWord) {
                String word = vn.toString(vn.getText());
                agigaToken.setWord(word);
            }

            if (prefs.readLemma) {
                require(vn.toElement(VTDNav.NS, AgigaConstants.LEMMA));
                String lemma = vn.toString(vn.getText());
                agigaToken.setLemma(lemma);
            }

            if (prefs.readOffsets) {
                require(vn.toElement(VTDNav.NS, AgigaConstants.CHARACTER_OFFSET_BEGIN));
                int charOffBegin;
                if (prefs.strict) {
                    charOffBegin = Integer.parseInt(vn.toString(vn.getText()));
                } else {
                    // Remove unexpected whitespace surrounding the integer.
                    charOffBegin = Integer.parseInt(vn.toString(vn.getText()).trim());
                }
                agigaToken.setCharOffBegin(charOffBegin);
                require(vn.toElement(VTDNav.NS, AgigaConstants.CHARACTER_OFFSET_END));
                int charOffEnd;
                if (prefs.strict) {
                    charOffEnd = Integer.parseInt(vn.toString(vn.getText()));
                } else {
                    // Remove unexpected whitespace surrounding the integer.
                    charOffEnd = Integer.parseInt(vn.toString(vn.getText()).trim());
                }
                agigaToken.setCharOffEnd(charOffEnd);
            }

            if (prefs.readPos) {
                require(vn.toElement(VTDNav.NS, AgigaConstants.POS));
                String posTag = vn.toString(vn.getText());
                agigaToken.setPosTag(posTag);
            }

            if (prefs.readNer) {
                if (prefs.strict) {
                    require(vn.toElement(VTDNav.NS, AgigaConstants.NER));
                    String nerTag = vn.toString(vn.getText());
                    agigaToken.setNerTag(nerTag);
                } else {
                    String nerTag = null;
                    if (vn.toElement(VTDNav.NS, AgigaConstants.NER)) {
                        nerTag = vn.toString(vn.getText());
                    }
                    agigaToken.setNerTag(nerTag);
                }
            }
            if (prefs.readNormNer) {
                // NormNER only applies to some tokens
                String normNer = null;
                if (vn.toElement(VTDNav.NS, AgigaConstants.NORM_NER)) {
                    normNer = vn.toString(vn.getText());
                }
                agigaToken.setNormNer(normNer);
            }

            agigaTokens.add(agigaToken);
           
            tokId++;
        }
       
        return agigaTokens;
    }
   
    /**
     * Assumes the position of vn is at a AgigaConstants.SENTENCE tag
     * @return
     */
    private String parseParse(VTDNav vn) throws NavException,
            PilotException {
        require (vn.matchElement(AgigaConstants.SENTENCE));

        // Move to the <parse> tag
        require (vn.toElement(VTDNav.FC, AgigaConstants.PARSE));
        String parseText = vn.toString(vn.getText());
       
        return parseText;
    }

    /**
     * Assumes the position of vn is at a "sent" tag
     * @return
     */
    private List<AgigaTypedDependency> parseDependencies(VTDNav vn, DependencyForm form) throws NavException,
            PilotException {
        require (vn.matchElement(AgigaConstants.SENTENCE));

        // Move to the <basic-deps> tag
        require (vn.toElement(VTDNav.FC, form.getXmlTag()));

        List<AgigaTypedDependency> agigaDeps = new ArrayList<AgigaTypedDependency>();
       
        // Loop through the dep tags
        AutoPilot basicDepRelAp = new AutoPilot(vn);
        basicDepRelAp.selectElement(AgigaConstants.DEP);
        while (basicDepRelAp.iterate()) {
            // Read the type, governor, and dependent
            String type = vn.toString(vn.getAttrVal(AgigaConstants.DEP_TYPE));
            require (vn.toElement(VTDNav.FC, AgigaConstants.GOVERNOR));
            int governorId = vn.parseInt(vn.getText());
            require (vn.toElement(VTDNav.NS, AgigaConstants.DEPENDENT));
            int dependentId = vn.parseInt(vn.getText());

            log.finer(String.format("\tdep type=%s\t%d-->%d", type, governorId, dependentId));

            // Subtract one, since the tokens are one-indexed in the XML but
            // zero-indexed in this API
            AgigaTypedDependency agigaDep = new AgigaTypedDependency(type, governorId - 1, dependentId - 1);
            agigaDeps.add(agigaDep);
        }
        return agigaDeps;
    }

    /**
     * This method will print out the XML from the current position of
     * <code>vn</code>. Very useful for debugging.
     */
    public static String getElementFragmentAsString(byte[] b, VTDNav vn) throws NavException {
        long l = vn.getElementFragment();
        int offset = (int) l;
        int len = (int) (l >> 32);
        String elementFragment = new String(Arrays.copyOfRange(b, offset, offset + len));
        return elementFragment;
    }
   
    public static void main(String args[]) throws Exception {
        // Must be Level.FINER for debug logging
        Util.initializeLogging(Level.FINE);

        // Parse each file provided on the command line.
        for (int i = 0; i < args.length; i++) {
            AgigaSentenceReader reader = new AgigaSentenceReader(args[i], new AgigaPrefs());
            log.fine("Parsing XML");
            for (AgigaSentence agigaSent : reader) {
                // Do nothing
            }
            log.info("Number of sentences: " + reader.getNumSentences());
        }
    }
   
    // TODO: any call to require should also have a message explaining what condition wasn't met
    public static void require(boolean truth) {
        if (!truth) {
            throw new IllegalStateException();
        }
    }
   
    public static void require(boolean truth, String message) {
        if (!truth) {
            throw new IllegalStateException(message);
        }
    }

}
TOP

Related Classes of edu.jhu.agiga.AgigaSentenceReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.