Package edu.umd.hooka.corpora

Examples of edu.umd.hooka.corpora.ParallelChunk


  //Event Handlers
  public void startElement(String uri, String localName, String qName,
    org.xml.sax.Attributes attributes) throws SAXException {
    //reset
    if(qName.equalsIgnoreCase("pchunk")) {
      pchunk = new ParallelChunk();
      pchunk.setName(attributes.getValue("name"));
    } else if (qName.equalsIgnoreCase("s")) {
      lang = Language.languageForISO639_1(attributes.getValue("lang"));
      tempVal = new StringBuffer();
    } else if (qName.equalsIgnoreCase("wordalignment")) {
View Full Code Here


            System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
        }
        Chunk ec = new Chunk(e);
        Chunk fc = new Chunk(f);
        String name = label + lc;
        ParallelChunk p = new ParallelChunk();
        p.setName(name);
        p.addChunk(de, fc);
        p.addChunk(en, ec);
        if (a != null) { 
          ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
          try {
            ra.addAlignmentPointsPharaoh(a);
            p.addReferenceAlignment(ende, ra);
          } catch (RuntimeException re) {
            System.err.println("Couldn't set alignment points for sentence # " + lc)
            System.err.println(" " + en +": len=" + ec.getLength() + " words=" + ec);
            System.err.println(" " + de +": len=" + fc.getLength() + " words=" + fc);
            System.err.println(" " + ende + ": " + a);
          }
        } 
        w.write(p.toXML());
      }
      String t = r2.readLine();
      if (t != null)
        System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
      w.write("</pdoc>");
View Full Code Here

        Reporter reporter) throws IOException {

      //key: a single sentence in both languages and alignment
      //ignore value. each key is parallel sentence and its alignment, in xml format

      ParallelChunk c = pcr.parseString(key.toString());
      ok.set(c.idString());
     
      //Chunk is an array of tokens in the sentence, without any special tokenization (just separated by spaces)
      Chunk fc = c.getChunk(src);
      Chunk ec = c.getChunk(tgt);
      if (fc == null || ec == null) {
        reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
        return;
      }
      if (fc.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
        return;
      }
      if (ec.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
        return;
      }

      //ec,fc: English/French sentence represented as sequence of words
      //vocE,vocF: vocabularies for english and french, of type VocabularyWritable

      //ee,fe: integer representation of words in sentences ec and fc
      sLogger.debug("Target sentence:");
      int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
      sLogger.debug("Source sentence:");
      int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);

      //e,f: phrase from whole sentence
      Phrase e = new Phrase(ee, 0);
      Phrase f = new Phrase(fe, 1);

      edu.umd.hooka.PhrasePair b = new PhrasePair(f,e);
      ReferenceAlignment ra = c.getReferenceAlignment(lp);
      if (ra != null) {
        b.setAlignment(ra);
      }
      reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
View Full Code Here

TOP

Related Classes of edu.umd.hooka.corpora.ParallelChunk

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.