Examples of edu.umd.hooka.corpora.ParallelChunk

edu.umd.hooka.corpora.ParallelChunk

  //Event Handlers
  public void startElement(String uri, String localName, String qName,
    org.xml.sax.Attributes attributes) throws SAXException {
    //reset
    if(qName.equalsIgnoreCase("pchunk")) {
      pchunk = new ParallelChunk();
      pchunk.setName(attributes.getValue("name"));
    } else if (qName.equalsIgnoreCase("s")) {
      lang = Language.languageForISO639_1(attributes.getValue("lang"));
      tempVal = new StringBuffer();
    } else if (qName.equalsIgnoreCase("wordalignment")) {

View Full Code Here

            System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
        }
        Chunk ec = new Chunk(e);
        Chunk fc = new Chunk(f);
        String name = label + lc;
        ParallelChunk p = new ParallelChunk();
        p.setName(name);
        p.addChunk(de, fc);
        p.addChunk(en, ec);
        if (a != null) {  
          ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
          try {
            ra.addAlignmentPointsPharaoh(a); 
            p.addReferenceAlignment(ende, ra);
          } catch (RuntimeException re) {
            System.err.println("Couldn't set alignment points for sentence # " + lc);  
            System.err.println(" " + en +": len=" + ec.getLength() + " words=" + ec);
            System.err.println(" " + de +": len=" + fc.getLength() + " words=" + fc);
            System.err.println(" " + ende + ": " + a);
          }
        }  
        w.write(p.toXML());
      }
      String t = r2.readLine();
      if (t != null)
        System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
      w.write("</pdoc>");

View Full Code Here

        Reporter reporter) throws IOException {


      //key: a single sentence in both languages and alignment
      //ignore value. each key is parallel sentence and its alignment, in xml format


      ParallelChunk c = pcr.parseString(key.toString());
      ok.set(c.idString());
      
      //Chunk is an array of tokens in the sentence, without any special tokenization (just separated by spaces)
      Chunk fc = c.getChunk(src);
      Chunk ec = c.getChunk(tgt);
      if (fc == null || ec == null) {
        reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
        return;
      }
      if (fc.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
        return;
      }
      if (ec.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
        return;
      }


      //ec,fc: English/French sentence represented as sequence of words
      //vocE,vocF: vocabularies for english and french, of type VocabularyWritable


      //ee,fe: integer representation of words in sentences ec and fc
      sLogger.debug("Target sentence:");
      int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
      sLogger.debug("Source sentence:");
      int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);


      //e,f: phrase from whole sentence
      Phrase e = new Phrase(ee, 0);
      Phrase f = new Phrase(fe, 1);


      edu.umd.hooka.PhrasePair b = new PhrasePair(f,e);
      ReferenceAlignment ra = c.getReferenceAlignment(lp);
      if (ra != null) {
        b.setAlignment(ra);
      }
      reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);

View Full Code Here

TOP

Related Classes of edu.umd.hooka.corpora.ParallelChunk

edu.umd.hooka.alignment.aer.ReferenceAlignment

edu.umd.hooka.corpora.ParallelCorpusReader

edu.umd.hooka.CorpusVocabNormalizerAndNumberizer$BitextCompilerMapper

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.