Package edu.umd.hooka.corpora

Examples of edu.umd.hooka.corpora.Chunk


    }else if (qName.equalsIgnoreCase("s")) {
      String s = tempVal.toString().trim();
      if (s.length() == 0) {
        System.err.println(pchunk.getName() + ": Empty segment for lang=" + lang);
      } else {
        Chunk c = new Chunk(tempVal.toString().trim());
        pchunk.addChunk(lang, c);
        chunkCount++;
        tempVal = null;
      }
    }else if (qName.equalsIgnoreCase("wordalignment")) {
      Chunk sc = pchunk.getChunk(langpair.getSource());
      if (sc == null)
        throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getSource() + ".  Note: manual word alignment data must follow the chunk data.");
      Chunk tc = pchunk.getChunk(langpair.getTarget());
      if (tc == null)
        throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getTarget() + ".  Note: manual word alignment data must follow the chunk data.");
      ReferenceAlignment r = new ReferenceAlignment(
          sc.getLength(),
          tc.getLength());
      r.addAlignmentPointsPharaoh(tempVal.toString().trim());
      pchunk.addReferenceAlignment(langpair, r);
      refAlignCount++;
      tempVal = null;
    }else if (qName.equalsIgnoreCase("pdoc")) {
View Full Code Here


        if (readAlignments) {
          a = r1_2.readLine();
          if (a==null)
            System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
        }
        Chunk ec = new Chunk(e);
        Chunk fc = new Chunk(f);
        String name = label + lc;
        ParallelChunk p = new ParallelChunk();
        p.setName(name);
        p.addChunk(de, fc);
        p.addChunk(en, ec);
        if (a != null) { 
          ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
          try {
            ra.addAlignmentPointsPharaoh(a);
            p.addReferenceAlignment(ende, ra);
          } catch (RuntimeException re) {
            System.err.println("Couldn't set alignment points for sentence # " + lc)
            System.err.println(" " + en +": len=" + ec.getLength() + " words=" + ec);
            System.err.println(" " + de +": len=" + fc.getLength() + " words=" + fc);
            System.err.println(" " + ende + ": " + a);
          }
        } 
        w.write(p.toXML());
      }
View Full Code Here

          Random r = new Random(1);
          BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/tmp/bar.xml"), "UTF8"));
          public void handlePChunk(ParallelChunk p) {
            Language fr = Language.languageForISO639_1("fr");
            Language en = Language.languageForISO639_1("en");
            Chunk f = p.getChunk(fr);
            if (f == null) return;
            Chunk e = p.getChunk(en);
            if (e == null) return;
            float elen = e.getLength();
            float flen = f.getLength();
            if (elen > 40) return;
            if (flen > 40) return;
            float ra = elen / flen;
            if (ra > 1.3) return;
            try {
              if (r.nextDouble() > 0.15) return;
              br.write(p.toXML()); } catch (Exception e1) { e1.printStackTrace(); }
          }
          @Override
          public void finalize() {
            try { br.close(); } catch (Exception e){}
          }
      });
      } catch (Exception e) { e.printStackTrace(); }
    if (true)
      convertToXMLDocument(
          "koen_jhu_",
        "/Users/redpony/bitexts/kkn-eng-alignments/kkn.utf8",
        "/Users/redpony/bitexts/kkn-eng-alignments/eng",
View Full Code Here

      ParallelChunk c = pcr.parseString(key.toString());
      ok.set(c.idString());
     
      //Chunk is an array of tokens in the sentence, without any special tokenization (just separated by spaces)
      Chunk fc = c.getChunk(src);
      Chunk ec = c.getChunk(tgt);
      if (fc == null || ec == null) {
        reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
        return;
      }
      if (fc.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
        return;
      }
      if (ec.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
        return;
      }

      //ec,fc: English/French sentence represented as sequence of words
      //vocE,vocF: vocabularies for english and french, of type VocabularyWritable

      //ee,fe: integer representation of words in sentences ec and fc
      sLogger.debug("Target sentence:");
      int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
      sLogger.debug("Source sentence:");
      int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);

      //e,f: phrase from whole sentence
      Phrase e = new Phrase(ee, 0);
View Full Code Here

      fw.flush();
      fw.close();
    }
    static final int MAX_LENGTH = 99;
    public void handlePChunk(ParallelChunk p) {
      Chunk a = p.getChunk(ar);
      Chunk e = p.getChunk(en);
      if (a == null) return;
      if (e == null) return;
      String[] npa = a.getWords();
      String[] npe = e.getWords();
      if (npa.length > MAX_LENGTH)
        return;
      if (npe.length > MAX_LENGTH)
        return;
      if (npa.length == 0 || npe.length == 0)
View Full Code Here

TOP

Related Classes of edu.umd.hooka.corpora.Chunk

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.