Examples of joshua.corpus.vocab.Vocabulary

joshua.corpus.vocab.Vocabulary
Vocabulary is the class that keeps track of the unique words that occur in a corpus of text for a particular language. It assigns integer IDs to Words, which is useful when we are creating suffix arrays or doing similar things. @author Chris Callison-Burch @since 8 February 2005 @author Lane Schwartz @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $

  PrefixTree tree;
  
  @Test(dependsOnMethods = {"prefixTreeNodes","suffixLinks"})
  public void setup() {
    
    vocab = new Vocabulary();
    it = vocab.addTerminal("it");
    persuades = vocab.addTerminal("persuades");
    him = vocab.addTerminal("him");
    and = vocab.addTerminal("and");
    disheartens = vocab.addTerminal("disheartens");

View Full Code Here

    //String alignmentsType = alignmentsType;
  
    int maxCacheSize = 100000;//12566;
    
    int numSourceWords, numSourceSentences;
    Vocabulary sourceVocab = new Vocabulary();
    int[] sourceWordsSentences = Vocabulary.initializeVocabulary(sourceFileName, sourceVocab, true);
    numSourceWords = sourceWordsSentences[0];
    numSourceSentences = sourceWordsSentences[1];
    
    Corpus sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceFileName, sourceVocab, numSourceWords, numSourceSentences);
    Suffixes sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, maxCacheSize);
    
    int numTargetWords, numTargetSentences;
    Vocabulary targetVocab = new Vocabulary();
    int[] targetWordsSentences = Vocabulary.initializeVocabulary(targetFileName, targetVocab, true);
    numTargetWords = targetWordsSentences[0];
    numTargetSentences = targetWordsSentences[1];
    
    Corpus targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetFileName, targetVocab, numTargetWords, numTargetSentences);

View Full Code Here

      for (char c2='a'; c2<='z'; c2++) {
        words.add(new String(new char[]{c1,c2}));
      }  
    }
    
    Vocabulary vocab = new Vocabulary(words);
    
    try {
      
      File tempFile = File.createTempFile(BinaryTest.class.getName(), "vocab");
      FileOutputStream outputStream = new FileOutputStream(tempFile);
      ObjectOutput out = new BinaryOut(outputStream, true);
      vocab.writeExternal(out);
      
      ObjectInput in = new BinaryIn<Vocabulary>(tempFile.getAbsolutePath(), Vocabulary.class);
      Object o = in.readObject();
      Assert.assertTrue(o instanceof Vocabulary);
      
      Vocabulary newVocab = (Vocabulary) o;
      
      Assert.assertNotNull(newVocab);
      Assert.assertEquals(newVocab.size(), vocab.size());      
      
      Assert.assertEquals(newVocab, vocab);

View Full Code Here

//    refFile.close();




    // Source language vocabulary
    println("Creating src vocabulary @ " + (new Date()));
    srcVocab = new Vocabulary();
    int[] sourceWordsSentences = Vocabulary.initializeVocabulary(trainSrc_fileName, srcVocab, true);


    int numSourceWords = sourceWordsSentences[0];
    int numSourceSentences = sourceWordsSentences[1];


    // Source language corpus array
    println("Reading src corpus @ " + (new Date()));
    srcCorpusArray = SuffixArrayFactory.createCorpusArray(trainSrc_fileName, srcVocab, numSourceWords, numSourceSentences);


    // Source language suffix array
    println("Creating src SA @ " + (new Date()));
    srcSA = SuffixArrayFactory.createSuffixArray(srcCorpusArray, maxCacheSize);




    // Target language vocabulary
    println("Creating tgt vocabulary @ " + (new Date()));
    tgtVocab = new Vocabulary();
    int[] targetWordsSentences = Vocabulary.initializeVocabulary(trainTgt_fileName, tgtVocab, true);


    int numTargetWords = targetWordsSentences[0];
    int numTargetSentences = targetWordsSentences[1];

View Full Code Here

    Set<String> sourceWords = new HashSet<String>();
    for (String word : corpusString.split("\\s+")) {
      sourceWords.add(word);
    }


    sourceVocab = new Vocabulary(sourceWords);
    


    corpusSentence = new BasicPhrase(corpusString, sourceVocab);
    
    targetCorpusString = "das macht ihn und es beschädigt ihn , es setzt ihn auf und es führt ihn aus .";
    Set<String> targetWords = new HashSet<String>();
    for (String targetWord : targetCorpusString.split("\\s+")) {
      targetWords.add(targetWord);
    }
    
    targetVocab = new Vocabulary(targetWords);
    
    ntVocab = new HashMap<Integer,String>();
    ntVocab.put(-1, "X");
    
    {

View Full Code Here

   * Constructs an empty corpus.
   * <p>
   * NOTE: Primarily needed for Externalizable interface.
   */
  public CorpusArray() {
    super(new Vocabulary());
//    this.symbolTable = new Vocabulary();
    this.sentences = new int[]{};
    this.corpus = new int[]{};
  }

View Full Code Here

    String corpusFileName = args[0];
    String binaryVocabFilename = args[1];
    String binaryCorpusFilename = args[2];
    String charset = (args.length > 3) ? args[3] : "UTF-8";
    
    Vocabulary symbolTable = new Vocabulary();
    int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);
    
    CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);
    
    corpusArray.write(binaryCorpusFilename, binaryVocabFilename, charset);

View Full Code Here

  private int[]      words;
  
  
  public BasicPhrase(byte language, String sentence) {
    this.language   = language;
    this.vocabulary = new Vocabulary();
    this.words = splitSentence(sentence, vocabulary);
  }

View Full Code Here

    int lastSentence = Integer.parseInt(argv[3]);
    HashMap<Integer,Integer> chosenSentences = new HashMap<Integer,Integer>();
    for (int i = firstSentence; i < lastSentence; i++) {
      chosenSentences.put(i, i);
    }
    Vocabulary vocab = new Vocabulary();
    DiskHyperGraph dhg = new DiskHyperGraph(vocab, 0, true, null);
    dhg.initRead(itemsFile, rulesFile, chosenSentences);
    JungHyperGraph hg = new JungHyperGraph(dhg.readHyperGraph(), vocab);
    JFrame frame = new JFrame("Joshua Hypergraph");
    frame.getContentPane().add(new HyperGraphViewer(hg, vocab));

View Full Code Here

    int lastSentence = Integer.parseInt(argv[3]);
    HashMap<Integer,Integer> chosenSentences = new HashMap<Integer,Integer>();
    for (int i = firstSentence; i < lastSentence; i++) {
      chosenSentences.put(i, i);
    }
    Vocabulary vocab = new Vocabulary();
    DiskHyperGraph dhg = new DiskHyperGraph(vocab, 0, true, null);
    dhg.initRead(itemsFile, rulesFile, chosenSentences);
    JungHyperGraph hg = new JungHyperGraph(dhg.readHyperGraph(), vocab);
    return;
  }

View Full Code Here

0 1 2 3

TOP

Related Classes of joshua.corpus.vocab.Vocabulary

joshua.aligner.AlignCandidates

joshua.corpus.BasicPhrase

joshua.corpus.CorpusArray

joshua.corpus.CorpusArrayTest

joshua.corpus.lexprob.WriteLexProbs

joshua.corpus.suffix_array.AbstractHierarchicalPhrasesTest

joshua.corpus.suffix_array.BasicPhrase

joshua.corpus.suffix_array.Compile

joshua.corpus.suffix_array.ConvertCorpus

joshua.corpus.suffix_array.FrequentClassesTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.