Examples of joshua.corpus.CorpusArray

joshua.corpus.CorpusArray
A compact int[] based representation of a corpus. The class keeps all of the words in their int form in a single array. It also maintains a separate int[] array that lists the start index for each sentence in the corpus. This second array allows us to quickly determine the source sentence of any given position in the corpus using a binary search. @author Josh Schroeder @since 29 Dec 2004 @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $

      alignmentPrintStream.println(alignmentString);
      alignmentPrintStream.close();
      alignmentFileName = alignmentFile.getAbsolutePath();
    }
    
    CorpusArray sourceCorpusArray =
      SuffixArrayFactory.createCorpusArray(sourceFileName);
    SuffixArray sourceSuffixArray = 
      SuffixArrayFactory.createSuffixArray(sourceCorpusArray, SuffixArray.DEFAULT_CACHE_CAPACITY);
    
    CorpusArray targetCorpusArray =
      SuffixArrayFactory.createCorpusArray(targetFileName);
    SuffixArray targetSuffixArray = 
      SuffixArrayFactory.createSuffixArray(targetCorpusArray, SuffixArray.DEFAULT_CACHE_CAPACITY);


    Alignments alignmentArray = SuffixArrayFactory.createAlignments(alignmentFileName, sourceSuffixArray, targetSuffixArray);

View Full Code Here

        alignedSourceIndices[i][0] = i;
      }
    }
    Alignments alignments = new AlignmentArray(alignedTargetIndices, alignedSourceIndices, 1);
    
    CorpusArray targetCorpus = new CorpusArray(sentenceF, sentenceStartPositions, vocab);
    SuffixArray targetSuffixes = new SuffixArray(targetCorpus);


    CorpusArray sourceCorpus = new CorpusArray(sentence, sentenceStartPositions, vocab);
    SuffixArray sourceSuffixes = new SuffixArray(sourceCorpus);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixes, targetSuffixes, alignments, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
    
//    tree = new PrefixTree(vocab, maxPhraseSpan, maxPhraseLength, maxNonterminals);
    tree = new PrefixTree(parallelCorpus);

View Full Code Here

      int[] corpus = new int[corpusSentence.size()];
      for(int i = 0; i < corpusSentence.size(); i++) {
        corpus[i] = corpusSentence.getWordID(i);
      }
      
      CorpusArray corpusArray = new CorpusArray(corpus, sentenceStartPositions, sourceVocab);
      suffixArray = new SuffixArray(corpusArray);
      




      int[] targetSentenceStartPositions = {0};
      
      BasicPhrase targetCorpusSentence = new BasicPhrase(targetCorpusString, targetVocab);
      Assert.assertEquals(targetCorpusSentence.size(), 18);
      
      int[] targetCorpus = new int[targetCorpusSentence.size()];
      for(int i = 0; i < targetCorpusSentence.size(); i++) {
        targetCorpus[i] = targetCorpusSentence.getWordID(i);
      }
      


      


      
      CorpusArray targetCorpusArray = new CorpusArray(targetCorpus, targetSentenceStartPositions, targetVocab);
      targetSuffixArray = new SuffixArray(targetCorpusArray);


      
      int[] lowestAlignedTargetIndex = new int[corpusSentence.size()];
      int[] highestAlignedTargetIndex = new int[corpusSentence.size()];

View Full Code Here

    int[] corpus = new int[exampleSentence.size()];
    for(int i = 0; i < exampleSentence.size(); i++) {
      corpus[i] = exampleSentence.getWordID(i);
    }
    
    CorpusArray corpusArray = new CorpusArray(corpus, sentences, vocab);
    
    if (binaryFileName==null || binaryFileName.trim().length()==0)
      suffixArray = new SuffixArray(corpusArray);
    else
      suffixArray = new MemoryMappedSuffixArray(binaryFileName, corpusArray, MemoryMappedSuffixArray.DEFAULT_CACHE_CAPACITY);

View Full Code Here

    
    
    
    // Construct source language corpus
    if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + sourceCorpusFileName);
    CorpusArray sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, symbolTable, sourceLengths[0], sourceLengths[1]);
    
    // Write source corpus to disk
    {
      String binarySourceCorpusFilename = outputDirName + File.separator + "source.corpus";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceCorpusFilename);
      
        BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binarySourceCorpusFilename), false);
        sourceCorpusArray.writeExternal(corpusOut);  
        corpusOut.flush();
        
      out.println("Source language corpus: " + binarySourceCorpusFilename);
    }
    
    // Construct target language corpus
    if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + targetCorpusFileName);
    CorpusArray targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetCorpusFileName, symbolTable, targetLengths[0], targetLengths[1]);
    
    
    // Write target language corpus to disk
    {
      String binaryTargetCorpusFilename = outputDirName + File.separator + "target.corpus";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetCorpusFilename);
      
        BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binaryTargetCorpusFilename), false);
        targetCorpusArray.writeExternal(corpusOut);  
        corpusOut.flush();
        
        out.println("Target language corpus: " + binaryTargetCorpusFilename);
    }

View Full Code Here

    
    // Read the provided corpus
    logger.info("Reading provided corpus");
    Vocabulary oldSymbolTable = new Vocabulary();
    int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, oldSymbolTable, true);
    CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, oldSymbolTable, lengths[0], lengths[1]);
    
    // Change the internal integer-string mappings
    // of the corpus to use those provided by the given symbol table.
    logger.info("Converting corpus to use new symbol mappings");
    corpusArray.setSymbolTable(symbolTable);
    
    // Write the corpus to disk in binary format
    logger.info("Writing corpus to disk in binary format, using new symbol mappings");
    corpusArray.write(binaryCorpusFilename, binaryVocabFilename, charset);
    
  }

View Full Code Here

//        wordCounter++;
//      }
//      if(SHOW_PROGRESS && sentenceCounter % 10000==0) logger.info(""+numWords);
    }


    return new CorpusArray(corpus, sentenceIndexes, vocab);
  }

View Full Code Here

    
    String sourceFilename = "data/tiny.fr";
    String targetFilename = "data/tiny.en";
    String alignmentsFilename = "data/tiny.fr-en.alignment";
    
    CorpusArray sourceArray =
      SuffixArrayFactory.createCorpusArray(sourceFilename);
    sourceCorpus =
      SuffixArrayFactory.createSuffixArray(sourceArray, SuffixArray.DEFAULT_CACHE_CAPACITY);//SuffixArrayFactory.loadSuffixArray(sourceLang, corpusName, directory);
    
    CorpusArray targetArray =
      SuffixArrayFactory.createCorpusArray(targetFilename);
    targetCorpus =
      SuffixArrayFactory.createSuffixArray(targetArray, SuffixArray.DEFAULT_CACHE_CAPACITY);//SuffixArrayFactory.loadSuffixArray(targetLang, corpusName, directory);
    
    alignmentArray = (AlignmentArray) SuffixArrayFactory.createAlignments(alignmentsFilename, sourceCorpus, targetCorpus); //SuffixArrayFactory.loadAlignmentArray(sourceLang, targetLang, corpusName, directory);

View Full Code Here

    try {
      
      // FIX: can't use createVocabulary(String) because we set numWords and numSentences
      Vocabulary vocab = new Vocabulary();
      Vocabulary.initializeVocabulary(filename, vocab, true);
      CorpusArray corpus = SuffixArrayFactory.createCorpusArray(filename, vocab, numWords, numSentences);
      
      corpus.write(filename+".corpus", filename+".vocab", "UTF-8");
      
      MemoryMappedCorpusArray mmCorpus = new MemoryMappedCorpusArray(filename+".corpus", filename+".vocab");
      
      Assert.assertEquals(mmCorpus.size(), corpus.size());
      Assert.assertEquals(mmCorpus.getNumSentences(), corpus.getNumSentences());
      
      // For each word in the corpus,
      for (int i=0; i<corpus.size(); i++) {
        
        // Verify that the memory-mapped corpus and the in-memory corpus have the same value
        Assert.assertEquals(mmCorpus.getWordID(i), corpus.getWordID(i));
      }
      
      
      // For each sentence in the corpus
      for (int i=0; i<corpus.sentences.length; i++) {
        
        // Verify that the sentence start position in the memory-mapped corpus and the in-memory corpus have the same value
        Assert.assertEquals(mmCorpus.getSentencePosition(i), corpus.getSentencePosition(i));
        
        // Verify that the sentence end position in the memory-mapped corpus and the in-memory corpus have the same value
        Assert.assertEquals(mmCorpus.getSentenceEndPosition(i), corpus.getSentenceEndPosition(i));
        
        // Verify that the phrase corresponding to this sentence is the same
        Phrase sentence = corpus.getSentence(i);
        Phrase mmSentence = mmCorpus.getSentence(i);
        Assert.assertNotNull(sentence);
        Assert.assertNotNull(mmSentence);
        Assert.assertEquals(mmSentence, sentence);
      }

View Full Code Here

TOP

Related Classes of joshua.corpus.CorpusArray

joshua.corpus.alignment.AlignmentArrayTest

joshua.corpus.CorpusArrayTest

joshua.corpus.lexprob.SampledLexProbs

joshua.corpus.suffix_array.Compile

joshua.corpus.suffix_array.ConvertCorpus

joshua.corpus.suffix_array.SuffixArrayFactory

joshua.corpus.suffix_array.SuffixArrayTest

joshua.corpus.vocab.SymbolTable

joshua.corpus.vocab.Vocabulary

joshua.prefix_tree.PrefixTreeAdvancedTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.