Package joshua.corpus

Examples of joshua.corpus.CorpusArray


      alignmentPrintStream.println(alignmentString);
      alignmentPrintStream.close();
      alignmentFileName = alignmentFile.getAbsolutePath();
    }
   
    CorpusArray sourceCorpusArray =
      SuffixArrayFactory.createCorpusArray(sourceFileName);
    SuffixArray sourceSuffixArray =
      SuffixArrayFactory.createSuffixArray(sourceCorpusArray, SuffixArray.DEFAULT_CACHE_CAPACITY);
   
    CorpusArray targetCorpusArray =
      SuffixArrayFactory.createCorpusArray(targetFileName);
    SuffixArray targetSuffixArray =
      SuffixArrayFactory.createSuffixArray(targetCorpusArray, SuffixArray.DEFAULT_CACHE_CAPACITY);

    Alignments alignmentArray = SuffixArrayFactory.createAlignments(alignmentFileName, sourceSuffixArray, targetSuffixArray);
View Full Code Here


        alignedSourceIndices[i][0] = i;
      }
    }
    Alignments alignments = new AlignmentArray(alignedTargetIndices, alignedSourceIndices, 1);
   
    CorpusArray targetCorpus = new CorpusArray(sentenceF, sentenceStartPositions, vocab);
    SuffixArray targetSuffixes = new SuffixArray(targetCorpus);

    CorpusArray sourceCorpus = new CorpusArray(sentence, sentenceStartPositions, vocab);
    SuffixArray sourceSuffixes = new SuffixArray(sourceCorpus);
    ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixes, targetSuffixes, alignments, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);
   
//    tree = new PrefixTree(vocab, maxPhraseSpan, maxPhraseLength, maxNonterminals);
    tree = new PrefixTree(parallelCorpus);
View Full Code Here

      int[] corpus = new int[corpusSentence.size()];
      for(int i = 0; i < corpusSentence.size(); i++) {
        corpus[i] = corpusSentence.getWordID(i);
      }
     
      CorpusArray corpusArray = new CorpusArray(corpus, sentenceStartPositions, sourceVocab);
      suffixArray = new SuffixArray(corpusArray);
     


      int[] targetSentenceStartPositions = {0};
     
      BasicPhrase targetCorpusSentence = new BasicPhrase(targetCorpusString, targetVocab);
      Assert.assertEquals(targetCorpusSentence.size(), 18);
     
      int[] targetCorpus = new int[targetCorpusSentence.size()];
      for(int i = 0; i < targetCorpusSentence.size(); i++) {
        targetCorpus[i] = targetCorpusSentence.getWordID(i);
      }
     

     

     
      CorpusArray targetCorpusArray = new CorpusArray(targetCorpus, targetSentenceStartPositions, targetVocab);
      targetSuffixArray = new SuffixArray(targetCorpusArray);

     
      int[] lowestAlignedTargetIndex = new int[corpusSentence.size()];
      int[] highestAlignedTargetIndex = new int[corpusSentence.size()];
View Full Code Here

    int[] corpus = new int[exampleSentence.size()];
    for(int i = 0; i < exampleSentence.size(); i++) {
      corpus[i] = exampleSentence.getWordID(i);
    }
   
    CorpusArray corpusArray = new CorpusArray(corpus, sentences, vocab);
   
    if (binaryFileName==null || binaryFileName.trim().length()==0)
      suffixArray = new SuffixArray(corpusArray);
    else
      suffixArray = new MemoryMappedSuffixArray(binaryFileName, corpusArray, MemoryMappedSuffixArray.DEFAULT_CACHE_CAPACITY);
View Full Code Here

   
   
   
    // Construct source language corpus
    if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + sourceCorpusFileName);
    CorpusArray sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, symbolTable, sourceLengths[0], sourceLengths[1]);
   
    // Write source corpus to disk
    {
      String binarySourceCorpusFilename = outputDirName + File.separator + "source.corpus";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceCorpusFilename);
     
        BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binarySourceCorpusFilename), false);
        sourceCorpusArray.writeExternal(corpusOut)
        corpusOut.flush();
       
      out.println("Source language corpus: " + binarySourceCorpusFilename);
    }
   
    // Construct target language corpus
    if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + targetCorpusFileName);
    CorpusArray targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetCorpusFileName, symbolTable, targetLengths[0], targetLengths[1]);
   
   
    // Write target language corpus to disk
    {
      String binaryTargetCorpusFilename = outputDirName + File.separator + "target.corpus";
      if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetCorpusFilename);
     
        BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binaryTargetCorpusFilename), false);
        targetCorpusArray.writeExternal(corpusOut)
        corpusOut.flush();
       
        out.println("Target language corpus: " + binaryTargetCorpusFilename);
    }
   
View Full Code Here

   
    // Read the provided corpus
    logger.info("Reading provided corpus");
    Vocabulary oldSymbolTable = new Vocabulary();
    int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, oldSymbolTable, true);
    CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, oldSymbolTable, lengths[0], lengths[1]);
   
    // Change the internal integer-string mappings
    // of the corpus to use those provided by the given symbol table.
    logger.info("Converting corpus to use new symbol mappings");
    corpusArray.setSymbolTable(symbolTable);
   
    // Write the corpus to disk in binary format
    logger.info("Writing corpus to disk in binary format, using new symbol mappings");
    corpusArray.write(binaryCorpusFilename, binaryVocabFilename, charset);
   
  }
View Full Code Here

//        wordCounter++;
//      }
//      if(SHOW_PROGRESS && sentenceCounter % 10000==0) logger.info(""+numWords);
    }

    return new CorpusArray(corpus, sentenceIndexes, vocab);
  }
View Full Code Here

   
    String sourceFilename = "data/tiny.fr";
    String targetFilename = "data/tiny.en";
    String alignmentsFilename = "data/tiny.fr-en.alignment";
   
    CorpusArray sourceArray =
      SuffixArrayFactory.createCorpusArray(sourceFilename);
    sourceCorpus =
      SuffixArrayFactory.createSuffixArray(sourceArray, SuffixArray.DEFAULT_CACHE_CAPACITY);//SuffixArrayFactory.loadSuffixArray(sourceLang, corpusName, directory);
   
    CorpusArray targetArray =
      SuffixArrayFactory.createCorpusArray(targetFilename);
    targetCorpus =
      SuffixArrayFactory.createSuffixArray(targetArray, SuffixArray.DEFAULT_CACHE_CAPACITY);//SuffixArrayFactory.loadSuffixArray(targetLang, corpusName, directory);
   
    alignmentArray = (AlignmentArray) SuffixArrayFactory.createAlignments(alignmentsFilename, sourceCorpus, targetCorpus); //SuffixArrayFactory.loadAlignmentArray(sourceLang, targetLang, corpusName, directory);
View Full Code Here

    try {
     
      // FIX: can't use createVocabulary(String) because we set numWords and numSentences
      Vocabulary vocab = new Vocabulary();
      Vocabulary.initializeVocabulary(filename, vocab, true);
      CorpusArray corpus = SuffixArrayFactory.createCorpusArray(filename, vocab, numWords, numSentences);
     
      corpus.write(filename+".corpus", filename+".vocab", "UTF-8");
     
      MemoryMappedCorpusArray mmCorpus = new MemoryMappedCorpusArray(filename+".corpus", filename+".vocab");
     
      Assert.assertEquals(mmCorpus.size(), corpus.size());
      Assert.assertEquals(mmCorpus.getNumSentences(), corpus.getNumSentences());
     
      // For each word in the corpus,
      for (int i=0; i<corpus.size(); i++) {
       
        // Verify that the memory-mapped corpus and the in-memory corpus have the same value
        Assert.assertEquals(mmCorpus.getWordID(i), corpus.getWordID(i));
      }
     
     
      // For each sentence in the corpus
      for (int i=0; i<corpus.sentences.length; i++) {
       
        // Verify that the sentence start position in the memory-mapped corpus and the in-memory corpus have the same value
        Assert.assertEquals(mmCorpus.getSentencePosition(i), corpus.getSentencePosition(i));
       
        // Verify that the sentence end position in the memory-mapped corpus and the in-memory corpus have the same value
        Assert.assertEquals(mmCorpus.getSentenceEndPosition(i), corpus.getSentenceEndPosition(i));
       
        // Verify that the phrase corresponding to this sentence is the same
        Phrase sentence = corpus.getSentence(i);
        Phrase mmSentence = mmCorpus.getSentence(i);
        Assert.assertNotNull(sentence);
        Assert.assertNotNull(mmSentence);
        Assert.assertEquals(mmSentence, sentence);
      }
View Full Code Here

TOP

Related Classes of joshua.corpus.CorpusArray

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.