Examples of Vocabulary


Examples of joshua.corpus.vocab.Vocabulary

   * Constructs an empty corpus.
   * <p>
   * NOTE: Primarily needed for Externalizable interface.
   */
  public CorpusArray() {
    super(new Vocabulary());
//    this.symbolTable = new Vocabulary();
    this.sentences = new int[]{};
    this.corpus = new int[]{};
  }
View Full Code Here

Examples of joshua.corpus.vocab.Vocabulary

    String corpusFileName = args[0];
    String binaryVocabFilename = args[1];
    String binaryCorpusFilename = args[2];
    String charset = (args.length > 3) ? args[3] : "UTF-8";
   
    Vocabulary symbolTable = new Vocabulary();
    int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);
   
    CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);
   
    corpusArray.write(binaryCorpusFilename, binaryVocabFilename, charset);
View Full Code Here

Examples of joshua.corpus.vocab.Vocabulary

  private int[]      words;
 
 
  public BasicPhrase(byte language, String sentence) {
    this.language   = language;
    this.vocabulary = new Vocabulary();
    this.words = splitSentence(sentence, vocabulary);
  }
View Full Code Here

Examples of joshua.corpus.vocab.Vocabulary

    int lastSentence = Integer.parseInt(argv[3]);
    HashMap<Integer,Integer> chosenSentences = new HashMap<Integer,Integer>();
    for (int i = firstSentence; i < lastSentence; i++) {
      chosenSentences.put(i, i);
    }
    Vocabulary vocab = new Vocabulary();
    DiskHyperGraph dhg = new DiskHyperGraph(vocab, 0, true, null);
    dhg.initRead(itemsFile, rulesFile, chosenSentences);
    JungHyperGraph hg = new JungHyperGraph(dhg.readHyperGraph(), vocab);
    JFrame frame = new JFrame("Joshua Hypergraph");
    frame.getContentPane().add(new HyperGraphViewer(hg, vocab));
View Full Code Here

Examples of joshua.corpus.vocab.Vocabulary

    int lastSentence = Integer.parseInt(argv[3]);
    HashMap<Integer,Integer> chosenSentences = new HashMap<Integer,Integer>();
    for (int i = firstSentence; i < lastSentence; i++) {
      chosenSentences.put(i, i);
    }
    Vocabulary vocab = new Vocabulary();
    DiskHyperGraph dhg = new DiskHyperGraph(vocab, 0, true, null);
    dhg.initRead(itemsFile, rulesFile, chosenSentences);
    JungHyperGraph hg = new JungHyperGraph(dhg.readHyperGraph(), vocab);
    return;
  }
View Full Code Here

Examples of joshua.corpus.vocab.Vocabulary

          String binaryTargetFileName = joshDirName + File.separator + "target.corpus";
//          String binaryTargetSuffixesFileName = joshDirName + File.separator + "target.suffixes";
          String binaryAlignmentFileName = joshDirName + File.separator + "alignment.grids";

          logger.fine("Loading vocabulary...");
          Vocabulary commonVocab = new Vocabulary();
          ObjectInput in = BinaryIn.vocabulary(binaryVocabFileName);
          commonVocab.readExternal(in);

          logger.fine("Loading source corpus...");
          Corpus sourceCorpus = new MemoryMappedCorpusArray(commonVocab, binarySourceFileName);

          logger.fine("Loading source suffix array...");
View Full Code Here

Examples of joshua.corpus.vocab.Vocabulary

  //===============================================================
 
  public static void main(String[] args) throws IOException, ClassNotFoundException {


    Vocabulary symbolTable;
    Corpus corpusArray;
    Suffixes suffixArray;
    FrequentPhrases frequentPhrases;

    if (args.length == 1) {

      String corpusFileName = args[0];

      logger.info("Constructing vocabulary from file " + corpusFileName);
      symbolTable = new Vocabulary();
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);

      logger.info("Constructing corpus array from file " + corpusFileName);
      corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);

      logger.info("Constructing suffix array from file " + corpusFileName);
      suffixArray = new SuffixArray(corpusArray, Cache.DEFAULT_CAPACITY);

    } else if (args.length == 3) {

      String binarySourceVocabFileName = args[0];
      String binaryCorpusFileName = args[1];
      String binarySuffixArrayFileName = args[2];

      if (logger.isLoggable(Level.INFO)) logger.info("Constructing source language vocabulary from binary file " + binarySourceVocabFileName);
      ObjectInput in = BinaryIn.vocabulary(binarySourceVocabFileName);
      symbolTable = new Vocabulary();
      symbolTable.readExternal(in);

      logger.info("Constructing corpus array from file " + binaryCorpusFileName);
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing memory mapped source language corpus array.");
      corpusArray = new MemoryMappedCorpusArray(symbolTable, binaryCorpusFileName);
View Full Code Here

Examples of joshua.corpus.vocab.Vocabulary

      for (String sentence : to_be_or_not_to_be) {
        String[] array = sentence.split("\\s+");
        Arrays.sort(array);
        for (String s : array) { set.add(s); }
      }
      symbolTableToBe = new Vocabulary(set);
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, new Vocabulary(), true);

      logger.fine("Constructing corpus array from file " + corpusFileName);
      corpusToBe = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTableToBe, lengths[0], lengths[1]);

      logger.fine("Constructing suffix array from file " + corpusFileName);
View Full Code Here

Examples of joshua.corpus.vocab.Vocabulary

        sourcePrintStream.println(sentence);
      }
      sourcePrintStream.close();
      corpusFileName = sourceFile.getAbsolutePath();
     
      Vocabulary symbolTable;
     
      logger.fine("Constructing vocabulary from file " + corpusFileName);
      ArrayList<String> words = new ArrayList<String>();
      for (String sentence : sentences) {
        String[] array = sentence.split("\\s+");
        for (String s : array) {
          if (! words.contains(s)) {
            words.add(s);
          }
        }
      }
      Collections.sort(words);
      LinkedHashSet<String> set = new LinkedHashSet<String>(words);
      symbolTable = new Vocabulary(set);
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, new Vocabulary(), true);

      logger.fine("Constructing corpus array from file " + corpusFileName);
      corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);

      logger.fine("Constructing suffix array from file " + corpusFileName);
View Full Code Here

Examples of kea.vocab.Vocabulary

    m_DisallowInternalPeriods = disallow;
  }


  public void loadThesaurus(Stemmer st, Stopwords sw) {
    m_Vocabulary = new Vocabulary(m_vocabulary,m_vocabularyFormat, m_documentLanguage);

    m_Vocabulary.setStemmer(st);
    m_Vocabulary.setStopwords(sw);
    m_Vocabulary.initialize();
    try {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.