Examples of ivory.core.tokenize.Tokenizer

ivory.core.tokenize.Tokenizer


    // set average doc length
    mModel.setAvgDocLength(avgLen);


    List<HMapSFW> transDocs = new ArrayList<HMapSFW>();
    Tokenizer tokenizer = TokenizerFactory.createTokenizer(eLang, 
        eTokenizerModelFile, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);


    // translate doc texts here
    for (HMapSIW deDoc : docs) {
      HMapIFW tfS = new HMapIFW();

View Full Code Here

  }


  // read from special wiki format created by Smith et al as part of their 2010 paper 
  private void readWikiSentences(String eReadFile, String fReadFile, String pairsFile, String eLang, String fLang,
      Vocab eVocab, Vocab fVocab, String fToken, String eToken, String fStopwordsFile, String eStopwordsFile) {
    Tokenizer eTokenizer = TokenizerFactory.createTokenizer(eLang, eToken, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);
    Tokenizer fTokenizer = TokenizerFactory.createTokenizer(fLang, fToken, true, fStopwordsFile, fStopwordsFile + ".stemmed", null);


    try {
      BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8"));
      BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8"));

View Full Code Here


  // regular 1 sentence per line, 1 sentence per doc format
  private void readSentences(int sentsPerDoc, String eReadFile, String fReadFile, String eLang, String fLang,
      String fToken, String eToken, String fStopwordsFile, String eStopwordsFile) throws IOException,
      ClassNotFoundException, InstantiationException, IllegalAccessException {
    Tokenizer eTokenizer = TokenizerFactory.createTokenizer(eLang, eToken, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);
    Tokenizer fTokenizer = TokenizerFactory.createTokenizer(fLang, fToken, true, fStopwordsFile, fStopwordsFile + ".stemmed", null);


    float sumFLengs = 0, sumELengs = 0;


    try {
      BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8"));
      BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8"));
      HMapSIW fDoc = new HMapSIW();
      HMapSIW eDoc = new HMapSIW();
      String eLine = null, fLine = null;
      int cntEDocs = 0, cntFDocs = 0, lastDocLenE = 0, lastDocLenF = 0, numSents = 0;


      while ((eLine = dis1.readLine()) != null) {
        fLine = dis2.readLine().trim();
        eLine = eLine.trim();


        String[] tokens = fTokenizer.processContent(fLine);      
        lastDocLenF += tokens.length;


        for (String token : tokens) {
          if (!fDoc.containsKey(token)) { // if this is first time we saw token in this sentence
            dfD.increment(token);

View Full Code Here

      eVocabTrg = HadoopAlign.loadVocab(new Path(eVocabTrgFile), localFs);
      fVocabSrc = HadoopAlign.loadVocab(new Path(fVocabSrcFile), localFs);
      fVocabTrg = HadoopAlign.loadVocab(new Path(fVocabTrgFile), localFs);
      f2e_Probs = new TTable_monolithic_IFAs(localFs, new Path(probTablef2eFile), true);
      e2f_Probs = new TTable_monolithic_IFAs(localFs, new Path(probTablee2fFile), true);
      Tokenizer fTokenizer = TokenizerFactory.createTokenizer(localFs, fLang, fTokenFile, false);
      Tokenizer eTokenizer = TokenizerFactory.createTokenizer(localFs, eLang, eTokenFile, false);
      long startTime = System.currentTimeMillis(); 


      if (pairsFile == null) {
        readSentences(1, eFile, fFile, eLang, fLang,
            fTokenFile, eTokenFile, fStopwordsFile, eStopwordsFile);

View Full Code Here

    // translate doc texts here
    for (HMapSIW deDoc : docs) {
      HMapIFW tfS = new HMapIFW();
      int docLen = 0;
      try {
        Tokenizer tokenizer = TokenizerFactory.createTokenizer(eLang, null, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);
        docLen = CLIRUtils.translateTFs(deDoc, tfS, eVocabSrc, eVocabTrg, fVocabSrc,
            fVocabTrg, e2f_Probs, f2e_Probs, tokenizer , null);   // tokenizer just for stopword list
      } catch (IOException e) {
        e.printStackTrace();
      }

View Full Code Here

    File fFile = new File(fReadFile);


    FileInputStream fis1 = null, fis2 = null;
    BufferedReader dis1 = null, dis2 = null;


    Tokenizer eTokenizer = TokenizerFactory.createTokenizer(eLang, eToken, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);
    Tokenizer fTokenizer = TokenizerFactory.createTokenizer(fLang, fToken, true, fStopwordsFile, fStopwordsFile + ".stemmed", null);


    float sumFLengs = 0, sumELengs = 0;


    try {
      fis1 = new FileInputStream(eFile);
      fis2 = new FileInputStream(fFile);
      dis1 = new BufferedReader(new InputStreamReader(fis1, "UTF-8"));
      dis2 = new BufferedReader(new InputStreamReader(fis2, "UTF-8"));
      HMapSIW fSent = new HMapSIW();
      HMapSIW eSent = new HMapSIW();
      String eLine = null, fLine = null;
      int cntE = 0, cntF = 0, lastSentLenE = 0, lastSentLenF = 0;


      while ((eLine = dis1.readLine()) != null) {
        fLine = dis2.readLine().trim();
        eLine = eLine.trim();


        String[] tokens;
        if (fTokenizer == null) {
          tokens = fLine.split(" ");
        } else {
          tokens = fTokenizer.processContent(fLine);
        }
        lastSentLenF = tokens.length;


        for (String token : tokens) {
          if (!fSent.containsKey(token)) { // if this is first time we saw token in this sentence

View Full Code Here


    // set average doc length
    mModel.setAvgDocLength(avgDeDocLeng);


    List<HMapSFW> transDocs = new ArrayList<HMapSFW>();
    Tokenizer tokenizer = TokenizerFactory.createTokenizer(eLang, 
        eTokenizerModelFile, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);


    // translate doc texts here
    for (HMapSIW deDoc : docs) {
      HMapIFW tfS = new HMapIFW();

View Full Code Here

    File fFile = new File(fReadFile);


    FileInputStream fis1 = null, fis2 = null;
    BufferedReader dis1 = null, dis2 = null;


    Tokenizer eTokenizer = TokenizerFactory.createTokenizer(eLang, eToken, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);
    Tokenizer fTokenizer = TokenizerFactory.createTokenizer(fLang, fToken, true, fStopwordsFile, fStopwordsFile + ".stemmed", null);


    float sumFLengs = 0, sumELengs = 0;


    try {
      fis1 = new FileInputStream(eFile);
      fis2 = new FileInputStream(fFile);
      dis1 = new BufferedReader(new InputStreamReader(fis1, "UTF-8"));
      dis2 = new BufferedReader(new InputStreamReader(fis2, "UTF-8"));
      HMapSIW fSent = new HMapSIW();
      HMapSIW eSent = new HMapSIW();
      String eLine = null, fLine = null;
      int cntE = 0, cntF = 0, lastSentLenE = 0, lastSentLenF = 0;


      while ((eLine = dis1.readLine()) != null) {
        fLine = dis2.readLine().trim();
        eLine = eLine.trim();


        String[] tokens;
        if (fTokenizer == null) {
          tokens = fLine.split(" ");
        } else {
          tokens = fTokenizer.processContent(fLine);
        }
        lastSentLenF = tokens.length;


        for (String token : tokens) {
          if (!fSent.containsKey(token)) { // if this is first time we saw token in this sentence

View Full Code Here

      eVocabTrg = HadoopAlign.loadVocab(new Path(eVocabTrgFile), localFs);
      fVocabSrc = HadoopAlign.loadVocab(new Path(fVocabSrcFile), localFs);
      fVocabTrg = HadoopAlign.loadVocab(new Path(fVocabTrgFile), localFs);
      f2e_Probs = new TTable_monolithic_IFAs(localFs, new Path(probTablef2eFile), true);
      e2f_Probs = new TTable_monolithic_IFAs(localFs, new Path(probTablee2fFile), true);
      Tokenizer fTokenizer = TokenizerFactory.createTokenizer(localFs, fLang, fTokenFile, false);
      Tokenizer eTokenizer = TokenizerFactory.createTokenizer(localFs, eLang, eTokenFile, false);
      long startTime = System.currentTimeMillis(); 
      
      readSentences(eName, fName, eLang, fLang, eVocabTrg, fVocabSrc, 
          fTokenFile, eTokenFile, fStopwordsFile, eStopwordsFile);

View Full Code Here


    // set average doc length
    mModel.setAvgDocLength(avgLen);


    List<HMapSFW> transDocs = new ArrayList<HMapSFW>();
    Tokenizer tokenizer = TokenizerFactory.createTokenizer(eLang, 
        eTokenizerModelFile, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);


    // translate doc texts here
    for (HMapSIW deDoc : docs) {
      HMapIFW tfS = new HMapIFW();

View Full Code Here

0 1

TOP

Related Classes of ivory.core.tokenize.Tokenizer

ivory.lsh.eval.BitextClassifierUtils

org.apache.commons.cli.CommandLine

org.apache.commons.cli.CommandLineParser

org.apache.commons.cli.GnuParser

org.apache.commons.cli.HelpFormatter

org.apache.commons.cli.Options

org.apache.hadoop.fs.FSDataInputStream

org.apache.hadoop.fs.Path

org.apache.lucene.analysis.tokenattributes.CharTermAttribute

java.lang.UnsupportedOperationException

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.