Source Code of com.sun.speech.freetts.en.TokenizerImpl

/**
 * Portions Copyright 2001 Sun Microsystems, Inc.
 * Portions Copyright 1999-2001 Language Technologies Institute, 
 * Carnegie Mellon University.
 * All Rights Reserved.  Use is subject to license terms.
 * 
 * See the file "license.terms" for information on usage and
 * redistribution of this file, and for a DISCLAIMER OF ALL 
 * WARRANTIES.
 */
package com.sun.speech.freetts.en;


import com.sun.speech.freetts.Token;
import com.sun.speech.freetts.Tokenizer;
import java.io.Reader;
import java.io.IOException;




/**
 * Implements the tokenizer interface. Breaks an input sequence of
 * characters into a set of tokens.
 */
public class TokenizerImpl implements Tokenizer {
        
    /** A constant indicating that the end of the stream has been read. */
    public static final int EOF = -1;
    
    /** A string containing the default whitespace characters. */
    public static final String DEFAULT_WHITESPACE_SYMBOLS = " \t\n\r";
    
    /** A string containing the default single characters. */
    public static final String DEFAULT_SINGLE_CHAR_SYMBOLS = "(){}[]";
    
    /** A string containing the default pre-punctuation characters. */
    public static final String DEFAULT_PREPUNCTUATION_SYMBOLS = "\"'`({[";
    
    /** A string containing the default post-punctuation characters. */
    public static final String DEFAULT_POSTPUNCTUATION_SYMBOLS 
  = "\"'`.,:;!?(){}[]";




    // the line number
    private int lineNumber = 0;
    
    // the input text (from the Utterance) to tokenize
    private String inputText = null;


    // the file to read input text from, if using file mode
    private Reader reader = null;


    // the token position - doesn't seem really necessary at this point
    // private int tokenPosition = 0;


    // the current character, whether its from the file or the input text
    private int currentChar = 0;
    
    // the current char position for the input text (not the file)
    // this is called "file_pos" in flite
    private int currentPosition = 0;
    
    
    // the delimiting symbols of this tokenizer
    private String whitespaceSymbols = DEFAULT_WHITESPACE_SYMBOLS;
    private String singleCharSymbols = DEFAULT_SINGLE_CHAR_SYMBOLS;
    private String prepunctuationSymbols = DEFAULT_PREPUNCTUATION_SYMBOLS;
    private String postpunctuationSymbols = DEFAULT_POSTPUNCTUATION_SYMBOLS;


    // The error description
    private String errorDescription = null;
    
    // a place to store the current token
    private Token token;
    private Token lastToken = null;


    // for timing
    private long duration = 0;
        


    /**
     * Constructs a Tokenizer.
     */
    public TokenizerImpl() {
    }




    /**
     * Creates a tokenizer that will return tokens from
     * the given string.
     *
     * @param string the string to tokenize
     */
    public TokenizerImpl(String string) {
  setInputText(string);
    }


    /**
     * Creates a tokenizer that will return tokens from
     * the given file.
     *
     * @param file where to read the input from
     */
    public TokenizerImpl(Reader file) {
  setInputReader(file);
    }




    /**
     * Sets the whitespace symbols of this Tokenizer to the given symbols.
     *
     * @param symbols the whitespace symbols
     */
    public void setWhitespaceSymbols(String symbols) {
  whitespaceSymbols = symbols;
    }
        


    /**
     * Sets the single character symbols of this Tokenizer to the given
     * symbols.
     *
     * @param symbols the single character symbols
     */
    public void setSingleCharSymbols(String symbols) {
  singleCharSymbols = symbols;
    }
        


    /**
     * Sets the prepunctuation symbols of this Tokenizer to the given
     * symbols.
     *
     * @param symbols the prepunctuation symbols
     */
    public void setPrepunctuationSymbols(String symbols) {
  prepunctuationSymbols = symbols;
    }
        


    /**
     * Sets the postpunctuation symbols of this Tokenizer to the given
     * symbols.
     *
     * @param symbols the postpunctuation symbols
     */
    public void setPostpunctuationSymbols(String symbols) {
  postpunctuationSymbols = symbols;
    }
    


    /**
     * Sets the text to tokenize. 
     *
     * @param  inputString  the string to tokenize
     */
    public void setInputText(String inputString) {
  inputText = inputString;
  currentPosition = 0;
  
  if (inputText != null) {
      getNextChar();
  }
    }


    /**
     * Sets the input reader
     *
     * @param  reader the input source
     */
    public void setInputReader(Reader reader) {
  this.reader = reader;
  getNextChar();
    }
    


    /**
     * Returns the next token.
     *
     * @return  the next token if it exists,
     *          <code>null</code> if no more tokens
     */
    public Token getNextToken() {
  lastToken = token;
  token = new Token();
  
  // Skip whitespace
  token.setWhitespace(getTokenOfCharClass(whitespaceSymbols));
  
  // quoted strings currently ignored
  
  // get prepunctuation
  token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols));
  
  // get the symbol itself
  if (singleCharSymbols.indexOf(currentChar) != -1) {
      token.setWord(String.valueOf((char) currentChar));
      getNextChar();
  } else {
      token.setWord(getTokenNotOfCharClass(whitespaceSymbols));
  }


  token.setPosition(currentPosition);
  token.setLineNumber(lineNumber);
  
  // This'll have token *plus* postpunctuation
  // Get postpunctuation
  removeTokenPostpunctuation();
  
  return token;
    }
    


    /**
     * Returns <code>true</code> if there are more tokens,
     *     <code>false</code> otherwise.
     *
     * @return <code>true</code> if there are more tokens
     *         <code>false</code> otherwise
     */
    public boolean hasMoreTokens() {
  int nextChar = currentChar;
  return (nextChar != EOF);
    }
    


    /**
     * Advances the currentPosition pointer by 1 (if not exceeding
     * length of inputText, and returns the character pointed by
     * currentPosition.
     *
     * @return the next character EOF if no more characters exist
     */
    private int getNextChar() {
  if (reader != null) {
      try {
          int readVal  = reader.read();
    if (readVal == -1) {
        currentChar = EOF;
    } else {
        currentChar = (char) readVal;
    }
      } catch (IOException ioe) {
    currentChar = EOF;
    errorDescription = ioe.getMessage();
      }
  } else if (inputText != null) {
      if (currentPosition < inputText.length()) {
    currentChar = (int) inputText.charAt(currentPosition);
      } else {
    currentChar = EOF;
      }
  }
  if (currentChar != EOF) {
      currentPosition++;
  }
  if (currentChar == '\n') {
      lineNumber++;
  }
  return currentChar;
    }
    


    /**
     * Starting from the current position of the input text,
     * returns the subsequent characters of type charClass,
     * and not of type singleCharSymbols.
     *
     * @param  charClass  the type of characters to look for
     * @param  buffer  the place to append characters of type charClass
     *
     * @return  a string of characters starting from the current position
     *          of the input text, until it encounters a character not
     *          in the string charClass
     *
     */
    private String getTokenOfCharClass(String charClass) {
  return getTokenByCharClass(charClass, true);
    }


    /**
     * Starting from the current position of the input text/file,
     * returns the subsequent characters, not of type singleCharSymbols,
     * and ended at characters of type endingCharClass.  E.g., if the current
     * string is "xxxxyyy", endingCharClass is "yz", and singleCharClass
     * "abc". Then this method will return to "xxxx".
     *
     * @param  endingCharClass  the type of characters to look for
     *
     * @return  a string of characters from the current position until
     *          it encounters characters in endingCharClass
     *
     */
    private String getTokenNotOfCharClass(String endingCharClass) {
  return getTokenByCharClass(endingCharClass, false);
    }
    
    /**
     * Provides a `compressed' method from getTokenOfCharClass() and 
     * getTokenNotOfCharClass().
     * If parameter containThisCharClass is <code>true</code>, 
     * then a string from the
     * current position to the last character in charClass is returned.
     * If containThisCharClass is <code>false</code>, then a string 
     * before the first
     * occurrence of a character in containThisCharClass is returned.
     *
     * @param  charClass  the string of characters you want included or
     *                    excluded in your return
     * @param  containThisCharClass  determines if you want characters
     *                in charClass in the returned string or not
     *
     * @return  a string of characters from the current position until
     *          it encounters characters in endingCharClass
     */
    private String getTokenByCharClass(String charClass, 
                                       boolean containThisCharClass) {  
  StringBuffer buffer = new StringBuffer();
  
  // if we want the returned string to contain chars in charClass, then
  // containThisCharClass is TRUE and
  // (charClass.indexOf(currentChar) != 1) == containThisCharClass)
  // returns true; if we want it to stop at characters of charClass,
  // then containThisCharClass is FALSE, and the condition returns
  // false.
  while ((charClass.indexOf(currentChar) != -1)
         == containThisCharClass  &&
         singleCharSymbols.indexOf(currentChar) == -1 &&
         currentChar != EOF) {
      buffer.append((char) currentChar);
      getNextChar();
  }
  return buffer.toString();
    }
    
    /**
     * Removes the postpunctuation characters from the current token.
     * Copies those postpunctuation characters to the class
     * variable 'postpunctuation'.
     */
    private void removeTokenPostpunctuation() {
  if (token != null) {
      String tokenWord = token.getWord();


      int tokenLength = tokenWord.length();
      int position = tokenLength - 1;
      
      while (position > 0 &&
       postpunctuationSymbols.indexOf
       ((int)tokenWord.charAt(position)) != -1) {
    position--;
      }
      
      if (tokenLength - 1 != position) {
    // Copy postpunctuation from token
    token.setPostpunctuation( tokenWord.substring(position+1));
    
    // truncate token at postpunctuation
    token.setWord(tokenWord.substring(0, position+1));
      } else {
    token.setPostpunctuation("");
      }
  }
    }


    /**
     * Returns <code>true</code> if there were errors while reading tokens
     *
     * @return <code>true</code> if there were errors;
     *     <code>false</code> otherwise
     */
    public boolean hasErrors() {
  return errorDescription != null;
    }


    /**
     * if hasErrors returns <code>true</code>, this will return a 
     * description of the error encountered, otherwise
     * it will return <code>null</code>
     *
     * @return a description of the last error that occurred.
     */
    public String getErrorDescription() {
  return errorDescription;
    }


    /**
     * Determines if the current token should start a new sentence.
     *
     * @return <code>true</code> if a new sentence should be started
     */
    public boolean isBreak() {


  String tokenWhiteSpace = token.getWhitespace();
  String lastTokenPostpunctuation = null;
  if (lastToken != null) {
      lastTokenPostpunctuation = lastToken.getPostpunctuation();
  }
  
  if (lastToken == null || token == null) {
      return false;
  } else if (tokenWhiteSpace.indexOf('\n') !=
       tokenWhiteSpace.lastIndexOf('\n')) {
      return true;
  } else if (lastTokenPostpunctuation.indexOf(':') != -1 ||
       lastTokenPostpunctuation.indexOf('?') != -1 ||
       lastTokenPostpunctuation.indexOf('!') != -1) {
      return true;
      } else if (lastTokenPostpunctuation.indexOf('.') != -1 &&
       tokenWhiteSpace.length() > 1 &&
       Character.isUpperCase(token.getWord().charAt(0))) {
      return true;
      } else {
      String lastWord = lastToken.getWord();
      int lastWordLength = lastWord.length();


      if (lastTokenPostpunctuation.indexOf('.') != -1 &&
    /* next word starts with a capital */
    Character.isUpperCase(token.getWord().charAt(0)) &&
    /* last word isn't an abbreviation */
    !(Character.isUpperCase
      (lastWord.charAt(lastWordLength - 1)) ||
      (lastWordLength < 4 &&
       Character.isUpperCase(lastWord.charAt(0))))) {
    return true;
      }
  }
  return false;
    }
}
Source Code of com.sun.speech.freetts.en.TokenizerImpl

Related Classes of com.sun.speech.freetts.en.TokenizerImpl