Source Code of edu.cmu.sphinx.linguist.dictionary.FastDictionary

/*
 * Copyright 1999-2002 Carnegie Mellon University.  
 * Portions Copyright 2002 Sun Microsystems, Inc.  
 * Portions Copyright 2002 Mitsubishi Electric Research Laboratories.
 * All Rights Reserved.  Use is subject to license terms.
 * 
 * See the file "license.terms" for information on usage and
 * redistribution of this file, and for a DISCLAIMER OF ALL 
 * WARRANTIES.
 *
 */


package edu.cmu.sphinx.linguist.dictionary;


import edu.cmu.sphinx.linguist.acoustic.Context;
import edu.cmu.sphinx.linguist.acoustic.Unit;
import edu.cmu.sphinx.linguist.acoustic.UnitManager;
import edu.cmu.sphinx.linguist.g2p.G2PConverter;
import edu.cmu.sphinx.linguist.g2p.Path;
import edu.cmu.sphinx.util.Timer;
import edu.cmu.sphinx.util.TimerPool;
import edu.cmu.sphinx.util.props.ConfigurationManagerUtils;
import edu.cmu.sphinx.util.props.PropertyException;
import edu.cmu.sphinx.util.props.PropertySheet;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.logging.Logger;


/**
 * Creates a dictionary by quickly reading in an ASCII-based Sphinx-3 format
 * dictionary. It is called the FastDictionary because the loading is fast. When
 * loaded the dictionary just loads each line of the dictionary into the hash
 * table, assuming that most words are not going to be used. Only when a word is
 * actually used is its pronunciations massaged into an array of pronunciations.
 * <p/>
 * The format of the ASCII dictionary that it explains is the same as the
 * {@link FullDictionary FullDictionary}, i.e., the word, followed by spaces or
 * tab, followed by the pronunciation(s). For example, a digits dictionary will
 * look like:
 * <p/>
 * 
 * <pre>
 *  ONE HH W AH N
 *  ONE(2) W AH N
 *  TWO T UW
 *  THREE TH R IY
 *  FOUR F AO R
 *  FIVE F AY V
 *  SIX S IH K S
 *  SEVEN S EH V AH N
 *  EIGHT EY T
 *  NINE N AY N
 *  ZERO Z IH R OW
 *  ZERO(2) Z IY R OW
 *  OH OW
 * </pre>
 * <p/>
 * <p/>
 * In the above example, the words "one" and "zero" have two pronunciations
 * each.
 */


public class FastDictionary implements Dictionary {


    // -------------------------------
    // Configuration data
    // --------------------------------
    protected Logger logger;
    private boolean addSilEndingPronunciation;
    private boolean allowMissingWords;
    private boolean createMissingWords;
    private String wordReplacement;
    protected URL wordDictionaryFile;
    protected URL fillerDictionaryFile;
    protected URL g2pModelFile;
    protected int g2pMaxPron = 0;
    protected List<URL> addendaUrlList;


    protected UnitManager unitManager;


    // -------------------------------
    // working data
    // -------------------------------
    protected Map<String, String> dictionary;
    protected Map<String, Word> wordDictionary;
    protected G2PConverter g2pDecoder;


    protected final static String FILLER_TAG = "-F-";
    protected Set<String> fillerWords;
    protected boolean allocated;


    public FastDictionary(String wordDictionaryFile,
            String fillerDictionaryFile, List<URL> addendaUrlList,
            boolean addSilEndingPronunciation, String wordReplacement,
            boolean allowMissingWords, boolean createMissingWords,
            UnitManager unitManager) throws MalformedURLException,
            ClassNotFoundException {
        this(ConfigurationManagerUtils.resourceToURL(wordDictionaryFile),
             ConfigurationManagerUtils.resourceToURL(fillerDictionaryFile),
             addendaUrlList,
             addSilEndingPronunciation,
             wordReplacement,
             allowMissingWords,
             createMissingWords,
             unitManager);
    }


    public FastDictionary(URL wordDictionaryFile, URL fillerDictionaryFile,
            List<URL> addendaUrlList, boolean addSilEndingPronunciation,
            String wordReplacement, boolean allowMissingWords,
            boolean createMissingWords, UnitManager unitManager) {
        this.logger = Logger.getLogger(getClass().getName());


        this.wordDictionaryFile = wordDictionaryFile;
        this.fillerDictionaryFile = fillerDictionaryFile;
        this.addendaUrlList = addendaUrlList;
        this.addSilEndingPronunciation = addSilEndingPronunciation;
        this.wordReplacement = wordReplacement;
        this.allowMissingWords = allowMissingWords;
        this.createMissingWords = createMissingWords;
        this.unitManager = unitManager;
    }


    public FastDictionary(URL wordDictionaryFile, URL fillerDictionaryFile,
            List<URL> addendaUrlList, boolean addSilEndingPronunciation,
            String wordReplacement, boolean allowMissingWords,
            boolean createMissingWords, UnitManager unitManager,
            URL g2pModelFile, int g2pMaxPron) {
        this(wordDictionaryFile,
             fillerDictionaryFile,
             addendaUrlList,
             addSilEndingPronunciation,
             wordReplacement,
             allowMissingWords,
             createMissingWords,
             unitManager);
        this.g2pModelFile = g2pModelFile;
        this.g2pMaxPron = g2pMaxPron;
    }


    public FastDictionary() {


    }


    /*
     * (non-Javadoc)
     * 
     * @see
     * edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.util
     * .props.PropertySheet)
     */


    public void newProperties(PropertySheet ps) throws PropertyException {
        logger = ps.getLogger();


        wordDictionaryFile = ConfigurationManagerUtils
                .getResource(PROP_DICTIONARY, ps);
        fillerDictionaryFile = ConfigurationManagerUtils
                .getResource(PROP_FILLER_DICTIONARY, ps);
        addendaUrlList = ps.getResourceList(PROP_ADDENDA);
        addSilEndingPronunciation = ps
                .getBoolean(PROP_ADD_SIL_ENDING_PRONUNCIATION);
        wordReplacement = ps.getString(Dictionary.PROP_WORD_REPLACEMENT);
        allowMissingWords = ps.getBoolean(Dictionary.PROP_ALLOW_MISSING_WORDS);
        createMissingWords = ps.getBoolean(PROP_CREATE_MISSING_WORDS);
        unitManager = (UnitManager) ps.getComponent(PROP_UNIT_MANAGER);
        g2pModelFile = ConfigurationManagerUtils
                .getResource(PROP_G2P_MODEL_PATH, ps);
        g2pMaxPron = ps.getInt(PROP_G2P_MAX_PRONUNCIATIONS);
    }


    /**
     * Get the word dictionary file
     * 
     * @return the URL of the word dictionary file
     */
    public URL getWordDictionaryFile() {
        return wordDictionaryFile;
    }


    /**
     * Get the filler dictionary file
     * 
     * @return the URL of the filler dictionary file
     */
    public URL getFillerDictionaryFile() {
        return fillerDictionaryFile;
    }


    /*
     * (non-Javadoc)
     * 
     * @see edu.cmu.sphinx.linguist.dictionary.Dictionary#allocate()
     */


    public void allocate() throws IOException {
        if (!allocated) {
            dictionary = new HashMap<String, String>();
            wordDictionary = new HashMap<String, Word>();


            Timer loadTimer = TimerPool.getTimer(this, "Load Dictionary");
            fillerWords = new HashSet<String>();


            loadTimer.start();


            logger.info("Loading dictionary from: " + wordDictionaryFile);


            loadDictionary(wordDictionaryFile.openStream(), false);


            loadCustomDictionaries(addendaUrlList);


            logger.info("Loading filler dictionary from: "
                    + fillerDictionaryFile);


            loadDictionary(fillerDictionaryFile.openStream(), true);


            if (g2pModelFile != null && !g2pModelFile.getPath().equals("")) {
                g2pDecoder = new G2PConverter(g2pModelFile);
            }
            loadTimer.stop();
        }


    }


    /*
     * (non-Javadoc)
     * 
     * @see edu.cmu.sphinx.linguist.dictionary.Dictionary#deallocate()
     */


    public void deallocate() {
        if (allocated) {
            dictionary = null;
            g2pDecoder = null;
            allocated = false;
        }
    }


    /**
     * Loads the given sphinx3 style simple dictionary from the given
     * InputStream. The InputStream is assumed to contain ASCII data.
     * 
     * @param inputStream
     *            the InputStream of the dictionary
     * @param isFillerDict
     *            true if this is a filler dictionary, false otherwise
     * @throws java.io.IOException
     *             if there is an error reading the dictionary
     */
    protected void loadDictionary(InputStream inputStream, boolean isFillerDict)
            throws IOException {
        InputStreamReader isr = new InputStreamReader(inputStream);
        BufferedReader br = new BufferedReader(isr);
        String line;


        while ((line = br.readLine()) != null) {
            if (!line.isEmpty()) {
                int spaceIndex = line.indexOf(' ');
                int spaceIndexTab = line.indexOf('\t');
                if (spaceIndex == -1) {
                    // Case where there's no blank character
                    spaceIndex = spaceIndexTab;
                } else if ((spaceIndexTab >= 0) && (spaceIndexTab < spaceIndex)) {
                    // Case where there's a blank and a tab, but the tab
                    // precedes the blank
                    spaceIndex = spaceIndexTab;
                }
                // TODO: throw an exception if spaceIndex == -1 ?
                if (spaceIndex == -1) {
                    throw new Error("Error loading word: " + line);
                }
                String word = line.substring(0, spaceIndex);
                word = word.toLowerCase();
                // Add numeric index if the word is repeating.
                if (dictionary.containsKey(word)) {
                    int index = 2;
                    String wordWithIdx;
                    do {
                        wordWithIdx = String.format("%s(%d)", word, index++);
                    } while (dictionary.containsKey(wordWithIdx));
                    word = wordWithIdx;
                }


                if (isFillerDict) {
                    dictionary.put(word, (FILLER_TAG + line));
                    fillerWords.add(word);
                } else {
                    dictionary.put(word, line);
                }
            }
        }


        br.close();
        isr.close();
        inputStream.close();
    }


    /**
     * Gets a context independent unit. There should only be one instance of any
     * CI unit
     * 
     * @param name
     *            the name of the unit
     * @param isFiller
     *            if true, the unit is a filler unit
     * @return the unit
     */
    protected Unit getCIUnit(String name, boolean isFiller) {
        return unitManager.getUnit(name, isFiller, Context.EMPTY_CONTEXT);
    }


    /**
     * Returns the sentence start word.
     * 
     * @return the sentence start word
     */
    public Word getSentenceStartWord() {
        return getWord(SENTENCE_START_SPELLING);
    }


    /**
     * Returns the sentence end word.
     * 
     * @return the sentence end word
     */
    public Word getSentenceEndWord() {
        return getWord(SENTENCE_END_SPELLING);
    }


    /**
     * Returns the silence word.
     * 
     * @return the silence word
     */
    public Word getSilenceWord() {
        return getWord(SILENCE_SPELLING);
    }


    /**
     * Returns a Word object based on the spelling and its classification. The
     * behavior of this method is also affected by the properties
     * wordReplacement, allowMissingWords, and createMissingWords.
     * 
     * @param text
     *            the spelling of the word of interest.
     * @return a Word object
     * @see edu.cmu.sphinx.linguist.dictionary.Word
     */
    public Word getWord(String text) {
        text = text.toLowerCase();
        Word wordObject = wordDictionary.get(text);


        if (wordObject != null) {
            return wordObject;
        }


        String word = dictionary.get(text);
        if (word == null) { // deal with 'not found' case
            logger.info("The dictionary is missing a phonetic transcription for the word '"
                    + text + "'");
            if (wordReplacement != null) {
                wordObject = getWord(wordReplacement);
            } else if (allowMissingWords) {
                if (createMissingWords) {
                    if (g2pModelFile != null
                            && !g2pModelFile.getPath().equals("")) {
                        logger.info("Generating phonetic transcription(s) for the word '"
                                + text + "' using g2p model");
                        ArrayList<Path> paths = g2pDecoder
                                .phoneticize(text, g2pMaxPron);
                        List<Pronunciation> pronunciations = new LinkedList<Pronunciation>();
                        for (Path p : paths) {
                            int unitCount = p.getPath().size();
                            ArrayList<Unit> units = new ArrayList<Unit>(unitCount);
                            for (String token : p.getPath()) {
                                units.add(getCIUnit(token, false));
                            }
                            if (units.size() == 0) {
                                units.add(UnitManager.SILENCE);
                            }
                            pronunciations.add(new Pronunciation(units));
                            if (addSilEndingPronunciation) {
                                units.add(UnitManager.SILENCE);
                                pronunciations.add(new Pronunciation(units));
                            }
                        }
                        Pronunciation[] pronunciationsArray = pronunciations
                                .toArray(new Pronunciation[pronunciations
                                        .size()]);
                        wordObject = createWord(text,
                                                pronunciationsArray,
                                                false);
                        for (Pronunciation pronunciation : pronunciationsArray) {
                            pronunciation.setWord(wordObject);
                        }
                        wordDictionary.put(text, wordObject);
                    } else {
                        wordObject = createWord(text, null, false);
                    }
                }
            }
        } else { // first lookup for this string
            wordObject = processEntry(text);
        }


        return wordObject;
    }


    /**
     * Create a Word object with the given spelling and pronunciations, and
     * insert it into the dictionary.
     * 
     * @param text
     *            the spelling of the word
     * @param pronunciation
     *            the pronunciation of the word
     * @param isFiller
     *            if <code>true</code> this is a filler word
     * @return the word
     */
    private Word createWord(String text,
                            Pronunciation[] pronunciation,
                            boolean isFiller) {
        Word word = new Word(text, pronunciation, isFiller);
        dictionary.put(text, word.toString());
        return word;
    }


    /**
     * Processes a dictionary entry. When loaded the dictionary just loads each
     * line of the dictionary into the hash table, assuming that most words are
     * not going to be used. Only when a word is actually used is its
     * pronunciations massaged into an array of pronunciations.
     */
    private Word processEntry(String word) {
        List<Pronunciation> pronunciations = new LinkedList<Pronunciation>();
        String line;
        int count = 0;
        boolean isFiller = false;


        do {
            count++;
            String lookupWord = word;
            if (count > 1) {
                lookupWord = lookupWord + '(' + count + ')';
            }
            line = dictionary.get(lookupWord);
            if (line != null) {
                StringTokenizer st = new StringTokenizer(line);


                String tag = st.nextToken();
                isFiller = tag.startsWith(FILLER_TAG);
                int unitCount = st.countTokens();


                ArrayList<Unit> units = new ArrayList<Unit>(unitCount);
                for (int i = 0; i < unitCount; i++) {
                    String unitName = st.nextToken();
                    units.add(getCIUnit(unitName, isFiller));
                }
                pronunciations.add(new Pronunciation(units));
                if (!isFiller && addSilEndingPronunciation) {
                    units.add(UnitManager.SILENCE);
                    pronunciations.add(new Pronunciation(units));
                }
            }
        } while (line != null);


        Pronunciation[] pronunciationsArray = pronunciations
                .toArray(new Pronunciation[pronunciations.size()]);
        Word wordObject = createWord(word, pronunciationsArray, isFiller);


        for (Pronunciation pronunciation : pronunciationsArray) {
            pronunciation.setWord(wordObject);
        }
        wordDictionary.put(word, wordObject);


        return wordObject;
    }


    /**
     * Returns the set of all possible word classifications for this dictionary.
     * 
     * @return the set of all possible word classifications
     */
    public WordClassification[] getPossibleWordClassifications() {
        return null;
    }


    /**
     * Returns a string representation of this FastDictionary in alphabetical
     * order.
     * 
     * @return a string representation of this FastDictionary
     */
    @Override
    public String toString() {


        SortedMap<String, String> sorted = new TreeMap<String, String>(dictionary);
        StringBuilder result = new StringBuilder();


        for (Map.Entry<String, String> entry : sorted.entrySet()) {
            result.append(entry.getKey());
            result.append("   ").append(entry.getValue()).append('\n');
        }


        return result.toString();
    }


    /**
     * Gets the set of all filler words in the dictionary
     * 
     * @return an array (possibly empty) of all filler words
     */
    public Word[] getFillerWords() {
        Word[] fillerWordArray = new Word[fillerWords.size()];
        int index = 0;
        for (String spelling : fillerWords) {
            fillerWordArray[index++] = getWord(spelling);
        }
        return fillerWordArray;
    }


    /**
     * Dumps this FastDictionary to System.out.
     */
    public void dump() {
        System.out.print(toString());
    }


    /**
     * Loads the dictionary with a list of URLs to custom dictionary resources
     * 
     * @param addenda
     *            the list of custom dictionary URLs to be loaded
     * @throws IOException
     *             if there is an error reading the resource URL
     */
    private void loadCustomDictionaries(List<URL> addenda) throws IOException {
        if (addenda != null) {
            for (URL addendumUrl : addenda) {
                logger.info("Loading addendum dictionary from: " + addendumUrl);
                loadDictionary(addendumUrl.openStream(), false);
            }
        }
    }


}
Source Code of edu.cmu.sphinx.linguist.dictionary.FastDictionary

Related Classes of edu.cmu.sphinx.linguist.dictionary.FastDictionary