Map<String, Glyph> glyphSubstitutionMap = ttu.getGlyphSubstitutionMap();
// generate a regex from the characters to be substituted
// for Indic languages: push back the CompositeCharacters with smaller length
Set<String> compositeCharacters = new TreeSet<String>(new IndicCompositeCharacterComparator());
compositeCharacters.addAll(glyphSubstitutionMap.keySet());
// convert the text to a list of Glyph, also take care of the substitution
ArrayBasedStringTokenizer tokenizer = new ArrayBasedStringTokenizer(compositeCharacters.toArray(new String[0]));
String[] tokens = tokenizer.tokenize(text);