/**
*
* @author marc.molins.piulachs
*/
package domain;
import java.util.StringTokenizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
public class Text {
private final static String PUNCTUATION_MARKS = "?!.;,:()[]{}<>\"\'—¿¡«»*";
private String _text;
private String _cleanText;
private int _totalWords;
private ArrayList< Pair<Pair<String,String>,Pair<Integer,Boolean>> > _verbsInfinitivesPositionVisited;
private ArrayList< Integer > _positions;
private ArrayList< Pair<String,Float> > _currentNounsAndProb;
private int _currentNominalization;
private int _totalVerbs; // ==_infinitivesAndPos.size();
private Boolean _startedNominalization;
private VerbDictionary _verbDict;
private TestDictionary _nounDict;
private int _learntRules;
private int _learntPairs;
private String _mostCommonVerb;
private UserStatistics _uStats;
private TextStatistics _tStats;
private String _auxText;
public Text(VerbDictionary verbDict, TestDictionary nounDict) {
_text = null;
_cleanText = null;
_totalWords = 0;
_currentNominalization = 0;
_totalVerbs = 0;
_startedNominalization = false;
_verbDict = verbDict;
_nounDict = nounDict;
_positions = new ArrayList<Integer>();
_verbsInfinitivesPositionVisited = new ArrayList< Pair<Pair<String,String>,Pair<Integer,Boolean>> >();
_currentNounsAndProb = new ArrayList< Pair<String,Float> >();
_learntRules = 0;
_learntPairs = 0;
_mostCommonVerb = "-";
_uStats = UserStatistics.getInstance();
_tStats = TextStatistics.getInstance();
_tStats.setTotalWords(0);
_tStats.setMarkedWords(0);
_tStats.setLearntPairs(0);
_tStats.setLearntRules(0);
_tStats.setMostCommonVerb("-");
}
private Boolean isAuxVerb(String verb) {
ArrayList<String> auxVerbsList = new ArrayList<String>(Arrays.asList("have",
"has", "had", "shall", "will", "should", "would", "am", "are", "is",
"be", "been", "was" , "were", "let", "do", "does", "did", "can"
, "could", "may", "might", "ought", "must", "doing", "having"
/*"haven't", "hasn't", "hadn't", "shalln't", "shan't", "won't", "shouldn't",
"wouldn't", "ain't", "aren't", "isn't", "wasn't", "weren't", "let's",
"don't", "doesn't", "didn't", "can't", "couldn't", "mayn't", "mightn't"
, "oughtn't", "mustn't"*/));
Iterator i = auxVerbsList.iterator();
Boolean found = false;
while(!found && i.hasNext()) {
found = verb.equalsIgnoreCase((String) i.next());
}
return found;
}
private void recogniseVerbs(String text) {
StringTokenizer tokens = new StringTokenizer(text);
String word1, word2, infinitive, infinitive2;
Boolean moreThanOne = false;
if (tokens.hasMoreTokens()) {
word1 = tokens.nextToken();
++_totalWords;
while (tokens.hasMoreTokens()) {
word2 = tokens.nextToken();
++_totalWords;
moreThanOne = true;
infinitive = _verbDict.getInfinitive(word1);
if (infinitive != null) {
if (!isAuxVerb(word1)) {
Pair<String,String> pair1 = new Pair<String,String>();
Pair<Integer,Boolean> pair2 = new Pair<Integer,Boolean>();
Pair<Pair<String,String>,Pair<Integer,Boolean>> infAndPos = new Pair<Pair<String,String>,Pair<Integer,Boolean>>();
pair1.first = word1;
pair1.second = infinitive;
pair2.first = _totalWords - 1;
pair2.second = false;
infAndPos.first = pair1;
infAndPos.second = pair2;
_verbsInfinitivesPositionVisited.add(infAndPos);
_positions.add(_totalWords - 1);
++_totalVerbs;
}
else {
infinitive2 = _verbDict.getInfinitive(word2);
if (infinitive2 == null) {
Pair<String,String> pair1 = new Pair<String,String>();
Pair<Integer,Boolean> pair2 = new Pair<Integer,Boolean>();
Pair<Pair<String,String>,Pair<Integer,Boolean>> infAndPos = new Pair<Pair<String,String>,Pair<Integer,Boolean>>();
pair1.first = word1;
pair1.second = infinitive;
pair2.first = _totalWords - 1;
pair2.second = false;
infAndPos.first = pair1;
infAndPos.second = pair2;
_verbsInfinitivesPositionVisited.add(infAndPos);
_positions.add(_totalWords - 1);
++_totalVerbs;
}
}
}
word1 = word2;
}
if (moreThanOne) {
infinitive = _verbDict.getInfinitive(word1);
if (infinitive != null) {
Pair<String,String> pair1 = new Pair<String,String>();
Pair<Integer,Boolean> pair2 = new Pair<Integer,Boolean>();
Pair<Pair<String,String>,Pair<Integer,Boolean>> infAndPos = new Pair<Pair<String,String>,Pair<Integer,Boolean>>();
pair1.first = word1;
pair1.second = infinitive;
pair2.first = _totalWords;
pair2.second = false;
infAndPos.first = pair1;
infAndPos.second = pair2;
_verbsInfinitivesPositionVisited.add(infAndPos);
_positions.add(_totalWords);
++_totalVerbs;
}
}
else if(_totalWords == 1) {
infinitive = _verbDict.getInfinitive(word1);
if (infinitive != null) {
Pair<String,String> pair1 = new Pair<String,String>();
Pair<Integer,Boolean> pair2 = new Pair<Integer,Boolean>();
Pair<Pair<String,String>,Pair<Integer,Boolean>> infAndPos = new Pair<Pair<String,String>,Pair<Integer,Boolean>>();
pair1.first = word1;
pair1.second = infinitive;
pair2.first = 1;
pair2.second = false;
infAndPos.first = pair1;
infAndPos.second = pair2;
_verbsInfinitivesPositionVisited.add(infAndPos);
_positions.add(1);
++_totalVerbs;
}
}
}
}
private void nominalization() {
_currentNounsAndProb = new ArrayList< Pair<String,Float> >();
ArrayList<Three<String, Float, ArrayList<Pair<String, String>>>> aux
= new ArrayList<Three<String, Float, ArrayList<Pair<String, String>>>>();
aux = _nounDict.getCorrectList(_verbsInfinitivesPositionVisited.get(_currentNominalization).first.second);
String verb = _verbsInfinitivesPositionVisited.get(_currentNominalization).first.first;
String infinitive = _verbsInfinitivesPositionVisited.get(_currentNominalization).first.second;
int position = _verbsInfinitivesPositionVisited.get(_currentNominalization).second.first;
Boolean visited = _verbsInfinitivesPositionVisited.get(_currentNominalization).second.second;
Boolean empty = true;
// System.out.println("\nVERB: " + verb);
// System.out.println("INFINITIVE: " + infinitive);
// System.out.println("WORD NUMBER: " + position + "\n");
// System.out.println("________________________");
Iterator i = aux.iterator();
while(i.hasNext()) {
empty = false;
Three<String, Float, ArrayList<Pair<String, String>>> three = (Three<String, Float, ArrayList<Pair<String, String>>>) i.next();
Pair<String,Float> p = new Pair<String,Float>();
p.first = three.correctWord;
p.second = three.probability;
_currentNounsAndProb.add(p);
// System.out.println("\nnoun: " + three.correctWord);
// System.out.println("probability: " + three.probability);
// System.out.println("________________________");
if (!visited) {
Iterator j = three.rules.iterator();
Pair<Pair<String,String>,Pair<Integer,Boolean>> pairpair = _verbsInfinitivesPositionVisited.get(_currentNominalization);
pairpair.second.second = true;
_verbsInfinitivesPositionVisited.set(_currentNominalization, pairpair);
// Pair<String,String> p2 = new Pair<String,String>();
// p2.first = three.correctWord;
// p2.second = infinitive;
// //p2.second = verb;
// _newTraining.add(p2);
boolean first = true;
while (j.hasNext()) {
Pair<String,String> pair = (Pair<String,String>) j.next();
//_newTraining.add(pair);
if (!first) {
++_learntRules;
_uStats.increaseLearntRules(1);
_tStats.increaseLearntRules(1);
}
first = false;
}
}
}
if (empty) {
// System.out.println("No nominalizations");
// System.out.println("________________________");
}
}
public Pair<String,String> favoriteNominalization(int pos) {
Pair<String,String> p = new Pair<String,String>();
p.first = _currentNounsAndProb.get(pos).first;
p.second = _verbsInfinitivesPositionVisited.get(_currentNominalization).first.second;
//p.second = verb;
_uStats.increaseLearntPairs(1);
_tStats.increaseLearntPairs(1);
return p;
}
private Boolean isPunctuationMark(char c) {
return PUNCTUATION_MARKS.indexOf(c) != -1;
}
private String cleanString(String s) {
StringBuilder temp = new StringBuilder(s.length());
for (int i = 0; i < s.length(); ++i) {
if (!isPunctuationMark(s.charAt(i))) {
temp.append(Character.toLowerCase(s.charAt(i)));
}
}
return temp.toString();
}
public void nextNominalization() {
if(_startedNominalization) {
if (_totalVerbs > 0) {
if (_currentNominalization == _totalVerbs - 1) {
_currentNominalization = 0;
}
else {
++_currentNominalization;
}
nominalization();
}
else {
// System.out.println("No verbs");
}
}
else {
// System.out.println("Must start nominalization");
}
}
public void previousNominalization() {
if(_startedNominalization) {
if (_totalVerbs > 0) {
if (_currentNominalization == 0) {
_currentNominalization = _totalVerbs - 1;
}
else {
--_currentNominalization;
}
nominalization();
}
else {
// System.out.println("No verbs");
}
}
else {
// System.out.println("Must start nominalization");
}
}
public void clickVerb(int posText) {
if(_startedNominalization) {
Iterator i = _positions.iterator();
int counter = 0;
Boolean passed = false;
while(i.hasNext() && !passed) {
Integer posVect = (Integer) i.next();
if (posVect == posText) {
_currentNominalization = counter;
nominalization();
passed = true;
}
++counter;
// else if (posVect > posText) {
// passed = true;
// }
}
}
else {
// System.out.println("Must start nominalization");
}
}
public void startNominalization() {
if (_startedNominalization == true) {
// System.out.println("Nominalization already begun");
}
else {
_tStats.setLearntRules(0);
_tStats.setLearntPairs(0);
if (_text != null) {
_startedNominalization = true;
recogniseVerbs(_cleanText);
_uStats.increaseTotalWords(_totalWords);
_tStats.setTotalWords(_totalWords);
_uStats.increaseMarkedWords(_totalVerbs);
_tStats.setMarkedWords(_totalVerbs);
// System.out.print(_totalVerbs + " verb/s detected: ");
HashMap map = new HashMap();
Iterator i = _verbsInfinitivesPositionVisited.iterator();
while(i.hasNext()) {
Pair<Pair<String,String>,Pair<Integer,Boolean>> pair = new Pair<Pair<String,String>,Pair<Integer,Boolean>>();
pair = (Pair<Pair<String,String>,Pair<Integer,Boolean>>) i.next();
// System.out.print(pair.first.first + " ");
Integer n = (Integer) map.get(pair.first.second);
if (n == null) {
n = 0;
}
map.put(pair.first.second, new Integer(n + 1));
}
// System.out.print("\n");
HashMap mapAux = new LinkedHashMap();
List mapKeys = new ArrayList(map.keySet());
List mapValues = new ArrayList(map.values());
TreeSet sortedSet = new TreeSet(mapValues);
Object[] sortedArray = sortedSet.toArray();
int size = sortedArray.length;
for (int j=0; j<size; j++) {
mapAux.put(mapKeys.get(mapValues.indexOf(sortedArray[j])), sortedArray[j]);
}
Set ref = mapAux.keySet();
Iterator it = ref.iterator();
String mostCommon = "-";
while (it.hasNext()) {
mostCommon = (String)it.next();
}
_mostCommonVerb = mostCommon;
_tStats.setMostCommonVerb(_mostCommonVerb);
if (_totalWords > 0) {
_uStats.increaseNominalizedTexts(1);
}
if (_totalVerbs > 0) {
nominalization();
}
}
else {
// System.out.println("No text");
}
}
}
public void enterText(String text) {
_text = null;
_cleanText = null;
_totalWords = 0;
_currentNominalization = 0;
_totalVerbs = 0;
_positions = new ArrayList<Integer>();
_startedNominalization = false;
_verbsInfinitivesPositionVisited = new ArrayList< Pair<Pair<String,String>,Pair<Integer,Boolean>> >();
_currentNounsAndProb = new ArrayList< Pair<String,Float> >();
_learntRules = 0;
_learntPairs = 0;
_mostCommonVerb = "-";
_tStats.setTotalWords(0);
_tStats.setMarkedWords(0);
_tStats.setLearntPairs(0);
_tStats.setLearntRules(0);
_tStats.setMostCommonVerb("-");
_text = text;
//System.out.println(text);
_cleanText = cleanString(text);
//System.out.println(_cleanText);
}
public int getTotalWords(){
return _totalWords;
}
public int getTotalVerbs() {
return _totalVerbs;
}
public String getText() {
return _text;
}
public String getCleanText() {
return _cleanText;
}
public ArrayList< Pair<Pair<String,String>,Pair<Integer,Boolean>> > getVerbsInfinitivesPositionVisited() {
return _verbsInfinitivesPositionVisited;
}
public ArrayList< Pair<String,Float> > getCurrentNounsAndProbability() {
return _currentNounsAndProb;
}
public int getCurrentNominalizationPosition() {
return _positions.get(_currentNominalization);
}
public Boolean getStartedNominalization() {
return _startedNominalization;
}
public int getLearntRules() {
return _learntRules;
}
public int getLearntPairs() {
return _learntPairs;
}
public String getMostCommonVerb() {
return _mostCommonVerb;
}
public ArrayList<Integer> getPositions() {
return _positions;
}
public void enterTextOnly(String txt) {
_text = txt;
}
// VerbDictionary _verbDict; GETS I SETS DAIXO?
// TestDictionary _nounDict;
}