@Override
public ArrayList<Feature> extractFeatures(CorpusHolder corpus) {
ArrayList<DataUnit> positiveExs = corpus.getPositiveExamples();
ArrayList<DataUnit> negativeExs = corpus.getNegativeExamples();
ArrayList<Feature> taggedStems = new ArrayList<Feature>();
TaggedStem temp;
StanfordParser theParser = new StanfordParser(Language.Arabic);
ParsingOptions opts = new ParsingOptions();
opts.setPosTag(true);
ArrayList<Word> textWords;
ParsedData parsedOut;
ArabicStemmerKhoja arabicStemmer = new ArabicStemmerKhoja();
for (DataUnit textUnit : positiveExs) {
textWords = StringToWordsTokenizer.tokenize(textUnit.getDataBody());
textUnit.getDataBody();
parsedOut = theParser.parse(textWords, opts);
for (int i = 0; i < parsedOut.getTaggedWords().size(); i++) {
temp = new TaggedStem();
temp.setStemTag(parsedOut.getTaggedWords().get(i).getWordTag());
temp.setTheStem(arabicStemmer.stem(textWords.get(i).word()));
if (!taggedStems.contains(temp)) {
taggedStems.add(temp);
}
}
}
for (DataUnit textUnit : negativeExs) {
textWords = StringToWordsTokenizer.tokenize(textUnit.getDataBody());
textUnit.getDataBody();
parsedOut = theParser.parse(textWords, opts);
for (int i = 0; i < parsedOut.getTaggedWords().size(); i++) {
temp = new TaggedStem();
temp.setStemTag(parsedOut.getTaggedWords().get(i).getWordTag());
temp.setTheStem(arabicStemmer.stem(textWords.get(i).word()));
if (!taggedStems.contains(temp)) {
taggedStems.add(temp);
}
}