package rouge;
import interfaces.IRouge;
import interfaces.IRougeSummaryModel;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.lang.mutable.MutableDouble;
import org.apache.commons.lang.mutable.MutableInt;
import common.ScoreType;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
/* experimental - do not use */
public class JRougeN implements IRouge
{
/*
* [1-9](?:\d{0,2}) #A sequence of 1-3 numerals not starting with 0
* (?:,\d{3})* #Any number of three-digit groups, each preceded by a comma
* (?:\.\d*[1-9])? #Optionally, a decimal point followed by any number of digits not ending in 0
* | #OR...
* 0?\.\d*[1-9] #Only the decimal portion, optionally preceded by a 0
* | #OR...
* 0 #Zero.
* source: http://stackoverflow.com/questions/5917082/regular-expression-to-match-numbers-with-or-without-commas-and-decimals-in-text
*/
public static final String numberRegex = "[1-9](?:\\d{0,2})(?:,\\d{3})*(?:\\.\\d*[1-9])?|0?\\.\\d*[1-9]|0";
/*
* \p{L} #Match any language letter in unicode
* \p{N} #Match any numeral in unicode
*/
public static final String legalTokenRegex = "[\\p{L}\\p{N}]+";
private IRougeSummaryModel peer;
private Set<IRougeSummaryModel> models;
private int byteLimit;
private int wordLimit;
private int n;
private char scoreMode;
private double alpha;
public static boolean DEBUG = false;
private static StanfordCoreNLP stanford;
static
{
Properties props = new Properties();
props.put("annotators", "tokenize ,ssplit, pos, lemma");
stanford = new StanfordCoreNLP(props);
}
/**
* Constructor for the JRougeN metric
*
* @param peer
* The test document
* @param models
* The gold standard documents
* @param byteLimit
* Limit by bytes
* @param wordLimit
* Limit by words
* @param n
* The n in n gram ;)
* @param scoreMode
* Score mode A for average B for best
* @param alpha
* Alpha for F score
*/
public JRougeN(IRougeSummaryModel peer, Set<IRougeSummaryModel> models, int byteLimit, int wordLimit, int n, char scoreMode, double alpha)
{
this.peer = peer;
this.models = models;
this.byteLimit = byteLimit;
this.wordLimit = wordLimit;
this.scoreMode = scoreMode;
this.n = n;
this.alpha = alpha;
}
/**
* Creates a map of ngrams and the number of times they appeared in the given sentences
*
* @param sentences
* The sentences from which to construct ngrams
* @param n
* The gram size
* @return A map of ngrams->count
*/
public Map<String, Integer> createNGram(IRougeSummaryModel summary, int n)
{
Map<String, Integer> results = new HashMap<String, Integer>();
int wordCount = 0; // total word count for all sentences
int byteCount = 0; // total byte count for all senetnces
// put the _cn_ marker for a case of error
if (results.size() == 0)
{
results.put("_cn_", 0);
}
int count = 0;
Annotation annotation = new Annotation(summary.asText());
stanford.annotate(annotation);
for (CoreMap sentenceMap : annotation.get(SentencesAnnotation.class))
{
List<String> usedTokens = new LinkedList<String>();
// check which tokens to use
for (CoreLabel token : sentenceMap.get(TokensAnnotation.class))
{
// check if we didn't pass the limit
if (wordCount > wordLimit || byteCount > byteLimit) break;
// check if a valid token
if (!token.toString().matches(legalTokenRegex + "|" + numberRegex)) continue;
wordCount++;
byteCount += stringByteSize(token.toString());
usedTokens.add(token.toString()/*getString(LemmaAnnotation.class)*/);
} // tokens of sentence loop loop
// build n-grams for sentence
for (int i = 0; i < usedTokens.size() - n + 1; i++)
{
String gram = usedTokens.get(i);
for (int j = i + 1; j < i + n; j++)
{
gram += " " + usedTokens.get(j);
} // build the n-gram
count++; // total count
// if n-gram exists add +1 to count, other wise add it with count 1
if (results.containsKey(gram))
{
results.put(gram, results.get(gram) + 1);
}
else
{
results.put(gram, 1);
}
} // creating of sentence n-grams
// check that we didn't break from the previous loop because the limit was reached
if (wordCount > wordLimit || byteCount > byteLimit) break;
} // sentence n-gram generation
results.put("_cn_", count);
return results;
}
/**
* Calculate the size in byte a string takes
*
* @param s
* The string to work on
* @return The amount of bytes in the string
*/
private int stringByteSize(String s)
{
int bytes = 0;
for (int i = 0; i < s.getBytes().length; i++)
{
bytes += s.getBytes()[i];
}
return bytes;
}
/**
* Calculate the score of the ngrams in the test text and the reference text.
* <p>
* The score is the ratio between the hit count - how many ngrams are common to both the test text<br>
* and the reference text - and the total number of ngrams in the reference text
*
* @param model_grams
* The ngrams of the referenced text
* @param peer_grams
* The ngrams of the test text
* @param hit
* The nubmber of hits between the 2 maps
* @param score
* The score calculated by the number of hits divided by the total number of ngram in the model
*/
public void ngramScore(Map<String, Integer> model_grams, Map<String, Integer> peer_grams, MutableInt hit, MutableDouble score)
{
hit.setValue(0);
Set<String> tokens = model_grams.keySet();
for (String t : tokens)
{
if (!t.equals("_cn_"))
{
int h = 0;
if (peer_grams.containsKey(t))
{
h = peer_grams.get(t) <= model_grams.get(t) ? peer_grams.get(t) : model_grams.get(t);
hit.setValue(hit.intValue() + h);
}
}
}
if (model_grams.get("_cn_")
.intValue() != 0)
{
score.setValue((double) (hit.intValue()) / model_grams.get("_cn_")
.doubleValue());
}
else
{
score.setValue(0);
}
}
/**
* Computes the n-gram score of the texts with which the object is initialized
*
* @return A map containing the scores.<br>
* The map will contain the following fields: totalGramCount, totalGramHit, gramScore, totalGramCountP, gramScoreP,
* gramScoreF
*/
public Map<ScoreType, Double> computeNGramScore()
{
Map<ScoreType, Double> results = new HashMap<ScoreType, Double>();
MutableInt gramHit = new MutableInt(0);
MutableDouble gramScore = new MutableDouble(0);
// #------------------------------------------------
// # read model file and create model n-gram maps
int totalGramHit = 0;
int totalGramCount = 0;
double gramScoreBest = -1;
double gramScoreP = 0; //# precision
double gramScoreF = 0; //# f-measure
int totalGramCountP = 0;
Map<String, Integer> peer_grams = createNGram(peer, this.n);
if (DEBUG)
{
System.out.println(peer.getSourceFile());
System.out.println(peer.asText());
int i = 0;
System.out.print("[");
for (String key : peer_grams.keySet())
{
System.out.print(key + ":" + peer_grams.get(key)
.intValue());
if (i != peer_grams.size() - 1)
{
System.out.print("|");
}
}
System.out.println("]");
}
for (IRougeSummaryModel model : models)
{
Map<String, Integer> model_grams = createNGram(model, this.n);
if (DEBUG)
{
System.out.println(model.getSourceFile());
System.out.println(model.asText());
int i = 0;
System.out.print("[");
for (String key : model_grams.keySet())
{
System.out.print(key + ":" + model_grams.get(key)
.intValue());
if (i != model_grams.size() - 1)
{
System.out.print("|");
}
}
System.out.println("]");
}
ngramScore(model_grams, peer_grams, gramHit, gramScore);
switch (scoreMode)
{
case 'A':
case 'a':
{
totalGramHit += gramHit.intValue();
totalGramCount += model_grams.get("_cn_");
totalGramCountP += peer_grams.get("_cn_");
break;
}
case 'B':
case 'b':
{
if (gramScore.doubleValue() > gramScoreBest)
{
//# only take a better score (i.e. better match)
gramScoreBest = gramScore.doubleValue();
totalGramHit = gramHit.intValue();
totalGramCount = model_grams.get("_cn_");
totalGramCountP = peer_grams.get("_cn_");
}
break;
}
default:
{
System.out.println("Warning: Unknown scoring mode - using average mode");
totalGramHit += gramHit.intValue();
totalGramCount += model_grams.get("_cn_");
totalGramCountP += peer_grams.get("_cn_");
}
}
}
results.put(ScoreType.TOTAL_GRAM_COUNT, (double) totalGramCount); // total number of ngrams in models
results.put(ScoreType.TOTAL_GRAM_HIT, (double) totalGramHit);
if (totalGramCount != 0)
{
gramScore.setValue((double) totalGramHit / (double) totalGramCount);
}
else
{
gramScore.setValue(0);
}
results.put(ScoreType.R, gramScore.doubleValue());
results.put(ScoreType.TOTAL_GRAM_HIT_P, (double) totalGramCountP); // total number of ngrams in peers
if (totalGramCountP != 0)
{
gramScoreP = (double) totalGramHit / (double) totalGramCountP;
}
else
{
gramScoreP = 0;
}
results.put(ScoreType.P, gramScoreP); // precision score
if (((1 - alpha) * gramScoreP + alpha * gramScoreP) > 0)
{
gramScoreF = (gramScoreP * gramScore.doubleValue()) / ((1 - alpha) * gramScoreP + alpha * gramScore.doubleValue());
}
else
{
gramScoreF = 0;
}
results.put(ScoreType.F, gramScoreF);
if (DEBUG)
{
System.out.println("total " + n + "-gram model count: " + totalGramCount);
System.out.println("total " + n + "-gram peer count: " + totalGramCountP);
System.out.println("total " + n + "-gram hit: " + totalGramHit);
System.out.println("total ROUGE-" + n + "-R: " + gramScore);
System.out.println("total ROUGE-" + n + "-P: " + gramScoreP);
System.out.println("total ROUGE-" + n + "-F: " + gramScoreF);
}
return results;
}
/**
* {@inheritDoc}
*/
@Override
public Map<ScoreType, Double> evaluate()
{
return computeNGramScore();
}
}