/*
Copyright 2012 Christian Prause and Fraunhofer FIT
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package net.sf.collabreview.hibernate;
import net.sf.collabreview.core.Artifact;
import net.sf.collabreview.core.ArtifactIdentifier;
import net.sf.collabreview.repository.BasicRepository;
import net.sf.collabreview.repository.TokenIndex;
import net.sf.collabreview.transform.tokened.IndexedTokenCore;
import net.sf.collabreview.transform.tokened.Token;
import net.sf.collabreview.transform.tokened.Tokenizer;
import net.sf.collabreview.transform.tokened.TrivialTokenFilter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.hibernate.Query;
import prause.toolbox.hibernate.HibernateUtil;
import prause.toolbox.other.ReferenceMap;
import java.sql.Connection;
import java.sql.Statement;
import java.util.*;
/**
* A TokenIndex that stores its Token cores in the database.
* <p/>
* The actual data for the HibernateTokenIndex is stored in IndexEntry objects that are made persistent with Hibernate.
*
* @author Christian Prause (chris)
* @date 17.02.2009 10:53:33
* @see net.sf.collabreview.hibernate.IndexEntry
*/
public class HibernateTokenIndex implements TokenIndex {
private static final Log logger = LogFactory.getLog(HibernateTokenIndex.class);
/**
* How many artifacts have been indexed?
* The value is used when calculating certainties for token cores.
* <p/>
* It is initialized to -1 to indicate that currently it is not known how many artifacts have been initialized,
* because the database might already contain some from a previous run of the program, but this (volatile) value
* has not been recovered yet.
*/
private long artifactCount = -1;
/**
* The BasicRepository that owns this TokenIndex.
*/
private BasicRepository owningRepository;
/**
* This filter is used in addition to the filter that this TokenIndex implements.
* The external filter is injected by the BasicRepository upon creation.
* <p/>
* The filter is also used by the testNonTriviality() validation method.
*
* @see net.sf.collabreview.hibernate.HibernateTokenIndex#setExternalFilter(net.sf.collabreview.transform.tokened.TrivialTokenFilter)
* @see HibernateTokenIndex#testNonTriviality(IndexEntry,net.sf.collabreview.transform.tokened.Token)
*/
private TrivialTokenFilter externalFilter = new TrivialTokenFilter() {
@Override
public boolean isTrivialToken(Token token) {
return false;
}
};
/**
* Remember the triviality value of a token core to speed triviality lookups.
*
* @see net.sf.collabreview.hibernate.HibernateTokenIndex#isTrivialToken(net.sf.collabreview.transform.tokened.Token)
*/
private ReferenceMap<String, Boolean> tokenTrivialityCache = new ReferenceMap<String, Boolean>(ReferenceMap.SOFT, ReferenceMap.HARD);
public HibernateTokenIndex(BasicRepository owningRepository) {
this.owningRepository = owningRepository;
}
@Override
public void setExternalFilter(TrivialTokenFilter externalFilter) {
this.externalFilter = externalFilter;
}
public void addTokensFromArtifact(Artifact artifact) {
Tokenizer tokenizer = owningRepository.getTokenizerFactory().create(artifact.getContent());
List<Token> tokens = tokenizer.listTokens();
for (Token token : tokens) {
IndexEntry entry = getIndexEntryForToken(token, true);
if (!isTrivialToken(token)) {
entry.addOrigin(artifact.getId());
testNonTriviality(entry, token);
} else if( !entry.isTrivial() ) {
// the entry is identified as trivial but it has not yet been marked as such. Make sure it is marked!
entry.setTrivial(true);
}
entry.incrementOccurrenceCount();
}
if (artifactCount != -1) {
artifactCount++;
}
}
/**
* Try to find out if this entry is trivial. (This is not a unit test!)
* If it is then set its trivial flag to true; otherwise do nothing.
* <p/>
* As this method is a utility method of addTokensFromArtifact() it is only called when the entry is not yet
* marked trivial.
*
* @param entry the non-trivial entry to validate
* @param token the token to which the entry belongs
* @see net.sf.collabreview.hibernate.HibernateTokenIndex#addTokensFromArtifact(net.sf.collabreview.core.Artifact)
*/
private void testNonTriviality(IndexEntry entry, Token token) {
assert !entry.isTrivial();
boolean trivial = false;
if (externalFilter.isTrivialToken(token)) {
trivial = true;
} else {
HashSet<String> differentActiveArtifacts = new HashSet<String>();
for (ArtifactIdentifier ai : entry.getOrigins()) {
if (owningRepository.getArtifact(ai).getObsoleteDate() == null) {
differentActiveArtifacts.add(ai.getName());
}
}
if (differentActiveArtifacts.size() > 3) {
trivial = true;
}
}
if (trivial) {
entry.setTrivial(true);
tokenTrivialityCache.remove(token.getCore());
}
}
public boolean isTrivialToken(Token token) {
assert token != null;
boolean trivial;
Boolean cachedResult = tokenTrivialityCache.get(token.getCore());
if (cachedResult != null) {
trivial = cachedResult;
} else {
trivial = externalFilter.isTrivialToken(token) || getIndexEntryForToken(token, false).isTrivial();
tokenTrivialityCache.put(token.getCore(), trivial);
}
return trivial;
}
public double getCertainty(Token token) {
return getCertaintyForCount(getIndexEntryForToken(token, false).getOccurrenceCount());
}
private double getCertaintyForCount(int occCount) {
if (occCount == 0) {
return 0;
}
double artCount = getArtifactCount();
double result = 1 - occCount / artCount; // some number between -INF and 1
result = Math.pow(2, result); // some number between 0 and 2
result /= 2;
return result;
}
public Collection<ArtifactIdentifier> getPossibleOccurrences(Token token) {
return getIndexEntryForToken(token, false).getOrigins();
}
private IndexEntry getIndexEntryForToken(Token token, boolean saveIfNew) {
return IndexEntry.get(token, saveIfNew);
}
private Collection<IndexEntry> getIndexEntriesForTokens(List<Token> tokens) {
return IndexEntry.getList(tokens);
}
private long getArtifactCount() {
if (artifactCount >= 0) {
return artifactCount;
}
Query q = HibernateUtil.getCurrentSession().createQuery("SELECT count(pk) FROM ArtifactIdentifier");
artifactCount = (Long) q.uniqueResult();
logger.info("Artifact count determined from database: " + artifactCount);
return artifactCount;
}
public void clear() {
tokenTrivialityCache.clear();
try {
Connection connection = HibernateUtil.getCurrentSession().connection();
Statement statement = connection.createStatement();
try {
statement.executeUpdate("DELETE FROM Line2ArtifactIdentifier");
statement.executeUpdate("DELETE FROM IndexEntry");
connection.commit();
}
finally {
statement.close();
HibernateUtil.closeSession();
}
} catch (Exception e) {
logger.error("Failed to clear the database: ", e);
}
}
public List<Token> listMostCertainTokens(String artifactContent, int maximum) {
// get certainties
List<ResultPair> list = new ArrayList<ResultPair>();
for (IndexEntry ie : getIndexEntriesForTokens(owningRepository.getTokenizerFactory().create(artifactContent).listTokens())) {
list.add(new ResultPair(ie));
}
// sort
Collections.sort(list);
ArrayList<Token> result = new ArrayList<Token>();
for (ResultPair pair : list) {
result.add(new IndexedTokenCore(pair.core));
if (result.size() >= maximum) {
break;
}
}
return result;
}
private class ResultPair implements Comparable {
String core;
int occurrenceCount;
double certainty;
private ResultPair(IndexEntry ie) {
this.core = ie.getCore();
this.occurrenceCount = ie.getOccurrenceCount();
this.certainty = getCertaintyForCount(occurrenceCount);
}
public int compareTo(Object o) {
ResultPair other = (ResultPair) o;
if (certainty < other.certainty) {
return 1;
}
if (certainty > other.certainty) {
return -1;
}
return 0;
}
}
}