/*
Copyright 2012 Christian Prause and Fraunhofer FIT
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package net.sf.collabreview.hibernate;
import net.sf.collabreview.core.ArtifactIdentifier;
import net.sf.collabreview.transform.tokened.Token;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.hibernate.Criteria;
import org.hibernate.criterion.Expression;
import prause.toolbox.hibernate.HibernateUtil;
import java.util.*;
/**
* An entry in the IndexEntry, stores the origins of a line (or token core).
* IndexEntries are used by the HibernateTokenIndex.
* <p/>
* When artifacts are inserted into the CollabReview database that are tokenized, the tokens are stored in this index.
* Now, if a new artifact is inserted and this artifact is transformed from another artifact by inserting tokens,
* every inserted token is checked against this index.
* The intention is to find possible origins where the token might have been copied from.
* <p/>
* Though the line hashes might not be unique, the "line" attribute makes sure that no two different lines are
* mapped to the same IndexEntry.
* In case non-unique hashes collide, the hash value is rehashed and the new value's bucket is used.
* The entry (under a "wrong" hash value) remembers its predecessor.
* A linked list is built in this way so that all entries of a bucket can be analysed until the correct one is found.
*
* @author Christian Prause (chris)
* @date 17.02.2009 10:55:25
* @see net.sf.collabreview.transform.tokened.Tokenizer
* @see net.sf.collabreview.hibernate.HibernateTokenIndex
*/
public class IndexEntry {
private static final Log logger = LogFactory.getLog(IndexEntry.class);
/**
* The line that this IndexEntry stores possible origins for.
* It is possible that there is a collision with the hash value.
* The line is used to detect that so that it is possible to re-hash the line and try a different entry.
*/
private String core;
/**
* The hash of the line that this IndexEntry stores possible origins for.
* <p/>
* The hash also serves as database primary key for this IndexEntry.
* This is on purpose because database (Hibernate) accesses will be most efficient if the primary key is used.
* (I tried it. When not using the primary it gets terribly slow, because caching can only be used with pk or natural-id.)
*/
private long coreHash;
/**
* All occurrences of this line in various Artifacts.
*/
private Set<ArtifactIdentifier> origins;
/**
* How often the line has occurred in different Artifacts.
* This also counts multiple occurrences of the same line in the same artifact.
*/
private int occurrenceCount = 0;
/**
* Marks this token as trivial.
* Once a token is marked as trivial its origins are no longer tracked to save memory in the database.
* Otherwise it will become very big, very quickly.
*/
private boolean isTrivial = false;
/**
* When the hash value for different entries is the same then all entries in the same "bucket" are stored as a linked
* list. This link is from a predessor to its preceeding entry in the bucket.
*/
private IndexEntry previousInBucket;
/**
* Protected simple constructor for hibernate.
*/
protected IndexEntry() {
}
/**
* Create a new IndexEntry for the specified token core.
*
* @param coreString the token core string to be associated with this IndexEntry
*/
public IndexEntry(String coreString) {
if (coreString == null) {
throw new NullPointerException("Core string cannot be null");
}
this.core = coreString.trim();
this.coreHash = hash(core);
this.origins = new HashSet<ArtifactIdentifier>();
}
/**
* Create a new IndexEntry for the specified token core but use a certain hash value (useful when the value
* was rehashed because the original hash value is already in use).
*
* @param core the token's core string
* @param hash the token's hash value (possibly rehashed)
*/
public IndexEntry(String core, long hash) {
this.core = core;
this.coreHash = hash;
this.origins = new HashSet<ArtifactIdentifier>();
}
/**
* Get the source line that this IndexEntry represents.
*
* @return the string without leading/trailing whitespace of this line
*/
public String getCore() {
return core;
}
/**
* Set the line that this IndexEntry represents.
* Only a setter for Hibernate.
*
* @param core the new line string
*/
protected void setCore(String core) {
this.core = core;
}
/**
* Get the line's hash value.
* This is also the primary key of the line.
*
* @return the line's hash value
*/
protected long getCoreHash() {
return coreHash;
}
/**
* Set the hash value of this line.
* Only a setter for Hibernate.
* You should not change this value as it is the primary key.
*
* @param coreHash the line's hash value
*/
protected void setCoreHash(long coreHash) {
this.coreHash = coreHash;
}
/**
* Get the occurrence count of this line.
* The occurrence count is at least the number of different artifacts that the line has appeared in, but may
* be more if the line has more than one occurrence in the artifacts.
*
* @return the number of occurrences of this line in artifacts
*/
public int getOccurrenceCount() {
return occurrenceCount;
}
/**
* Set the occurrence count of a line to a new value
*
* @param occurrenceCount the new occurrence count of the line
*/
protected void setOccurrenceCount(int occurrenceCount) {
this.occurrenceCount = occurrenceCount;
}
/**
* Increments the occurence count of a line.
*/
public void incrementOccurrenceCount() {
setOccurrenceCount(getOccurrenceCount() + 1);
}
/**
* Returns the origins where this entry's line appears.
* If this entry's line does not appear anywhere then an empty set is returned.
*
* @return where the line is used
*/
public Set<ArtifactIdentifier> getOrigins() {
return origins;
}
/**
* Set the origins of this line, i.e. the artifacts that this line appears in.
*
* @param origins a set of origins that replaces the current origins set.
*/
protected void setOrigins(Set<ArtifactIdentifier> origins) {
this.origins = origins;
}
/**
* Add a new origin to this entry's origin list.
*
* @param newOrigin the new origin to add to this entry
*/
public void addOrigin(ArtifactIdentifier newOrigin) {
if (!isTrivial()) {
getOrigins().add(newOrigin);
} else {
logger.warn("Trying to add an origin to a trivial entry. LineHash=" + getCoreHash());
}
}
/**
* A trivial entry does not have its origins remembered.
*
* @return true iff this entry is marked as trivial
*/
public boolean isTrivial() {
return isTrivial;
}
/**
* Set an entry to trivial (or not trivial).
* A trivial entry does not have its origins remembered.
* <p/>
* Once an entry is set to trivial its origins are cleared and are hence lost even if the token is switched back
* to non-trivial later.
*
* @param trivial true iff this entry should be marked as trivial
*/
public void setTrivial(boolean trivial) {
if (trivial && getOrigins() != null) {
logger.debug("Now trivial (" + getCoreHash() + "): " + getCore());
getOrigins().clear();
}
isTrivial = trivial;
}
/**
* @return the entry that preceedes that current IndexEntry in the bucket
* @see net.sf.collabreview.hibernate.IndexEntry#previousInBucket
*/
public IndexEntry getPreviousInBucket() {
return previousInBucket;
}
/**
* Set a new predecessor for the IndexEntry.
*
* @param previousInBucket the IndexEntry that should preceede this IndexEntry
* @see net.sf.collabreview.hibernate.IndexEntry#previousInBucket
*/
public void setPreviousInBucket(IndexEntry previousInBucket) {
this.previousInBucket = previousInBucket;
}
/**
* Load the IndexEntry for a given token.
* For an IndexEntry that could not be loaded from the database a new IndexEntry is created.
* Depending on the value of saveIfNew this new IndexEntry is immediately added to the database or not.
*
* @param token for which Token to load the IndexEntry for
* @param saveIfNew save the IndexEntry if it was created because it was not in the database? (true=save)
* @return an IndexEntry for the token
*/
protected static IndexEntry get(Token token, boolean saveIfNew) {
String core = token.getCore();
long hash = hash(core);
IndexEntry previous = null;
IndexEntry entry = (IndexEntry) HibernateUtil.getCurrentSession().get(IndexEntry.class, hash);
while (entry != null && !entry.getCore().equals(core)) {
previous = entry;
Criteria fetch = HibernateUtil.getCurrentSession().createCriteria(IndexEntry.class).add(Expression.eq("previousInBucket.coreHash", entry.getCoreHash()));
entry = (IndexEntry) fetch.uniqueResult();
}
if (entry == null) {
entry = new IndexEntry(core, freeHashSlot(core));
if (previous != null) {
entry.setPreviousInBucket(previous);
}
if (saveIfNew) {
HibernateUtil.save(entry);
}
}
return entry;
}
/**
* Load the IndexEntries for several tokens.
* IndexEntries that are not in the database are not added automatically.
*
* @param tokens a list of tokens for which the IndexEntries should be loaded
* @return the index entries that could be loaded from the database
*/
@SuppressWarnings({"unchecked"})
protected static Collection<IndexEntry> getList(List<Token> tokens) {
if (tokens.size() == 0) {
return new ArrayList<IndexEntry>();
}
Set<IndexEntry> results = new HashSet<IndexEntry>();
// prepare the target list
Set<String> targetCores = new HashSet<String>();
for (Token token : tokens) {
targetCores.add(token.getCore());
}
// do the initial fetch that should satisfy all requests where the entry is the first in the bucket
List<Long> initialFetchList = new LinkedList<Long>();
for (String string : targetCores) {
initialFetchList.add(hash(string));
}
Criteria fetchCriteria = HibernateUtil.getCurrentSession().createCriteria(IndexEntry.class).add(Expression.in("coreHash", initialFetchList));
List<IndexEntry> fetchedEntries = fetchCriteria.list();
List<Long> refetchList = new LinkedList<Long>();
for (IndexEntry fetched : fetchedEntries) {
if (targetCores.contains(fetched.getCore())) {
results.add(fetched);
} else {
refetchList.add(fetched.getCoreHash());
}
}
// refetch until all entries are loaded or are definetely not in the database
while (refetchList.size() != 0) {
fetchCriteria = HibernateUtil.getCurrentSession().createCriteria(IndexEntry.class).add(Expression.in("previousInBucket.coreHash", refetchList));
fetchedEntries = fetchCriteria.list();
refetchList.clear();
for (IndexEntry fetched : fetchedEntries) {
if (targetCores.contains(fetched.getCore())) {
results.add(fetched);
} else {
refetchList.add(fetched.getCoreHash());
}
}
}
return results;
}
/**
* Find a hash value that is not used by any IndexEntry.
*
* @param string string to be hashed
* @return a hash value (possibly rehashed several times) that is not used by any IndexEntry.
*/
private static long freeHashSlot(String string) {
long hash = hash(string);
while (HibernateUtil.getCurrentSession().get(IndexEntry.class, hash) != null) {
hash++;
}
return hash;
}
/**
* A "long" hashing function.
* Will leave lowest four bits 0 that freeHashSlot() method can more easily find a free slot by incrementing
* hash value by one.
*
* @param string the string to be hashed
* @return the original hash value (without rehashing) for the specified string.
*/
private static long hash(String string) {
return ((long) string.hashCode()) << 4;
}
}