Source Code of it.unibz.instasearch.indexing.SearchResultDoc

/*
 * Copyright (c) 2009 Andrejs Jermakovics.
 * 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     Andrejs Jermakovics - initial implementation
 */
package it.unibz.instasearch.indexing;


import it.unibz.instasearch.InstaSearchPlugin;


import java.io.IOException;
import java.util.Collection;
import java.util.Locale;


import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.eclipse.core.resources.IFile;
import org.eclipse.core.resources.IWorkspaceRoot;
import org.eclipse.core.runtime.IPath;
import org.eclipse.core.runtime.Path;




public class SearchResultDoc {


  private Document doc;
  private int docId;
  private TermFreqVector termFreqVector;
  private float[] termScoreVector;
  private float score;
  private int matchCount;
  private Directory indexDir;


  public SearchResultDoc(Directory dir, Document doc, int docId, float score) {
    this.indexDir = dir;
    this.docId = docId;
    this.doc = doc;
    this.score = score;
    
    matchCount = 0;
  }


  private String getFieldValue(Field field) {
    return doc.get(field.toString());
  }
  
  public String getFilePath() {
    return getFieldValue(Field.FILE);
  }
  
  public String getFileName() {
    return getFieldValue(Field.NAME);
  }
  
  public String getFileExtension() {
    return getFieldValue(Field.EXT);
  }
  
  public boolean isInJar() {
    if( doc.getField(Field.JAR.toString())==null )
      return false;
    
    String jarField = getFieldValue(Field.JAR);
    
    if( StorageIndexer.NO_VALUE.equals(jarField) )
      return false;
    
    if( jarField.toLowerCase(Locale.ENGLISH).endsWith(".jar") )
      return true;
    
    return false;
  }
  
  public String getJarName() {
    
    if( isInJar() )
      return getFieldValue(Field.JAR);
    
    return null;
  }
  
  public IPath getProject() {
    return new Path(getFieldValue(Field.PROJ));
  }
  
  public String getProjectName() {
    return getProject().lastSegment();
  }
  
  /**
   * 
   * @return
   * @throws IOException
   */
  private float[] getTermScoreVector()  throws IOException
  {
    if( termScoreVector == null ) {
      IndexReader reader = IndexReader.open(indexDir, true);


      if( termFreqVector == null ) 
        createFreqVect(reader);


      termScoreVector = createTermScoreVector(termFreqVector, reader);
      reader.close();
    }


    return termScoreVector;
  }


  private TermFreqVector getTermFreqVector() throws IOException
  {
    if( termFreqVector == null ) {
      IndexReader reader = IndexReader.open(indexDir, true);
      createFreqVect(reader);
      reader.close();
    }


    return termFreqVector;
  }


  private void createFreqVect(IndexReader reader) throws IOException
  {
    termFreqVector = reader.getTermFreqVector(docId, Field.CONTENTS.toString()); // obtain only when requested
  }


  /**
   * Returns a vector of given term scores (tf-idf). 
   * The size of the vector is the number of terms in this document
   * The term positions in the vector are the same as in the term frequency vector
   * 
   * @param terms
   * @return TermScoreVector
   * @throws IOException
   */
  public float[] getTermScoreVector(Collection<String> terms) throws IOException
  {
    float[] allTermScoreVect = getTermScoreVector();
    float[] termScoreVect = new float[allTermScoreVect.length]; 
    TermFreqVector freqVector = getTermFreqVector();


    for(String term: terms){
      int idx = freqVector.indexOf(term); // does a binary search
      if( idx == -1 ) continue;
      termScoreVect[idx] = allTermScoreVect[idx];
    }


    return termScoreVect;
  }


  public double getTermScore(String term) throws IOException
  {
    float[] allTermScoreVect = getTermScoreVector();
    TermFreqVector freqVector = getTermFreqVector();


    if( freqVector == null ) return 0;
    
    int idx = freqVector.indexOf(term); // does a binary search
    if( idx == -1 ) return 0;
    return allTermScoreVect[idx];
  }


  private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException
  {    
    if( vect == null )
      return new float[0];


    int[] termFrequencies = vect.getTermFrequencies();
    String[] terms = vect.getTerms();
    float[] scores = new float[terms.length];


    int numDocs = reader.maxDoc();
    Similarity sim = Searcher.SIMILARITY;
    
    for(int i = 0; i < terms.length; i++) {
      String termText = terms[i];
      Term term = new Term(Field.CONTENTS.toString(), termText);
      
      float termFreq = sim.tf( termFrequencies[i] );


      int docFreq = reader.docFreq(term);
      float idf = sim.idf(docFreq, numDocs); 


      float tfIdf = termFreq * idf;


      scores[i] = tfIdf;
    }


    return scores; 
  }


  public IFile getFile() 
  {
    if( isInJar() ) return null;
    
    Path path = new Path(getFilePath());
    IWorkspaceRoot workspaceRoot = InstaSearchPlugin.getWorkspaceRoot();
    IFile file = workspaceRoot.getFile(path);
    
    if( file == null || file.getRawLocation() == null )
      file = workspaceRoot.getFileForLocation(path);
    
    return file;
  }
  
  /**
   * @return the score
   */
  public float getScore()
  {
    return score;
  }


  /**
   * @return the doc
   */
  public Document getDoc()
  {
    return doc;
  }


  /**
   * @return the docId
   */
  public int getDocId()
  {
    return docId;
  }


  /**
   * @return the matchCount
   */
  public int getMatchCount()
  {
    return matchCount;
  }


  /**
   * Computes match count as SUM( tf ) of all query terms in the document
   * Accesses the index thus affects performance
   * 
   * @param reader
   * @param queryTerms
   * @throws IOException
   */
  public void computeMatchCount(IndexReader reader, Collection<String> queryTerms) throws IOException 
  {
    if( termFreqVector == null )
      createFreqVect(reader);


    if( termFreqVector == null )
      return;
    
    int freqs[] = termFreqVector.getTermFrequencies();
    int freqSum = 0;


    for(String term: queryTerms){
      int idx = termFreqVector.indexOf(term); // does a binary search
      if( idx == -1 ) continue;
      freqSum += freqs[idx]; 
    }


    matchCount = freqSum;
  }
  
  @Override
  public String toString() {
    return getFilePath();
  }
}
Source Code of it.unibz.instasearch.indexing.SearchResultDoc

Related Classes of it.unibz.instasearch.indexing.SearchResultDoc