Package it.unibz.instasearch.indexing

Source Code of it.unibz.instasearch.indexing.SearchResultDoc

/*
* Copyright (c) 2009 Andrejs Jermakovics.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
*     Andrejs Jermakovics - initial implementation
*/
package it.unibz.instasearch.indexing;

import it.unibz.instasearch.InstaSearchPlugin;

import java.io.IOException;
import java.util.Collection;
import java.util.Locale;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.eclipse.core.resources.IFile;
import org.eclipse.core.resources.IWorkspaceRoot;
import org.eclipse.core.runtime.IPath;
import org.eclipse.core.runtime.Path;


public class SearchResultDoc {

  private Document doc;
  private int docId;
  private TermFreqVector termFreqVector;
  private float[] termScoreVector;
  private float score;
  private int matchCount;
  private Directory indexDir;

  public SearchResultDoc(Directory dir, Document doc, int docId, float score) {
    this.indexDir = dir;
    this.docId = docId;
    this.doc = doc;
    this.score = score;
   
    matchCount = 0;
  }

  private String getFieldValue(Field field) {
    return doc.get(field.toString());
  }
 
  public String getFilePath() {
    return getFieldValue(Field.FILE);
  }
 
  public String getFileName() {
    return getFieldValue(Field.NAME);
  }
 
  public String getFileExtension() {
    return getFieldValue(Field.EXT);
  }
 
  public boolean isInJar() {
    if( doc.getField(Field.JAR.toString())==null )
      return false;
   
    String jarField = getFieldValue(Field.JAR);
   
    if( StorageIndexer.NO_VALUE.equals(jarField) )
      return false;
   
    if( jarField.toLowerCase(Locale.ENGLISH).endsWith(".jar") )
      return true;
   
    return false;
  }
 
  public String getJarName() {
   
    if( isInJar() )
      return getFieldValue(Field.JAR);
   
    return null;
  }
 
  public IPath getProject() {
    return new Path(getFieldValue(Field.PROJ));
  }
 
  public String getProjectName() {
    return getProject().lastSegment();
  }
 
  /**
   *
   * @return
   * @throws IOException
   */
  private float[] getTermScoreVector()  throws IOException
  {
    if( termScoreVector == null ) {
      IndexReader reader = IndexReader.open(indexDir, true);

      if( termFreqVector == null )
        createFreqVect(reader);

      termScoreVector = createTermScoreVector(termFreqVector, reader);
      reader.close();
    }

    return termScoreVector;
  }

  private TermFreqVector getTermFreqVector() throws IOException
  {
    if( termFreqVector == null ) {
      IndexReader reader = IndexReader.open(indexDir, true);
      createFreqVect(reader);
      reader.close();
    }

    return termFreqVector;
  }

  private void createFreqVect(IndexReader reader) throws IOException
  {
    termFreqVector = reader.getTermFreqVector(docId, Field.CONTENTS.toString()); // obtain only when requested
  }

  /**
   * Returns a vector of given term scores (tf-idf).
   * The size of the vector is the number of terms in this document
   * The term positions in the vector are the same as in the term frequency vector
   *
   * @param terms
   * @return TermScoreVector
   * @throws IOException
   */
  public float[] getTermScoreVector(Collection<String> terms) throws IOException
  {
    float[] allTermScoreVect = getTermScoreVector();
    float[] termScoreVect = new float[allTermScoreVect.length];
    TermFreqVector freqVector = getTermFreqVector();

    for(String term: terms){
      int idx = freqVector.indexOf(term); // does a binary search
      if( idx == -1 ) continue;
      termScoreVect[idx] = allTermScoreVect[idx];
    }

    return termScoreVect;
  }

  public double getTermScore(String term) throws IOException
  {
    float[] allTermScoreVect = getTermScoreVector();
    TermFreqVector freqVector = getTermFreqVector();

    if( freqVector == null ) return 0;
   
    int idx = freqVector.indexOf(term); // does a binary search
    if( idx == -1 ) return 0;
    return allTermScoreVect[idx];
  }

  private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException
  {   
    if( vect == null )
      return new float[0];

    int[] termFrequencies = vect.getTermFrequencies();
    String[] terms = vect.getTerms();
    float[] scores = new float[terms.length];

    int numDocs = reader.maxDoc();
    Similarity sim = Searcher.SIMILARITY;
   
    for(int i = 0; i < terms.length; i++) {
      String termText = terms[i];
      Term term = new Term(Field.CONTENTS.toString(), termText);
     
      float termFreq = sim.tf( termFrequencies[i] );

      int docFreq = reader.docFreq(term);
      float idf = sim.idf(docFreq, numDocs);

      float tfIdf = termFreq * idf;

      scores[i] = tfIdf;
    }

    return scores;
  }

  public IFile getFile()
  {
    if( isInJar() ) return null;
   
    Path path = new Path(getFilePath());
    IWorkspaceRoot workspaceRoot = InstaSearchPlugin.getWorkspaceRoot();
    IFile file = workspaceRoot.getFile(path);
   
    if( file == null || file.getRawLocation() == null )
      file = workspaceRoot.getFileForLocation(path);
   
    return file;
  }
 
  /**
   * @return the score
   */
  public float getScore()
  {
    return score;
  }

  /**
   * @return the doc
   */
  public Document getDoc()
  {
    return doc;
  }

  /**
   * @return the docId
   */
  public int getDocId()
  {
    return docId;
  }

  /**
   * @return the matchCount
   */
  public int getMatchCount()
  {
    return matchCount;
  }

  /**
   * Computes match count as SUM( tf ) of all query terms in the document
   * Accesses the index thus affects performance
   *
   * @param reader
   * @param queryTerms
   * @throws IOException
   */
  public void computeMatchCount(IndexReader reader, Collection<String> queryTerms) throws IOException
  {
    if( termFreqVector == null )
      createFreqVect(reader);

    if( termFreqVector == null )
      return;
   
    int freqs[] = termFreqVector.getTermFrequencies();
    int freqSum = 0;

    for(String term: queryTerms){
      int idx = termFreqVector.indexOf(term); // does a binary search
      if( idx == -1 ) continue;
      freqSum += freqs[idx];
    }

    matchCount = freqSum;
  }
 
  @Override
  public String toString() {
    return getFilePath();
  }
}


TOP

Related Classes of it.unibz.instasearch.indexing.SearchResultDoc

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.