Source Code of org.deri.grefine.rdf.vocab.imp.VocabularySearcher

package org.deri.grefine.rdf.vocab.imp;


import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.deri.grefine.rdf.vocab.IVocabularySearcher;
import org.deri.grefine.rdf.vocab.PrefixExistException;
import org.deri.grefine.rdf.vocab.RDFNode;
import org.deri.grefine.rdf.vocab.RDFSClass;
import org.deri.grefine.rdf.vocab.RDFSProperty;
import org.deri.grefine.rdf.vocab.SearchResultItem;
import org.deri.grefine.rdf.vocab.Vocabulary;
import org.deri.grefine.rdf.vocab.VocabularyImportException;
import org.deri.grefine.rdf.vocab.VocabularyImporter;
import org.deri.grefine.rdf.vocab.VocabularyIndexException;
import org.openrdf.repository.Repository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;




public class VocabularySearcher implements IVocabularySearcher {


  final static Logger logger = LoggerFactory.getLogger("vocabulary_searcher");


  private static final String CLASS_TYPE = "class";
  private static final String PROPERTY_TYPE = "property";
  // project id is always a number. it is safe to use this placeholder
  private static final String GLOBAL_VOCABULARY_PLACE_HOLDER = "g";


  private IndexWriter writer;
  private IndexSearcher searcher;
  private IndexReader r;


  private Directory _directory;
  
  public VocabularySearcher(File dir) throws IOException {
                _directory = new SimpleFSDirectory(new File(dir, "luceneIndex"));
                Analyzer a = new SimpleAnalyzer(Version.LUCENE_43);
                IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43,a);                


                writer = new IndexWriter(_directory,conf);
                writer.commit();
                r = DirectoryReader.open(_directory);
                searcher = new IndexSearcher(r);
        }


  @Override
  public void importAndIndexVocabulary(String name, String uri, String fetchUrl,VocabularyImporter importer)throws VocabularyImportException, VocabularyIndexException,PrefixExistException, CorruptIndexException, IOException {
    importAndIndexVocabulary(name, uri, fetchUrl, GLOBAL_VOCABULARY_PLACE_HOLDER,importer);
  }


  @Override
  public void importAndIndexVocabulary(String name, String uri, String fetchUrl, String projectId,VocabularyImporter importer) throws VocabularyImportException,VocabularyIndexException, PrefixExistException,
      CorruptIndexException, IOException {
    List<RDFSClass> classes = new ArrayList<RDFSClass>();
    List<RDFSProperty> properties = new ArrayList<RDFSProperty>();
    importer.importVocabulary(name, uri, fetchUrl,classes, properties);
    indexTerms(name, uri, projectId, classes, properties);
  }


  
  @Override
  public void importAndIndexVocabulary(String name, String uri, Repository repository, String projectId,VocabularyImporter importer) throws VocabularyImportException, VocabularyIndexException,
      PrefixExistException, CorruptIndexException, IOException {
    List<RDFSClass> classes = new ArrayList<RDFSClass>();
    List<RDFSProperty> properties = new ArrayList<RDFSProperty>();
    importer.importVocabulary(name, uri, repository, classes, properties);
    indexTerms(name, uri, projectId, classes, properties);
  }


  @Override
  public List<SearchResultItem> searchClasses(String str, String projectId)
      throws IOException {
    Query query = prepareQuery(str, CLASS_TYPE, projectId);
    TopDocs docs = searcher.search(query, getMaxDoc());    
    return prepareSearchResults(docs);
  }


  @Override
  public List<SearchResultItem> searchProperties(String str, String projectId)
      throws IOException {
    Query query = prepareQuery(str, PROPERTY_TYPE, projectId);
    TopDocs docs = searcher.search(query, getMaxDoc());
    return prepareSearchResults(docs);
  }


  @Override
  public void deleteTermsOfVocabs(Set<Vocabulary> toRemove, String projectId)
      throws CorruptIndexException, IOException {
    for (Vocabulary v : toRemove) {
      deleteTerms(v.getName(), projectId);
    }
    this.update();
  }


  @Override
  public void addPredefinedVocabulariesToProject(long projectId)throws VocabularyIndexException, IOException{
    //get all documents of the global scope
    TopDocs docs = getDocumentsOfProjectId(GLOBAL_VOCABULARY_PLACE_HOLDER);
    //add all of them to project projectId
    addDocumentsToProject(docs,String.valueOf(projectId));
    
    this.update();
  }
  
  @Override
  public void update() throws CorruptIndexException, IOException {
    writer.commit();
    // TODO this shouldn't be required but it is not working without it...
    // check
    r.close();
    r = IndexReader.open(_directory);
    searcher = new IndexSearcher(r);
  }
  
  @Override
  public void synchronize(String projectId, Set<String> prefixes) throws IOException{
    Set<String> allPrefixes = getPrefixesOfProjectId(projectId);
    allPrefixes.removeAll(prefixes);
    if(!allPrefixes.isEmpty()){
      deletePrefixesOfProjectId(projectId,allPrefixes);
    }
    this.update();
  }
  
  @Override
  public void deleteTermsOfVocab(String vocabName, String projectId) throws CorruptIndexException, IOException {
    deleteTerms(vocabName, projectId);
    this.update();
  }


  /*
   * Private methods
   */
  private void deleteTerms(String prefix, String projectId)
      throws CorruptIndexException, IOException {
    if (projectId == null || projectId.isEmpty()) {
      throw new RuntimeException("projectId is null");
    }
    // "type":vocabulary AND "projectId":projectId AND "name":name
    // ("type": (class OR property) ) AND "projectId":projectId AND
    // "prefix":name


    BooleanQuery termsQuery = new BooleanQuery();
    BooleanQuery typeQuery = new BooleanQuery();
    typeQuery
        .add(new TermQuery(new Term("type", CLASS_TYPE)), Occur.SHOULD);
    typeQuery.add(new TermQuery(new Term("type", PROPERTY_TYPE)),
        Occur.SHOULD);


    termsQuery.add(typeQuery, Occur.MUST);
    termsQuery.add(new TermQuery(new Term("projectId", projectId)),
        Occur.MUST);
    termsQuery.add(new TermQuery(new Term("prefix", prefix)), Occur.MUST);


    writer.deleteDocuments(termsQuery);
  }


  private void indexTerms(String name, String uri, String projectId,
      List<RDFSClass> classes, List<RDFSProperty> properties)
      throws CorruptIndexException, IOException {
    for (RDFSClass c : classes) {
      indexRdfNode(c, CLASS_TYPE, projectId);
    }
    for (RDFSProperty p : properties) {
      indexRdfNode(p, PROPERTY_TYPE, projectId);
    }


    this.update();
  }


  private void indexRdfNode(RDFNode node, String type, String projectId)
      throws CorruptIndexException, IOException {
    Document doc = new Document();
    doc.add(new Field("type", type, Field.Store.YES,
        Field.Index.NOT_ANALYZED));
    doc.add(new Field("prefix", node.getVocabularyPrefix(),
        Field.Store.YES, Field.Index.NOT_ANALYZED));
    String l = node.getLabel() == null ? "" : node.getLabel();
    doc.add(new Field("label", l, Field.Store.YES, Field.Index.ANALYZED));
    String d = node.getDescription() == null ? "" : node.getDescription();
    doc.add(new Field("description", d, Field.Store.YES,
        Field.Index.ANALYZED));
    doc.add(new Field("uri", node.getURI(), Field.Store.YES, Field.Index.NO));
    doc.add(new Field("localPart", node.getLocalPart(), Field.Store.YES,
        Field.Index.ANALYZED));
    doc.add(new Field("namespace", node.getVocabularyUri(),
        Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("projectId", String.valueOf(projectId),
        Field.Store.YES, Field.Index.NOT_ANALYZED));


    writer.addDocument(doc);
  }


  private Query prepareQuery(String s, String type, String projectId)
      throws IOException {
    BooleanQuery q1 = new BooleanQuery();
    // q1.add(new TermQuery(new
    // Term("projectId",GLOBAL_VOCABULARY_PLACE_HOLDER)), Occur.SHOULD);
    q1.add(new TermQuery(new Term("projectId", projectId)), Occur.MUST);


    BooleanQuery q2 = new BooleanQuery();
    q2.add(new TermQuery(new Term("type", type)), Occur.MUST);


    BooleanQuery q = new BooleanQuery();
    q.add(q1, Occur.MUST);
    q.add(q2, Occur.MUST);


    if (s != null && s.trim().length() > 0) {
      SimpleAnalyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
      if (s.indexOf(":") == -1) {
        // the query we need:
        // "projectId":projectId AND "type":type AND ("prefix":s* OR
        // "localPart":s* OR "label":s* OR "description":s*)
        BooleanQuery q3 = new BooleanQuery();
        q3.add(new WildcardQuery(new Term("prefix", s + "*")),
            Occur.SHOULD);


        TokenStream stream = analyzer.tokenStream("localPart",
            new StringReader(s));
        // get the TermAttribute from the TokenStream
        CharTermAttribute termAtt = (CharTermAttribute) stream
            .addAttribute(CharTermAttribute.class);


        stream.reset();
                
        while (stream.incrementToken()) {
          String tmp = termAtt.toString() + "*";
          q3.add(new WildcardQuery(new Term("localPart", tmp)),
              Occur.SHOULD);
        }
        stream.close();
        stream.end();


        stream = analyzer.tokenStream("description",
            new StringReader(s));
        // get the TermAttribute from the TokenStream
        termAtt = (CharTermAttribute) stream
            .addAttribute(CharTermAttribute.class);


        stream.reset();
        while (stream.incrementToken()) {
          String tmp = termAtt.toString() + "*";
          q3.add(new WildcardQuery(new Term("description", tmp)),
              Occur.SHOULD);
        }
        stream.close();
        stream.end();


        stream = analyzer.tokenStream("label", new StringReader(s));
        // get the TermAttribute from the TokenStream
        termAtt = (CharTermAttribute) stream
            .addAttribute(CharTermAttribute.class);


        stream.reset();
        while (stream.incrementToken()) {
          String tmp = termAtt.toString() + "*";
          q3.add(new WildcardQuery(new Term("label", tmp)),
              Occur.SHOULD);
        }
        stream.close();
        stream.end();


        q.add(q3, Occur.MUST);
        return q;
      } else {
        // the query we need:
        // "projectId":projectId AND "type":type AND ("prefix":p1 AND
        // "localPart":s*)
        String p1 = s.substring(0, s.indexOf(":"));
        String p2 = s.substring(s.indexOf(":") + 1);


        BooleanQuery q3 = new BooleanQuery();
        q3.add(new TermQuery(new Term("prefix", p1)), Occur.SHOULD);


        BooleanQuery q4 = new BooleanQuery();


        TokenStream stream = analyzer.tokenStream("localPart",
            new StringReader(p2));
        // get the TermAttribute from the TokenStream
        CharTermAttribute termAtt = (CharTermAttribute) stream
            .addAttribute(CharTermAttribute.class);


        stream.reset();
        if (!p2.isEmpty()) {
          while (stream.incrementToken()) {
            q4.add(new WildcardQuery(new Term("localPart", termAtt.toString()
                 + "*")), Occur.SHOULD);
          }
        }
        stream.close();
        stream.end();


        q.add(q3, Occur.MUST);
        if (!p2.isEmpty()) {
          q.add(q4, Occur.MUST);
        }
        return q;
      }
    } else {
      return q;
    }


  }


  private List<SearchResultItem> prepareSearchResults(TopDocs docs)
      throws CorruptIndexException, IOException {
    List<SearchResultItem> res = new ArrayList<SearchResultItem>();
    for (int i = 0; i < docs.totalHits; i++) {
      Document doc = searcher.doc(docs.scoreDocs[i].doc);
      String uri = doc.get("uri");
      String label = doc.get("label");
      String description = doc.get("description");
      String prefix = doc.get("prefix");
      String lPart = doc.get("localPart");


      SearchResultItem item = new SearchResultItem(uri, prefix, lPart,
          label, description);
      res.add(item);
    }


    return res;


  }
  
  private void addDocumentsToProject(TopDocs docs,String projectId) throws CorruptIndexException, IOException{
    for(int i=0;i<docs.totalHits;i++){
      Document doc = searcher.doc(docs.scoreDocs[i].doc);
      //TODO this needs to be changed into a more efficient impl
      Document newdoc = new Document();
      Iterator fieldsIter = doc.getFields().iterator();
      while(fieldsIter.hasNext()){
        newdoc.add((IndexableField)fieldsIter.next());
      }
      newdoc.removeField("projectId");
      newdoc.add(new Field("projectId",projectId,Field.Store.YES,Field.Index.NOT_ANALYZED));
      writer.addDocument(newdoc);
    }
  }
  
  private TopDocs getDocumentsOfProjectId(String projectId) throws IOException{
    //query for:
    // "projectId":projectId
    Query query = new TermQuery(new Term("projectId",projectId));
    return searcher.search(query, getMaxDoc());
  }
  
  private Set<String> getPrefixesOfProjectId(String projectId) throws IOException{
    //query for:
    // "projectId":projectId
    Set<String> prefixes = new HashSet<String>();
    Query query = new TermQuery(new Term("projectId",projectId));
    TopDocs docs =  searcher.search(query, getMaxDoc());
    for (int i = 0; i < docs.totalHits; i++) {
      Document doc = searcher.doc(docs.scoreDocs[i].doc);
      prefixes.add(doc.get("prefix"));
    }
    return prefixes;
  }
  private void deletePrefixesOfProjectId(String projectId, Set<String> toDelete) throws CorruptIndexException, IOException {
    if (projectId == null || projectId.isEmpty()) {
      throw new RuntimeException("projectId is null");
    }
    // "type":vocabulary AND "projectId":projectId AND ("prefix":prefix OR ...)
    BooleanQuery q = new BooleanQuery();
    Query query = new TermQuery(new Term("projectId",projectId));
    //TODO backward compatibility is broken here!!!!!!
//    Query typeQ = new TermQuery(new Term("type", "vocabulary"));
    
    BooleanQuery prefixQ = new BooleanQuery();
    for(String p:toDelete){
      Query pQ = new TermQuery(new Term("prefix",p));
      prefixQ.add(pQ,Occur.SHOULD);
    }
    q.add(query,Occur.MUST);
//    q.add(typeQ,Occur.MUST);
    q.add(prefixQ,Occur.MUST);
    
    writer.deleteDocuments(q);    
  }
  
  private int getMaxDoc() throws IOException {
    return r.maxDoc() > 0 ? r.maxDoc() : 100000;
  }
}
Source Code of org.deri.grefine.rdf.vocab.imp.VocabularySearcher

Related Classes of org.deri.grefine.rdf.vocab.imp.VocabularySearcher