package org.deri.grefine.rdf.vocab.imp;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.deri.grefine.rdf.vocab.IVocabularySearcher;
import org.deri.grefine.rdf.vocab.PrefixExistException;
import org.deri.grefine.rdf.vocab.RDFNode;
import org.deri.grefine.rdf.vocab.RDFSClass;
import org.deri.grefine.rdf.vocab.RDFSProperty;
import org.deri.grefine.rdf.vocab.SearchResultItem;
import org.deri.grefine.rdf.vocab.Vocabulary;
import org.deri.grefine.rdf.vocab.VocabularyImportException;
import org.deri.grefine.rdf.vocab.VocabularyImporter;
import org.deri.grefine.rdf.vocab.VocabularyIndexException;
import org.openrdf.repository.Repository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class VocabularySearcher implements IVocabularySearcher {
final static Logger logger = LoggerFactory.getLogger("vocabulary_searcher");
private static final String CLASS_TYPE = "class";
private static final String PROPERTY_TYPE = "property";
// project id is always a number. it is safe to use this placeholder
private static final String GLOBAL_VOCABULARY_PLACE_HOLDER = "g";
private IndexWriter writer;
private IndexSearcher searcher;
private IndexReader r;
private Directory _directory;
public VocabularySearcher(File dir) throws IOException {
_directory = new SimpleFSDirectory(new File(dir, "luceneIndex"));
Analyzer a = new SimpleAnalyzer(Version.LUCENE_43);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43,a);
writer = new IndexWriter(_directory,conf);
writer.commit();
r = DirectoryReader.open(_directory);
searcher = new IndexSearcher(r);
}
@Override
public void importAndIndexVocabulary(String name, String uri, String fetchUrl,VocabularyImporter importer)throws VocabularyImportException, VocabularyIndexException,PrefixExistException, CorruptIndexException, IOException {
importAndIndexVocabulary(name, uri, fetchUrl, GLOBAL_VOCABULARY_PLACE_HOLDER,importer);
}
@Override
public void importAndIndexVocabulary(String name, String uri, String fetchUrl, String projectId,VocabularyImporter importer) throws VocabularyImportException,VocabularyIndexException, PrefixExistException,
CorruptIndexException, IOException {
List<RDFSClass> classes = new ArrayList<RDFSClass>();
List<RDFSProperty> properties = new ArrayList<RDFSProperty>();
importer.importVocabulary(name, uri, fetchUrl,classes, properties);
indexTerms(name, uri, projectId, classes, properties);
}
@Override
public void importAndIndexVocabulary(String name, String uri, Repository repository, String projectId,VocabularyImporter importer) throws VocabularyImportException, VocabularyIndexException,
PrefixExistException, CorruptIndexException, IOException {
List<RDFSClass> classes = new ArrayList<RDFSClass>();
List<RDFSProperty> properties = new ArrayList<RDFSProperty>();
importer.importVocabulary(name, uri, repository, classes, properties);
indexTerms(name, uri, projectId, classes, properties);
}
@Override
public List<SearchResultItem> searchClasses(String str, String projectId)
throws IOException {
Query query = prepareQuery(str, CLASS_TYPE, projectId);
TopDocs docs = searcher.search(query, getMaxDoc());
return prepareSearchResults(docs);
}
@Override
public List<SearchResultItem> searchProperties(String str, String projectId)
throws IOException {
Query query = prepareQuery(str, PROPERTY_TYPE, projectId);
TopDocs docs = searcher.search(query, getMaxDoc());
return prepareSearchResults(docs);
}
@Override
public void deleteTermsOfVocabs(Set<Vocabulary> toRemove, String projectId)
throws CorruptIndexException, IOException {
for (Vocabulary v : toRemove) {
deleteTerms(v.getName(), projectId);
}
this.update();
}
@Override
public void addPredefinedVocabulariesToProject(long projectId)throws VocabularyIndexException, IOException{
//get all documents of the global scope
TopDocs docs = getDocumentsOfProjectId(GLOBAL_VOCABULARY_PLACE_HOLDER);
//add all of them to project projectId
addDocumentsToProject(docs,String.valueOf(projectId));
this.update();
}
@Override
public void update() throws CorruptIndexException, IOException {
writer.commit();
// TODO this shouldn't be required but it is not working without it...
// check
r.close();
r = IndexReader.open(_directory);
searcher = new IndexSearcher(r);
}
@Override
public void synchronize(String projectId, Set<String> prefixes) throws IOException{
Set<String> allPrefixes = getPrefixesOfProjectId(projectId);
allPrefixes.removeAll(prefixes);
if(!allPrefixes.isEmpty()){
deletePrefixesOfProjectId(projectId,allPrefixes);
}
this.update();
}
@Override
public void deleteTermsOfVocab(String vocabName, String projectId) throws CorruptIndexException, IOException {
deleteTerms(vocabName, projectId);
this.update();
}
/*
* Private methods
*/
private void deleteTerms(String prefix, String projectId)
throws CorruptIndexException, IOException {
if (projectId == null || projectId.isEmpty()) {
throw new RuntimeException("projectId is null");
}
// "type":vocabulary AND "projectId":projectId AND "name":name
// ("type": (class OR property) ) AND "projectId":projectId AND
// "prefix":name
BooleanQuery termsQuery = new BooleanQuery();
BooleanQuery typeQuery = new BooleanQuery();
typeQuery
.add(new TermQuery(new Term("type", CLASS_TYPE)), Occur.SHOULD);
typeQuery.add(new TermQuery(new Term("type", PROPERTY_TYPE)),
Occur.SHOULD);
termsQuery.add(typeQuery, Occur.MUST);
termsQuery.add(new TermQuery(new Term("projectId", projectId)),
Occur.MUST);
termsQuery.add(new TermQuery(new Term("prefix", prefix)), Occur.MUST);
writer.deleteDocuments(termsQuery);
}
private void indexTerms(String name, String uri, String projectId,
List<RDFSClass> classes, List<RDFSProperty> properties)
throws CorruptIndexException, IOException {
for (RDFSClass c : classes) {
indexRdfNode(c, CLASS_TYPE, projectId);
}
for (RDFSProperty p : properties) {
indexRdfNode(p, PROPERTY_TYPE, projectId);
}
this.update();
}
private void indexRdfNode(RDFNode node, String type, String projectId)
throws CorruptIndexException, IOException {
Document doc = new Document();
doc.add(new Field("type", type, Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("prefix", node.getVocabularyPrefix(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
String l = node.getLabel() == null ? "" : node.getLabel();
doc.add(new Field("label", l, Field.Store.YES, Field.Index.ANALYZED));
String d = node.getDescription() == null ? "" : node.getDescription();
doc.add(new Field("description", d, Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("uri", node.getURI(), Field.Store.YES, Field.Index.NO));
doc.add(new Field("localPart", node.getLocalPart(), Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("namespace", node.getVocabularyUri(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("projectId", String.valueOf(projectId),
Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
private Query prepareQuery(String s, String type, String projectId)
throws IOException {
BooleanQuery q1 = new BooleanQuery();
// q1.add(new TermQuery(new
// Term("projectId",GLOBAL_VOCABULARY_PLACE_HOLDER)), Occur.SHOULD);
q1.add(new TermQuery(new Term("projectId", projectId)), Occur.MUST);
BooleanQuery q2 = new BooleanQuery();
q2.add(new TermQuery(new Term("type", type)), Occur.MUST);
BooleanQuery q = new BooleanQuery();
q.add(q1, Occur.MUST);
q.add(q2, Occur.MUST);
if (s != null && s.trim().length() > 0) {
SimpleAnalyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
if (s.indexOf(":") == -1) {
// the query we need:
// "projectId":projectId AND "type":type AND ("prefix":s* OR
// "localPart":s* OR "label":s* OR "description":s*)
BooleanQuery q3 = new BooleanQuery();
q3.add(new WildcardQuery(new Term("prefix", s + "*")),
Occur.SHOULD);
TokenStream stream = analyzer.tokenStream("localPart",
new StringReader(s));
// get the TermAttribute from the TokenStream
CharTermAttribute termAtt = (CharTermAttribute) stream
.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String tmp = termAtt.toString() + "*";
q3.add(new WildcardQuery(new Term("localPart", tmp)),
Occur.SHOULD);
}
stream.close();
stream.end();
stream = analyzer.tokenStream("description",
new StringReader(s));
// get the TermAttribute from the TokenStream
termAtt = (CharTermAttribute) stream
.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String tmp = termAtt.toString() + "*";
q3.add(new WildcardQuery(new Term("description", tmp)),
Occur.SHOULD);
}
stream.close();
stream.end();
stream = analyzer.tokenStream("label", new StringReader(s));
// get the TermAttribute from the TokenStream
termAtt = (CharTermAttribute) stream
.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String tmp = termAtt.toString() + "*";
q3.add(new WildcardQuery(new Term("label", tmp)),
Occur.SHOULD);
}
stream.close();
stream.end();
q.add(q3, Occur.MUST);
return q;
} else {
// the query we need:
// "projectId":projectId AND "type":type AND ("prefix":p1 AND
// "localPart":s*)
String p1 = s.substring(0, s.indexOf(":"));
String p2 = s.substring(s.indexOf(":") + 1);
BooleanQuery q3 = new BooleanQuery();
q3.add(new TermQuery(new Term("prefix", p1)), Occur.SHOULD);
BooleanQuery q4 = new BooleanQuery();
TokenStream stream = analyzer.tokenStream("localPart",
new StringReader(p2));
// get the TermAttribute from the TokenStream
CharTermAttribute termAtt = (CharTermAttribute) stream
.addAttribute(CharTermAttribute.class);
stream.reset();
if (!p2.isEmpty()) {
while (stream.incrementToken()) {
q4.add(new WildcardQuery(new Term("localPart", termAtt.toString()
+ "*")), Occur.SHOULD);
}
}
stream.close();
stream.end();
q.add(q3, Occur.MUST);
if (!p2.isEmpty()) {
q.add(q4, Occur.MUST);
}
return q;
}
} else {
return q;
}
}
private List<SearchResultItem> prepareSearchResults(TopDocs docs)
throws CorruptIndexException, IOException {
List<SearchResultItem> res = new ArrayList<SearchResultItem>();
for (int i = 0; i < docs.totalHits; i++) {
Document doc = searcher.doc(docs.scoreDocs[i].doc);
String uri = doc.get("uri");
String label = doc.get("label");
String description = doc.get("description");
String prefix = doc.get("prefix");
String lPart = doc.get("localPart");
SearchResultItem item = new SearchResultItem(uri, prefix, lPart,
label, description);
res.add(item);
}
return res;
}
private void addDocumentsToProject(TopDocs docs,String projectId) throws CorruptIndexException, IOException{
for(int i=0;i<docs.totalHits;i++){
Document doc = searcher.doc(docs.scoreDocs[i].doc);
//TODO this needs to be changed into a more efficient impl
Document newdoc = new Document();
Iterator fieldsIter = doc.getFields().iterator();
while(fieldsIter.hasNext()){
newdoc.add((IndexableField)fieldsIter.next());
}
newdoc.removeField("projectId");
newdoc.add(new Field("projectId",projectId,Field.Store.YES,Field.Index.NOT_ANALYZED));
writer.addDocument(newdoc);
}
}
private TopDocs getDocumentsOfProjectId(String projectId) throws IOException{
//query for:
// "projectId":projectId
Query query = new TermQuery(new Term("projectId",projectId));
return searcher.search(query, getMaxDoc());
}
private Set<String> getPrefixesOfProjectId(String projectId) throws IOException{
//query for:
// "projectId":projectId
Set<String> prefixes = new HashSet<String>();
Query query = new TermQuery(new Term("projectId",projectId));
TopDocs docs = searcher.search(query, getMaxDoc());
for (int i = 0; i < docs.totalHits; i++) {
Document doc = searcher.doc(docs.scoreDocs[i].doc);
prefixes.add(doc.get("prefix"));
}
return prefixes;
}
private void deletePrefixesOfProjectId(String projectId, Set<String> toDelete) throws CorruptIndexException, IOException {
if (projectId == null || projectId.isEmpty()) {
throw new RuntimeException("projectId is null");
}
// "type":vocabulary AND "projectId":projectId AND ("prefix":prefix OR ...)
BooleanQuery q = new BooleanQuery();
Query query = new TermQuery(new Term("projectId",projectId));
//TODO backward compatibility is broken here!!!!!!
// Query typeQ = new TermQuery(new Term("type", "vocabulary"));
BooleanQuery prefixQ = new BooleanQuery();
for(String p:toDelete){
Query pQ = new TermQuery(new Term("prefix",p));
prefixQ.add(pQ,Occur.SHOULD);
}
q.add(query,Occur.MUST);
// q.add(typeQ,Occur.MUST);
q.add(prefixQ,Occur.MUST);
writer.deleteDocuments(q);
}
private int getMaxDoc() throws IOException {
return r.maxDoc() > 0 ? r.maxDoc() : 100000;
}
}