/**
* NanoDoA - File based document archive
*
* Copyright (C) 2011-2012 Christian Packenius, christian.packenius@googlemail.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.chris_soft.utilities;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* A very (!) basic implementation of a full text indexing and search, based on
* Apache Lucene.
* @author Christian Packenius.
*/
public class FulltextIndexAndSearchUtils {
private static final Version VERSION = Version.LUCENE_35;
private static final String FIELD_DOCUMENT_ID = "documentID";
private static final String FIELD_FULLTEXT = "fulltext";
private final Analyzer analyzer = new SimpleAnalyzer(VERSION);
private final FSDirectory indexDirectory;
/**
* Constructor.
* @param indexDirectory Directory for Lucene index files. Will be created if
* it does not exist.
* @throws IOException
*/
public FulltextIndexAndSearchUtils(File indexDirectory) throws IOException {
indexDirectory.mkdirs();
this.indexDirectory = FSDirectory.open(indexDirectory);
}
/**
* Adds a document full text to the search index.
* @param documentID ID of the document containing the full text.
* @param fulltext Full document text.
* @throws CorruptIndexException
* @throws IOException
*/
public void add(String documentID, String fulltext) throws CorruptIndexException, IOException {
IndexWriterConfig indexWriterConfiguration = new IndexWriterConfig(VERSION, analyzer);
indexWriterConfiguration.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter iwriter = new IndexWriter(indexDirectory, indexWriterConfiguration);
Document doc = new Document();
doc.add(new Field(FIELD_FULLTEXT, fulltext, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field(FIELD_DOCUMENT_ID, documentID, Field.Store.YES, Field.Index.NO));
iwriter.addDocument(doc);
iwriter.close();
}
/**
* Searches a search item in full index.
* @param searchItem Item to search.
* @return List of document IDs that match the search item.
* @throws IOException
* @throws ParseException
*/
public List<String> search(String searchItem) throws IOException, ParseException {
List<String> results = new ArrayList<String>();
IndexReader ireader = IndexReader.open(indexDirectory);
IndexSearcher isearcher = new IndexSearcher(ireader);
QueryParser parser = new QueryParser(VERSION, FIELD_FULLTEXT, analyzer);
parser.setLowercaseExpandedTerms(true);
Query query = parser.parse(searchItem);
TopScoreDocCollector collector = TopScoreDocCollector.create(10000, false);
isearcher.search(query, collector);// .scoreDocs;
ScoreDoc[] hits = collector.topDocs().scoreDocs;
for (ScoreDoc hit : hits) {
Document hitDoc = isearcher.doc(hit.doc);
results.add(hitDoc.get(FIELD_DOCUMENT_ID));
}
isearcher.close();
ireader.close();
return results;
}
/**
* Close index.
*/
public void close() {
indexDirectory.close();
}
/**
* Remove a document from the fulltext index.
* @param documentID Document identifier.
* @throws IOException
*/
public void remove(String documentID) throws IOException {
Term term = new Term(FIELD_DOCUMENT_ID, documentID);
IndexReader indexReader = IndexReader.open(indexDirectory);
indexReader.deleteDocuments(term);
indexReader.close();
}
}