/**
* Vosao CMS. Simple CMS for Google App Engine.
*
* Copyright (C) 2009-2010 Vosao development team.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* email: vosao.dev@gmail.com
*/
package org.vosao.search.impl;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.vosao.business.Business;
import org.vosao.common.VosaoContext;
import org.vosao.dao.Dao;
import org.vosao.entity.ContentEntity;
import org.vosao.entity.FileEntity;
import org.vosao.entity.PageEntity;
import org.vosao.search.Hit;
import org.vosao.search.SearchIndex;
import org.vosao.search.SearchResultFilter;
import org.vosao.utils.StrUtil;
public class SearchIndexImpl implements SearchIndex {
private static final Log logger = LogFactory.getLog(
SearchIndexImpl.class);
private static final String INDEX_MOD_DATE = "IndexModDate";
private String language;
private Map<String, Set<Long>> index;
private Date indexModDate;
public SearchIndexImpl(String aLanguage) {
language = aLanguage;
loadIndex();
}
@Override
public void updateIndex(Long pageId) {
PageEntity page = getDao().getPageDao().getById(pageId);
if (page == null) {
return;
}
refreshIndex();
List<PageEntity> versions = getDao().getPageDao().selectByUrl(
page.getFriendlyURL());
for (PageEntity version : versions) {
removeFromIndex(version.getId());
}
page = getDao().getPageDao().getByUrl(page.getFriendlyURL());
if (page == null) {
return;
}
if (!page.isSearchable()) {
return;
}
String content = getDao().getPageDao().getContent(page.getId(),
getLanguage());
if (content == null) {
return;
}
String data = StrUtil.extractSearchTextFromHTML(content.toLowerCase());
String[] words = StrUtil.splitByWord(data);
//logger.info(Arrays.asList(words));
for (String word : words) {
if (word.length() < 3) {
continue;
}
if (!getIndex().containsKey(word)) {
getIndex().put(word, new HashSet<Long>());
}
if (!getIndex().get(word).contains(page.getId())) {
getIndex().get(word).add(page.getId());
}
}
}
@Override
public void removeFromIndex(Long pageId) {
for (Set<Long> pages : getIndex().values()) {
pages.remove(pageId);
}
}
@Override
public void saveIndex() {
try {
byte[] indexContent = StrUtil.zipStringToBytes(indexToString());
FileEntity file = getBusiness().getFileBusiness()
.saveFile(getIndexFilename(), indexContent);
indexModDate = file.getLastModifiedTime();
getBusiness().getSystemService().getCache().getMemcache().put(
getIndexKey(), indexModDate);
}
catch (Exception e) {
e.printStackTrace();
}
}
private String indexToString() {
StringBuffer buf = new StringBuffer();
int i = 0;
for (String word : getIndex().keySet()) {
if (getIndex().get(word).isEmpty()) {
continue;
}
buf.append(i++ == 0 ? "" : ":").append(word).append("=");
int j = 0;
for (Long id : getIndex().get(word)) {
buf.append(j++ == 0 ? "" : ",").append(id);
}
}
return buf.toString();
}
@Override
public List<Hit> search(SearchResultFilter filter, String query,
int textSize) {
try {
refreshIndex();
List<Hit> result = new ArrayList<Hit>();
List<Long> pages = new ArrayList<Long>(getPageIds(query));
for (Long pageId : pages) {
PageEntity page = getDao().getPageDao().getById(pageId);
if (page != null) {
if (filter != null && !filter.check(page)) {
continue;
}
ContentEntity content = getBusiness().getPageBusiness()
.getPageContent(page, language);
if (content != null) {
String text = StrUtil.extractSearchTextFromHTML(
content.getContent());
if (text.length() > textSize) {
text = text.substring(0, textSize);
}
result.add(new Hit(page, text, language));
}
}
else {
logger.error("Page not found " + pageId + ". Rebuild index.");
}
}
return result;
}
catch (Exception e) {
e.printStackTrace();
return Collections.EMPTY_LIST;
}
}
private Set<Long> getPageIds(String query) {
String[] words = StrUtil.splitByWord(query);
if (words.length == 0) {
return Collections.EMPTY_SET;
}
Set<Long> keys = getPageKeys(words[0]);
int i = 0;
for (String word : words) {
if (i++ > 0) {
keys = keysLogicalAnd(keys, getPageKeys(word));
}
}
//logger.info("found keys " + keys.toString());
return keys;
}
private Set<Long> keysLogicalAnd(Set<Long> l1, Set<Long> l2) {
Set<Long> result = new HashSet<Long>();
for (Long i : l1) {
if (l2.contains(i) && !result.contains(i)) {
result.add(i);
}
}
return result;
}
private Set<Long> getPageKeys(String word) {
if (getIndex().containsKey(word)) {
return getIndex().get(word);
}
return Collections.EMPTY_SET;
}
private String getIndexKey() {
return INDEX_MOD_DATE + getLanguage();
}
private void refreshIndex() {
Date date = (Date) getBusiness().getSystemService().getCache()
.getMemcache().get(getIndexKey());
if (index == null || date == null || !date.equals(indexModDate)) {
loadIndex();
}
}
private String getIndexFilename() {
return "/tmp/index_" + getLanguage() + ".bin";
}
private void loadIndex() {
try {
index = new HashMap<String, Set<Long>>();
indexModDate = null;
FileEntity file = getBusiness().getFileBusiness()
.findFile(getIndexFilename());
if (file == null) {
logger.error("Search index not found. " + getIndexFilename());
return;
}
byte[] data = getDao().getFileDao().getFileContent(file);
if (data != null) {
String strIndex = StrUtil.unzipStringFromBytes(data);
indexFromString(strIndex);
indexModDate = file.getLastModifiedTime();
Date dt = (Date)getBusiness().getSystemService().getCache()
.getMemcache().get(getIndexKey());
if (dt == null || dt.before(indexModDate)) {
getBusiness().getSystemService().getCache().getMemcache()
.put(getIndexKey(), indexModDate);
}
}
else {
logger.error("Search index is empty. " + getIndexFilename());
}
}
catch (Exception e) {
e.printStackTrace();
}
}
private void indexFromString(String data) {
if (StringUtils.isEmpty(data)) {
return;
}
for (String wordBuf : data.split("\\:")) {
//logger.info(wordBuf);
String[] wordStruc = wordBuf.split("\\=");
if (wordStruc.length != 2 ) {
logger.error("Problem with index " + wordBuf);
continue;
}
index.put(wordStruc[0], new HashSet<Long>());
for (String key : wordStruc[1].split(",")) {
index.get(wordStruc[0]).add(Long.valueOf(key));
}
}
}
private Business getBusiness() {
return VosaoContext.getInstance().getBusiness();
}
private Dao getDao() {
return getBusiness().getDao();
}
@Override
public String getLanguage() {
return language;
}
private Map<String, Set<Long>> getIndex() {
if (index == null) {
index = new HashMap<String, Set<Long>>();
}
return index;
}
@Override
public void clear() {
getIndex().clear();
}
}