/*******************************************************************************
* Copyright 2009, 2010 Innovation Gate GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.innovationgate.wga.common.beans;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.dom4j.Element;
/**
* Represents a lucene rule to index content items
*
*/
public class LuceneIndexItemRule {
private String _itemExpression = "";
private String _indexType = "";
private boolean _sortable = false;
private String _contentType = "";
private float _boost = 1.0F;
/**
* Wildcard for item name patterns
*/
public static final String EXPRESSION_WILDCARD = "*";
/**
* Indexing type "keyword"
*/
public static final String INDEX_TYPE_KEYWORD = "keyword";
/**
* Indexing type "fulltext"
*/
public static final String INDEX_TYPE_FULLTEXT = "fulltext";
/**
* Indexing type "noindex" - Meaning that items applying to this rule will not be indexed
*/
public static final String INDEX_TYPE_NO_INDEX = "noindex";
/**
* Content type "plaintext" which will be indexed unparsed
*/
public static final String CONTENT_TYPE_PLAINTEXT = "plaintext";
/**
* Content type "htmlxml" which will index only real content, no HTML/XML tag code
*/
public static final String CONTENT_TYPE_HTML_XML = "htmlxml";
/**
* get rules from configfile element lucene
* @param lucene configfile element
* @return list LuceneIndexItemRules
*/
public static List getRules(Element lucene) {
ArrayList list = new ArrayList();
Element itemrules = lucene.element("itemrules");
Iterator itemrulesIt = itemrules.elementIterator("itemrule");
while (itemrulesIt.hasNext()) {
Element itemruleElement = (Element) itemrulesIt.next();
LuceneIndexItemRule rule = new LuceneIndexItemRule();
rule.setItemExpression(itemruleElement.getText());
rule.setIndexType(itemruleElement.attributeValue("indextype"));
rule.setContentType(itemruleElement.attributeValue("contenttype"));
if (itemruleElement.attributeValue("sortable").equals("true")) {
rule.setSortable(true);
} else {
rule.setSortable(false);
}
rule.setBoost(Float.parseFloat(itemruleElement.attributeValue("boost", "1.0")));
list.add(rule);
}
return list;
}
/**
* get rules from configbean list
* @return list LuceneIndexItemRules
*/
public static List<LuceneIndexItemRule> getRules(List<de.innovationgate.wga.config.LuceneIndexItemRule> rules) {
ArrayList<LuceneIndexItemRule> list = new ArrayList<LuceneIndexItemRule>();
Iterator<de.innovationgate.wga.config.LuceneIndexItemRule> itemrulesIt = rules.iterator();
while (itemrulesIt.hasNext()) {
de.innovationgate.wga.config.LuceneIndexItemRule ruleConf = itemrulesIt.next();
LuceneIndexItemRule rule = new LuceneIndexItemRule();
rule.setItemExpression(ruleConf.getItemExpression());
rule.setIndexType(ruleConf.getIndexType());
rule.setContentType(ruleConf.getContentType());
rule.setSortable(ruleConf.isSortable());
rule.setBoost(ruleConf.getBoost());
list.add(rule);
}
return list;
}
/**
* Saves rules to lucene configuration in wga.xml
* @param lucene The lucene configuration element
* @param rules List of {@link LuceneIndexItemRule} objects to be saved
*/
public static void saveRules(Element lucene, List rules) {
Element itemrules = lucene.element("itemrules");
//remove old rules
itemrules.clearContent();
Iterator itemrulesIt = rules.iterator();
while (itemrulesIt.hasNext()) {
LuceneIndexItemRule rule = (LuceneIndexItemRule) itemrulesIt.next();
Element itemrule = itemrules.addElement("itemrule");
itemrule.addAttribute("indextype", rule.getIndexType());
itemrule.addAttribute("contenttype", rule.getContentType());
if (rule.isSortable()) {
itemrule.addAttribute("sortable", "true");
} else {
itemrule.addAttribute("sortable", "false");
}
itemrule.addAttribute("boost", Float.toString(rule.getBoost()));
itemrule.addText(rule.getItemExpression());
}
}
/**
* Returns the default rule for newly created databases
*/
public static LuceneIndexItemRule getDefaultRule() {
LuceneIndexItemRule rule = new LuceneIndexItemRule();
rule.setItemExpression("*");
rule.setIndexType(LuceneIndexItemRule.INDEX_TYPE_FULLTEXT);
rule.setContentType(LuceneIndexItemRule.CONTENT_TYPE_PLAINTEXT);
rule.setSortable(false);
rule.setBoost(1.0F);
return rule;
}
/**
* adds the default rule to the lucene.itemrules element
* @param lucene luceneElement
*/
public static void addDefaultRule(Element lucene) {
Element itemrules = lucene.element("itemrules");
LuceneIndexItemRule rule = getDefaultRule();
Element itemrule = itemrules.addElement("itemrule");
itemrule.addAttribute("indextype", rule.getIndexType());
itemrule.addAttribute("contenttype", rule.getContentType());
if (rule.isSortable()) {
itemrule.addAttribute("sortable", "true");
} else {
itemrule.addAttribute("sortable", "false");
}
itemrule.addAttribute("boost", Float.toString(rule.getBoost()));
itemrule.addText(rule.getItemExpression());
}
/**
* Determines if the rule item name pattern has a wildcard
*/
public boolean hasWildcard() {
if (this._itemExpression == null) {
return false;
}
if (this._itemExpression.indexOf(EXPRESSION_WILDCARD) != -1) {
return true;
} else {
return false;
}
}
/**
* Determines if this rule is a default rule that will apply to all items not taken by other rules
*/
public boolean isDefaultRule() {
if (this._itemExpression == null) {
return false;
}
if (this._itemExpression.trim().equals(EXPRESSION_WILDCARD)) {
return true;
} else {
return false;
}
}
/**
* Returns the content type of indexing as constant CONTENT_TYPE_...
*/
public String getContentType() {
return _contentType;
}
/**
* Sets the content type for indexing. This will trigger eventual parsing to extract text from the item format.
* Use constants CONTENT_TYPE...
* @param contentType
*/
public void setContentType(String contentType) {
_contentType = contentType;
}
/**
* Returns the indexing type as constant INDEX_TYPE...
*/
public String getIndexType() {
return _indexType;
}
/**
* Sets the indexing type. Use constant INDEX_TYPE..
* @param indexType
*/
public void setIndexType(String indexType) {
_indexType = indexType;
}
/**
* Returns the item name pattern
*/
public String getItemExpression() {
return _itemExpression;
}
/**
* Returns the item name pattern name without eventually trailing text after the wildcard
*/
public String getItemExpressionClearWildcard() {
if (this.isDefaultRule()) {
return "";
} else {
return _itemExpression.substring(0, _itemExpression.length()-1);
}
}
/**
* Sets the item name pattern. The pattern may end with a wildcard character {@link #EXPRESSION_WILDCARD}.
* @param itemExpression
*/
public void setItemExpression(String itemExpression) {
_itemExpression = itemExpression;
}
/**
* Returns if searches should be sortable by items applying to this rule
*/
public boolean isSortable() {
return _sortable;
}
/**
* Sets if searches should be sortable by items applying to this rule
* @param sortable
*/
public void setSortable(boolean sortable) {
_sortable = sortable;
}
public boolean equals(Object obj) {
if (obj instanceof LuceneIndexItemRule) {
LuceneIndexItemRule rule = (LuceneIndexItemRule) obj;
if ( (_contentType.equals(rule.getContentType())) &&
(_indexType.equals(rule.getIndexType())) &&
(_itemExpression.equals(rule.getItemExpression())) &&
(_sortable == rule.isSortable()) &&
(_boost == rule.getBoost()) ) {
return true;
} else {
return false;
}
} else {
return super.equals(obj);
}
}
/**
* Parses an items value according to the content type of this rule
* @param value The item value
* @return The parsed item value
*/
public String parseItemValue(String value) {
if (value != null) {
// if content type html/xml replace tags with '$' which is removed by analyzer
// so <table id="tab1"> is replaced with "$$$$$$$$$$$$$$$$$"
// this change was necessary for feature 'F0000367A' to ensure
// tokenStream by analyzer has the same length than the original text
if (getContentType().equals(LuceneIndexItemRule.CONTENT_TYPE_HTML_XML)) {
StringBuffer output = new StringBuffer();
char[] chars = value.toCharArray();
boolean tagOpen = false;
for (int i=0; i< chars.length; i++) {
if (chars[i] =='<') {
tagOpen = true;
}
if (tagOpen) {
output.append("$");
} else {
output.append(chars[i]);
}
if (chars[i] == '>') {
tagOpen = false;
}
}
value = output.toString();
}
}
return value;
}
/**
* returns the boost value of this rule
*/
public float getBoost() {
return _boost;
}
/**
* sets the boost value for this rule
* @param boost
* @throws IllegalArgumentException if boost is <=0
*/
public void setBoost(float boost) {
if (boost <= 0) {
throw new IllegalArgumentException("Boost value must be >0.");
}
_boost = boost;
}
}