* @todo the extra fields will use the same stored|indexed setting as defined
* on HtmlParser.stored and HtmlParser.indexed. It would be nice to have the
* posibility to optionally override it for each field.
*/
public HtmlParser() {
Config conf = Config.getConfig("indexer.properties");
String[] inputTagNames = conf.getStringArray("HtmlParser.inputTagNames");
String[] outputFieldNames= conf.getStringArray("HtmlParser.outputFieldNames");
if (inputTagNames.length != outputFieldNames.length) {
throw new IllegalArgumentException("Length of inputTagName list does not match length of outputFieldName list.");
}
tags = new HashSet<Pair<String, String>>();
for (int i = 0; i < inputTagNames.length; i++) {
tags.add(new Pair<String, String>(inputTagNames[i], outputFieldNames[i]));
}
String removedXPathElements = conf.getString("HtmlParser.removedXPath");
String[] separatorTags = conf.getStringArray("HtmlParser.separatorTags");
List<Pair<String,String>> extraFieldMapping = conf.getPairList("HtmlParser.extraFieldMapping");
Map<String,String> mapping = new HashMap<String,String>();
extraFields = new ArrayList<String>(extraFieldMapping.size());
for (Pair<String,String> pair: extraFieldMapping) {
mapping.put(pair.first(),pair.last());
extraFields.add(pair.first());
}
parser = new com.flaptor.util.parser.HtmlParser(removedXPathElements, separatorTags,mapping);
STORED = conf.getBoolean("HtmlParser.stored");
INDEXED = conf.getBoolean("HtmlParser.indexed");
if (!(STORED || INDEXED)) {
throw new IllegalArgumentException("constructor: both indexed an stored are set to false in the configuration.");
}
}