Map<String,Object> indexableAttributes = doc.getIndexableAttributes();
// build xml doc
org.dom4j.Document dom = DocumentHelper.createDocument();
Element root = dom.addElement("documentAdd");
Page page = doc.getPage();
String text = doc.getText();
String url = page.getUrl();
String host = getHost(url);
String title = doc.getTitle(titleLengthLimit);
String tokenizedHost = tokenizeHost(host);
String anchorText = getAnchorText(page);
float categoryBoost = calculateCategoryBoost(attributes);
float pagerankBoost = calculatePagerankBoost(page);
float spamrankBoost = calculateSpamrankBoost(page);
float logBoost = calculateLogBoost(page);
float freshnessBoost = calculateFreshnessBoost(page);
// add overall score
float f1 = factor("category",categoryBoost,categoryBoostDamp);
float f2 = factor("pagerank",pagerankBoost,pagerankBoostDamp);
float f3 = factor("spamrank",spamrankBoost,spamrankBoostDamp);
float f4 = factor("log",logBoost,logBoostDamp);
float f5 = factor("freshness",freshnessBoost,freshnessBoostDamp);
float f6 = ((Double)attributes.get("boost")).floatValue(); // as calculated by the boost module, or 1.0 if no boost module is defined.
float boost = f1 * f2 * f3 * f4 * f5 * f6;
// System.out.println("BOOST url=["+url+"] category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+") pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+") log="+f3+" ("+logBoost+":"+logBoostDamp+") freshness="+f4+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f5+" Boost="+boost);
if (boost < 1e-6f) {
logger.warn("Boost too low! ("+boost+") category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+") pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+") spamrank="+f3+" ("+spamrankBoost+":"+spamrankBoostDamp+") log="+f4+" ("+logBoost+":"+logBoostDamp+") freshness="+f5+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f6);
boost = 1e-6f;
}
if (null == title || "".equals(title)) {
title = "Untitled";
}
root.addElement("boost").addText(String.valueOf(boost));
root.addElement("documentId").addText(getDocumentId(page));
Map<String,Double> boostMap = (Map<String,Double>)attributes.get("field_boost");
// add the search fields
addField(root, "url", url, true, true, true, boostMap);
addField(root, "site", host, true, true, false, boostMap);
addField(root, "tokenizedHost", tokenizedHost, false, true, true, boostMap);
addField(root, "title", title, true, true, true, boostMap);
addField(root, "text", text, true, true, true, boostMap);
addField(root, "anchor", anchorText, false, true, true, boostMap);
addField(root, "crawl", crawlName, false, true, true, boostMap);
if (sendContent) {
addBody(root,doc,content);
}
// for debugging only
//addField(root, "boostinfo", "category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+") pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+") log="+f3+" ("+logBoost+":"+logBoostDamp+") freshness="+f4+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f5+" Boost="+boost, true, false, false, null);
addAdditionalFields(root, page, boostMap);
// Adding metainfo from attributes
Set<Entry<String,Object>> attributeSet = indexableAttributes.entrySet();
for (Entry<String,Object> attribute : attributeSet) {
addField(root, attribute.getKey(), attribute.getValue() == null ? "" : attribute.getValue().toString(), true, true, true, boostMap);
}
StringBuffer assignedCategories = new StringBuffer();
if (null != categories) {
// iterate through the classes the page belongs to add each category and its score
for (Iterator<String> iter = categories.iterator(); iter.hasNext();) {
assignedCategories.append(iter.next());
assignedCategories.append(" ");
// repeat the field times proportional to the score (this is a way to boost the document by category);
//for (int rep = 0; rep < score*10; rep++) {
// addField(root, "categoryBoost", categ, false, true, false);
//}
}
addField(root, "categories", assignedCategories.toString().trim(), true, true, true, boostMap);
}
if (logger.isDebugEnabled()) {
logger.debug("Indexing dom: " + DomUtil.domToString(dom));
}
// Send the document to the indexer. If the queue is full, wait and retry.
try {
int i = pageMapper.mapPage(page);
while (indexers[i].index(dom) == IndexerReturnCode.RETRY_QUEUE_FULL) {
try {
Thread.sleep(indexerBusyRetryTime*1000);
} catch (InterruptedException e) {
logger.debug("Sleep interrupted: " + e, e);
}
}
page.setEmitted(true);
} catch (Exception e) {
logger.error(e,e);
}
}