*/
public void convert(INewInformation info) throws RpException {
logger.info("WebExtractor handling location :" + info.getUri() +
" with level " + info.getLevel());
Spider spider = new Spider(info.getUri(), getMaxLengthSummary());
spider.start();
//process the content from the actual document
//iterate on the links
NodeStruct node = new NodeStruct();
for (int i = 0; (i < spider.getLinks().size()); i++) {
String uri = ((URL) spider.getLinks().get(i)).toString();
node.addTuple(TupleStruct.KEYWORD_NAME, uri);
}
Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
Pattern replacePattern = Pattern.compile(getReplaceChars());
for (int i = 0; (i < spider.getValues().size()); i++) {
String value = ((String) spider.getValues().get(i));
//generate the list of the words for the spidered values
LinkedList listWords = UtilExtract.getValueList(value,
getMinLengthWord(), notIgnorePattern, replacePattern);
for (int j = 0; j < listWords.size(); j++)
node.addTuple(TupleStruct.KEYWORD_GENERIC,
(String) listWords.get(j));
}
//define an DocumentStruct object
DocumStruct doc = new DocumStruct();
doc.setTitle(spider.getTitle());
doc.setPath(spider.getUri());
doc.setDescription(spider.getDescription());
doc.setContent(node);
doc.setCategoryName(info.getCategoryName());
doc.setCategoryLocation(info.getCategoryLocation());
//store and reindex document
PluginManager.storeAndAddDocument(doc);
logger.debug("Level of the information is " + info.getLevel());
//spider the location only if the level is present (>0)
if (info.getLevel() > 0) {
//process the links
for (int i = 0; (i < spider.getLinks().size()); i++) {
String uriLink = ((URL) spider.getLinks().get(i)).toString();
logger.debug("Process the link :" + uriLink);
AddInfo addInfo = new AddInfo(info.getCategoryLocation(),
info.getCategoryName(), uriLink, info.getLevel() - 1);