Examples of net.fp.rp.search.back.extractor.util.Spider

    /**
     * Test the spider
     */
    public void test() {
        try {
            Spider spider = new Spider(UtilExtract.getUri(
                        "http://www.google.ie/search?hl=en&lr=&q=www.cnn.com&btnG=Search"),
                    40);
            spider.start();
        } catch (Throwable e) {
            e.printStackTrace(System.out);
            fail();
        }
    }

     */
    public void convert(INewInformation info) throws RpException {
        logger.info("WebExtractor handling location :" + info.getUri() +
            " with level " + info.getLevel());


        Spider spider = new Spider(info.getUri(), getMaxLengthSummary());
        spider.start();


        //process the content from the actual document
        //iterate on the links
        NodeStruct node = new NodeStruct();


        for (int i = 0; (i < spider.getLinks().size()); i++) {
            String uri = ((URL) spider.getLinks().get(i)).toString();
            node.addTuple(TupleStruct.KEYWORD_NAME, uri);
        }


        Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
        Pattern replacePattern = Pattern.compile(getReplaceChars());


        for (int i = 0; (i < spider.getValues().size()); i++) {
            String value = ((String) spider.getValues().get(i));


            //generate the list of the words for the spidered values
            LinkedList listWords = UtilExtract.getValueList(value,
                    getMinLengthWord(), notIgnorePattern, replacePattern);


            for (int j = 0; j < listWords.size(); j++)
                node.addTuple(TupleStruct.KEYWORD_GENERIC,
                    (String) listWords.get(j));
        }


        //define an DocumentStruct object 
        DocumStruct doc = new DocumStruct();
        doc.setTitle(spider.getTitle());
        doc.setPath(spider.getUri());
        doc.setDescription(spider.getDescription());
        doc.setContent(node);
        doc.setCategoryName(info.getCategoryName());
        doc.setCategoryLocation(info.getCategoryLocation());


        //store and reindex document
        PluginManager.storeAndAddDocument(doc);


        logger.debug("Level of the information is " + info.getLevel());


        //spider the location only if the level is present (>0)
        if (info.getLevel() > 0) {
            //process the links  
            for (int i = 0; (i < spider.getLinks().size()); i++) {
                String uriLink = ((URL) spider.getLinks().get(i)).toString();
                logger.debug("Process the link :" + uriLink);


                AddInfo addInfo = new AddInfo(info.getCategoryLocation(),
                        info.getCategoryName(), uriLink, info.getLevel() - 1);

Examples of net.fp.rp.search.back.extractor.util.Spider

Related Classes of net.fp.rp.search.back.extractor.util.Spider