Package com.flaptor.hounder.crawler.modules

Source Code of com.flaptor.hounder.crawler.modules.SpamDetectorModule

/*
Copyright 2008 Flaptor (flaptor.com)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.flaptor.hounder.crawler.modules;

import com.flaptor.hounder.crawler.UrlPatterns;
import java.io.IOException;

import org.apache.log4j.Logger;

import com.flaptor.hounder.crawler.pagedb.Page;
import com.flaptor.util.Config;
import com.flaptor.util.Execute;

/**
* Tries to detect spam
* @author Flaptor Development Team
*/
public class SpamDetectorModule extends AProcessorModule {

    private static final Logger logger = Logger.getLogger(Execute.whoAmI());
    private int maxTitleLength = 0;
    private UrlPatterns patterns; // list of grep patterns a url must match to become a hotspot.
    private boolean titleSpamActive;
    private boolean urlMatchSpamActive;

  /**
     * Get the module configuration.
   */
    public SpamDetectorModule (String name, Config globalConfig) throws IOException {
        super(name, globalConfig);
        maxTitleLength = getModuleConfig().getInt("max.title.length");
        titleSpamActive = (maxTitleLength > 0);
        String urlPatternFile = getModuleConfig().getString("url.pattern.file");
        urlMatchSpamActive = !("".equals(urlPatternFile));
        patterns = new UrlPatterns(urlPatternFile);
    }

    //    @Override
    public void internalProcess (FetchDocument doc) {
        try {
            Page page = doc.getPage();
            float spamValue = 0;
           
            if (titleSpamActive) spamValue += titleSpamValue(doc.getTitle());
            if (urlMatchSpamActive) spamValue += urlMatchSpamValue(page);

            page.setAntiScore(spamValue);
        } catch (NullPointerException e) {
            logger.error(e,e);
        }
    }

    /**
     * Long titles are indicative of spam.
     * @param title
     * @return
     */
    private float titleSpamValue(String title) {
        if (title.length() > maxTitleLength) {
            return 0.3f;
        } else {
            return 0;
        }
    }
   
    /**
     * If the page url matches the provided spam url patterns, it is spam.
     * @param page
     * @return
     */
    private float urlMatchSpamValue(Page page) {
        if (patterns.match(page.getUrl())) {
            return 1f;
        } else {
            return 0;
        }
    }

   
    public void close() {
        patterns.close();
    }

   
   
}
TOP

Related Classes of com.flaptor.hounder.crawler.modules.SpamDetectorModule

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.