Package etc.aloe.oilspill2010

Source Code of etc.aloe.oilspill2010.FeatureGenerationImpl

/*
* This file is part of ALOE.
*
* ALOE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.

* ALOE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License
* along with ALOE.  If not, see <http://www.gnu.org/licenses/>.
*
* Copyright (c) 2012 SCCL, University of Washington (http://depts.washington.edu/sccl)
*/
package etc.aloe.oilspill2010;

import etc.aloe.data.ExampleSet;
import etc.aloe.data.FeatureSpecification;
import etc.aloe.filters.SimpleStringToWordVector;
import etc.aloe.filters.SimpleStringToWordVector.NoNonsenseStemmer;
import java.util.List;
import weka.filters.supervised.attribute.AttributeSelection;
import weka.attributeSelection.Ranker;
import weka.attributeSelection.ReliefFAttributeEval;
import weka.core.Instances;
import weka.core.SelectedTag;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.PrincipalComponents;
import weka.filters.unsupervised.attribute.StringToWordVector;
import weka.filters.unsupervised.instance.SparseToNonSparse;

/**
* Generates a set of filters that extract the desired features from message
* texts.
*
* Features include words, emoticons, pronouns, punctuations, and other strings.
*
* @author Michael Brooks <mjbrooks@uw.edu>
*/
public class FeatureGenerationImpl extends etc.aloe.cscw2013.FeatureGenerationImpl {
   
    /**
     * Construct a new FeatureGeneration implementation.
     *
     * @param emoticonDictionary The list of emoticons to look for in the
     * messages.
     */
    public FeatureGenerationImpl(List<String> emoticonDictionary) {
        super(emoticonDictionary);
       
        //Change the default number of participant features
        this.participantFeatures = 20;
    }
   
    @Override
    public FeatureSpecification generateFeatures(ExampleSet basicExamples) {

        ExampleSet examples = basicExamples.copy();
        FeatureSpecification spec = new FeatureSpecification();

        System.out.print("Configuring features over " + examples.size() + " examples... ");

        try {
            spec.addFilter(getPronounsFilter(examples));
            spec.addFilter(getPunctuationFilter(examples));
            spec.addFilter(getSpecialWordsFilter(examples));
            spec.addFilter(getSpellingFilter(examples));

            spec.addFilter(getEmoticonsFilter(examples));
            spec.addFilter(getBagOfWordsFilter(examples));
           
            if (this.getParticipantFeatureCount() > 0) {
                spec.addFilter(getParticipantsFilter(examples));
            } else {
                spec.addFilter(getRemoveParticipantFilter(examples));
            }
           
            spec.addFilter(getRemoveIDFilter(examples));
            //spec.addFilter(getSparseToNonsparseFilter(examples));
            //spec.addFilter(getFeatureSelectionFilter(examples));
           
            Instances output = spec.getOutputFormat();
            int numAttrs = output.numAttributes();
            System.out.println("generated " + (numAttrs - 1) + " features.");
        } catch (Exception e) {
            System.err.println("Error generating features.");
            System.err.println("\t" + e.getMessage());
        }

        return spec;
    }
   
    protected Filter getSparseToNonsparseFilter(ExampleSet examples) throws Exception {
        SparseToNonSparse filter = new SparseToNonSparse();
       
        filter.setInputFormat(examples.getInstances());
        Instances filtered = Filter.useFilter(examples.getInstances(), filter);
        examples.setInstances(filtered);
       
        return filter;
    }
   
    protected Filter getFeatureSelectionFilter(ExampleSet examples) throws Exception {
       
        AttributeSelection filter = new AttributeSelection()// package weka.filters.supervised.attribute!
        //CfsSubsetEval eval = new CfsSubsetEval();
       
        //CorrelationAttributeEval eval = new CorrelationAttributeEval();
        //InfoGainAttributeEval eval = new InfoGainAttributeEval();
       
        ReliefFAttributeEval eval = new ReliefFAttributeEval();
       
        //GreedyStepwise search = new GreedyStepwise();
        //search.setNumToSelect(980);
        //search.setSearchBackwards(true);
       
        Ranker search = new Ranker();
        search.setNumToSelect(980);
       
        filter.setEvaluator(eval);
        filter.setSearch(search);
       
        filter.setInputFormat(examples.getInstances());
        Instances filtered = Filter.useFilter(examples.getInstances(), filter);
        examples.setInstances(filtered);
       
        return filter;
    }
   
    protected Filter getFeatureReductionFilter(ExampleSet examples) throws Exception {
        PrincipalComponents filter = new PrincipalComponents();
        filter.setMaximumAttributes(10);
       
        filter.setInputFormat(examples.getInstances());
        Instances filtered = Filter.useFilter(examples.getInstances(), filter);
        examples.setInstances(filtered);
       
        return filter;
    }
   
    /**
     * Get a bag of words filter based on the provided examples.
     *
     * @param examples
     * @return
     * @throws Exception
     */
    @Override
    protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception {
        SimpleStringToWordVector filter = new SimpleStringToWordVector();
        filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX);
        filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME);

        //This is stupid because it depends on how much data you use
        //bagger.setMinTermFreq(20);
       
        filter.setDoNotOperateOnPerClassBasis(true);
        filter.setWordsToKeep(3000);
        filter.setLowerCaseTokens(true);

        //use stemming and remove "nonsense"
        filter.setStemmer(new NoNonsenseStemmer(true));

        filter.setTFTransform(true);
        filter.setIDFTransform(true);
        filter.setNormalizeDocLength(new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));

        filter.setOutputWordCounts(true);

        filter.setInputFormat(examples.getInstances());
        Instances filtered = Filter.useFilter(examples.getInstances(), filter);
        examples.setInstances(filtered);

        return filter;
    }
   
}
TOP

Related Classes of etc.aloe.oilspill2010.FeatureGenerationImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.