Source Code of edu.ucla.sspace.hadoop.RawTextCooccurrenceMapper

/*
 * Copyright 2010 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.hadoop;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.util.HadoopResourceFinder;
import edu.ucla.sspace.util.ResourceFinder;


import java.io.IOException;
import java.io.IOError;


import java.util.Properties;


import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.util.*;


import static edu.ucla.sspace.text.IteratorFactory.ITERATOR_FACTORY_PROPERTIES;




/**
 * A {@link Mapper} implementation that maps a the text values of a document to
 * the word co-occurrences.  This class is intended to be used with the {@link
 * TextInputFormat} where the incoming text files are mapped to byte offsets and
 * the text contained there-in.  The input key values are not interpreted by
 * this mapper, only the text values.
 *
 * <p>This class defines the following configurable properties that may be set
 * using {@link Properties} constructor to {@link HadoopJob}.  Note that
 * setting these properties with the {@link System} properties will have no
 * effect on this class.
 *
 * <dl style="margin-left: 1em">
 *
 * <dt> <i>Property:</i> <code><b>{@value edu.ucla.sspace.hadoop.CooccurrenceExtractor#WINDOW_SIZE_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@value edu.ucla.sspace.hadoop.CooccurrenceExtractor#DEFAULT_WINDOW_SIZE}
 *
 * <dd style="padding-top: .5em">This property sets the number of words before
 *      and after that are counted as co-occurring.  With the default value,
 *      {@value
 *      edu.ucla.sspace.hadoop.CooccurrenceExtractor#DEFAULT_WINDOW_SIZE} words
 *      are counted before and {@value
 *      edu.ucla.sspace.hadoop.CooccurrenceExtractor#DEFAULT_WINDOW_SIZE} words
 *      are counter after.  This class always uses a symmetric window. <p>
 *
 * </dl>
 *
 * @see HadoopJob#HadoopJob(Class,Class,Class,Class,Class,Class,Properties)
 */
public class RawTextCooccurrenceMapper 
        extends Mapper<LongWritable,Text,Text,TextIntWritable> {


    /**
     * The object responsible for performing all the tokenization and
     * co-occurrence extraction from a {@link Text} object.
     */
    private CooccurrenceExtractor extractor;
    
    public RawTextCooccurrenceMapper() { }


    /**
     * Initializes all the properties for this particular mapper.  This process
     * includes setting up the window size and configuring how the input
     * documents will be tokenized.
     */
    protected void setup(Mapper.Context context) {
        Configuration conf = context.getConfiguration();
        extractor = new CooccurrenceExtractor(conf);


        // Set up the IteratorFactory properties           
        Properties props = new Properties();
        for (String property : ITERATOR_FACTORY_PROPERTIES) {
            String propVal = conf.get(property);
            if (propVal != null)
                props.setProperty(property, propVal);
        }
        
        // Create the ResourceFinder that the IteratorFactory will use to find
        // the various files on HDFS
        ResourceFinder hadoopRf = null;
        try {
            hadoopRf = new HadoopResourceFinder(conf);
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }


        // Set the IteratorFactory to locate the resources and then have it
        // reconfigure itself based on the user specified properties
        IteratorFactory.setResourceFinder(hadoopRf);        
        IteratorFactory.setProperties(props);
    }




    /**
     * Processes the tokens in the {@code value} and writes a set of tuples
     * mapping a word to the other words it co-occurs with and the relative
     * position of those co-occurrences.  The key to this method is ignored.
     *
     * @param key the byte offset of the document in the input corpus
     * @param value the document that will be segmented into tokens and
     *        mapped to cooccurrences
     * @param context the context in which this mapper is executing
     */
    public void map(LongWritable key, Text value, Context context) 
            throws IOException, InterruptedException {
        extractor.processDocument(value, context);
    }


}
Source Code of edu.ucla.sspace.hadoop.RawTextCooccurrenceMapper

Related Classes of edu.ucla.sspace.hadoop.RawTextCooccurrenceMapper