Source Code of org.apache.mahout.pig.encoders.Schema

package org.apache.mahout.pig.encoders;


import org.antlr.runtime.ANTLRReaderStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.RecognitionException;


import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Map;


/**
 * A schema specifies the names and types for positional parameters in a tuple.  These names
 * and types are used when encoding tuples as vectors using PigVector.
 * <p/>
 * This class contains convenience routines that simplify the use of the schema parser classes.
 * <p/>
 * A schema consists of a comma separated list of variable specifications.  A variable
 * specification contains a name plus optional absolute position and a type separated by a colon.
 * The name follows the rules of java identifiers.  The optional absolute position is indicated
 * by a dollar sign and a number.  The number indicates the 0-based position of the variable
 * in any tuple being encoded.  Variables without positions are assigned consecutive positions
 * starting at 0 in the tuple.
 * <p/>
 * The type specification can be numeric, word or text.  For text, you can supply an optional
 * parenthesis surrounded fully qualified name of a Java class that implements the Analyzer
 * interface from Lucene.  If you don't supply an analyzer, the text is tokenized on whilte-
 * space boundaries.
 * <p/>
 * Here is an example of a schema:
 * <pre>
 * a:numeric, b$0:word, c$1:text(org.apache.lucene.analysis.en.EnglishAnalyzer), d:text
 * </pre>
 * Here, argument 0 in the encoded tuple is encoded as a number and as a key word while
 * argument 1 in the encoded tuple is encoded as text using two different methods for
 * analysis.
 */
public class Schema {
    public static Map<String, ArgumentEncoder> parse(Reader reader) {
        SchemaParser tokenParser;
        try {
            CommonTokenStream input = new CommonTokenStream(new SchemaLexer(new ANTLRReaderStream(reader)));
            tokenParser = new SchemaParser(input);
            // parsing causes side effect on tokenParser
            tokenParser.schema();
        } catch (IOException e) {
            throw new SchemaParseException("Cannot parse schema", e);
        } catch (RecognitionException e) {
            throw new SchemaParseException("Cannot parse schema", e);
        }
        return tokenParser.getEncoders();
    }


    public static Map<String, ArgumentEncoder> parse(String s) {
        final StringReader input = new StringReader(s);
        Map<String, ArgumentEncoder> r = parse(input);
        input.close();
        return r;
    }
}
Source Code of org.apache.mahout.pig.encoders.Schema

Related Classes of org.apache.mahout.pig.encoders.Schema