Package org.apache.mahout.pig.encoders

Source Code of org.apache.mahout.pig.encoders.Schema

package org.apache.mahout.pig.encoders;

import org.antlr.runtime.ANTLRReaderStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.RecognitionException;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Map;

/**
* A schema specifies the names and types for positional parameters in a tuple.  These names
* and types are used when encoding tuples as vectors using PigVector.
* <p/>
* This class contains convenience routines that simplify the use of the schema parser classes.
* <p/>
* A schema consists of a comma separated list of variable specifications.  A variable
* specification contains a name plus optional absolute position and a type separated by a colon.
* The name follows the rules of java identifiers.  The optional absolute position is indicated
* by a dollar sign and a number.  The number indicates the 0-based position of the variable
* in any tuple being encoded.  Variables without positions are assigned consecutive positions
* starting at 0 in the tuple.
* <p/>
* The type specification can be numeric, word or text.  For text, you can supply an optional
* parenthesis surrounded fully qualified name of a Java class that implements the Analyzer
* interface from Lucene.  If you don't supply an analyzer, the text is tokenized on whilte-
* space boundaries.
* <p/>
* Here is an example of a schema:
* <pre>
* a:numeric, b$0:word, c$1:text(org.apache.lucene.analysis.en.EnglishAnalyzer), d:text
* </pre>
* Here, argument 0 in the encoded tuple is encoded as a number and as a key word while
* argument 1 in the encoded tuple is encoded as text using two different methods for
* analysis.
*/
public class Schema {
    public static Map<String, ArgumentEncoder> parse(Reader reader) {
        SchemaParser tokenParser;
        try {
            CommonTokenStream input = new CommonTokenStream(new SchemaLexer(new ANTLRReaderStream(reader)));
            tokenParser = new SchemaParser(input);
            // parsing causes side effect on tokenParser
            tokenParser.schema();
        } catch (IOException e) {
            throw new SchemaParseException("Cannot parse schema", e);
        } catch (RecognitionException e) {
            throw new SchemaParseException("Cannot parse schema", e);
        }
        return tokenParser.getEncoders();
    }

    public static Map<String, ArgumentEncoder> parse(String s) {
        final StringReader input = new StringReader(s);
        Map<String, ArgumentEncoder> r = parse(input);
        input.close();
        return r;
    }
}
TOP

Related Classes of org.apache.mahout.pig.encoders.Schema

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.