Package cascading.operation.regex

Examples of cascading.operation.regex.RegexSplitGenerator


    Pipe tweetPipe = new Pipe( "tweet" ); // name branch
    AssertMatches assertMatches = new AssertMatches( ".{6,150}" );
    tweetPipe = new Each( tweetPipe, AssertionLevel.STRICT, assertMatches );

    // create an OPERATION split the text into a token stream
    RegexSplitGenerator splitter = new RegexSplitGenerator( new Fields( "token" ), " " );
    Fields outputSelector = new Fields( "uid", "token" );
    tweetPipe = new Each( tweetPipe, new Fields( "text" ), splitter, outputSelector );

    tweetPipe = new Unique( tweetPipe, Fields.ALL );
View Full Code Here


    Tap tfidfTap = new Hfs( new TextDelimited( true, "\t" ), tfidfPath );

    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
    Fields fieldSelector = new Fields( "doc_id", "token" );
    Pipe docPipe = new Each( "token", text, splitter, fieldSelector );

    // define "ScrubFunction" to clean up the token stream
    Fields scrubArguments = new Fields( "doc_id", "token" );
View Full Code Here

    Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );

    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
    // only returns "token"
    Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
View Full Code Here

    Tap leftSink = getPlatform().getTextFile( getOutputPath( "left" ), SinkMode.REPLACE );
    Tap rightSink = getPlatform().getTextFile( getOutputPath( "right" ), SinkMode.REPLACE );

    Pipe uniques = new Pipe( "unique" );

    uniques = new Each( uniques, new Fields( "line" ), new RegexSplitGenerator( new Fields( "word" ), "\\s" ) );

    uniques = new GroupBy( uniques, new Fields( "word" ) );

    uniques = new Every( uniques, new Fields( "word" ), new First( Fields.ARGS ), Fields.REPLACE );
View Full Code Here

TOP

Related Classes of cascading.operation.regex.RegexSplitGenerator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.