Tap tfidfTap = new Hfs( new TextDelimited( true, "\t" ), tfidfPath );
// specify a regex operation to split the "document" text lines into a token stream
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
Fields fieldSelector = new Fields( "doc_id", "token" );
Pipe docPipe = new Each( "token", text, splitter, fieldSelector );
// define "ScrubFunction" to clean up the token stream
Fields scrubArguments = new Fields( "doc_id", "token" );