String elementXPath = "//text()[ name(parent::node()) != 'script']";
XPathGenerator elementRemover = new XPathGenerator( new Fields( "words" ), XPathOperation.NAMESPACE_XHTML, elementXPath );
pipe = new Each( pipe, new Fields( "body" ), elementRemover, new Fields( "url", "words" ) );
// apply the regex to break the document into individual words and stuff each word at a new tuple into the current
// stream with field names "url" and "word"
RegexGenerator wordGenerator = new RegexGenerator( new Fields( "word" ), "(?<!\\pL)(?=\\pL)[^ ]*(?<=\\pL)(?!\\pL)" );
pipe = new Each( pipe, new Fields( "words" ), wordGenerator, new Fields( "url", "word" ) );
// group on "url"
Pipe urlCountPipe = new GroupBy( sinkUrlName, pipe, new Fields( "url", "word" ) );
urlCountPipe = new Every( urlCountPipe, new Fields( "url", "word" ), new Count(), new Fields( "url", "word", "count" ) );