Pipe pipe = new Pipe( sourceName );
// convert the html to xhtml using the TagSouParser. return only the fields "url" and "xml", discard the rest
pipe = new Each( pipe, new Fields( "page" ), new TagSoupParser( new Fields( "xml" ) ), new Fields( "url", "xml" ) );
// apply the given XPath expression to the xml in the "xml" field. this expression extracts the 'body' element.
XPathGenerator bodyExtractor = new XPathGenerator( new Fields( "body" ), XPathOperation.NAMESPACE_XHTML, "//xhtml:body" );
pipe = new Each( pipe, new Fields( "xml" ), bodyExtractor, new Fields( "url", "body" ) );
// apply another XPath expression. this expression removes all elements from the xml, leaving only text nodes.
// text nodes in a 'script' element are removed.
String elementXPath = "//text()[ name(parent::node()) != 'script']";
XPathGenerator elementRemover = new XPathGenerator( new Fields( "words" ), XPathOperation.NAMESPACE_XHTML, elementXPath );
pipe = new Each( pipe, new Fields( "body" ), elementRemover, new Fields( "url", "words" ) );
// apply the regex to break the document into individual words and stuff each word at a new tuple into the current
// stream with field names "url" and "word"
RegexGenerator wordGenerator = new RegexGenerator( new Fields( "word" ), "(?<!\\pL)(?=\\pL)[^ ]*(?<=\\pL)(?!\\pL)" );
pipe = new Each( pipe, new Fields( "words" ), wordGenerator, new Fields( "url", "word" ) );