Package simhash.examples

Source Code of simhash.examples.HtmlSimhash

/*
    Copyright 2010 Nate Murray
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

package simhash.examples;

import org.apache.log4j.Logger;
import cascading.flow.Flow;
import cascading.pipe.*;
import cascading.scheme.TextDelimited;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascalog.StdoutTap;
import clojure.lang.AFn;
import simhash.Simhash;
import simhash.tokenizers.HtmlText;

/**
* Html Simhash - an example of how to use Simhash
*
* To run this example:
*   lein uberjar
*   lein classpath > classpath
*   java -cp `cat classpath`:build/cascading-simhash-1.0.0-SNAPSHOT-standalone.jar simhash.examples.HtmlSimhash "test-resources/test-html-documents.txt"
**/
public class HtmlSimhash {
  private static final Logger LOG = Logger.getLogger( HtmlSimhash.class );

  public static void main( String[] args ) {
    Tap inputTap = new Hfs( new TextDelimited(
                                new Fields("docid", "body"), "  " ),
                            args[0] );
    Tap outputTap = new StdoutTap();

    // create the flow
    Flow simhashFlow = Simhash.simhash(inputTap, outputTap, 1, HtmlText.tokenizer(3));
    simhashFlow.complete(); // or add to your Cascade, etc
  }
}
TOP

Related Classes of simhash.examples.HtmlSimhash

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.