Package simhash.examples

Source Code of simhash.examples.SimpleSimhash

/*
    Copyright 2010 Nate Murray
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

package simhash.examples;

import org.apache.log4j.Logger;
import cascading.flow.Flow;
import cascading.pipe.*;
import cascading.scheme.TextDelimited;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascalog.StdoutTap;
import clojure.lang.AFn;
import simhash.Simhash;

/**
* Simple Simhash - an example of how to use Simhash
*
* To run this example:
*   lein uberjar
*   lein classpath > classpath
*   java -cp `cat classpath`:build/cascading-simhash-1.0.0-SNAPSHOT-standalone.jar simhash.examples.SimpleSimhash "test-resources/test-documents.txt"
**/
public class SimpleSimhash {
  private static final Logger LOG = Logger.getLogger( SimpleSimhash.class );

  /**
   * Create a tokenizer that is a subclass of clojure.lang.AFn and
   * implements invoke(Object body)
   **/
  public static class Tokenizer extends AFn {

    /**
     * Your tokenization logic goes here
     *
     * @param String body
     * @return something seq-able
     */
    public Object invoke(Object body) throws Exception {
      String b = (String)body;
      return b.split(" ");
    }
  }

  public static void main( String[] args ) {
    Tap inputTap = new Hfs( new TextDelimited(
                                new Fields("docid", "body"), "\t" ),
                            args[0] );
    Tap outputTap = new StdoutTap();

    // create the flow
    Flow simhashFlow = Simhash.simhash(inputTap, outputTap,
                                       2, // combine n-th lowest minhashes (e.g. 2)
                                       SimpleSimhash.Tokenizer.class);
    simhashFlow.complete(); // or add to your Cascade, etc
  }
}
TOP

Related Classes of simhash.examples.SimpleSimhash

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
a>
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.