Package cc.twittertools.corpus.demo

Source Code of cc.twittertools.corpus.demo.IndexStatuses$ConstantNormSimilarity

package cc.twittertools.corpus.demo;

import java.io.File;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import cc.twittertools.corpus.data.JsonStatusBlockReader;
import cc.twittertools.corpus.data.JsonStatusCorpusReader;
import cc.twittertools.corpus.data.Status;
import cc.twittertools.corpus.data.StatusStream;

/**
* Reference implementation for indexing statuses.
*/
public class IndexStatuses {
  private static final Logger LOG = Logger.getLogger(IndexStatuses.class);

  public static final Analyzer ANALYZER = new TweetAnalyzer(Version.LUCENE_31);

  private IndexStatuses() {}

  public static enum StatusField {
    ID("id"),
    SCREEN_NAME("screen_name"),
    CREATED_AT("create_at"),
    TEXT("text"),
    DAY("day");

    public final String name;

    StatusField(String s) {
      name = s;
    }
  };

  private static final String INPUT_OPTION = "input";
  private static final String INDEX_OPTION = "index";

  @SuppressWarnings("static-access")
  public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("input directory or file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("index location").create(INDEX_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      System.exit(-1);
    }

    if (!(cmdline.hasOption(INPUT_OPTION) && cmdline.hasOption(INDEX_OPTION))) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(IndexStatuses.class.getName(), options);
      System.exit(-1);
    }

    File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));

    LOG.info("Indexing " + cmdline.getOptionValue(INPUT_OPTION));
    StatusStream stream;

    File file = new File(cmdline.getOptionValue(INPUT_OPTION));
    if (!file.exists()) {
      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    if (file.isDirectory()) {
      stream = new JsonStatusCorpusReader(file);
    } else {
      stream = new JsonStatusBlockReader(file);
    }

    Analyzer analyzer = ANALYZER;
    Similarity similarity = new ConstantNormSimilarity();
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31, analyzer);
    config.setSimilarity(similarity);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // Overwrite existing.

    IndexWriter writer = new IndexWriter(FSDirectory.open(indexLocation), config);

    int cnt = 0;
    Status status;
    try {
      while ((status = stream.next()) != null) {
        if (status.getText() == null) {
          continue;
        }

        cnt++;
        String createdAt = status.getCreatedAt();
        Document doc = new Document();
        doc.add(new Field(StatusField.ID.name, status.getId() + "",
            Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        doc.add(new Field(StatusField.SCREEN_NAME.name, status.getScreenname(),
            Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        doc.add(new Field(StatusField.CREATED_AT.name, createdAt, Store.YES, Index.NO));
        doc.add(new Field(StatusField.TEXT.name, status.getText(), Store.YES, Index.ANALYZED));

        String[] arr = createdAt.split(" ");
        String createDay = new StringBuffer().append(arr[1]).append("_").append(arr[2]).toString();
        doc.add(new Field(StatusField.DAY.name, createDay, Store.YES, Index.NOT_ANALYZED_NO_NORMS));

        writer.addDocument(doc);
        if (cnt % 10000 == 0) {
          LOG.info(cnt + " statuses indexed");
        }
      }
      LOG.info("Optimizing index...");
      writer.optimize();
      writer.close();
    } finally {
      stream.close();
    }

    LOG.info(String.format("Total of %s statuses indexed", cnt));
  }

  public static class ConstantNormSimilarity extends DefaultSimilarity {
    private static final long serialVersionUID = 2737920231537795826L;

    @Override
    public float computeNorm(String field, FieldInvertState state) {
      return 1.0f;
    }
  }
}
TOP

Related Classes of cc.twittertools.corpus.demo.IndexStatuses$ConstantNormSimilarity

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.