Source Code of com.tamingtext.qa.WikipediaWexIndexer

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */


package com.tamingtext.qa;


import java.io.File;
import java.io.FilenameFilter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;


import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


public class WikipediaWexIndexer {
  private transient static Logger log = LoggerFactory.getLogger(WikipediaWexIndexer.class);
  
  private SolrServer server;
  public static final String DEFAULT_SOLR_URL = "http://localhost:8983/solr";
  
  public WikipediaWexIndexer() throws MalformedURLException {
    this.server = new CommonsHttpSolrServer(DEFAULT_SOLR_URL);
  }
  
  public WikipediaWexIndexer(SolrServer server) throws MalformedURLException {
    this.server = server;
  }
  
  public int index(File wikipediaWEX) throws Exception {
    return index(wikipediaWEX, Integer.MAX_VALUE, 1000);
  }
  
  public int index(File wikipediaWEX, int numDocs, int batchSize)
      throws Exception {
    int result = 0;
    if (wikipediaWEX != null && wikipediaWEX.isFile()) {
      WexWikiContentSource contentSource = new WexWikiContentSource();
      Properties properties = new Properties();
      // fileName = config.get("docs.file", null);
      String filePath = wikipediaWEX.getAbsolutePath();
      properties.setProperty("docs.file", filePath);
      properties.setProperty("doc.maker.forever", "false");
      contentSource.setConfig(new Config(properties));
      contentSource.resetInputs();
      // docMaker.openFile();
      List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
      int i = 0;
      SolrInputDocument sDoc = null;
      long start = System.currentTimeMillis();
      try {
        DocData docData = new DocData();
        
        while ((docData = contentSource.getNextDocData(docData)) != null
            && i < numDocs) {
          int mod = i % batchSize;
          
          sDoc = new SolrInputDocument();
          docs.add(sDoc);
          sDoc.addField("file", filePath + "_" + i);
          
          sDoc.addField("docid", String.valueOf(docData.getID()));
          sDoc.addField("body", docData.getBody());
          sDoc.addField("doctitle", docData.getTitle());
          sDoc.addField("name_s", docData.getName());
          
          String[] categories = docData.getProps().getProperty("category")
              .split(";;");
          
          for (String c : categories) {
            sDoc.addField("category", c);
          }
          
          if (mod == batchSize - 1) {
            log.info("Sending: " + docs.size() + " docs"
                + " total sent for this file: " + i);
            server.add(docs);
            docs.clear();
          }
          i++;
        }
      } catch (NoMoreDataException e) {


      }
      long finish = System.currentTimeMillis();
      if (log.isInfoEnabled()) {
        log.info("Indexing took " + (finish - start) + " ms");
      }
      if (docs.size() > 0) {
        server.add(docs);
      }
      result = i + docs.size();
      server.commit();
      server.optimize();
    } else {
      System.out.println("Can't find file: " + wikipediaWEX);
    }
    return result;
  }
  
  public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    
    Option wikipediaFileOpt = obuilder
        .withLongName("wikiFile")
        .withRequired(true)
        .withArgument(
            abuilder.withName("wikiFile").withMinimum(1).withMaximum(1)
                .create())
        .withDescription(
            "The path to the wikipedia dump file. "
                + "May be a directory containing wikipedia dump files. "
                + "If a directory is specified, files starting with the prefix "
                + "freebase-segment- are used.").withShortName("w").create();
    
    Option numDocsOpt = obuilder
        .withLongName("numDocs")
        .withRequired(false)
        .withArgument(
            abuilder.withName("numDocs").withMinimum(1).withMaximum(1).create())
        .withDescription("The number of docs to index").withShortName("n")
        .create();
    
    Option solrURLOpt = obuilder
        .withLongName("solrURL")
        .withRequired(false)
        .withArgument(
            abuilder.withName("solrURL").withMinimum(1).withMaximum(1).create())
        .withDescription("The URL where Solr lives").withShortName("s")
        .create();
    
    Option solrBatchOpt = obuilder
        .withLongName("batch")
        .withRequired(false)
        .withArgument(
            abuilder.withName("batch").withMinimum(1).withMaximum(1).create())
        .withDescription("The number of docs to include in each indexing batch")
        .withShortName("b").create();
    
    Option helpOpt = obuilder.withLongName("help").withDescription(
        "Print out help").withShortName("h").create();
    
    Group group = gbuilder.withName("Options").withOption(wikipediaFileOpt)
        .withOption(numDocsOpt).withOption(solrURLOpt).withOption(solrBatchOpt)
        .withOption(helpOpt).create();
    
    Parser parser = new Parser();
    parser.setGroup(group);
    
    try {
      CommandLine cmdLine = parser.parse(args);
      
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }
      
      File file;
      file = new File(cmdLine.getValue(wikipediaFileOpt).toString());
      File[] dumpFiles;
      if (file.isDirectory()) {
        dumpFiles = file.listFiles(new FilenameFilter() {
          public boolean accept(File file, String s) {
            return s.startsWith("freebase-segment-");
          }
        });
      } else {
        dumpFiles = new File[] {file};
      }
      
      int numDocs = Integer.MAX_VALUE;
      if (cmdLine.hasOption(numDocsOpt)) {
        numDocs = Integer.parseInt(cmdLine.getValue(numDocsOpt).toString());
      }
      String url = DEFAULT_SOLR_URL;
      if (cmdLine.hasOption(solrURLOpt)) {
        url = cmdLine.getValue(solrURLOpt).toString();
      }
      int batch = 100;
      if (cmdLine.hasOption(solrBatchOpt)) {
        batch = Integer.parseInt(cmdLine.getValue(solrBatchOpt).toString());
      }
      WikipediaWexIndexer indexer = new WikipediaWexIndexer(
          new CommonsHttpSolrServer(url));
      int total = 0;
      for (int i = 0; i < dumpFiles.length && total < numDocs; i++) {
        File dumpFile = dumpFiles[i];
        log.info("Indexing: " + file + " Num files to index: "
            + (numDocs - total));
        long start = System.currentTimeMillis();
        int totalFile = indexer.index(dumpFile, numDocs - total, batch);
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
          log
              .info("Indexing " + dumpFile + " took " + (finish - start)
                  + " ms");
        }
        total += totalFile;
        log.info("Done Indexing: " + file + ". Indexed " + totalFile
            + " docs for that file and " + total + " overall.");
        
      }
      log.info("Indexed " + total + " docs overall.");
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
      return;
    }
  }
}
Source Code of com.tamingtext.qa.WikipediaWexIndexer

Related Classes of com.tamingtext.qa.WikipediaWexIndexer