Package org.languagetool.dev.dumpcheck

Source Code of org.languagetool.dev.dumpcheck.SentenceSourceIndexer

/* LanguageTool, a natural language style checker
* Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
* USA
*/
package org.languagetool.dev.dumpcheck;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.languagetool.Language;
import org.languagetool.dev.index.Indexer;
import org.xml.sax.helpers.DefaultHandler;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

/**
* Creates a Lucene index of a {@link SentenceSource}.
* @since 2.4
*/
public class SentenceSourceIndexer extends DefaultHandler implements AutoCloseable {

  public static final String MAX_DOC_COUNT_VALUE = "maxDocCountValue";
  public static final String MAX_DOC_COUNT_FIELD = "maxDocCount";
  public static final String MAX_DOC_COUNT_FIELD_VAL = "1";

  private final Indexer indexer;
  private final int maxSentences;
 
  private int sentenceCount = 0;
 
  SentenceSourceIndexer(Directory dir, Language language, int maxSentences) {
    this.indexer = new Indexer(dir, language);
    this.maxSentences = maxSentences;
  }

  @Override
  public void close() throws Exception {
    indexer.close();
  }

  private void run(List<String> dumpFileNames, Language language) throws IOException {
    MixingSentenceSource mixingSource = MixingSentenceSource.create(dumpFileNames, language);
    while (mixingSource.hasNext()) {
      Sentence sentence = mixingSource.next();
      if (sentenceCount % 1000 == 0) {
        System.out.println("Indexing sentence #" + sentenceCount + " (" + mixingSource.getSourceDistribution() + "):");
        System.out.println("  [" +  sentence.getSource() + "] " + sentence);
      }
      indexer.indexSentence(sentence, sentenceCount);
      sentenceCount++;
      if (maxSentences > 0 && sentenceCount >= maxSentences) {
        throw new DocumentLimitReachedException(maxSentences);
      }
    }
  }

  private void writeMetaDocuments() throws IOException {
    final Document doc = new Document();
    doc.add(new StringField(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL, Field.Store.YES));
    doc.add(new StringField(MAX_DOC_COUNT_VALUE, sentenceCount + "", Field.Store.YES));
    indexer.add(doc);
  }

  public static void main(String... args) throws Exception {
    if (args.length != 4) {
      System.out.println("Usage: " + SentenceSourceIndexer.class.getSimpleName() + " <dataFile...> <indexDir> <languageCode> <maxSentences>");
      System.out.println("\t<dataFiles> comma-separated list of a Wikipedia XML dump (*.xml) and/or Tatoeba files (tatoeba-*)");
      System.out.println("\t<indexDir> directory where Lucene index will be written to, existing index content will be removed");
      System.out.println("\t<languageCode> short code like en for English, de for German etc");
      System.out.println("\t<maxSentences> maximum number of sentences to be indexed, use 0 for no limit");
      System.exit(1);
    }
    final List<String> dumpFilesNames = Arrays.asList(args[0].split(","));
    final File indexDir = new File(args[1]);
    final String languageCode = args[2];
    final int maxSentences = Integer.parseInt(args[3]);

    final Language language = Language.getLanguageForShortName(languageCode);
    if (maxSentences == 0) {
      System.out.println("Going to index contents from " + dumpFilesNames);
    } else {
      System.out.println("Going to index up to " + maxSentences + " sentences from " + dumpFilesNames);
    }
    System.out.println("Output index dir: " + indexDir);
   
    final long start = System.currentTimeMillis();
    try (FSDirectory fsDirectory = FSDirectory.open(indexDir)) {
      final SentenceSourceIndexer indexer = new SentenceSourceIndexer(fsDirectory, language, maxSentences);
      try {
        indexer.run(dumpFilesNames, language);
      } catch (DocumentLimitReachedException e) {
        System.out.println("Sentence limit (" + e.getLimit() + ") reached, stopping indexing");
      } finally {
        indexer.writeMetaDocuments();
        indexer.close();
      }
    }
    final long end = System.currentTimeMillis();
    final float minutes = (end - start) / (float)(1000 * 60);
    System.out.printf("Indexing took %.2f minutes\n", minutes);
  }

}
TOP

Related Classes of org.languagetool.dev.dumpcheck.SentenceSourceIndexer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.