/*
* Copyright 2012 Foundation for On-Line Genealogy, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.folg.places.tools;
import org.apache.commons.lang.math.NumberUtils;
import org.folg.places.standardize.Normalizer;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.xml.sax.SAXParseException;
import java.io.*;
import java.util.List;
/**
* User: RyanK
* Date: 1/1/12
*/
public class AnalyzePlaces {
@Option(name = "-i", required = true, usage = "places file in")
private File placesIn;
@Option(name = "-o", required = false, usage = "directory for analysis file output")
private File analysisPlacesOut;
// break apart words, so North Grinston is split into separate words
private static String SPLIT_REGEX = "[, ]+";
private int REVERSE_EVERY_N = 10;
private CountsCollector placesCountCC;
private int totalPlacesCount;
private CountsCollector wordsCountCC;
private int totalWordsCount;
private CountsCollector numbersCountCC;
private int totalNumbersCount;
private CountsCollector endingsOfPlacesCC;
private int endingsOfPlacesTotalCount;
/**
* This section controls the Normalizer Tokenizer in the analysis *
*/
private boolean useTokenizer = true;
private CountsCollector tokenizerPlacesCountCC;
private int totalTokenizerPlacesCount;
//The total number of lines to test in the places file
//when the tokenizer is turned on things get significantly slower so
private int TOKENIZE_EVERY_N = 1;
public AnalyzePlaces() {
placesCountCC = new CountsCollector();
totalPlacesCount = 0;
wordsCountCC = new CountsCollector();
totalWordsCount = 0;
numbersCountCC = new CountsCollector();
totalNumbersCount = 0;
endingsOfPlacesCC = new CountsCollector();
endingsOfPlacesTotalCount = 0;
if (useTokenizer) {
tokenizerPlacesCountCC = new CountsCollector();
totalTokenizerPlacesCount = 0;
}
}
private void doMain() throws SAXParseException, IOException {
Normalizer normalizer = null;
if (useTokenizer) {
normalizer = Normalizer.getInstance();
}
PrintWriter reversedWordsWriter = analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "reversedWords.txt")) : new PrintWriter(System.out);
BufferedReader bufferedReader = new BufferedReader(new FileReader(placesIn));
int lineCount = 0;
while (bufferedReader.ready()) {
String nextLine = bufferedReader.readLine();
nextLine = nextLine.trim().toLowerCase();
if (nextLine.length() == 0)
continue;
lineCount++;
if (lineCount % 5000 == 0)
System.out.println("indexing line " + lineCount);
placesCountCC.add(nextLine);
totalPlacesCount++;
String[] placeList = nextLine.split(SPLIT_REGEX);
for (String place : placeList) {
place = place.trim();
if (place.length() == 0)
continue;
if (NumberUtils.isNumber(place)) {
numbersCountCC.add(place);
totalNumbersCount++;
} else {
wordsCountCC.add(place);
totalWordsCount++;
}
}
int lastCommaIndx = nextLine.lastIndexOf(",");
String lastWord = nextLine.substring(lastCommaIndx + 1).trim();
if (lastWord.length() > 0) {
endingsOfPlacesCC.add(lastWord);
endingsOfPlacesTotalCount++;
}
if (lineCount % REVERSE_EVERY_N == 0) {
StringBuilder reversedWord = new StringBuilder(nextLine);
reversedWordsWriter.println(reversedWord.reverse());
}
if ( (useTokenizer) && (lineCount % TOKENIZE_EVERY_N == 0) ){
List<List<String>> levels = normalizer.tokenize(nextLine);
for (List<String> levelWords : levels) {
tokenizerPlacesCountCC.addAll(levelWords);
totalTokenizerPlacesCount += levelWords.size();
}
}
}
System.out.println("total number of lines in files " + lineCount);
System.out.println("Indexed a total of " + totalPlacesCount + " places.");
System.out.println("Found a total of " + getPlacesCountCC().size() + " unique places.");
getPlacesCountCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "placesCount.txt")) : new PrintWriter(System.out));
System.out.println("Indexed a total of " + totalWordsCount + " words.");
System.out.println("Found a total of " + getWordsCountCC().size() + " unique words.");
getWordsCountCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "wordsCount.txt")) : new PrintWriter(System.out));
System.out.println("Indexed a total of " + totalNumbersCount + " numbers.");
System.out.println("Found a total of " + getNumbersCountCC().size() + " unique numbers.");
getNumbersCountCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "numbersCount.txt")) : new PrintWriter(System.out));
System.out.println("Indexed a total of " + endingsOfPlacesTotalCount + " endings.");
System.out.println("Found a total of " + getEndingsOfPlacesCC().size() + " unique endings.");
getEndingsOfPlacesCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "endingsCount.txt")) : new PrintWriter(System.out));
if (useTokenizer) {
System.out.println("Indexed a total of " + totalTokenizerPlacesCount + " normalized words.");
System.out.println("Found a total of " + getTokenizerPlacesCountCC().size() + " normalized words.");
getTokenizerPlacesCountCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "normalizedWordsCount.txt")) : new PrintWriter(System.out));
}
}
public CountsCollector getPlacesCountCC() {
return placesCountCC;
}
public CountsCollector getWordsCountCC() {
return wordsCountCC;
}
public CountsCollector getNumbersCountCC() {
return numbersCountCC;
}
public CountsCollector getEndingsOfPlacesCC() {
return endingsOfPlacesCC;
}
public CountsCollector getTokenizerPlacesCountCC() {
return tokenizerPlacesCountCC;
}
public static void main(String[] args) throws SAXParseException, IOException {
AnalyzePlaces self = new AnalyzePlaces();
CmdLineParser parser = new CmdLineParser(self);
try {
parser.parseArgument(args);
self.doMain();
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
}
}
}