Package org.folg.places.tools

Source Code of org.folg.places.tools.AnalyzePlaces

/*
* Copyright 2012 Foundation for On-Line Genealogy, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.folg.places.tools;

import org.apache.commons.lang.math.NumberUtils;
import org.folg.places.standardize.Normalizer;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.xml.sax.SAXParseException;

import java.io.*;
import java.util.List;

/**
* User: RyanK
* Date: 1/1/12
*/
public class AnalyzePlaces {

   @Option(name = "-i", required = true, usage = "places file in")
   private File placesIn;

   @Option(name = "-o", required = false, usage = "directory for analysis file output")
   private File analysisPlacesOut;

   // break apart words, so North Grinston is split into separate words
   private static String SPLIT_REGEX = "[, ]+";

   private int REVERSE_EVERY_N = 10;

   private CountsCollector placesCountCC;
   private int totalPlacesCount;

   private CountsCollector wordsCountCC;
   private int totalWordsCount;

   private CountsCollector numbersCountCC;
   private int totalNumbersCount;

   private CountsCollector endingsOfPlacesCC;
   private int endingsOfPlacesTotalCount;

   /**
    * This section controls the Normalizer Tokenizer in the analysis *
    */
   private boolean useTokenizer = true;

   private CountsCollector tokenizerPlacesCountCC;
   private int totalTokenizerPlacesCount;

   //The total number of lines to test in the places file
   //when the tokenizer is turned on things get significantly slower so
   private int TOKENIZE_EVERY_N = 1;



   public AnalyzePlaces() {
      placesCountCC = new CountsCollector();
      totalPlacesCount = 0;

      wordsCountCC = new CountsCollector();
      totalWordsCount = 0;

      numbersCountCC = new CountsCollector();
      totalNumbersCount = 0;

      endingsOfPlacesCC = new CountsCollector();
      endingsOfPlacesTotalCount = 0;

      if (useTokenizer) {
         tokenizerPlacesCountCC = new CountsCollector();
         totalTokenizerPlacesCount = 0;
      }
   }

   private void doMain() throws SAXParseException, IOException {

      Normalizer normalizer = null;
      if (useTokenizer) {
         normalizer = Normalizer.getInstance();
      }


      PrintWriter reversedWordsWriter = analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "reversedWords.txt")) : new PrintWriter(System.out);

      BufferedReader bufferedReader = new BufferedReader(new FileReader(placesIn));

      int lineCount = 0;
      while (bufferedReader.ready()) {
         String nextLine = bufferedReader.readLine();
         nextLine = nextLine.trim().toLowerCase();
         if (nextLine.length() == 0)
            continue;

         lineCount++;
         if (lineCount % 5000 == 0)
            System.out.println("indexing line " + lineCount);

         placesCountCC.add(nextLine);
         totalPlacesCount++;

         String[] placeList = nextLine.split(SPLIT_REGEX);

         for (String place : placeList) {
            place = place.trim();

            if (place.length() == 0)
               continue;

            if (NumberUtils.isNumber(place)) {
               numbersCountCC.add(place);
               totalNumbersCount++;
            } else {
               wordsCountCC.add(place);
               totalWordsCount++;
            }
         }

         int lastCommaIndx = nextLine.lastIndexOf(",");
         String lastWord = nextLine.substring(lastCommaIndx + 1).trim();
         if (lastWord.length() > 0) {
            endingsOfPlacesCC.add(lastWord);
            endingsOfPlacesTotalCount++;
         }

         if (lineCount % REVERSE_EVERY_N == 0) {
            StringBuilder reversedWord = new StringBuilder(nextLine);
            reversedWordsWriter.println(reversedWord.reverse());
         }

         if ( (useTokenizer) && (lineCount % TOKENIZE_EVERY_N == 0) ){
            List<List<String>> levels = normalizer.tokenize(nextLine);
            for (List<String> levelWords : levels) {
               tokenizerPlacesCountCC.addAll(levelWords);
               totalTokenizerPlacesCount += levelWords.size();
            }
         }
      }

      System.out.println("total number of lines in files " + lineCount);

      System.out.println("Indexed a total of " + totalPlacesCount + " places.");
      System.out.println("Found a total of " + getPlacesCountCC().size() + " unique places.");
      getPlacesCountCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "placesCount.txt")) : new PrintWriter(System.out));

      System.out.println("Indexed a total of " + totalWordsCount + " words.");
      System.out.println("Found a total of " + getWordsCountCC().size() + " unique words.");
      getWordsCountCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "wordsCount.txt")) : new PrintWriter(System.out));

      System.out.println("Indexed a total of " + totalNumbersCount + " numbers.");
      System.out.println("Found a total of " + getNumbersCountCC().size() + " unique numbers.");
      getNumbersCountCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "numbersCount.txt")) : new PrintWriter(System.out));


      System.out.println("Indexed a total of " + endingsOfPlacesTotalCount + " endings.");
      System.out.println("Found a total of " + getEndingsOfPlacesCC().size() + " unique endings.");
      getEndingsOfPlacesCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "endingsCount.txt")) : new PrintWriter(System.out));

      if (useTokenizer) {
         System.out.println("Indexed a total of " + totalTokenizerPlacesCount + " normalized words.");
         System.out.println("Found a total of " + getTokenizerPlacesCountCC().size() + " normalized words.");
         getTokenizerPlacesCountCC().writeSorted(false, 1, analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "normalizedWordsCount.txt")) : new PrintWriter(System.out));
      }

   }

   public CountsCollector getPlacesCountCC() {
      return placesCountCC;
   }

   public CountsCollector getWordsCountCC() {
      return wordsCountCC;
   }

   public CountsCollector getNumbersCountCC() {
      return numbersCountCC;
   }

   public CountsCollector getEndingsOfPlacesCC() {
      return endingsOfPlacesCC;
   }

   public CountsCollector getTokenizerPlacesCountCC() {
      return tokenizerPlacesCountCC;
   }

   public static void main(String[] args) throws SAXParseException, IOException {
      AnalyzePlaces self = new AnalyzePlaces();
      CmdLineParser parser = new CmdLineParser(self);
      try {
         parser.parseArgument(args);
         self.doMain();
      } catch (CmdLineException e) {
         System.err.println(e.getMessage());
         parser.printUsage(System.err);
      }
   }
}
TOP

Related Classes of org.folg.places.tools.AnalyzePlaces

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.