Package org.folg.places.standardize

Examples of org.folg.places.standardize.Normalizer


      }
   }

   private void doMain() throws SAXParseException, IOException {

      Normalizer normalizer = null;
      if (useTokenizer) {
         normalizer = Normalizer.getInstance();
      }


      PrintWriter reversedWordsWriter = analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "reversedWords.txt")) : new PrintWriter(System.out);

      BufferedReader bufferedReader = new BufferedReader(new FileReader(placesIn));

      int lineCount = 0;
      while (bufferedReader.ready()) {
         String nextLine = bufferedReader.readLine();
         nextLine = nextLine.trim().toLowerCase();
         if (nextLine.length() == 0)
            continue;

         lineCount++;
         if (lineCount % 5000 == 0)
            System.out.println("indexing line " + lineCount);

         placesCountCC.add(nextLine);
         totalPlacesCount++;

         String[] placeList = nextLine.split(SPLIT_REGEX);

         for (String place : placeList) {
            place = place.trim();

            if (place.length() == 0)
               continue;

            if (NumberUtils.isNumber(place)) {
               numbersCountCC.add(place);
               totalNumbersCount++;
            } else {
               wordsCountCC.add(place);
               totalWordsCount++;
            }
         }

         int lastCommaIndx = nextLine.lastIndexOf(",");
         String lastWord = nextLine.substring(lastCommaIndx + 1).trim();
         if (lastWord.length() > 0) {
            endingsOfPlacesCC.add(lastWord);
            endingsOfPlacesTotalCount++;
         }

         if (lineCount % REVERSE_EVERY_N == 0) {
            StringBuilder reversedWord = new StringBuilder(nextLine);
            reversedWordsWriter.println(reversedWord.reverse());
         }

         if ( (useTokenizer) && (lineCount % TOKENIZE_EVERY_N == 0) ){
            List<List<String>> levels = normalizer.tokenize(nextLine);
            for (List<String> levelWords : levels) {
               tokenizerPlacesCountCC.addAll(levelWords);
               totalTokenizerPlacesCount += levelWords.size();
            }
         }
View Full Code Here

TOP

Related Classes of org.folg.places.standardize.Normalizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.