Package org.folg.places.tools

Source Code of org.folg.places.tools.CompareMatches

/*
* Copyright 2012 Foundation for On-Line Genealogy, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.folg.places.tools;

import org.folg.places.standardize.Place;
import org.folg.places.standardize.Standardizer;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.xml.sax.SAXParseException;

import java.io.*;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Logger;

/**
* Compare matches with another system against ours
* The input matches file should have the form place-text|matched-place
* The output file contains differences: place-text|our-matched-place|their-matched-place
*
* User: dallan
* Date: 1/20/12
*/
public class CompareMatches {
   private static Logger logger = Logger.getLogger("org.folg.places.tools");

   @Option(name = "-i", required = true, usage = "matches file in")
   private File matchesIn;

   @Option(name = "-o", required = true, usage = "disagreements file out")
   private File disagreementsOut = null;

   private Standardizer standardizer;

   public CompareMatches() {
      standardizer = Standardizer.getInstance();
   }

   private String removeSpuriousDifferences(String place) {
      // lowercase
      place = place.toLowerCase();
      // remove beginning and ending commas
      place = place.replaceFirst("^[, ]+", "").replaceFirst("[, ]+$", "");
      // remove parenthetical type names
      place = place.replaceAll("\\s*\\([^)]*\\)", "");
      return place;
   }

   private void doMain() throws SAXParseException, IOException {
      BufferedReader reader = new BufferedReader(new FileReader(matchesIn));
      PrintWriter writer = new PrintWriter(disagreementsOut);
      int cntDiffs = 0;
      int cntSame = 0;

      // standardize all places and compare
      while (reader.ready()) {
         String nextLine = reader.readLine();
         String[] fields = nextLine.split("\\|");
         String text = fields[0];
         String otherPlace = fields[1];
         String ourPlace = "";
         Place p = standardizer.standardize(text);
         if (p != null) {
            ourPlace = p.getFullName();
         }
         if (!removeSpuriousDifferences(ourPlace).equals(removeSpuriousDifferences(otherPlace))) {
            cntDiffs++;
            writer.println(text+"|"+ourPlace+"|"+otherPlace);
         }
         else {
            cntSame++;
         }
      }

      writer.close();
      reader.close();
      System.out.println("Total places="+(cntSame+cntDiffs)+" same="+cntSame+" different="+cntDiffs);
   }

   public static void main(String[] args) throws SAXParseException, IOException {
      CompareMatches self = new CompareMatches();
      CmdLineParser parser = new CmdLineParser(self);
      try {
         parser.parseArgument(args);
         self.doMain();
      } catch (CmdLineException e) {
         System.err.println(e.getMessage());
         parser.printUsage(System.err);
      }
   }
}
TOP

Related Classes of org.folg.places.tools.CompareMatches

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.