Package org.eobjects.datacleaner.phonetic

Source Code of org.eobjects.datacleaner.phonetic.PhoneticSimilarityFinder

/**
* eobjects.org DataCleaner
* Copyright (C) 2010 eobjects.org
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA  02110-1301  USA
*/
package org.eobjects.datacleaner.phonetic;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;

import javax.inject.Inject;

import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.eobjects.analyzer.beans.api.AnalyzerBean;
import org.eobjects.analyzer.beans.api.Configured;
import org.eobjects.analyzer.beans.api.Description;
import org.eobjects.analyzer.beans.api.Provided;
import org.eobjects.analyzer.beans.api.RowProcessingAnalyzer;
import org.eobjects.analyzer.data.InputColumn;
import org.eobjects.analyzer.data.InputRow;
import org.eobjects.analyzer.result.SimilarityGroup;
import org.eobjects.analyzer.result.SimilarityResult;
import org.eobjects.analyzer.storage.InMemoryRowAnnotationFactory;
import org.eobjects.analyzer.storage.RowAnnotation;
import org.eobjects.analyzer.storage.RowAnnotationFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@AnalyzerBean("Phonetic similarity finder")
@Description("Find similar sounding values using phonetic checking.\nThis analyzer uses the Soundex, Refined Soundex and Metaphone algorithms to determine the phonetic similarity of String values.")
public class PhoneticSimilarityFinder implements RowProcessingAnalyzer<SimilarityResult> {

  public static enum MatchMode {
    STRICT, LOOSE
  }

  private final static Logger logger = LoggerFactory.getLogger(PhoneticSimilarityFinder.class);

  private final static double STRICT_SIMILARITY_THRESHOLD = 1.0;

  // everything that is ~80% similar will be included
  private final static double LOOSE_SIMILARITY_THRESHOLD = 8d / 10;

  private List<SimilarityGroup> _similarityGroups = new ArrayList<SimilarityGroup>();

  @Inject
  @Provided
  RowAnnotationFactory _rowAnnotationFactory;

  @Configured
  MatchMode matchMode = MatchMode.STRICT;

  @Inject
  @Configured
  InputColumn<String> _column;

  public PhoneticSimilarityFinder() {
  }

  // constructor for test purposes
  public PhoneticSimilarityFinder(InputColumn<String> column) {
    _column = column;
    _rowAnnotationFactory = new InMemoryRowAnnotationFactory();
  }

  @Override
  public void run(InputRow row, int distinctCount) {
    String value = row.getValue(_column);

    if (value != null) {
      value = value.trim().toLowerCase();
      if (!"".equals(value)) {
        boolean foundMatch = false;

        for (ListIterator<SimilarityGroup> it = _similarityGroups.listIterator(); it.hasNext();) {
          SimilarityGroup similarityGroup = it.next();

          if (matches(value, similarityGroup)) {
            RowAnnotation annotation = similarityGroup.getAnnotation();
            it.set(new SimilarityGroup(annotation, _rowAnnotationFactory, _column, value, similarityGroup
                .getValues()));
            _rowAnnotationFactory.annotate(row, distinctCount, annotation);
            foundMatch = true;
          }
        }

        if (!foundMatch) {
          RowAnnotation annotation = _rowAnnotationFactory.createAnnotation();
          _rowAnnotationFactory.annotate(row, distinctCount, annotation);
          _similarityGroups.add(new SimilarityGroup(annotation, _rowAnnotationFactory, _column, value,
              new String[0]));
        }
      }
    }
  }

  public boolean matches(String value, SimilarityGroup similarityGroup) {
    // first do exact matching
    for (String similarityGroupValue : similarityGroup.getValues()) {
      if (value.equals(similarityGroupValue)) {
        return true;
      }
    }

    Soundex soundex = new Soundex();
    RefinedSoundex refinedSoundex = new RefinedSoundex();
    Metaphone metaphone = new Metaphone();

    double threshold;
    if (matchMode == MatchMode.STRICT) {
      threshold = STRICT_SIMILARITY_THRESHOLD;
    } else {
      threshold = LOOSE_SIMILARITY_THRESHOLD;
    }
    int soundexThreshold = (int) Math.round(threshold * 4);

    for (String similarityGroupValue : similarityGroup.getValues()) {
      boolean metaphoneEquals = metaphone.isMetaphoneEqual(value, similarityGroupValue);
      if (metaphoneEquals) {
        return true;
      }

      try {
        int soundexDiff = soundex.difference(value, similarityGroupValue);

        if (soundexDiff >= soundexThreshold) {
          return true;
        }
      } catch (Exception e) {
        logger.error("Could not determine soundex difference", e);
      }

      int refinedSoundexThreshold = (int) Math.round(threshold
          * Math.min(value.length(), similarityGroupValue.length()));

      try {
        int refinedSoundexDiff = refinedSoundex.difference(value, similarityGroupValue);

        if (refinedSoundexDiff >= refinedSoundexThreshold) {
          return true;
        }
      } catch (Exception e) {
        logger.error("Could not determine refined soundex difference", e);
      }
    }

    return false;
  }

  @Override
  public SimilarityResult getResult() {
    for (Iterator<SimilarityGroup> it = _similarityGroups.iterator(); it.hasNext();) {
      SimilarityGroup similarityGroup = it.next();
      if (similarityGroup.getValueCount() == 1) {
        it.remove();
      }
    }
    return new SimilarityResult(_similarityGroups);
  }
}
TOP

Related Classes of org.eobjects.datacleaner.phonetic.PhoneticSimilarityFinder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.