/* ***** BEGIN LICENSE BLOCK *****
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* The original code is from the Apache commons-codec-1.5-src package
* and has been modified for the dcm4che project
*
* ***** END LICENSE BLOCK ***** */
package org.dcm4che3.soundex;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import junit.framework.Assert;
import org.dcm4che3.soundex.Soundex;
import org.junit.Test;
public class SoundexTest {
public String getSoundexEncoderString(String arg) {
Soundex inst = new Soundex();
return inst.toFuzzy(arg);
}
protected void checkEncodingVariations(String expected, String data[]) throws Exception {
for (int i = 0; i < data.length; i++) {
checkEncoding(expected, data[i]);
}
}
public void checkEncoding(String expected, String source) throws Exception {
assertEquals("Source: " + source, expected, getSoundexEncoderString(source));
}
@Test
public void testB650() throws Exception {
checkEncodingVariations("B650", (new String[]{
"BARHAM",
"BARONE",
"BARRON",
"BERNA",
"BIRNEY",
"BIRNIE",
"BOOROM",
"BOREN",
"BORN",
"BOURN",
"BOURNE",
"BOWRON",
"BRAIN",
"BRAME",
"BRANN",
"BRAUN",
"BREEN",
"BRIEN",
"BRIM",
"BRIMM",
"BRINN",
"BRION",
"BROOM",
"BROOME",
"BROWN",
"BROWNE",
"BRUEN",
"BRUHN",
"BRUIN",
"BRUMM",
"BRUN",
"BRUNO",
"BRYAN",
"BURIAN",
"BURN",
"BURNEY",
"BYRAM",
"BYRNE",
"BYRON",
"BYRUM"}));
}
public void testBadCharacters() {
Assert.assertEquals("H452", getSoundexEncoderString("HOL>MES"));
}
@Test
public void testEncodeBasic() {
Assert.assertEquals("T235", getSoundexEncoderString("testing"));
Assert.assertEquals("T000", getSoundexEncoderString("The"));
Assert.assertEquals("Q200", getSoundexEncoderString("quick"));
Assert.assertEquals("B650", getSoundexEncoderString("brown"));
Assert.assertEquals("F200", getSoundexEncoderString("fox"));
Assert.assertEquals("J513", getSoundexEncoderString("jumped"));
Assert.assertEquals("O160", getSoundexEncoderString("over"));
Assert.assertEquals("T000", getSoundexEncoderString("the"));
Assert.assertEquals("L200", getSoundexEncoderString("lazy"));
Assert.assertEquals("D200", getSoundexEncoderString("dogs"));
}
/**
* Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
*/
@Test
public void testEncodeBatch2() {
Assert.assertEquals("A462", getSoundexEncoderString("Allricht"));
Assert.assertEquals("E166", getSoundexEncoderString("Eberhard"));
Assert.assertEquals("E521", getSoundexEncoderString("Engebrethson"));
Assert.assertEquals("H512", getSoundexEncoderString("Heimbach"));
Assert.assertEquals("H524", getSoundexEncoderString("Hanselmann"));
Assert.assertEquals("H431", getSoundexEncoderString("Hildebrand"));
Assert.assertEquals("K152", getSoundexEncoderString("Kavanagh"));
Assert.assertEquals("L530", getSoundexEncoderString("Lind"));
Assert.assertEquals("L222", getSoundexEncoderString("Lukaschowsky"));
Assert.assertEquals("M235", getSoundexEncoderString("McDonnell"));
Assert.assertEquals("M200", getSoundexEncoderString("McGee"));
Assert.assertEquals("O155", getSoundexEncoderString("Opnian"));
Assert.assertEquals("O155", getSoundexEncoderString("Oppenheimer"));
Assert.assertEquals("R355", getSoundexEncoderString("Riedemanas"));
Assert.assertEquals("Z300", getSoundexEncoderString("Zita"));
Assert.assertEquals("Z325", getSoundexEncoderString("Zitzmeinn"));
}
/**
* Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
*/
@Test
public void testEncodeBatch3() {
Assert.assertEquals("W252", getSoundexEncoderString("Washington"));
Assert.assertEquals("L000", getSoundexEncoderString("Lee"));
Assert.assertEquals("G362", getSoundexEncoderString("Gutierrez"));
Assert.assertEquals("P236", getSoundexEncoderString("Pfister"));
Assert.assertEquals("J250", getSoundexEncoderString("Jackson"));
Assert.assertEquals("T522", getSoundexEncoderString("Tymczak"));
// For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
// possible.
Assert.assertEquals("V532", getSoundexEncoderString("VanDeusen"));
}
/**
* Examples from: http://www.myatt.demon.co.uk/sxalg.htm
*/
@Test
public void testEncodeBatch4() {
Assert.assertEquals("H452", getSoundexEncoderString("HOLMES"));
Assert.assertEquals("A355", getSoundexEncoderString("ADOMOMI"));
Assert.assertEquals("V536", getSoundexEncoderString("VONDERLEHR"));
Assert.assertEquals("B400", getSoundexEncoderString("BALL"));
Assert.assertEquals("S000", getSoundexEncoderString("SHAW"));
Assert.assertEquals("J250", getSoundexEncoderString("JACKSON"));
Assert.assertEquals("S545", getSoundexEncoderString("SCANLON"));
Assert.assertEquals("S532", getSoundexEncoderString("SAINTJOHN"));
}
@Test
public void testEncodeIgnoreApostrophes() throws Exception {
checkEncodingVariations("O165", (new String[]{
"OBrien",
"'OBrien",
"O'Brien",
"OB'rien",
"OBr'ien",
"OBri'en",
"OBrie'n",
"OBrien'"}));
}
@Test
public void testEncodeIgnoreHyphens() throws Exception {
checkEncodingVariations("K525", (new String[]{
"KINGSMITH",
"-KINGSMITH",
"K-INGSMITH",
"KI-NGSMITH",
"KIN-GSMITH",
"KINGS-MITH",
"KINGSM-ITH",
"KINGSMI-TH",
"KINGSMIT-H",
"KINGSMITH-"}));
assertFalse("Source: KING-SMITH", getSoundexEncoderString("KING-SMITH").equals("K525"));
}
@Test
public void testEncodeIgnoreTrimmable() {
Assert.assertEquals("W252", getSoundexEncoderString(" \t\n\r Washington \t\n\r "));
}
/**
* Consonants from the same code group separated by W or H are treated as one.
*/
@Test
public void testHWRuleEx1() {
// From
// http://www.archives.gov/research_room/genealogy/census/soundex.html:
// Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
// for the F). It is not coded A-226.
Assert.assertEquals("A261", getSoundexEncoderString("Ashcraft"));
}
/**
* Consonants from the same code group separated by W or H are treated as one.
*
* Test data from http://www.myatt.demon.co.uk/sxalg.htm
*/
@Test
public void testHWRuleEx2() {
Assert.assertEquals("B312", getSoundexEncoderString("BOOTHDAVIS"));
Assert.assertNotSame("B312", getSoundexEncoderString("BOOTH-DAVIS"));
}
/**
* Consonants from the same code group separated by W or H are treated as one.
*
* @throws Exception
*/
@Test
public void testHWRuleEx3() throws Exception {
Assert.assertEquals("S460", getSoundexEncoderString("Sgler"));
Assert.assertEquals("S460", getSoundexEncoderString("Swhgler"));
// Also S460:
checkEncodingVariations("S460", (new String[]{
"SAILOR",
"SALYER",
"SAYLOR",
"SCHALLER",
"SCHELLER",
"SCHILLER",
"SCHOOLER",
"SCHULER",
"SCHUYLER",
"SEILER",
"SEYLER",
"SHOLAR",
"SHULER",
"SILAR",
"SILER",
"SILLER"}));
}
@Test
public void testMaxLength() throws Exception {
// Soundex soundex = new Soundex();
// soundex.setMaxLength(soundex.getMaxLength());
Assert.assertEquals("S460", getSoundexEncoderString("Sgler"));
}
@Test
public void testMaxLengthLessThan3Fix() throws Exception {
// Soundex soundex = new Soundex();
// soundex.setMaxLength(2);
Assert.assertEquals("S460", getSoundexEncoderString("SCHELLER"));
}
/**
* Examples for MS SQLServer from
* http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
*/
@Test
public void testMsSqlServer1() {
Assert.assertEquals("S530", getSoundexEncoderString("Smith"));
Assert.assertEquals("S530", getSoundexEncoderString("Smythe"));
}
/**
* Examples for MS SQLServer from
* http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support
* /kb/articles/Q100/3/65.asp&NoWebContent=1
*
* @throws Exception
*/
@Test
public void testMsSqlServer2() throws Exception {
checkEncodingVariations("E625", (new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"}));
}
/**
* Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm
*/
@Test
public void testMsSqlServer3() {
Assert.assertEquals("A500", getSoundexEncoderString("Ann"));
Assert.assertEquals("A536", getSoundexEncoderString("Andrew"));
Assert.assertEquals("J530", getSoundexEncoderString("Janet"));
Assert.assertEquals("M626", getSoundexEncoderString("Margaret"));
Assert.assertEquals("S315", getSoundexEncoderString("Steven"));
Assert.assertEquals("M240", getSoundexEncoderString("Michael"));
Assert.assertEquals("R163", getSoundexEncoderString("Robert"));
Assert.assertEquals("L600", getSoundexEncoderString("Laura"));
Assert.assertEquals("A500", getSoundexEncoderString("Anne"));
}
/**
* Fancy characters are not mapped by the default US mapping.
*
* http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
*/
@Test
public void testUsMappingEWithAcute() {
Assert.assertEquals("E000", getSoundexEncoderString("e"));
if (Character.isLetter('�')) {
try {
Assert.assertEquals("�000", getSoundexEncoderString("�"));
Assert.fail("Expected IllegalArgumentException not thrown");
} catch (IllegalArgumentException e) {
// expected
}
} else {
Assert.assertEquals("", getSoundexEncoderString("�"));
}
}
/**
* Fancy characters are not mapped by the default US mapping.
*
* http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
*/
@Test
public void testUsMappingOWithDiaeresis() {
Assert.assertEquals("O000", getSoundexEncoderString("o"));
if (Character.isLetter('�')) {
try {
Assert.assertEquals("�000", getSoundexEncoderString("�"));
Assert.fail("Expected IllegalArgumentException not thrown");
} catch (IllegalArgumentException e) {
// expected
}
} else {
Assert.assertEquals("", getSoundexEncoderString("�"));
}
}
}