From c25a53653dea28606e0261091a12be906b3d4886 Mon Sep 17 00:00:00 2001 From: shalu Date: Sat, 14 Feb 2026 14:28:14 +0530 Subject: [PATCH 1/4] [CODEC-317] Fix. ColognePhonetic: Duplicate code in some cases --- .../codec/language/ColognePhonetic.java | 71 ++++++++++--------- .../codec/language/ColognePhoneticTest.java | 16 +++-- 2 files changed, 47 insertions(+), 40 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java b/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java index aab2420f60..01efd61204 100644 --- a/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java +++ b/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java @@ -151,7 +151,7 @@ * * *

Example:

- * + *

* {@code "M}ü{@code ller-L}üdenscheidt" * => "MULLERLUDENSCHEIDT" => "6005507500206880022" * @@ -270,26 +270,29 @@ protected char[] copyData(final int start, final int length) { * @param code the code to store. */ public void put(final char code) { - if (code != CHAR_IGNORE && lastCode != code && (code != '0' || length == 0)) { - data[length] = code; - length++; + if (code == CHAR_IGNORE) { + return; + } + if (lastCode != code && (code != '0' || length == 0)) { + data[length++] = code; } lastCode = code; } } + // Predefined char arrays for better performance and less GC load - private static final char[] AEIJOUY = { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' }; - private static final char[] CSZ = { 'C', 'S', 'Z' }; - private static final char[] FPVW = { 'F', 'P', 'V', 'W' }; - private static final char[] GKQ = { 'G', 'K', 'Q' }; - private static final char[] CKQ = { 'C', 'K', 'Q' }; - private static final char[] AHKLOQRUX = { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' }; + private static final char[] AEIJOUY = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}; + private static final char[] CSZ = {'C', 'S', 'Z'}; + private static final char[] FPVW = {'F', 'P', 'V', 'W'}; + private static final char[] GKQ = {'G', 'K', 'Q'}; + private static final char[] CKQ = {'C', 'K', 'Q'}; + private static final char[] AHKLOQRUX = {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}; - private static final char[] SZ = { 'S', 'Z' }; + private static final char[] SZ = {'S', 'Z'}; - private static final char[] AHKOQUX = { 'A', 'H', 'K', 'O', 'Q', 'U', 'X' }; + private static final char[] AHKOQUX = {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}; - private static final char[] DTX = { 'D', 'T', 'X' }; + private static final char[] DTX = {'D', 'T', 'X'}; private static final char CHAR_IGNORE = '-'; // is this character to be ignored? @@ -346,7 +349,7 @@ public String colognePhonetic(final String text) { } if (chr < 'A' || chr > 'Z') { - continue; // ignore unwanted characters + continue; // ignore unwanted characters } if (arrayContains(AEIJOUY, chr)) { @@ -380,21 +383,21 @@ public String colognePhonetic(final String text) { output.put('8'); } else { switch (chr) { - case 'R': - output.put('7'); - break; - case 'L': - output.put('5'); - break; - case 'M': - case 'N': - output.put('6'); - break; - case 'H': - output.put(CHAR_IGNORE); // needed by put - break; - default: - break; + case 'R': + output.put('7'); + break; + case 'L': + output.put('5'); + break; + case 'M': + case 'N': + output.put('6'); + break; + case 'H': + output.put(CHAR_IGNORE); // needed by put + break; + default: + break; } } @@ -407,10 +410,10 @@ public String colognePhonetic(final String text) { public Object encode(final Object object) throws EncoderException { if (!(object instanceof String)) { throw new EncoderException("This method's parameter was expected to be of the type " + - String.class.getName() + - ". But actually it was of the type " + - object.getClass().getName() + - "."); + String.class.getName() + + ". But actually it was of the type " + + object.getClass().getName() + + "."); } return encode((String) object); } @@ -426,7 +429,7 @@ public String encode(final String text) { * @param text1 source text to encode before testing for equality. * @param text2 source text to encode before testing for equality. * @return {@code true} if the encoding the first string equals the encoding of the second string, {@code false} - * otherwise. + * otherwise. */ public boolean isEncodeEqual(final String text1, final String text2) { return colognePhonetic(text1).equals(colognePhonetic(text2)); diff --git a/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java b/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java index 2867dce36f..7873092654 100644 --- a/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java +++ b/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java @@ -42,7 +42,9 @@ class ColognePhoneticTest extends AbstractStringEncoderTest { private static final Set TESTSET = new HashSet<>(); - /** Character sequences to be tested by the code. */ + /** + * Character sequences to be tested by the code. + */ // @formatter:off private static final String[] MATCHES = { ".*[AEIOUJY].*", // A, E, I, J, O, U, Y @@ -133,7 +135,7 @@ void testAychlmajrCodec122() throws EncoderException { } @Test - // Ensure that override still allows tests to work + // Ensure that override still allows tests to work void testCanFail() { assertThrows(AssertionFailedError.class, () -> checkEncoding("/", "Fehler")); } @@ -182,6 +184,8 @@ void testEdgeCases() throws EncoderException { void testExamples() throws EncoderException { // @formatter:off final String[][] data = { + { "m\u00FClhler", "657" }, + { "m\u00FCleler", "6557" }, { "m\u00DCller", "657" }, // mÜller - why upper case U-umlaut? { "m\u00FCller", "657" }, // müller - add equivalent lower-case { "schmidt", "862" }, @@ -223,7 +227,7 @@ void testExamples() throws EncoderException { @Test void testHyphen() throws EncoderException { - final String[][] data = { { "bergisch-gladbach", "174845214" }, { "M\u00fcller-L\u00fcdenscheidt", "65752682" } }; // Müller-Lüdenscheidt + final String[][] data = {{"bergisch-gladbach", "174845214"}, {"M\u00fcller-L\u00fcdenscheidt", "65752682"}}; // Müller-Lüdenscheidt checkEncodings(data); } @@ -249,19 +253,19 @@ void testIsEncodeEquals() { @Test void testSpecialCharsBetweenSameLetters() throws EncoderException { - final String[] data = { "Test test", "Testtest", "Test-test", "TesT#Test", "TesT?test" }; + final String[] data = {"Test test", "Testtest", "Test-test", "TesT#Test", "TesT?test"}; checkEncodingVariations("28282", data); } @Test void testVariationsMella() throws EncoderException { - final String[] data = { "mella", "milah", "moulla", "mellah", "muehle", "mule" }; + final String[] data = {"mella", "milah", "moulla", "mellah", "muehle", "mule"}; checkEncodingVariations("65", data); } @Test void testVariationsMeyer() throws EncoderException { - final String[] data = { "Meier", "Maier", "Mair", "Meyer", "Meyr", "Mejer", "Major" }; + final String[] data = {"Meier", "Maier", "Mair", "Meyer", "Meyr", "Mejer", "Major"}; checkEncodingVariations("67", data); } } From 4c3b0c102406ff54cce89dce890f19c9f3548f78 Mon Sep 17 00:00:00 2001 From: shalu Date: Mon, 16 Feb 2026 16:32:09 +0530 Subject: [PATCH 2/4] [CODEC-317] Fix. failing tests and add more edge case --- .../commons/codec/language/ColognePhonetic.java | 10 +++++----- .../commons/codec/language/ColognePhoneticTest.java | 11 ++++++++++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java b/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java index 01efd61204..147a863411 100644 --- a/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java +++ b/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java @@ -270,13 +270,13 @@ protected char[] copyData(final int start, final int length) { * @param code the code to store. */ public void put(final char code) { - if (code == CHAR_IGNORE) { - return; + if (code != CHAR_IGNORE && lastCode != code && (code != '0' || length == 0)) { + data[length] = code; + length++; } - if (lastCode != code && (code != '0' || length == 0)) { - data[length++] = code; + if(code != '-' ) { + lastCode = code; } - lastCode = code; } } diff --git a/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java b/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java index 7873092654..c6dabd75a7 100644 --- a/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java +++ b/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java @@ -156,6 +156,7 @@ void testEdgeCases() throws EncoderException { { "aa", "0" }, { "ha", "0" }, { "h", "" }, + { "hh", "" }, { "aha", "0" }, { "b", "1" }, { "p", "1" }, @@ -184,6 +185,9 @@ void testEdgeCases() throws EncoderException { void testExamples() throws EncoderException { // @formatter:off final String[][] data = { + {"aeiou", "0"}, + {"Ares", "078"}, + {"abab", "011"}, { "m\u00FClhler", "657" }, { "m\u00FCleler", "6557" }, { "m\u00DCller", "657" }, // mÜller - why upper case U-umlaut? @@ -195,6 +199,7 @@ void testExamples() throws EncoderException { { "wagner", "3467" }, { "becker", "147" }, { "hoffmann", "0366" }, + { "hoffman", "0366" }, { "sch\u00C4fer", "837" }, // schÄfer - why upper case A-umlaut ? { "sch\u00e4fer", "837" }, // schäfer - add equivalent lower-case { "Breschnew", "17863" }, @@ -219,7 +224,11 @@ void testExamples() throws EncoderException { { "Ace", "08" }, { "shch", "84" }, // CODEC-254 { "xch", "484" }, // CODEC-255 - { "heithabu", "021" } + { "heithabu", "021" }, + { "anna", "06" }, + { "bobb", "11" }, + { "muleler", "6557" }, + { "xcx", "4848" } }; // @formatter:on checkEncodings(data); From 0a8ee74f2f98f0e9cbcbf72227c01b8da4c6490f Mon Sep 17 00:00:00 2001 From: shalu Date: Mon, 16 Feb 2026 16:53:22 +0530 Subject: [PATCH 3/4] [CODEC-317] Fix. formatting --- .../codec/language/ColognePhonetic.java | 64 +++++++++---------- .../codec/language/ColognePhoneticTest.java | 20 +++--- 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java b/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java index 147a863411..54cda39ccf 100644 --- a/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java +++ b/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java @@ -151,7 +151,7 @@ * * *

Example:

- *

+ * * {@code "M}ü{@code ller-L}üdenscheidt" * => "MULLERLUDENSCHEIDT" => "6005507500206880022" * @@ -274,25 +274,25 @@ public void put(final char code) { data[length] = code; length++; } - if(code != '-' ) { + if (code != '-') { lastCode = code; } } } // Predefined char arrays for better performance and less GC load - private static final char[] AEIJOUY = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}; - private static final char[] CSZ = {'C', 'S', 'Z'}; - private static final char[] FPVW = {'F', 'P', 'V', 'W'}; - private static final char[] GKQ = {'G', 'K', 'Q'}; - private static final char[] CKQ = {'C', 'K', 'Q'}; - private static final char[] AHKLOQRUX = {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}; + private static final char[] AEIJOUY = { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' }; + private static final char[] CSZ = { 'C', 'S', 'Z' }; + private static final char[] FPVW = { 'F', 'P', 'V', 'W' }; + private static final char[] GKQ = { 'G', 'K', 'Q' }; + private static final char[] CKQ = { 'C', 'K', 'Q' }; + private static final char[] AHKLOQRUX = { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' }; - private static final char[] SZ = {'S', 'Z'}; + private static final char[] SZ = { 'S', 'Z' }; - private static final char[] AHKOQUX = {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}; + private static final char[] AHKOQUX = { 'A', 'H', 'K', 'O', 'Q', 'U', 'X' }; - private static final char[] DTX = {'D', 'T', 'X'}; + private static final char[] DTX = { 'D', 'T', 'X' }; private static final char CHAR_IGNORE = '-'; // is this character to be ignored? @@ -349,7 +349,7 @@ public String colognePhonetic(final String text) { } if (chr < 'A' || chr > 'Z') { - continue; // ignore unwanted characters + continue; // ignore unwanted characters } if (arrayContains(AEIJOUY, chr)) { @@ -383,21 +383,21 @@ public String colognePhonetic(final String text) { output.put('8'); } else { switch (chr) { - case 'R': - output.put('7'); - break; - case 'L': - output.put('5'); - break; - case 'M': - case 'N': - output.put('6'); - break; - case 'H': - output.put(CHAR_IGNORE); // needed by put - break; - default: - break; + case 'R': + output.put('7'); + break; + case 'L': + output.put('5'); + break; + case 'M': + case 'N': + output.put('6'); + break; + case 'H': + output.put(CHAR_IGNORE); // needed by put + break; + default: + break; } } @@ -410,10 +410,10 @@ public String colognePhonetic(final String text) { public Object encode(final Object object) throws EncoderException { if (!(object instanceof String)) { throw new EncoderException("This method's parameter was expected to be of the type " + - String.class.getName() + - ". But actually it was of the type " + - object.getClass().getName() + - "."); + String.class.getName() + + ". But actually it was of the type " + + object.getClass().getName() + + "."); } return encode((String) object); } @@ -429,7 +429,7 @@ public String encode(final String text) { * @param text1 source text to encode before testing for equality. * @param text2 source text to encode before testing for equality. * @return {@code true} if the encoding the first string equals the encoding of the second string, {@code false} - * otherwise. + * otherwise. */ public boolean isEncodeEqual(final String text1, final String text2) { return colognePhonetic(text1).equals(colognePhonetic(text2)); diff --git a/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java b/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java index c6dabd75a7..31a07001cc 100644 --- a/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java +++ b/src/test/java/org/apache/commons/codec/language/ColognePhoneticTest.java @@ -42,9 +42,7 @@ class ColognePhoneticTest extends AbstractStringEncoderTest { private static final Set TESTSET = new HashSet<>(); - /** - * Character sequences to be tested by the code. - */ + /** Character sequences to be tested by the code. */ // @formatter:off private static final String[] MATCHES = { ".*[AEIOUJY].*", // A, E, I, J, O, U, Y @@ -135,7 +133,7 @@ void testAychlmajrCodec122() throws EncoderException { } @Test - // Ensure that override still allows tests to work + // Ensure that override still allows tests to work void testCanFail() { assertThrows(AssertionFailedError.class, () -> checkEncoding("/", "Fehler")); } @@ -185,9 +183,9 @@ void testEdgeCases() throws EncoderException { void testExamples() throws EncoderException { // @formatter:off final String[][] data = { - {"aeiou", "0"}, - {"Ares", "078"}, - {"abab", "011"}, + { "aeiou", "0" }, + { "Ares", "078" }, + { "abab", "011" }, { "m\u00FClhler", "657" }, { "m\u00FCleler", "6557" }, { "m\u00DCller", "657" }, // mÜller - why upper case U-umlaut? @@ -236,7 +234,7 @@ void testExamples() throws EncoderException { @Test void testHyphen() throws EncoderException { - final String[][] data = {{"bergisch-gladbach", "174845214"}, {"M\u00fcller-L\u00fcdenscheidt", "65752682"}}; // Müller-Lüdenscheidt + final String[][] data = { { "bergisch-gladbach", "174845214" }, { "M\u00fcller-L\u00fcdenscheidt", "65752682" } }; // Müller-Lüdenscheidt checkEncodings(data); } @@ -262,19 +260,19 @@ void testIsEncodeEquals() { @Test void testSpecialCharsBetweenSameLetters() throws EncoderException { - final String[] data = {"Test test", "Testtest", "Test-test", "TesT#Test", "TesT?test"}; + final String[] data = { "Test test", "Testtest", "Test-test", "TesT#Test", "TesT?test" }; checkEncodingVariations("28282", data); } @Test void testVariationsMella() throws EncoderException { - final String[] data = {"mella", "milah", "moulla", "mellah", "muehle", "mule"}; + final String[] data = { "mella", "milah", "moulla", "mellah", "muehle", "mule" }; checkEncodingVariations("65", data); } @Test void testVariationsMeyer() throws EncoderException { - final String[] data = {"Meier", "Maier", "Mair", "Meyer", "Meyr", "Mejer", "Major"}; + final String[] data = { "Meier", "Maier", "Mair", "Meyer", "Meyr", "Mejer", "Major" }; checkEncodingVariations("67", data); } } From 13970fcdd863f8074f89004ccfe5d15b91d1208b Mon Sep 17 00:00:00 2001 From: shalu Date: Mon, 16 Feb 2026 16:55:18 +0530 Subject: [PATCH 4/4] [CODEC-317] Remove empty lines --- .../java/org/apache/commons/codec/language/ColognePhonetic.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java b/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java index 54cda39ccf..87c075c3d3 100644 --- a/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java +++ b/src/main/java/org/apache/commons/codec/language/ColognePhonetic.java @@ -279,7 +279,6 @@ public void put(final char code) { } } } - // Predefined char arrays for better performance and less GC load private static final char[] AEIJOUY = { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' }; private static final char[] CSZ = { 'C', 'S', 'Z' };