EmoLanguage/normalize_dictionary.py at main · a904guy/EmoLanguage · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
"""
Script to normalize the dictionary file by applying consistent normalization rules
and providing statistics on the changes made.
"""

import re
from collections import Counter
import argparse

def normalize_word(word):
    """
    Normalize a word using consistent rules.

    Args:
        word (str): The word to normalize

    Returns:
        str: The normalized word, or None if it should be excluded
    """
    # Convert to lowercase and strip whitespace
    word = word.strip().lower()

    # Remove any non-alphabetic characters
    word = re.sub(r'[^a-z]', '', word)

    # Only exclude completely empty words
    if not word:
        return None

    return word

def normalize_dictionary(input_file, output_file=None):
    """
    Normalize all words in the dictionary file.

    Args:
        input_file (str): Path to the input dictionary file
        output_file (str): Path to the output file (if None, overwrites input)

    Returns:
        dict: Statistics about the normalization process
    """
    if output_file is None:
        output_file = input_file

    print(f"Reading dictionary from: {input_file}")

    # Read original words
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            original_words = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"Error: File {input_file} not found!")
        return None

    print(f"Original dictionary contains: {len(original_words)} entries")

    # Track statistics
    stats = {
        'original_count': len(original_words),
        'excluded_count': 0,
        'duplicate_count': 0,
        'final_count': 0,
        'changes': []
    }

    # Normalize all words
    normalized_words = []
    excluded_words = []
    changes = []

    for original_word in original_words:
        normalized = normalize_word(original_word)

        if normalized is None:
            excluded_words.append(original_word)
            stats['excluded_count'] += 1
        elif normalized != original_word.strip().lower():
            # Word was changed during normalization
            changes.append((original_word, normalized))
            normalized_words.append(normalized)
        else:
            # Word unchanged
            normalized_words.append(normalized)

    # Remove duplicates while preserving order
    seen = set()
    unique_normalized = []
    duplicates = []

    for word in normalized_words:
        if word in seen:
            duplicates.append(word)
            stats['duplicate_count'] += 1
        else:
            seen.add(word)
            unique_normalized.append(word)

    # Sort the final list
    final_words = sorted(unique_normalized)
    stats['final_count'] = len(final_words)
    stats['changes'] = changes

    # Write normalized dictionary
    print(f"Writing normalized dictionary to: {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        for word in final_words:
            f.write(word + '\n')

    # Print statistics
    print("\n" + "="*60)
    print("DICTIONARY NORMALIZATION STATISTICS")
    print("="*60)
    print(f"Original entries:          {stats['original_count']:,}")
    print(f"Excluded entries:          {stats['excluded_count']:,}")
    print(f"Duplicate entries removed: {stats['duplicate_count']:,}")
    print(f"Final entries:             {stats['final_count']:,}")
    print(f"Net change:                {stats['final_count'] - stats['original_count']:+,}")
    print(f"Compression ratio:         {stats['final_count']/stats['original_count']:.1%}")

    if excluded_words:
        print(f"\nExcluded words ({len(excluded_words)}):")
        for word in excluded_words[:10]:  # Show first 10
            print(f"  '{word}'")
        if len(excluded_words) > 10:
            print(f"  ... and {len(excluded_words) - 10} more")

    if duplicates:
        duplicate_counts = Counter(duplicates)
        print(f"\nMost common duplicates:")
        for word, count in duplicate_counts.most_common(10):
            print(f"  '{word}': {count + 1} total occurrences")

    if changes:
        print(f"\nWords that were changed ({len(changes)}):")
        for original, normalized in changes[:10]:  # Show first 10
            print(f"  '{original}' -> '{normalized}'")
        if len(changes) > 10:
            print(f"  ... and {len(changes) - 10} more")

    print("="*60)
    print("Dictionary normalization completed successfully!")

    return stats

def main():
    parser = argparse.ArgumentParser(description='Normalize dictionary file')
    parser.add_argument('input_file',
                       help='Path to the input dictionary file')
    parser.add_argument('-o', '--output',
                       help='Path to the output file (default: overwrite input)')
    parser.add_argument('--dry-run', action='store_true',
                       help='Show what would be changed without writing')

    args = parser.parse_args()

    if args.dry_run:
        print("DRY RUN MODE - No files will be modified")
        output_file = None  # Don't write anything
    else:
        output_file = args.output

    stats = normalize_dictionary(args.input_file, output_file)

    if stats and args.dry_run:
        print("\nTo apply these changes, run without --dry-run flag")

if __name__ == '__main__':
    main()