feat: Implement FilenameTokenizer for text extraction (#13)

rickhohler · web-flow · commit 1da20a727827 · 2025-12-29T19:22:20.000-06:00
diff --git a/Sources/DesignAlgorithmsKit/Algorithms/Text/FilenameTokenizer.swift b/Sources/DesignAlgorithmsKit/Algorithms/Text/FilenameTokenizer.swift
@@ -0,0 +1,47 @@
+import Foundation
+
+public struct FilenameTokenizer {
+    
+    /// Tokenizes a filename into constituent words.
+    /// Handles camelCase, snake_case, kebab-case, and alphanumeric transitions.
+    ///
+    /// Examples:
+    /// - "PDFExtractor.swift" -> ["PDF", "Extractor", "swift"]
+    /// - "disk_ii_controller" -> ["disk", "ii", "controller"]
+    /// - "iPhone12Pro" -> ["iPhone", "12", "Pro"]
+    public static func tokenize(_ filename: String) -> [String] {
+        // 1. Replace common separators with spaces
+        let separators = CharacterSet(charactersIn: "_-.")
+        let clean = filename.components(separatedBy: separators).joined(separator: " ")
+        
+        // 2. Regex for CamelCase and Number transitions
+        // ([a-z])([A-Z]) -> Lower followed by Upper (e.g. fooBar)
+        let pattern1 = "([a-z])([A-Z])"
+        
+        // ([A-Z])([A-Z][a-z]) -> Upper followed by Upper+Lower (e.g. PDFExtractor)
+        let pattern2 = "([A-Z])([A-Z][a-z])"
+        
+        // ([a-zA-Z])([0-9]) -> Letter followed by Number (e.g. file123)
+        let pattern3 = "([a-zA-Z])([0-9])"
+        
+        // ([0-9])([a-zA-Z]) -> Number followed by Letter (e.g. 123file)
+        let pattern4 = "([0-9])([a-zA-Z])"
+        
+        var result = clean
+        
+        func applyRegex(_ pattern: String) {
+            if let regex = try? NSRegularExpression(pattern: pattern, options: []) {
+                let range = NSRange(location: 0, length: result.utf16.count)
+                result = regex.stringByReplacingMatches(in: result, options: [], range: range, withTemplate: "$1 $2")
+            }
+        }
+        
+        applyRegex(pattern1)
+        applyRegex(pattern2)
+        applyRegex(pattern3)
+        applyRegex(pattern4)
+        
+        return result.components(separatedBy: .whitespacesAndNewlines)
+            .filter { !$0.isEmpty }
+    }
+}
diff --git a/Tests/DesignAlgorithmsKitTests/Algorithms/Text/FilenameTokenizerTests.swift b/Tests/DesignAlgorithmsKitTests/Algorithms/Text/FilenameTokenizerTests.swift
@@ -0,0 +1,28 @@
+import XCTest
+@testable import DesignAlgorithmsKit
+
+final class FilenameTokenizerTests: XCTestCase {
+    
+    func testTokenization() {
+        let cases: [(String, [String])] = [
+            ("simple_file_name", ["simple", "file", "name"]),
+            ("camelCaseFile", ["camel", "Case", "File"]),
+            ("PascalCaseFile", ["Pascal", "Case", "File"]),
+            ("kebab-case-file", ["kebab", "case", "file"]),
+            ("PDFExtractor", ["PDF", "Extractor"]),
+            ("PDFExtractor.swift", ["PDF", "Extractor", "swift"]),
+            ("file123", ["file", "123"]),
+            ("123file", ["123", "file"]),
+            ("complex_FileName-123.txt", ["complex", "File", "Name", "123", "txt"]),
+            ("URLSession", ["URL", "Session"]),
+            ("HTTPClient", ["HTTP", "Client"]),
+             // Verify number splitting
+            ("v2", ["v", "2"])
+        ]
+        
+        for (input, expected) in cases {
+            let result = FilenameTokenizer.tokenize(input)
+            XCTAssertEqual(result, expected, "Failed for input: \(input)")
+        }
+    }
+}