Skip to content

Commit 1da20a7

Browse files
authored
feat: Implement FilenameTokenizer for text extraction (#13)
1 parent d5a9e4a commit 1da20a7

2 files changed

Lines changed: 75 additions & 0 deletions

File tree

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import Foundation
2+
3+
public struct FilenameTokenizer {
4+
5+
/// Tokenizes a filename into constituent words.
6+
/// Handles camelCase, snake_case, kebab-case, and alphanumeric transitions.
7+
///
8+
/// Examples:
9+
/// - "PDFExtractor.swift" -> ["PDF", "Extractor", "swift"]
10+
/// - "disk_ii_controller" -> ["disk", "ii", "controller"]
11+
/// - "iPhone12Pro" -> ["iPhone", "12", "Pro"]
12+
public static func tokenize(_ filename: String) -> [String] {
13+
// 1. Replace common separators with spaces
14+
let separators = CharacterSet(charactersIn: "_-.")
15+
let clean = filename.components(separatedBy: separators).joined(separator: " ")
16+
17+
// 2. Regex for CamelCase and Number transitions
18+
// ([a-z])([A-Z]) -> Lower followed by Upper (e.g. fooBar)
19+
let pattern1 = "([a-z])([A-Z])"
20+
21+
// ([A-Z])([A-Z][a-z]) -> Upper followed by Upper+Lower (e.g. PDFExtractor)
22+
let pattern2 = "([A-Z])([A-Z][a-z])"
23+
24+
// ([a-zA-Z])([0-9]) -> Letter followed by Number (e.g. file123)
25+
let pattern3 = "([a-zA-Z])([0-9])"
26+
27+
// ([0-9])([a-zA-Z]) -> Number followed by Letter (e.g. 123file)
28+
let pattern4 = "([0-9])([a-zA-Z])"
29+
30+
var result = clean
31+
32+
func applyRegex(_ pattern: String) {
33+
if let regex = try? NSRegularExpression(pattern: pattern, options: []) {
34+
let range = NSRange(location: 0, length: result.utf16.count)
35+
result = regex.stringByReplacingMatches(in: result, options: [], range: range, withTemplate: "$1 $2")
36+
}
37+
}
38+
39+
applyRegex(pattern1)
40+
applyRegex(pattern2)
41+
applyRegex(pattern3)
42+
applyRegex(pattern4)
43+
44+
return result.components(separatedBy: .whitespacesAndNewlines)
45+
.filter { !$0.isEmpty }
46+
}
47+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import XCTest
2+
@testable import DesignAlgorithmsKit
3+
4+
final class FilenameTokenizerTests: XCTestCase {
5+
6+
func testTokenization() {
7+
let cases: [(String, [String])] = [
8+
("simple_file_name", ["simple", "file", "name"]),
9+
("camelCaseFile", ["camel", "Case", "File"]),
10+
("PascalCaseFile", ["Pascal", "Case", "File"]),
11+
("kebab-case-file", ["kebab", "case", "file"]),
12+
("PDFExtractor", ["PDF", "Extractor"]),
13+
("PDFExtractor.swift", ["PDF", "Extractor", "swift"]),
14+
("file123", ["file", "123"]),
15+
("123file", ["123", "file"]),
16+
("complex_FileName-123.txt", ["complex", "File", "Name", "123", "txt"]),
17+
("URLSession", ["URL", "Session"]),
18+
("HTTPClient", ["HTTP", "Client"]),
19+
// Verify number splitting
20+
("v2", ["v", "2"])
21+
]
22+
23+
for (input, expected) in cases {
24+
let result = FilenameTokenizer.tokenize(input)
25+
XCTAssertEqual(result, expected, "Failed for input: \(input)")
26+
}
27+
}
28+
}

0 commit comments

Comments
 (0)