@@ -35,11 +35,25 @@ const JSON_YAML_CHUNKING_CONFIG = {
3535export class JsonYamlChunker {
3636 private chunkSize : number // in tokens
3737 private minCharactersPerChunk : number // in characters
38+ private readonly embeddingModel ?: string
3839
3940 constructor ( options : ChunkerOptions = { } ) {
4041 this . chunkSize = options . chunkSize ?? JSON_YAML_CHUNKING_CONFIG . TARGET_CHUNK_SIZE
4142 this . minCharactersPerChunk =
4243 options . minCharactersPerChunk ?? JSON_YAML_CHUNKING_CONFIG . MIN_CHARACTERS_PER_CHUNK
44+ this . embeddingModel = options . embeddingModel
45+ }
46+
47+ /**
48+ * Estimate token count for a given text, adjusted for the embedding provider.
49+ * Ollama uses a conservative character-based ratio (3 chars/token).
50+ * OpenAI uses tiktoken for accurate counting.
51+ */
52+ private getTokenEstimate ( text : string ) : number {
53+ if ( this . embeddingModel ?. startsWith ( 'ollama/' ) ) {
54+ return Math . ceil ( text . length / 3 )
55+ }
56+ return getTokenCount ( text )
4357 }
4458
4559 /**
@@ -103,7 +117,7 @@ export class JsonYamlChunker {
103117 }
104118
105119 const content = JSON . stringify ( data , null , 2 )
106- const tokenCount = getTokenCount ( content )
120+ const tokenCount = this . getTokenEstimate ( content )
107121
108122 // Filter tiny fragments using character count
109123 if ( content . length >= this . minCharactersPerChunk ) {
@@ -133,14 +147,14 @@ export class JsonYamlChunker {
133147 for ( let i = 0 ; i < arr . length ; i ++ ) {
134148 const item = arr [ i ]
135149 const itemStr = JSON . stringify ( item , null , 2 )
136- const itemTokens = getTokenCount ( itemStr )
150+ const itemTokens = this . getTokenEstimate ( itemStr )
137151
138152 if ( itemTokens > this . chunkSize ) {
139153 if ( currentBatch . length > 0 ) {
140154 const batchContent = contextHeader + JSON . stringify ( currentBatch , null , 2 )
141155 chunks . push ( {
142156 text : batchContent ,
143- tokenCount : getTokenCount ( batchContent ) ,
157+ tokenCount : this . getTokenEstimate ( batchContent ) ,
144158 metadata : {
145159 startIndex : i - currentBatch . length ,
146160 endIndex : i - 1 ,
@@ -167,7 +181,7 @@ export class JsonYamlChunker {
167181 const batchContent = contextHeader + JSON . stringify ( currentBatch , null , 2 )
168182 chunks . push ( {
169183 text : batchContent ,
170- tokenCount : getTokenCount ( batchContent ) ,
184+ tokenCount : this . getTokenEstimate ( batchContent ) ,
171185 metadata : {
172186 startIndex : i - currentBatch . length ,
173187 endIndex : i - 1 ,
@@ -185,7 +199,7 @@ export class JsonYamlChunker {
185199 const batchContent = contextHeader + JSON . stringify ( currentBatch , null , 2 )
186200 chunks . push ( {
187201 text : batchContent ,
188- tokenCount : getTokenCount ( batchContent ) ,
202+ tokenCount : this . getTokenEstimate ( batchContent ) ,
189203 metadata : {
190204 startIndex : arr . length - currentBatch . length ,
191205 endIndex : arr . length - 1 ,
@@ -204,7 +218,7 @@ export class JsonYamlChunker {
204218 const entries = Object . entries ( obj )
205219
206220 const fullContent = JSON . stringify ( obj , null , 2 )
207- const fullTokens = getTokenCount ( fullContent )
221+ const fullTokens = this . getTokenEstimate ( fullContent )
208222
209223 if ( fullTokens <= this . chunkSize ) {
210224 chunks . push ( {
@@ -224,14 +238,14 @@ export class JsonYamlChunker {
224238
225239 for ( const [ key , value ] of entries ) {
226240 const valueStr = JSON . stringify ( { [ key ] : value } , null , 2 )
227- const valueTokens = getTokenCount ( valueStr )
241+ const valueTokens = this . getTokenEstimate ( valueStr )
228242
229243 if ( valueTokens > this . chunkSize ) {
230244 if ( Object . keys ( currentObj ) . length > 0 ) {
231245 const objContent = JSON . stringify ( currentObj , null , 2 )
232246 chunks . push ( {
233247 text : objContent ,
234- tokenCount : getTokenCount ( objContent ) ,
248+ tokenCount : this . getTokenEstimate ( objContent ) ,
235249 metadata : {
236250 startIndex : 0 ,
237251 endIndex : objContent . length ,
@@ -262,7 +276,7 @@ export class JsonYamlChunker {
262276 const objContent = JSON . stringify ( currentObj , null , 2 )
263277 chunks . push ( {
264278 text : objContent ,
265- tokenCount : getTokenCount ( objContent ) ,
279+ tokenCount : this . getTokenEstimate ( objContent ) ,
266280 metadata : {
267281 startIndex : 0 ,
268282 endIndex : objContent . length ,
@@ -282,7 +296,7 @@ export class JsonYamlChunker {
282296 const objContent = JSON . stringify ( currentObj , null , 2 )
283297 chunks . push ( {
284298 text : objContent ,
285- tokenCount : getTokenCount ( objContent ) ,
299+ tokenCount : this . getTokenEstimate ( objContent ) ,
286300 metadata : {
287301 startIndex : 0 ,
288302 endIndex : objContent . length ,
@@ -304,7 +318,7 @@ export class JsonYamlChunker {
304318 let startIndex = 0
305319
306320 for ( const line of lines ) {
307- const lineTokens = getTokenCount ( line )
321+ const lineTokens = this . getTokenEstimate ( line )
308322
309323 if ( currentTokens + lineTokens > this . chunkSize && currentChunk ) {
310324 chunks . push ( {
0 commit comments