Skip to content

Commit 045824a

Browse files
committed
fix: null guard on KB lookup and Ollama-aware token estimation in JsonYamlChunker
1 parent 08e2b24 commit 045824a

File tree

2 files changed

+34
-14
lines changed

2 files changed

+34
-14
lines changed

apps/sim/lib/chunkers/json-yaml-chunker.ts

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,25 @@ const JSON_YAML_CHUNKING_CONFIG = {
3535
export class JsonYamlChunker {
3636
private chunkSize: number // in tokens
3737
private minCharactersPerChunk: number // in characters
38+
private readonly embeddingModel?: string
3839

3940
constructor(options: ChunkerOptions = {}) {
4041
this.chunkSize = options.chunkSize ?? JSON_YAML_CHUNKING_CONFIG.TARGET_CHUNK_SIZE
4142
this.minCharactersPerChunk =
4243
options.minCharactersPerChunk ?? JSON_YAML_CHUNKING_CONFIG.MIN_CHARACTERS_PER_CHUNK
44+
this.embeddingModel = options.embeddingModel
45+
}
46+
47+
/**
48+
* Estimate token count for a given text, adjusted for the embedding provider.
49+
* Ollama uses a conservative character-based ratio (3 chars/token).
50+
* OpenAI uses tiktoken for accurate counting.
51+
*/
52+
private getTokenEstimate(text: string): number {
53+
if (this.embeddingModel?.startsWith('ollama/')) {
54+
return Math.ceil(text.length / 3)
55+
}
56+
return getTokenCount(text)
4357
}
4458

4559
/**
@@ -103,7 +117,7 @@ export class JsonYamlChunker {
103117
}
104118

105119
const content = JSON.stringify(data, null, 2)
106-
const tokenCount = getTokenCount(content)
120+
const tokenCount = this.getTokenEstimate(content)
107121

108122
// Filter tiny fragments using character count
109123
if (content.length >= this.minCharactersPerChunk) {
@@ -133,14 +147,14 @@ export class JsonYamlChunker {
133147
for (let i = 0; i < arr.length; i++) {
134148
const item = arr[i]
135149
const itemStr = JSON.stringify(item, null, 2)
136-
const itemTokens = getTokenCount(itemStr)
150+
const itemTokens = this.getTokenEstimate(itemStr)
137151

138152
if (itemTokens > this.chunkSize) {
139153
if (currentBatch.length > 0) {
140154
const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
141155
chunks.push({
142156
text: batchContent,
143-
tokenCount: getTokenCount(batchContent),
157+
tokenCount: this.getTokenEstimate(batchContent),
144158
metadata: {
145159
startIndex: i - currentBatch.length,
146160
endIndex: i - 1,
@@ -167,7 +181,7 @@ export class JsonYamlChunker {
167181
const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
168182
chunks.push({
169183
text: batchContent,
170-
tokenCount: getTokenCount(batchContent),
184+
tokenCount: this.getTokenEstimate(batchContent),
171185
metadata: {
172186
startIndex: i - currentBatch.length,
173187
endIndex: i - 1,
@@ -185,7 +199,7 @@ export class JsonYamlChunker {
185199
const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
186200
chunks.push({
187201
text: batchContent,
188-
tokenCount: getTokenCount(batchContent),
202+
tokenCount: this.getTokenEstimate(batchContent),
189203
metadata: {
190204
startIndex: arr.length - currentBatch.length,
191205
endIndex: arr.length - 1,
@@ -204,7 +218,7 @@ export class JsonYamlChunker {
204218
const entries = Object.entries(obj)
205219

206220
const fullContent = JSON.stringify(obj, null, 2)
207-
const fullTokens = getTokenCount(fullContent)
221+
const fullTokens = this.getTokenEstimate(fullContent)
208222

209223
if (fullTokens <= this.chunkSize) {
210224
chunks.push({
@@ -224,14 +238,14 @@ export class JsonYamlChunker {
224238

225239
for (const [key, value] of entries) {
226240
const valueStr = JSON.stringify({ [key]: value }, null, 2)
227-
const valueTokens = getTokenCount(valueStr)
241+
const valueTokens = this.getTokenEstimate(valueStr)
228242

229243
if (valueTokens > this.chunkSize) {
230244
if (Object.keys(currentObj).length > 0) {
231245
const objContent = JSON.stringify(currentObj, null, 2)
232246
chunks.push({
233247
text: objContent,
234-
tokenCount: getTokenCount(objContent),
248+
tokenCount: this.getTokenEstimate(objContent),
235249
metadata: {
236250
startIndex: 0,
237251
endIndex: objContent.length,
@@ -262,7 +276,7 @@ export class JsonYamlChunker {
262276
const objContent = JSON.stringify(currentObj, null, 2)
263277
chunks.push({
264278
text: objContent,
265-
tokenCount: getTokenCount(objContent),
279+
tokenCount: this.getTokenEstimate(objContent),
266280
metadata: {
267281
startIndex: 0,
268282
endIndex: objContent.length,
@@ -282,7 +296,7 @@ export class JsonYamlChunker {
282296
const objContent = JSON.stringify(currentObj, null, 2)
283297
chunks.push({
284298
text: objContent,
285-
tokenCount: getTokenCount(objContent),
299+
tokenCount: this.getTokenEstimate(objContent),
286300
metadata: {
287301
startIndex: 0,
288302
endIndex: objContent.length,
@@ -304,7 +318,7 @@ export class JsonYamlChunker {
304318
let startIndex = 0
305319

306320
for (const line of lines) {
307-
const lineTokens = getTokenCount(line)
321+
const lineTokens = this.getTokenEstimate(line)
308322

309323
if (currentTokens + lineTokens > this.chunkSize && currentChunk) {
310324
chunks.push({

apps/sim/lib/knowledge/chunks/service.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ export async function queryChunks(
5353
.where(and(eq(knowledgeBase.id, knowledgeBaseId), isNull(knowledgeBase.deletedAt)))
5454
.limit(1)
5555

56-
const { provider } = parseEmbeddingModel(kbRows[0]?.embeddingModel)
56+
if (kbRows.length === 0) throw new Error(`Knowledge base not found: ${knowledgeBaseId}`)
57+
58+
const { provider } = parseEmbeddingModel(kbRows[0].embeddingModel)
5759

5860
if (provider === 'ollama') {
5961
const { rows, total } = await queryKBChunks(knowledgeBaseId, documentId, {
@@ -359,7 +361,9 @@ export async function batchChunkOperation(
359361
.where(and(eq(knowledgeBase.id, knowledgeBaseId), isNull(knowledgeBase.deletedAt)))
360362
.limit(1)
361363

362-
const { provider } = parseEmbeddingModel(kbRows[0]?.embeddingModel)
364+
if (kbRows.length === 0) throw new Error(`Knowledge base not found: ${knowledgeBaseId}`)
365+
366+
const { provider } = parseEmbeddingModel(kbRows[0].embeddingModel)
363367
const isOllama = provider === 'ollama'
364368

365369
const errors: string[] = []
@@ -723,7 +727,9 @@ export async function deleteChunk(
723727
.where(and(eq(knowledgeBase.id, knowledgeBaseId), isNull(knowledgeBase.deletedAt)))
724728
.limit(1)
725729

726-
const { provider } = parseEmbeddingModel(kbRows[0]?.embeddingModel)
730+
if (kbRows.length === 0) throw new Error(`Knowledge base not found: ${knowledgeBaseId}`)
731+
732+
const { provider } = parseEmbeddingModel(kbRows[0].embeddingModel)
727733
const isOllama = provider === 'ollama'
728734

729735
await db.transaction(async (tx) => {

0 commit comments

Comments
 (0)