diff --git a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua
index 06009d62..791f11b8 100644
--- a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua
+++ b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua
@@ -1915,6 +1915,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "The selec
-- Select a provider first
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Select a provider first"
+-- Estimated amount of tokens:
+UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Estimated amount of tokens:"
+
-- Start new chat in workspace '{0}'
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Start new chat in workspace '{0}'"
@@ -3814,6 +3817,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1324664716"] = "AP
-- Create account
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1356621346"] = "Create account"
+-- Failed to validate the selected tokenizer. Please try again.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1384494471"] = "Failed to validate the selected tokenizer. Please try again."
+
-- Please enter an embedding model name.
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1661085403"] = "Please enter an embedding model name."
@@ -3835,9 +3841,15 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2189814010"] = "Mo
-- (Optional) API Key
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2331453405"] = "(Optional) API Key"
+-- Invalid tokenizer:
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2448302543"] = "Invalid tokenizer:"
+
-- Add
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2646845972"] = "Add"
+-- Selected file path for the custom tokenizer
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T278585345"] = "Selected file path for the custom tokenizer"
+
-- No models loaded or available.
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2810182573"] = "No models loaded or available."
@@ -3847,6 +3859,12 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2842060373"] = "In
-- Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually.
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T290547799"] = "Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually."
+-- Choose a custom tokenizer here
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T3787466119"] = "Choose a custom tokenizer here"
+
+-- For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T4126312157"] = "For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count."
+
-- Model selection
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T416738168"] = "Model selection"
@@ -4024,6 +4042,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1324664716"] = "API Key"
-- Create account
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1356621346"] = "Create account"
+-- Failed to validate the selected tokenizer. Please try again.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1384494471"] = "Failed to validate the selected tokenizer. Please try again."
+
-- Load models
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T15352225"] = "Load models"
@@ -4051,12 +4072,18 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2189814010"] = "Model"
-- (Optional) API Key
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2331453405"] = "(Optional) API Key"
+-- Invalid tokenizer:
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2448302543"] = "Invalid tokenizer:"
+
-- Add
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2646845972"] = "Add"
-- Additional API parameters
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2728244552"] = "Additional API parameters"
+-- Selected file path for the custom tokenizer
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T278585345"] = "Selected file path for the custom tokenizer"
+
-- No models loaded or available.
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2810182573"] = "No models loaded or available."
@@ -4075,6 +4102,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3763891899"] = "Show availa
-- This host uses the model configured at the provider level. No model selection is available.
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3783329915"] = "This host uses the model configured at the provider level. No model selection is available."
+-- Choose a custom tokenizer here
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3787466119"] = "Choose a custom tokenizer here"
+
-- Duplicate key '{0}' found.
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3804472591"] = "Duplicate key '{0}' found."
@@ -4096,6 +4126,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T900237532"] = "Provider"
-- Cancel
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T900713019"] = "Cancel"
+-- For better token estimates, you can configure a custom tokenizer for this provider.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T961454300"] = "For better token estimates, you can configure a custom tokenizer for this provider."
+
-- The parameter name. It must be unique within the retrieval process.
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::RETRIEVALPROCESSDIALOG::T100726215"] = "The parameter name. It must be unique within the retrieval process."
@@ -5689,6 +5722,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startup log file
-- Browse AI Studio's source code on GitHub — we welcome your contributions.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Browse AI Studio's source code on GitHub — we welcome your contributions."
+-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer.
+UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer."
+
-- ID mismatch: the plugin ID differs from the enterprise configuration ID.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID mismatch: the plugin ID differs from the enterprise configuration ID."
@@ -5929,6 +5965,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "This is a library
-- Used .NET SDK
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Used .NET SDK"
+-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate.
+UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate."
+
-- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated."
diff --git a/app/MindWork AI Studio/Components/AttachDocuments.razor.cs b/app/MindWork AI Studio/Components/AttachDocuments.razor.cs
index acfc0dd2..65a901ef 100644
--- a/app/MindWork AI Studio/Components/AttachDocuments.razor.cs
+++ b/app/MindWork AI Studio/Components/AttachDocuments.razor.cs
@@ -48,6 +48,9 @@ public partial class AttachDocuments : MSGComponentBase
[Parameter]
public bool UseSmallForm { get; set; }
+ [Parameter]
+ public FileType[]? AllowedFileTypes { get; set; }
+
///
/// When true, validate media file types before attaching. Default is true. That means that
/// the user cannot attach unsupported media file types when the provider or model does not
diff --git a/app/MindWork AI Studio/Components/ChatComponent.razor b/app/MindWork AI Studio/Components/ChatComponent.razor
index 6ab7d977..1998aee6 100644
--- a/app/MindWork AI Studio/Components/ChatComponent.razor
+++ b/app/MindWork AI Studio/Components/ChatComponent.razor
@@ -34,7 +34,7 @@
-
diff --git a/app/MindWork AI Studio/Components/ChatComponent.razor.cs b/app/MindWork AI Studio/Components/ChatComponent.razor.cs
index c4b30a2f..0e01d58d 100644
--- a/app/MindWork AI Studio/Components/ChatComponent.razor.cs
+++ b/app/MindWork AI Studio/Components/ChatComponent.razor.cs
@@ -3,6 +3,7 @@
using AIStudio.Provider;
using AIStudio.Settings;
using AIStudio.Settings.DataModel;
+using AIStudio.Tools.Services;
using Microsoft.AspNetCore.Components;
using Microsoft.AspNetCore.Components.Web;
@@ -44,6 +45,8 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable
[Inject]
private IDialogService DialogService { get; init; } = null!;
+ [Inject]
+ private RustService RustService { get; init; } = null!;
[Inject]
private IJSRuntime JsRuntime { get; init; } = null!;
@@ -69,10 +72,12 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable
private Guid currentChatThreadId = Guid.Empty;
private CancellationTokenSource? cancellationTokenSource;
private HashSet chatDocumentPaths = [];
+ private string tokenCount = "0";
+ private string TokenCountMessage => $"{this.T("Estimated amount of tokens:")} {this.tokenCount}";
// Unfortunately, we need the input field reference to blur the focus away. Without
// this, we cannot clear the input field.
- private MudTextField inputField = null!;
+ private UserPromptComponent inputField = null!;
#region Overrides of ComponentBase
@@ -460,6 +465,9 @@ private async Task InputKeyEvent(KeyboardEventArgs keyEvent)
// Was a modifier key pressed as well?
var isModifier = keyEvent.AltKey || keyEvent.CtrlKey || keyEvent.MetaKey || keyEvent.ShiftKey;
+ if (isEnter)
+ await this.CalculateTokenCount();
+
// Depending on the user's settings, might react to shortcuts:
switch (this.SettingsManager.ConfigurationData.Chat.ShortcutSendBehavior)
{
@@ -596,6 +604,7 @@ private async Task SendMessage(bool reuseLastUserPrompt = false)
this.chatDocumentPaths.Clear();
await this.inputField.BlurAsync();
+ this.tokenCount = "0";
// Enable the stream state for the chat component:
this.isStreaming = true;
@@ -978,6 +987,25 @@ private Task EditLastBlock(IContent block)
return Task.CompletedTask;
}
+ private async Task CalculateTokenCount()
+ {
+ if (this.inputField.Value is null)
+ {
+ this.tokenCount = "0";
+ return;
+ }
+ var response = await this.RustService.GetTokenCount(this.inputField.Value);
+ if (response is null)
+ return;
+ if (!response.Value.Success)
+ {
+ this.Logger.LogWarning($"Failed to calculate token count: {response.Value.Message}");
+ return;
+ }
+ this.tokenCount = response.Value.TokenCount.ToString();
+ this.StateHasChanged();
+ }
+
#region Overrides of MSGComponentBase
protected override async Task ProcessIncomingMessage(ComponentBase? sendingComponent, Event triggeredEvent, T? data) where T : default
diff --git a/app/MindWork AI Studio/Components/SelectFile.razor b/app/MindWork AI Studio/Components/SelectFile.razor
index de3971e5..b6f7d39b 100644
--- a/app/MindWork AI Studio/Components/SelectFile.razor
+++ b/app/MindWork AI Studio/Components/SelectFile.razor
@@ -5,12 +5,16 @@
T="string"
Text="@this.File"
Label="@this.Label"
- ReadOnly="@true"
+ ReadOnly="@(!this.IsClearable)"
Validation="@this.Validation"
Adornment="Adornment.Start"
AdornmentIcon="@Icons.Material.Filled.AttachFile"
UserAttributes="@SPELLCHECK_ATTRIBUTES"
Variant="Variant.Outlined"
+ Clearable="this.IsClearable"
+ Error="@this.Error"
+ ErrorText="@this.ErrorText"
+ OnClearButtonClick="@this.OnClear"
/>
diff --git a/app/MindWork AI Studio/Components/SelectFile.razor.cs b/app/MindWork AI Studio/Components/SelectFile.razor.cs
index 91c7a667..38215b0c 100644
--- a/app/MindWork AI Studio/Components/SelectFile.razor.cs
+++ b/app/MindWork AI Studio/Components/SelectFile.razor.cs
@@ -2,6 +2,7 @@
using AIStudio.Tools.Services;
using Microsoft.AspNetCore.Components;
+using Microsoft.AspNetCore.Components.Web;
namespace AIStudio.Components;
@@ -27,7 +28,19 @@ public partial class SelectFile : MSGComponentBase
[Parameter]
public Func Validation { get; set; } = _ => null;
-
+
+ [Parameter]
+ public bool IsClearable { get; set; } = false;
+
+ [Parameter]
+ public bool Error { get; set; } = false;
+
+ [Parameter]
+ public string ErrorText { get; set; } = string.Empty;
+
+ [Parameter]
+ public Func OnClear { get; set; } = _ => Task.CompletedTask;
+
[Inject]
public RustService RustService { get; set; } = null!;
@@ -52,7 +65,7 @@ private void InternalFileChanged(string file)
this.File = file;
this.FileChanged.InvokeAsync(file);
}
-
+
private async Task OpenFileDialog()
{
var response = await this.RustService.SelectFile(this.FileDialogTitle, this.Filter, string.IsNullOrWhiteSpace(this.File) ? null : this.File);
diff --git a/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs b/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs
index 775b2ad9..8f9ad19c 100644
--- a/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs
+++ b/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs
@@ -73,6 +73,7 @@ private async Task EditEmbeddingProvider(EmbeddingProvider embeddingProvider)
{ x => x.IsSelfHosted, embeddingProvider.IsSelfHosted },
{ x => x.IsEditing, true },
{ x => x.DataHost, embeddingProvider.Host },
+ { x => x.DataTokenizerPath, embeddingProvider.TokenizerPath },
};
var dialogReference = await this.DialogService.ShowAsync(T("Edit Embedding Provider"), dialogParameters, DialogOptions.FULLSCREEN);
diff --git a/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs b/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs
index 500a4c2d..f4f4d9bd 100644
--- a/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs
+++ b/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs
@@ -73,6 +73,7 @@ private async Task EditLLMProvider(AIStudio.Settings.Provider provider)
{ x => x.DataHost, provider.Host },
{ x => x.HFInferenceProviderId, provider.HFInferenceProvider },
{ x => x.AdditionalJsonApiParameters, provider.AdditionalJsonApiParameters },
+ { x => x.DataTokenizerPath, provider.TokenizerPath },
};
var dialogReference = await this.DialogService.ShowAsync(T("Edit LLM Provider"), dialogParameters, DialogOptions.FULLSCREEN);
diff --git a/app/MindWork AI Studio/Components/UserPromptComponent.cs b/app/MindWork AI Studio/Components/UserPromptComponent.cs
new file mode 100644
index 00000000..03139a52
--- /dev/null
+++ b/app/MindWork AI Studio/Components/UserPromptComponent.cs
@@ -0,0 +1,68 @@
+using Microsoft.AspNetCore.Components;
+using Timer = System.Timers.Timer;
+
+namespace AIStudio.Components;
+
+///
+/// Debounced multi-line text input built on .
+/// Keeps the base API while adding a debounce timer.
+/// Callers can override any property as usual.
+///
+public class UserPromptComponent : MudTextField
+{
+ [Parameter]
+ public TimeSpan DebounceTime { get; set; } = TimeSpan.FromMilliseconds(800);
+
+ [Parameter]
+ public Func WhenTextChangedAsync { get; set; } = _ => Task.CompletedTask;
+
+ private readonly Timer debounceTimer = new();
+ private string text = string.Empty;
+ private string lastParameterText = string.Empty;
+ private string lastNotifiedText = string.Empty;
+ private bool isInitialized;
+
+ protected override async Task OnInitializedAsync()
+ {
+ this.text = this.Text ?? string.Empty;
+ this.lastParameterText = this.text;
+ this.lastNotifiedText = this.text;
+ this.debounceTimer.AutoReset = false;
+ this.debounceTimer.Interval = this.DebounceTime.TotalMilliseconds;
+ this.debounceTimer.Elapsed += (_, _) =>
+ {
+ this.debounceTimer.Stop();
+ if (this.text == this.lastNotifiedText)
+ return;
+
+ this.lastNotifiedText = this.text;
+ this.InvokeAsync(async () => await this.TextChanged.InvokeAsync(this.text));
+ this.InvokeAsync(async () => await this.WhenTextChangedAsync(this.text));
+ };
+
+ this.isInitialized = true;
+ await base.OnInitializedAsync();
+ }
+
+ protected override async Task OnParametersSetAsync()
+ {
+ // Ensure the timer uses the latest debouncing interval:
+ if (!this.isInitialized)
+ return;
+
+ if(Math.Abs(this.debounceTimer.Interval - this.DebounceTime.TotalMilliseconds) > 1)
+ this.debounceTimer.Interval = this.DebounceTime.TotalMilliseconds;
+
+ // Only sync when the parent's parameter actually changed since the last change:
+ if (this.Text != this.lastParameterText)
+ {
+ this.text = this.Text ?? string.Empty;
+ this.lastParameterText = this.text;
+ }
+
+ this.debounceTimer.Stop();
+ this.debounceTimer.Start();
+
+ await base.OnParametersSetAsync();
+ }
+}
diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor
index 85e6e6ef..c23a7948 100644
--- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor
+++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor
@@ -1,5 +1,6 @@
@using AIStudio.Provider
@using AIStudio.Provider.SelfHosted
+@using AIStudio.Tools.Rust
@inherits MSGComponentBase
@@ -7,7 +8,7 @@
@* ReSharper disable once CSharpWarnings::CS8974 *@
-
+
@foreach (LLMProviders provider in Enum.GetValues(typeof(LLMProviders)))
{
if (provider.ProvideEmbeddingAPI() || provider is LLMProviders.NONE)
@@ -22,7 +23,7 @@
@T("Create account")
-
+
@if (this.DataLLMProvider.IsAPIKeyNeeded(this.DataHost))
{
@@ -71,15 +72,14 @@
AdornmentColor="Color.Info"
Validation="@this.ValidateManuallyModel"
UserAttributes="@SPELLCHECK_ATTRIBUTES"
- HelperText="@T("Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually.")"
- />
+ HelperText="@T("Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually.")"/>
}
else
{
@T("Load")
- @if(this.availableModels.Count is 0)
+ @if (this.availableModels.Count is 0)
{
@T("No models loaded or available.")
@@ -122,18 +122,36 @@
AdornmentIcon="@Icons.Material.Filled.Lightbulb"
AdornmentColor="Color.Info"
Validation="@this.providerValidation.ValidatingInstanceName"
- UserAttributes="@SPELLCHECK_ATTRIBUTES"
- />
-
+ UserAttributes="@SPELLCHECK_ATTRIBUTES"/>
+ @if (this.DataModel != default){
+
+ @T("For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count.")
+
+
+ }
-
+ @if (this.dataStoreWasAttempted)
+ {
+
+ }
@T("Cancel")
- @if(this.IsEditing)
+ @if (this.IsEditing)
{
@T("Update")
}
@@ -143,4 +161,4 @@
}
-
\ No newline at end of file
+
diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs
index 6520b7ee..9e4479a7 100644
--- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs
+++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs
@@ -1,3 +1,4 @@
+using AIStudio.Chat;
using AIStudio.Components;
using AIStudio.Provider;
using AIStudio.Settings;
@@ -5,7 +6,7 @@
using AIStudio.Tools.Validation;
using Microsoft.AspNetCore.Components;
-
+using Microsoft.AspNetCore.Components.Web;
using Host = AIStudio.Provider.SelfHosted.Host;
namespace AIStudio.Dialogs;
@@ -68,6 +69,9 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
///
[Parameter]
public bool IsEditing { get; init; }
+
+ [Parameter]
+ public string DataTokenizerPath { get; set; } = string.Empty;
[Inject]
private RustService RustService { get; init; } = null!;
@@ -89,6 +93,11 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
private string dataAPIKeyStorageIssue = string.Empty;
private string dataEditingPreviousInstanceName = string.Empty;
private string dataLoadingModelsIssue = string.Empty;
+ private string dataFilePath = string.Empty;
+ private string dataCustomTokenizerValidationIssue = string.Empty;
+ private Task dataTokenizerValidationTask = Task.CompletedTask;
+ private bool dataStoreWasAttempted;
+ private int dataTokenizerValidationRevision;
// We get the form reference from Blazor code to validate it manually:
private MudForm form = null!;
@@ -96,7 +105,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
private readonly List availableModels = new();
private readonly Encryption encryption = Program.ENCRYPTION;
private readonly ProviderValidation providerValidation;
-
+
public EmbeddingProviderDialog()
{
this.providerValidation = new()
@@ -107,6 +116,7 @@ public EmbeddingProviderDialog()
GetUsedInstanceNames = () => this.UsedInstanceNames,
GetHost = () => this.DataHost,
IsModelProvidedManually = () => this.DataLLMProvider is LLMProviders.SELF_HOSTED && this.DataHost is Host.OLLAMA,
+ GetCustomTokenizerValidationIssue = () => this.dataCustomTokenizerValidationIssue,
};
}
@@ -136,6 +146,7 @@ private EmbeddingProvider CreateEmbeddingProviderSettings()
Host = this.DataHost,
IsEnterpriseConfiguration = false,
EnterpriseConfigurationPluginId = Guid.Empty,
+ TokenizerPath = this.dataFilePath,
};
}
@@ -152,10 +163,13 @@ protected override async Task OnInitializedAsync()
// Load the used instance names:
this.UsedInstanceNames = this.SettingsManager.ConfigurationData.EmbeddingProviders.Select(x => x.Name.ToLowerInvariant()).ToList();
+ Console.WriteLine($"Previous instance names: {this.dataEditingPreviousInstanceName}");
// When editing, we need to load the data:
if(this.IsEditing)
{
this.dataEditingPreviousInstanceName = this.DataName.ToLowerInvariant();
+ this.dataFilePath = this.DataTokenizerPath;
+ Console.WriteLine($"Previous instance name is '{this.dataEditingPreviousInstanceName}'");
// When using self-hosted embedding, we must copy the model name:
if (this.DataLLMProvider is LLMProviders.SELF_HOSTED)
@@ -211,6 +225,8 @@ protected override async Task OnAfterRenderAsync(bool firstRender)
private async Task Store()
{
+ this.dataStoreWasAttempted = true;
+ await this.dataTokenizerValidationTask;
await this.form.Validate();
this.dataAPIKeyStorageIssue = string.Empty;
@@ -227,6 +243,16 @@ private async Task Store()
if (!this.dataIsValid)
return;
+ var response = await this.RustService.StoreTokenizer(this.DataName, this.dataEditingPreviousInstanceName, this.dataFilePath);
+ Console.WriteLine($"Response from Rust: {response.Message}");
+ if (!response.Success)
+ {
+ this.dataCustomTokenizerValidationIssue = response.Message;
+ await this.form.Validate();
+ return;
+ }
+ this.dataFilePath = response.Message;
+
// Use the data model to store the provider.
// We just return this data to the parent component:
var addedProviderSettings = this.CreateEmbeddingProviderSettings();
@@ -265,6 +291,58 @@ private async Task OnAPIKeyChanged(string apiKey)
}
}
+ private Task ClearPathTokenizer(MouseEventArgs _)
+ {
+ return this.OnDataFilePathChanged(string.Empty);
+ }
+
+ private async Task OnDataFilePathChanged(string filePath)
+ {
+ this.dataFilePath = filePath;
+ var validationRevision = ++this.dataTokenizerValidationRevision;
+ this.dataTokenizerValidationTask = this.ValidateCustomTokenizer(filePath, validationRevision);
+ await this.dataTokenizerValidationTask;
+
+ if (validationRevision != this.dataTokenizerValidationRevision)
+ return;
+
+ if (this.dataStoreWasAttempted)
+ await this.form.Validate();
+ else
+ this.form.ResetValidation();
+ }
+
+ private async Task ValidateCustomTokenizer(string filePath, int validationRevision)
+ {
+ if (string.IsNullOrWhiteSpace(filePath))
+ {
+ if (validationRevision == this.dataTokenizerValidationRevision)
+ this.dataCustomTokenizerValidationIssue = string.Empty;
+
+ return;
+ }
+
+ try
+ {
+ var response = await this.RustService.ValidateTokenizer(filePath);
+ if (validationRevision != this.dataTokenizerValidationRevision)
+ return;
+
+ if (response.Success)
+ this.dataCustomTokenizerValidationIssue = string.Empty;
+ else
+ this.dataCustomTokenizerValidationIssue = T("Invalid tokenizer: ") + response.Message;
+ }
+ catch (Exception e)
+ {
+ if (validationRevision != this.dataTokenizerValidationRevision)
+ return;
+
+ this.Logger.LogError(e, "Failed to validate custom tokenizer.");
+ this.dataCustomTokenizerValidationIssue = T("Failed to validate the selected tokenizer. Please try again.");
+ }
+ }
+
private void OnHostChanged(Host selectedHost)
{
// When the host changes, reset the model selection state:
@@ -307,4 +385,4 @@ private async Task ReloadModels()
};
private bool IsNoneProvider => this.DataLLMProvider is LLMProviders.NONE;
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor
index 4c09da2f..0e61ce5b 100644
--- a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor
+++ b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor
@@ -1,6 +1,7 @@
@using AIStudio.Provider
@using AIStudio.Provider.HuggingFace
@using AIStudio.Provider.SelfHosted
+@using AIStudio.Tools.Rust
@inherits MSGComponentBase
@@ -150,6 +151,24 @@
Validation="@this.providerValidation.ValidatingInstanceName"
UserAttributes="@SPELLCHECK_ATTRIBUTES"
/>
+
+ @if (this.DataLLMProvider != LLMProviders.NONE)
+ {
+
+ @T("For better token estimates, you can configure a custom tokenizer for this provider.")
+
+
+ }
diff --git a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs
index 9e84bea8..fbd9a9b2 100644
--- a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs
+++ b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs
@@ -8,6 +8,7 @@
using AIStudio.Tools.Validation;
using Microsoft.AspNetCore.Components;
+using Microsoft.AspNetCore.Components.Web;
using Host = AIStudio.Provider.SelfHosted.Host;
@@ -83,6 +84,9 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
[Parameter]
public string AdditionalJsonApiParameters { get; set; } = string.Empty;
+
+ [Parameter]
+ public string DataTokenizerPath { get; set; } = string.Empty;
[Inject]
private RustService RustService { get; init; } = null!;
@@ -104,6 +108,11 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
private string dataAPIKeyStorageIssue = string.Empty;
private string dataEditingPreviousInstanceName = string.Empty;
private string dataLoadingModelsIssue = string.Empty;
+ private string dataFilePath = string.Empty;
+ private string dataCustomTokenizerValidationIssue = string.Empty;
+ private Task dataTokenizerValidationTask = Task.CompletedTask;
+ private bool dataStoreWasAttempted;
+ private int dataTokenizerValidationRevision;
private bool showExpertSettings;
// We get the form reference from Blazor code to validate it manually:
@@ -123,6 +132,7 @@ public ProviderDialog()
GetUsedInstanceNames = () => this.UsedInstanceNames,
GetHost = () => this.DataHost,
IsModelProvidedManually = () => this.DataLLMProvider.IsLLMModelProvidedManually(),
+ GetCustomTokenizerValidationIssue = () => this.dataCustomTokenizerValidationIssue,
};
}
@@ -158,6 +168,7 @@ private AIStudio.Settings.Provider CreateProviderSettings()
Host = this.DataHost,
HFInferenceProvider = this.HFInferenceProviderId,
AdditionalJsonApiParameters = this.AdditionalJsonApiParameters,
+ TokenizerPath = this.dataFilePath,
};
}
@@ -182,6 +193,7 @@ protected override async Task OnInitializedAsync()
if(this.IsEditing)
{
this.dataEditingPreviousInstanceName = this.DataInstanceName.ToLowerInvariant();
+ this.dataFilePath = this.DataTokenizerPath;
// When using Fireworks or Hugging Face, we must copy the model name:
if (this.DataLLMProvider.IsLLMModelProvidedManually())
@@ -237,6 +249,8 @@ protected override async Task OnAfterRenderAsync(bool firstRender)
private async Task Store()
{
+ this.dataStoreWasAttempted = true;
+ await this.dataTokenizerValidationTask;
await this.form.Validate();
if (!string.IsNullOrWhiteSpace(this.dataAPIKeyStorageIssue))
this.dataAPIKeyStorageIssue = string.Empty;
@@ -253,6 +267,15 @@ private async Task Store()
// When the data is not valid, we don't store it:
if (!this.dataIsValid)
return;
+
+ var tokenizerResponse = await this.RustService.StoreTokenizer(this.DataInstanceName, this.dataEditingPreviousInstanceName, this.dataFilePath);
+ if (!tokenizerResponse.Success)
+ {
+ this.dataCustomTokenizerValidationIssue = tokenizerResponse.Message;
+ await this.form.Validate();
+ return;
+ }
+ this.dataFilePath = tokenizerResponse.Message;
// Use the data model to store the provider.
// We just return this data to the parent component:
@@ -292,6 +315,58 @@ private async Task OnAPIKeyChanged(string apiKey)
}
}
+ private Task ClearPathTokenizer(MouseEventArgs _)
+ {
+ return this.OnDataFilePathChanged(string.Empty);
+ }
+
+ private async Task OnDataFilePathChanged(string filePath)
+ {
+ this.dataFilePath = filePath;
+ var validationRevision = ++this.dataTokenizerValidationRevision;
+ this.dataTokenizerValidationTask = this.ValidateCustomTokenizer(filePath, validationRevision);
+ await this.dataTokenizerValidationTask;
+
+ if (validationRevision != this.dataTokenizerValidationRevision)
+ return;
+
+ if (this.dataStoreWasAttempted)
+ await this.form.Validate();
+ else
+ this.form.ResetValidation();
+ }
+
+ private async Task ValidateCustomTokenizer(string filePath, int validationRevision)
+ {
+ if (string.IsNullOrWhiteSpace(filePath))
+ {
+ if (validationRevision == this.dataTokenizerValidationRevision)
+ this.dataCustomTokenizerValidationIssue = string.Empty;
+
+ return;
+ }
+
+ try
+ {
+ var response = await this.RustService.ValidateTokenizer(filePath);
+ if (validationRevision != this.dataTokenizerValidationRevision)
+ return;
+
+ if (response.Success)
+ this.dataCustomTokenizerValidationIssue = string.Empty;
+ else
+ this.dataCustomTokenizerValidationIssue = T("Invalid tokenizer: ") + response.Message;
+ }
+ catch (Exception e)
+ {
+ if (validationRevision != this.dataTokenizerValidationRevision)
+ return;
+
+ this.Logger.LogError(e, "Failed to validate custom tokenizer.");
+ this.dataCustomTokenizerValidationIssue = T("Failed to validate the selected tokenizer. Please try again.");
+ }
+ }
+
private void OnHostChanged(Host selectedHost)
{
// When the host changes, reset the model selection state:
diff --git a/app/MindWork AI Studio/Pages/Information.razor b/app/MindWork AI Studio/Pages/Information.razor
index b7b9aea4..665afad6 100644
--- a/app/MindWork AI Studio/Pages/Information.razor
+++ b/app/MindWork AI Studio/Pages/Information.razor
@@ -290,6 +290,8 @@
+
+
diff --git a/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua b/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua
index 75c38a6d..b4c9692e 100644
--- a/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua
+++ b/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua
@@ -1917,6 +1917,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "Der ausge
-- Select a provider first
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Wähle zuerst einen Anbieter aus"
+-- Estimated amount of tokens:
+UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Geschätzte Anzahl an Tokens:"
+
-- Start new chat in workspace "{0}"
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Neuen Chat im Arbeitsbereich \"{0}\" starten"
@@ -5691,6 +5694,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startprotokollda
-- Browse AI Studio's source code on GitHub — we welcome your contributions.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Sehen Sie sich den Quellcode von AI Studio auf GitHub an – wir freuen uns über ihre Beiträge."
+-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer.
+UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "Die Tokenizer‑Bibliothek dient als Basis‑Framework für die Integration des DeepSeek‑Tokenizers."
+
-- ID mismatch: the plugin ID differs from the enterprise configuration ID.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID-Konflikt: Die Plugin-ID stimmt nicht mit der ID der Unternehmenskonfiguration überein."
@@ -5931,6 +5937,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "Dies ist eine Bib
-- Used .NET SDK
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Verwendetes .NET SDK"
+-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate.
+UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "Wir verwenden den DeepSeek‑Tokenizer, um die Token‑Anzahl einer Eingabe zu schätzen."
+
-- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "Diese Bibliothek wird verwendet, um Sidecar-Prozesse zu verwalten und sicherzustellen, dass veraltete oder Zombie-Sidecars erkannt und beendet werden."
diff --git a/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua b/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua
index 8e7c757f..544a565b 100644
--- a/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua
+++ b/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua
@@ -1917,6 +1917,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "The selec
-- Select a provider first
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Select a provider first"
+-- Estimated amount of tokens:
+UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Estimated amount of tokens:"
+
-- Start new chat in workspace "{0}"
UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Start new chat in workspace \"{0}\""
@@ -5691,6 +5694,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startup log file
-- Browse AI Studio's source code on GitHub — we welcome your contributions.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Browse AI Studio's source code on GitHub — we welcome your contributions."
+-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer.
+UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer."
+
-- ID mismatch: the plugin ID differs from the enterprise configuration ID.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID mismatch: the plugin ID differs from the enterprise configuration ID."
@@ -5931,6 +5937,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "This is a library
-- Used .NET SDK
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Used .NET SDK"
+-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate.
+UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate."
+
-- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated.
UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated."
diff --git a/app/MindWork AI Studio/Provider/BaseProvider.cs b/app/MindWork AI Studio/Provider/BaseProvider.cs
index 9b729824..28179223 100644
--- a/app/MindWork AI Studio/Provider/BaseProvider.cs
+++ b/app/MindWork AI Studio/Provider/BaseProvider.cs
@@ -90,6 +90,9 @@ protected BaseProvider(LLMProviders provider, string url, ILogger logger)
///
public string AdditionalJsonApiParameters { get; init; } = string.Empty;
+ ///
+ public string TokenizerPath { get; init; } = string.Empty;
+
///
public abstract IAsyncEnumerable StreamChatCompletion(Model chatModel, ChatThread chatThread, SettingsManager settingsManager, CancellationToken token = default);
diff --git a/app/MindWork AI Studio/Provider/IProvider.cs b/app/MindWork AI Studio/Provider/IProvider.cs
index ef15dd21..e0842f2e 100644
--- a/app/MindWork AI Studio/Provider/IProvider.cs
+++ b/app/MindWork AI Studio/Provider/IProvider.cs
@@ -28,6 +28,11 @@ public interface IProvider
/// The additional API parameters.
///
public string AdditionalJsonApiParameters { get; }
+
+ ///
+ /// The tokenizer path associated with this provider configuration.
+ ///
+ public string TokenizerPath { get; }
///
/// Starts a chat completion stream.
@@ -101,4 +106,4 @@ public interface IProvider
/// >The cancellation token.
/// >The list of transcription models.
public Task> GetTranscriptionModels(string? apiKeyProvisional = null, CancellationToken token = default);
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs b/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs
index e71cef95..f04d9af4 100644
--- a/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs
+++ b/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs
@@ -186,7 +186,7 @@ public static class LLMProvidersExtensions
/// The provider instance.
public static IProvider CreateProvider(this AIStudio.Settings.Provider providerSettings)
{
- return providerSettings.UsedLLMProvider.CreateProvider(providerSettings.InstanceName, providerSettings.Host, providerSettings.Hostname, providerSettings.Model, providerSettings.HFInferenceProvider, providerSettings.AdditionalJsonApiParameters, providerSettings.IsEnterpriseConfiguration);
+ return providerSettings.UsedLLMProvider.CreateProvider(providerSettings.InstanceName, providerSettings.Host, providerSettings.Hostname, providerSettings.Model, providerSettings.HFInferenceProvider, providerSettings.TokenizerPath, providerSettings.AdditionalJsonApiParameters, providerSettings.IsEnterpriseConfiguration);
}
///
@@ -196,7 +196,7 @@ public static IProvider CreateProvider(this AIStudio.Settings.Provider providerS
/// The provider instance.
public static IProvider CreateProvider(this EmbeddingProvider embeddingProviderSettings)
{
- return embeddingProviderSettings.UsedLLMProvider.CreateProvider(embeddingProviderSettings.Name, embeddingProviderSettings.Host, embeddingProviderSettings.Hostname, embeddingProviderSettings.Model, HFInferenceProvider.NONE, isEnterpriseConfiguration: embeddingProviderSettings.IsEnterpriseConfiguration);
+ return embeddingProviderSettings.UsedLLMProvider.CreateProvider(embeddingProviderSettings.Name, embeddingProviderSettings.Host, embeddingProviderSettings.Hostname, embeddingProviderSettings.Model, HFInferenceProvider.NONE, embeddingProviderSettings.TokenizerPath, isEnterpriseConfiguration: embeddingProviderSettings.IsEnterpriseConfiguration);
}
///
@@ -206,33 +206,33 @@ public static IProvider CreateProvider(this EmbeddingProvider embeddingProviderS
/// The provider instance.
public static IProvider CreateProvider(this TranscriptionProvider transcriptionProviderSettings)
{
- return transcriptionProviderSettings.UsedLLMProvider.CreateProvider(transcriptionProviderSettings.Name, transcriptionProviderSettings.Host, transcriptionProviderSettings.Hostname, transcriptionProviderSettings.Model, HFInferenceProvider.NONE, isEnterpriseConfiguration: transcriptionProviderSettings.IsEnterpriseConfiguration);
+ return transcriptionProviderSettings.UsedLLMProvider.CreateProvider(transcriptionProviderSettings.Name, transcriptionProviderSettings.Host, transcriptionProviderSettings.Hostname, transcriptionProviderSettings.Model, HFInferenceProvider.NONE, string.Empty, isEnterpriseConfiguration: transcriptionProviderSettings.IsEnterpriseConfiguration);
}
- private static IProvider CreateProvider(this LLMProviders provider, string instanceName, Host host, string hostname, Model model, HFInferenceProvider inferenceProvider, string expertProviderApiParameter = "", bool isEnterpriseConfiguration = false)
+ private static IProvider CreateProvider(this LLMProviders provider, string instanceName, Host host, string hostname, Model model, HFInferenceProvider inferenceProvider, string tokenizerPath = "", string expertProviderApiParameter = "", bool isEnterpriseConfiguration = false)
{
try
{
return provider switch
{
- LLMProviders.OPEN_AI => new ProviderOpenAI { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.ANTHROPIC => new ProviderAnthropic { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.MISTRAL => new ProviderMistral { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.GOOGLE => new ProviderGoogle { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.X => new ProviderX { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.DEEP_SEEK => new ProviderDeepSeek { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.ALIBABA_CLOUD => new ProviderAlibabaCloud { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.PERPLEXITY => new ProviderPerplexity { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.OPEN_ROUTER => new ProviderOpenRouter { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.OPEN_AI => new ProviderOpenAI { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.ANTHROPIC => new ProviderAnthropic { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.MISTRAL => new ProviderMistral { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.GOOGLE => new ProviderGoogle { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.X => new ProviderX { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.DEEP_SEEK => new ProviderDeepSeek { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.ALIBABA_CLOUD => new ProviderAlibabaCloud { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.PERPLEXITY => new ProviderPerplexity { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.OPEN_ROUTER => new ProviderOpenRouter { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.GROQ => new ProviderGroq { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.FIREWORKS => new ProviderFireworks { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.HUGGINGFACE => new ProviderHuggingFace(inferenceProvider, model) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.GROQ => new ProviderGroq { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.FIREWORKS => new ProviderFireworks { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.HUGGINGFACE => new ProviderHuggingFace(inferenceProvider, model) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.SELF_HOSTED => new ProviderSelfHosted(host, hostname) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.SELF_HOSTED => new ProviderSelfHosted(host, hostname) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.HELMHOLTZ => new ProviderHelmholtz { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
- LLMProviders.GWDG => new ProviderGWDG { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.HELMHOLTZ => new ProviderHelmholtz { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+ LLMProviders.GWDG => new ProviderGWDG { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
_ => new NoProvider(),
};
@@ -442,4 +442,4 @@ public static bool CanLoadModels(this LLMProviders provider, Host host, string?
LLMProviders.HUGGINGFACE => true,
_ => false,
};
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Provider/NoProvider.cs b/app/MindWork AI Studio/Provider/NoProvider.cs
index 3fc8459c..9128ad47 100644
--- a/app/MindWork AI Studio/Provider/NoProvider.cs
+++ b/app/MindWork AI Studio/Provider/NoProvider.cs
@@ -18,6 +18,9 @@ public class NoProvider : IProvider
///
public string AdditionalJsonApiParameters { get; init; } = string.Empty;
+ ///
+ public string TokenizerPath { get; init; } = string.Empty;
+
public Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) => Task.FromResult>([]);
public Task> GetImageModels(string? apiKeyProvisional = null, CancellationToken token = default) => Task.FromResult>([]);
@@ -45,4 +48,4 @@ public async IAsyncEnumerable StreamImageCompletion(Model imageModel,
public IReadOnlyCollection GetModelCapabilities(Model model) => [ Capability.NONE ];
#endregion
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Settings/EmbeddingProvider.cs b/app/MindWork AI Studio/Settings/EmbeddingProvider.cs
index d5a6f20a..0f72c6c7 100644
--- a/app/MindWork AI Studio/Settings/EmbeddingProvider.cs
+++ b/app/MindWork AI Studio/Settings/EmbeddingProvider.cs
@@ -19,7 +19,8 @@ public sealed record EmbeddingProvider(
bool IsEnterpriseConfiguration = false,
Guid EnterpriseConfigurationPluginId = default,
string Hostname = "http://localhost:1234",
- Host Host = Host.NONE) : ConfigurationBaseObject, ISecretId
+ Host Host = Host.NONE,
+ string TokenizerPath = "") : ConfigurationBaseObject, ISecretId
{
private static readonly ILogger LOGGER = Program.LOGGER_FACTORY.CreateLogger();
@@ -96,6 +97,13 @@ public static bool TryParseEmbeddingProviderTable(int idx, LuaTable table, Guid
return false;
}
+ var tokenizerPath = string.Empty;
+ if (table.TryGetValue("TokenizerPath", out var tokenizerPathValue) && !tokenizerPathValue.TryRead(out tokenizerPath))
+ {
+ LOGGER.LogWarning($"The configured embedding provider {idx} does not contain a valid tokenizer path. (Plugin ID: {configPluginId})");
+ tokenizerPath = string.Empty;
+ }
+
provider = new EmbeddingProvider
{
Num = 0, // will be set later by the PluginConfigurationObject
@@ -108,6 +116,7 @@ public static bool TryParseEmbeddingProviderTable(int idx, LuaTable table, Guid
EnterpriseConfigurationPluginId = configPluginId,
Hostname = hostname,
Host = host,
+ TokenizerPath = tokenizerPath,
};
// Handle encrypted API key if present:
diff --git a/app/MindWork AI Studio/Settings/Provider.cs b/app/MindWork AI Studio/Settings/Provider.cs
index 0ccf272c..c8276bcd 100644
--- a/app/MindWork AI Studio/Settings/Provider.cs
+++ b/app/MindWork AI Studio/Settings/Provider.cs
@@ -32,7 +32,8 @@ public sealed record Provider(
string Hostname = "http://localhost:1234",
Host Host = Host.NONE,
HFInferenceProvider HFInferenceProvider = HFInferenceProvider.NONE,
- string AdditionalJsonApiParameters = "") : ConfigurationBaseObject, ISecretId
+ string AdditionalJsonApiParameters = "",
+ string TokenizerPath = "") : ConfigurationBaseObject, ISecretId
{
private static readonly ILogger LOGGER = Program.LOGGER_FACTORY.CreateLogger();
@@ -151,6 +152,13 @@ public static bool TryParseProviderTable(int idx, LuaTable table, Guid configPlu
additionalJsonApiParameters = string.Empty;
}
+ var tokenizerPath = string.Empty;
+ if (table.TryGetValue("TokenizerPath", out var tokenizerPathValue) && !tokenizerPathValue.TryRead(out tokenizerPath))
+ {
+ LOGGER.LogWarning($"The configured provider {idx} does not contain a valid tokenizer path. (Plugin ID: {configPluginId})");
+ tokenizerPath = string.Empty;
+ }
+
provider = new Provider
{
Num = 0, // will be set later by the PluginConfigurationObject
@@ -165,6 +173,7 @@ public static bool TryParseProviderTable(int idx, LuaTable table, Guid configPlu
Host = host,
HFInferenceProvider = hfInferenceProvider,
AdditionalJsonApiParameters = additionalJsonApiParameters,
+ TokenizerPath = tokenizerPath,
};
// Handle encrypted API key if present:
diff --git a/app/MindWork AI Studio/Tools/Rust/FileType.cs b/app/MindWork AI Studio/Tools/Rust/FileType.cs
new file mode 100644
index 00000000..c333a691
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/Rust/FileType.cs
@@ -0,0 +1,41 @@
+namespace AIStudio.Tools.Rust;
+
+///
+/// Represents a file type that can optionally contain child file types.
+/// Use the static helpers , and to build readable trees.
+///
+/// Display name of the type (e.g., "Document").
+/// File extensions belonging to this type (without dot).
+/// Nested file types that are included when this type is selected.
+public sealed record FileType(string FilterName, string[] FilterExtensions, IReadOnlyList Children)
+{
+ ///
+ /// Factory for a leaf node.
+ /// Example: FileType.Leaf(".NET", "cs", "razor")
+ ///
+ public static FileType Leaf(string name, params string[] extensions) =>
+ new(name, extensions, []);
+
+ ///
+ /// Factory for a parent node that only has children.
+ /// Example: FileType.Parent("Source Code", dotnet, java)
+ ///
+ public static FileType Parent(string name, params FileType[]? children) =>
+ new(name, [], children ?? []);
+
+ ///
+ /// Factory for a composite node that has its own extensions in addition to children.
+ ///
+ public static FileType Composite(string name, string[] extensions, params FileType[] children) =>
+ new(name, extensions, children);
+
+ ///
+ /// Collects all extensions for this type, including children.
+ ///
+ public IEnumerable FlattenExtensions()
+ {
+ return this.FilterExtensions
+ .Concat(this.Children.SelectMany(child => child.FlattenExtensions()))
+ .Distinct(StringComparer.OrdinalIgnoreCase);
+ }
+}
\ No newline at end of file
diff --git a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs
index 87a551b2..4a02608e 100644
--- a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs
+++ b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs
@@ -127,4 +127,4 @@ public static bool IsAllowedPath(string filePath, params FileTypeFilter[]? types
return false;
}
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Tools/Rust/TokenizerHandlingResponse.cs b/app/MindWork AI Studio/Tools/Rust/TokenizerHandlingResponse.cs
new file mode 100644
index 00000000..4323f76f
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/Rust/TokenizerHandlingResponse.cs
@@ -0,0 +1,3 @@
+namespace AIStudio.Tools.Rust;
+
+public readonly record struct TokenizerHandlingResponse(int Success, string Response);
\ No newline at end of file
diff --git a/app/MindWork AI Studio/Tools/Rust/TokenizerResponse.cs b/app/MindWork AI Studio/Tools/Rust/TokenizerResponse.cs
new file mode 100644
index 00000000..54f0b61c
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/Rust/TokenizerResponse.cs
@@ -0,0 +1,3 @@
+namespace AIStudio.Tools.Rust;
+
+public readonly record struct TokenizerResponse(bool Success, int TokenCount, string Message);
diff --git a/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs b/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs
new file mode 100644
index 00000000..d7976198
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs
@@ -0,0 +1,69 @@
+using AIStudio.Tools.Rust;
+
+namespace AIStudio.Tools.Services;
+
+public sealed partial class RustService
+{
+ public async Task ValidateTokenizer(string filePath)
+ {
+ var result = await this.http.PostAsJsonAsync("/tokenizer/validate", new {
+ file_path = filePath,
+ }, this.jsonRustSerializerOptions);
+
+ if (!result.IsSuccessStatusCode)
+ {
+ this.logger!.LogError($"Failed to validate the tokenizer '{result.StatusCode}'");
+ return new TokenizerResponse
+ {
+ Success = false,
+ Message = "An error occured while sending the path to the Rust framework for validation: "+result.StatusCode,
+ TokenCount = 0
+ };
+ }
+
+ return await result.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions);
+ }
+
+ public async Task StoreTokenizer(string modelId, string previousmodelId, string filePath)
+ {
+ Console.WriteLine($"Storing tokenizer for model '{modelId}' with previous model '{previousmodelId}' from file '{filePath}'");
+ var result = await this.http.PostAsJsonAsync("/tokenizer/store", new {
+ model_id = modelId,
+ previous_model_id = previousmodelId,
+ file_path = filePath,
+ }, this.jsonRustSerializerOptions);
+
+ if (!result.IsSuccessStatusCode)
+ {
+ this.logger!.LogError($"Failed to store the tokenizer '{result.StatusCode}'");
+ return new TokenizerResponse{
+ Success = false,
+ Message = "An error occured while sending the path to the Rust framework for storing: "+result.StatusCode,
+ TokenCount = 0
+ };
+ }
+
+ return await result.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions);
+ }
+
+ public async Task GetTokenCount(string text)
+ {
+ try
+ {
+ var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
+ var payload = new { text };
+ var response = await this.http.PostAsJsonAsync("/tokenizer/count", payload, this.jsonRustSerializerOptions, cts.Token);
+ response.EnsureSuccessStatusCode();
+ return await response.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions, cancellationToken: cts.Token);
+ }
+ catch (Exception e)
+ {
+ if(this.logger is not null)
+ this.logger.LogError(e, "Error while getting token count from Rust service.");
+ else
+ Console.WriteLine($"Error while getting token count from Rust service: '{e}'.");
+
+ return null;
+ }
+ }
+}
\ No newline at end of file
diff --git a/app/MindWork AI Studio/Tools/Validation/ProviderValidation.cs b/app/MindWork AI Studio/Tools/Validation/ProviderValidation.cs
index bb72feb4..595eb23e 100644
--- a/app/MindWork AI Studio/Tools/Validation/ProviderValidation.cs
+++ b/app/MindWork AI Studio/Tools/Validation/ProviderValidation.cs
@@ -22,6 +22,8 @@ public sealed class ProviderValidation
public Func IsModelProvidedManually { get; init; } = () => false;
+ public Func GetCustomTokenizerValidationIssue { get; init; } = () => string.Empty;
+
public string? ValidatingHostname(string hostname)
{
if(this.GetProvider() != LLMProviders.SELF_HOSTED)
@@ -120,4 +122,13 @@ public sealed class ProviderValidation
return null;
}
-}
\ No newline at end of file
+
+ public string? ValidatingCustomTokenizer(string _)
+ {
+ var issue = this.GetCustomTokenizerValidationIssue();
+ if (string.IsNullOrWhiteSpace(issue))
+ return null;
+
+ return issue;
+ }
+}
diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml
index 0fb62f1a..4b41800c 100644
--- a/runtime/Cargo.toml
+++ b/runtime/Cargo.toml
@@ -41,6 +41,7 @@ pptx-to-md = "0.4.0"
tempfile = "3.27.0"
strum_macros = "0.28.0"
sysinfo = "0.38.4"
+tokenizers = "0.22.2"
# Fixes security vulnerability downstream, where the upstream is not fixed yet:
time = "0.3.47" # -> Rocket
diff --git a/runtime/src/app_window.rs b/runtime/src/app_window.rs
index 0066cfae..223047e9 100644
--- a/runtime/src/app_window.rs
+++ b/runtime/src/app_window.rs
@@ -11,7 +11,6 @@ use serde::Deserialize;
use strum_macros::Display;
use tauri::updater::UpdateResponse;
use tauri::{FileDropEvent, GlobalShortcutManager, UpdaterEvent, RunEvent, Manager, PathResolver, Window, WindowEvent, generate_context};
-use tauri::api::dialog::blocking::FileDialogBuilder;
use tokio::sync::broadcast;
use tokio::time;
use crate::api_token::APIToken;
@@ -474,241 +473,6 @@ pub async fn install_update(_token: APIToken) {
}
}
-/// Let the user select a directory.
-#[post("/select/directory?", data = "")]
-pub fn select_directory(_token: APIToken, title: &str, previous_directory: Option>) -> Json {
- let folder_path = match previous_directory {
- Some(previous) => {
- let previous_path = previous.path.as_str();
- FileDialogBuilder::new()
- .set_title(title)
- .set_directory(previous_path)
- .pick_folder()
- },
-
- None => {
- FileDialogBuilder::new()
- .set_title(title)
- .pick_folder()
- },
- };
-
- match folder_path {
- Some(path) => {
- info!("User selected directory: {path:?}");
- Json(DirectorySelectionResponse {
- user_cancelled: false,
- selected_directory: path.to_str().unwrap().to_string(),
- })
- },
-
- None => {
- info!("User cancelled directory selection.");
- Json(DirectorySelectionResponse {
- user_cancelled: true,
- selected_directory: String::from(""),
- })
- },
- }
-}
-
-#[derive(Clone, Deserialize)]
-pub struct PreviousDirectory {
- path: String,
-}
-
-#[derive(Clone, Deserialize)]
-pub struct FileTypeFilter {
- filter_name: String,
- filter_extensions: Vec,
-}
-
-#[derive(Clone, Deserialize)]
-pub struct SelectFileOptions {
- title: String,
- previous_file: Option,
- filter: Option,
-}
-
-#[derive(Clone, Deserialize)]
-pub struct SaveFileOptions {
- title: String,
- name_file: Option,
- filter: Option,
-}
-
-#[derive(Serialize)]
-pub struct DirectorySelectionResponse {
- user_cancelled: bool,
- selected_directory: String,
-}
-
-/// Let the user select a file.
-#[post("/select/file", data = "")]
-pub fn select_file(_token: APIToken, payload: Json) -> Json {
-
- // Create a new file dialog builder:
- let file_dialog = FileDialogBuilder::new();
-
- // Set the title of the file dialog:
- let file_dialog = file_dialog.set_title(&payload.title);
-
- // Set the file type filter if provided:
- let file_dialog = apply_filter(file_dialog, &payload.filter);
-
- // Set the previous file path if provided:
- let file_dialog = match &payload.previous_file {
- Some(previous) => {
- let previous_path = previous.file_path.as_str();
- file_dialog.set_directory(previous_path)
- },
-
- None => file_dialog,
- };
-
- // Show the file dialog and get the selected file path:
- let file_path = file_dialog.pick_file();
- match file_path {
- Some(path) => {
- info!("User selected file: {path:?}");
- Json(FileSelectionResponse {
- user_cancelled: false,
- selected_file_path: path.to_str().unwrap().to_string(),
- })
- },
-
- None => {
- info!("User cancelled file selection.");
- Json(FileSelectionResponse {
- user_cancelled: true,
- selected_file_path: String::from(""),
- })
- },
- }
-}
-
-/// Let the user select some files.
-#[post("/select/files", data = "")]
-pub fn select_files(_token: APIToken, payload: Json) -> Json {
-
- // Create a new file dialog builder:
- let file_dialog = FileDialogBuilder::new();
-
- // Set the title of the file dialog:
- let file_dialog = file_dialog.set_title(&payload.title);
-
- // Set the file type filter if provided:
- let file_dialog = apply_filter(file_dialog, &payload.filter);
-
- // Set the previous file path if provided:
- let file_dialog = match &payload.previous_file {
- Some(previous) => {
- let previous_path = previous.file_path.as_str();
- file_dialog.set_directory(previous_path)
- },
-
- None => file_dialog,
- };
-
- // Show the file dialog and get the selected file path:
- let file_paths = file_dialog.pick_files();
- match file_paths {
- Some(paths) => {
- info!("User selected {} files.", paths.len());
- Json(FilesSelectionResponse {
- user_cancelled: false,
- selected_file_paths: paths.iter().map(|p| p.to_str().unwrap().to_string()).collect(),
- })
- }
-
- None => {
- info!("User cancelled file selection.");
- Json(FilesSelectionResponse {
- user_cancelled: true,
- selected_file_paths: Vec::new(),
- })
- },
- }
-}
-
-#[post("/save/file", data = "")]
-pub fn save_file(_token: APIToken, payload: Json) -> Json {
-
- // Create a new file dialog builder:
- let file_dialog = FileDialogBuilder::new();
-
- // Set the title of the file dialog:
- let file_dialog = file_dialog.set_title(&payload.title);
-
- // Set the file type filter if provided:
- let file_dialog = apply_filter(file_dialog, &payload.filter);
-
- // Set the previous file path if provided:
- let file_dialog = match &payload.name_file {
- Some(previous) => {
- let previous_path = previous.file_path.as_str();
- file_dialog.set_directory(previous_path)
- },
-
- None => file_dialog,
- };
-
- // Displays the file dialogue box and select the file:
- let file_path = file_dialog.save_file();
- match file_path {
- Some(path) => {
- info!("User selected file for writing operation: {path:?}");
- Json(FileSaveResponse {
- user_cancelled: false,
- save_file_path: path.to_str().unwrap().to_string(),
- })
- },
-
- None => {
- info!("User cancelled file selection.");
- Json(FileSaveResponse {
- user_cancelled: true,
- save_file_path: String::from(""),
- })
- },
- }
-}
-
-#[derive(Clone, Deserialize)]
-pub struct PreviousFile {
- file_path: String,
-}
-
-/// Applies an optional file type filter to a FileDialogBuilder.
-fn apply_filter(file_dialog: FileDialogBuilder, filter: &Option) -> FileDialogBuilder {
- match filter {
- Some(f) => file_dialog.add_filter(
- &f.filter_name,
- &f.filter_extensions.iter().map(|s| s.as_str()).collect::>(),
- ),
-
- None => file_dialog,
- }
-}
-
-#[derive(Serialize)]
-pub struct FileSelectionResponse {
- user_cancelled: bool,
- selected_file_path: String,
-}
-
-#[derive(Serialize)]
-pub struct FilesSelectionResponse {
- user_cancelled: bool,
- selected_file_paths: Vec,
-}
-
-#[derive(Serialize)]
-pub struct FileSaveResponse {
- user_cancelled: bool,
- save_file_path: String,
-}
-
/// Request payload for registering a global shortcut.
#[derive(Clone, Deserialize)]
pub struct RegisterShortcutRequest {
diff --git a/runtime/src/file_actions.rs b/runtime/src/file_actions.rs
new file mode 100644
index 00000000..333190c2
--- /dev/null
+++ b/runtime/src/file_actions.rs
@@ -0,0 +1,241 @@
+use log::info;
+use rocket::post;
+use rocket::serde::{Deserialize, Serialize};
+use rocket::serde::json::Json;
+use tauri::api::dialog::blocking::FileDialogBuilder;
+use crate::api_token::APIToken;
+
+#[derive(Clone, Deserialize)]
+pub struct PreviousDirectory {
+ path: String,
+}
+
+#[derive(Clone, Deserialize)]
+pub struct FileTypeFilter {
+ filter_name: String,
+ filter_extensions: Vec,
+}
+
+#[derive(Clone, Deserialize)]
+pub struct SelectFileOptions {
+ title: String,
+ previous_file: Option,
+ filter: Option,
+}
+
+#[derive(Clone, Deserialize)]
+pub struct SaveFileOptions {
+ title: String,
+ name_file: Option,
+ filter: Option,
+}
+
+#[derive(Serialize)]
+pub struct DirectorySelectionResponse {
+ user_cancelled: bool,
+ selected_directory: String,
+}
+
+#[derive(Serialize)]
+pub struct FileSelectionResponse {
+ user_cancelled: bool,
+ selected_file_path: String,
+}
+
+#[derive(Serialize)]
+pub struct FilesSelectionResponse {
+ user_cancelled: bool,
+ selected_file_paths: Vec,
+}
+
+#[derive(Serialize)]
+pub struct FileSaveResponse {
+ user_cancelled: bool,
+ save_file_path: String,
+}
+
+#[derive(Clone, Deserialize)]
+pub struct PreviousFile {
+ file_path: String,
+}
+
+/// Let the user select a directory.
+#[post("/select/directory?", data = "")]
+pub fn select_directory(_token: APIToken, title: &str, previous_directory: Option>) -> Json {
+ let folder_path = match previous_directory {
+ Some(previous) => {
+ let previous_path = previous.path.as_str();
+ FileDialogBuilder::new()
+ .set_title(title)
+ .set_directory(previous_path)
+ .pick_folder()
+ },
+
+ None => {
+ FileDialogBuilder::new()
+ .set_title(title)
+ .pick_folder()
+ },
+ };
+
+ match folder_path {
+ Some(path) => {
+ info!("User selected directory: {path:?}");
+ Json(DirectorySelectionResponse {
+ user_cancelled: false,
+ selected_directory: path.to_str().unwrap().to_string(),
+ })
+ },
+
+ None => {
+ info!("User cancelled directory selection.");
+ Json(DirectorySelectionResponse {
+ user_cancelled: true,
+ selected_directory: String::from(""),
+ })
+ },
+ }
+}
+
+/// Let the user select a file.
+#[post("/select/file", data = "")]
+pub fn select_file(_token: APIToken, payload: Json) -> Json {
+
+ // Create a new file dialog builder:
+ let file_dialog = FileDialogBuilder::new();
+
+ // Set the title of the file dialog:
+ let file_dialog = file_dialog.set_title(&payload.title);
+
+ // Set the file type filter if provided:
+ let file_dialog = apply_filter(file_dialog, &payload.filter);
+
+ // Set the previous file path if provided:
+ let file_dialog = match &payload.previous_file {
+ Some(previous) => {
+ let previous_path = previous.file_path.as_str();
+ file_dialog.set_directory(previous_path)
+ },
+
+ None => file_dialog,
+ };
+
+ // Show the file dialog and get the selected file path:
+ let file_path = file_dialog.pick_file();
+ match file_path {
+ Some(path) => {
+ info!("User selected file: {path:?}");
+ Json(FileSelectionResponse {
+ user_cancelled: false,
+ selected_file_path: path.to_str().unwrap().to_string(),
+ })
+ },
+
+ None => {
+ info!("User cancelled file selection.");
+ Json(FileSelectionResponse {
+ user_cancelled: true,
+ selected_file_path: String::from(""),
+ })
+ },
+ }
+}
+
+/// Let the user select some files.
+#[post("/select/files", data = "")]
+pub fn select_files(_token: APIToken, payload: Json) -> Json {
+
+ // Create a new file dialog builder:
+ let file_dialog = FileDialogBuilder::new();
+
+ // Set the title of the file dialog:
+ let file_dialog = file_dialog.set_title(&payload.title);
+
+ // Set the file type filter if provided:
+ let file_dialog = apply_filter(file_dialog, &payload.filter);
+
+ // Set the previous file path if provided:
+ let file_dialog = match &payload.previous_file {
+ Some(previous) => {
+ let previous_path = previous.file_path.as_str();
+ file_dialog.set_directory(previous_path)
+ },
+
+ None => file_dialog,
+ };
+
+ // Show the file dialog and get the selected file path:
+ let file_paths = file_dialog.pick_files();
+ match file_paths {
+ Some(paths) => {
+ info!("User selected {} files.", paths.len());
+ Json(FilesSelectionResponse {
+ user_cancelled: false,
+ selected_file_paths: paths.iter().map(|p| p.to_str().unwrap().to_string()).collect(),
+ })
+ }
+
+ None => {
+ info!("User cancelled file selection.");
+ Json(FilesSelectionResponse {
+ user_cancelled: true,
+ selected_file_paths: Vec::new(),
+ })
+ },
+ }
+}
+
+#[post("/save/file", data = "")]
+pub fn save_file(_token: APIToken, payload: Json) -> Json {
+
+ // Create a new file dialog builder:
+ let file_dialog = FileDialogBuilder::new();
+
+ // Set the title of the file dialog:
+ let file_dialog = file_dialog.set_title(&payload.title);
+
+ // Set the file type filter if provided:
+ let file_dialog = apply_filter(file_dialog, &payload.filter);
+
+ // Set the previous file path if provided:
+ let file_dialog = match &payload.name_file {
+ Some(previous) => {
+ let previous_path = previous.file_path.as_str();
+ file_dialog.set_directory(previous_path)
+ },
+
+ None => file_dialog,
+ };
+
+ // Displays the file dialogue box and select the file:
+ let file_path = file_dialog.save_file();
+ match file_path {
+ Some(path) => {
+ info!("User selected file for writing operation: {path:?}");
+ Json(FileSaveResponse {
+ user_cancelled: false,
+ save_file_path: path.to_str().unwrap().to_string(),
+ })
+ },
+
+ None => {
+ info!("User cancelled file selection.");
+ Json(FileSaveResponse {
+ user_cancelled: true,
+ save_file_path: String::from(""),
+ })
+ },
+ }
+}
+
+/// Applies an optional file type filter to a FileDialogBuilder.
+fn apply_filter(file_dialog: FileDialogBuilder, filter: &Option) -> FileDialogBuilder {
+ match filter {
+ Some(f) => file_dialog.add_filter(
+ &f.filter_name,
+ &f.filter_extensions.iter().map(|s| s.as_str()).collect::>(),
+ ),
+
+ None => file_dialog,
+ }
+}
\ No newline at end of file
diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs
index 1b13e099..d4366e3e 100644
--- a/runtime/src/lib.rs
+++ b/runtime/src/lib.rs
@@ -17,4 +17,6 @@ pub mod qdrant;
pub mod certificate_factory;
pub mod runtime_api_token;
pub mod stale_process_cleanup;
-mod sidecar_types;
\ No newline at end of file
+mod sidecar_types;
+pub mod tokenizer;
+pub mod file_actions;
\ No newline at end of file
diff --git a/runtime/src/main.rs b/runtime/src/main.rs
index 00a7ba90..a210de54 100644
--- a/runtime/src/main.rs
+++ b/runtime/src/main.rs
@@ -11,7 +11,7 @@ use mindwork_ai_studio::environment::is_dev;
use mindwork_ai_studio::log::init_logging;
use mindwork_ai_studio::metadata::MetaData;
use mindwork_ai_studio::runtime_api::start_runtime_api;
-
+use mindwork_ai_studio::tokenizer::{init_tokenizer};
#[tokio::main]
async fn main() {
@@ -43,8 +43,12 @@ async fn main() {
info!("Running in production mode.");
}
+ if let Err(e) = init_tokenizer() {
+ warn!(Source = "Tokenizer"; "Error during the initialisation of the tokenizer: {}", e);
+ }
+
generate_runtime_certificate();
start_runtime_api();
start_tauri();
-}
\ No newline at end of file
+}
diff --git a/runtime/src/runtime_api.rs b/runtime/src/runtime_api.rs
index 64bc8174..b3401db9 100644
--- a/runtime/src/runtime_api.rs
+++ b/runtime/src/runtime_api.rs
@@ -72,10 +72,10 @@ pub fn start_runtime_api() {
crate::app_window::get_event_stream,
crate::app_window::check_for_update,
crate::app_window::install_update,
- crate::app_window::select_directory,
- crate::app_window::select_file,
- crate::app_window::select_files,
- crate::app_window::save_file,
+ crate::file_actions::select_directory,
+ crate::file_actions::select_file,
+ crate::file_actions::select_files,
+ crate::file_actions::save_file,
crate::secret::get_secret,
crate::secret::store_secret,
crate::secret::delete_secret,
@@ -89,6 +89,9 @@ pub fn start_runtime_api() {
crate::file_data::extract_data,
crate::log::get_log_paths,
crate::log::log_event,
+ crate::tokenizer::token_count,
+ crate::tokenizer::validate_tokenizer,
+ crate::tokenizer::store_tokenizer,
crate::app_window::register_shortcut,
crate::app_window::validate_shortcut,
crate::app_window::suspend_shortcuts,
diff --git a/runtime/src/tokenizer.rs b/runtime/src/tokenizer.rs
new file mode 100644
index 00000000..f45416af
--- /dev/null
+++ b/runtime/src/tokenizer.rs
@@ -0,0 +1,192 @@
+use std::fs;
+use std::path::{PathBuf};
+use std::sync::OnceLock;
+use rocket::{post};
+use rocket::serde::json::Json;
+use rocket::serde::Serialize;
+use serde::Deserialize;
+use tokenizers::Error;
+use tokenizers::tokenizer::{Tokenizer, Error as TokenizerError};
+use crate::api_token::APIToken;
+use crate::environment::{DATA_DIRECTORY};
+
+static TOKENIZER: OnceLock = OnceLock::new();
+
+#[derive(Deserialize)]
+pub struct SetTokenText {
+ pub text: String,
+}
+
+#[derive(Clone, Deserialize)]
+pub struct TokenizerStorage {
+ model_id: String,
+ previous_model_id: String,
+ file_path: String,
+}
+
+#[derive(Clone, Deserialize)]
+pub struct TokenizerValidation {
+ file_path: String,
+}
+
+#[derive(Serialize)]
+pub struct TokenizerResponse {
+ success: bool,
+ token_count: usize,
+ message: String,
+}
+
+impl From> for TokenizerResponse {
+ fn from(result: Result) -> Self {
+ match result {
+ Ok(count) => TokenizerResponse {
+ success: true,
+ token_count: count,
+ message: "Success".to_string(),
+ },
+ Err(e) => TokenizerResponse {
+ success: false,
+ token_count: 0,
+ message: e.to_string(),
+ },
+ }
+ }
+}
+
+pub fn init_tokenizer() -> Result<(), Error>{
+ let mut target_dir = PathBuf::from("target");
+ target_dir.push("tokenizers");
+ fs::create_dir_all(&target_dir)?;
+
+ let mut local_tokenizer_path = target_dir.clone();
+ local_tokenizer_path.push("tokenizer.json");
+
+ TOKENIZER.set(Tokenizer::from_file(local_tokenizer_path)?).expect("Could not set the tokenizer.");
+ Ok(())
+}
+
+fn validate_tokenizer_at_path(path: &PathBuf) -> Result {
+ if !path.is_file() {
+ return Err(TokenizerError::from(format!(
+ "Tokenizer file was not found: {}",
+ path.display()
+ )));
+ }
+
+ let tokenizer = Tokenizer::from_file(path).map_err(|e| {
+ TokenizerError::from(format!(
+ "Failed to load tokenizer from '{}': {}",
+ path.display(),
+ e
+ ))
+ })?;
+
+ let test_string = "Hello, world! This is a test string for tokenizer validation.";
+
+ let encoding = tokenizer.encode(test_string, true).map_err(|e| {
+ TokenizerError::from(format!(
+ "Tokenizer failed to encode validation string: {}",
+ e
+ ))
+ })?;
+ let token_count = encoding.len();
+
+ if token_count == 0 {
+ return Err(TokenizerError::from(
+ "Tokenizer produced 0 tokens for test string. The tokenizer is likely invalid or misconfigured."
+ ));
+ }
+
+ if encoding.get_tokens().iter().any(|t| t.is_empty()) {
+ return Err(TokenizerError::from(
+ "Tokenizer produced empty tokens. The tokenizer is invalid."
+ ));
+ }
+
+ Ok(token_count)
+}
+
+fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result {
+ let data_dir = DATA_DIRECTORY
+ .get()
+ .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "DATA_DIRECTORY not initialized"))?;
+
+ let base_path = PathBuf::from(data_dir).join("tokenizers");
+
+ // Delete previous model if file_path is empty
+ if payload.file_path.trim().is_empty() {
+ if payload.previous_model_id.trim().is_empty() {
+ return Ok(String::from("")); // Nothing to delete
+ }
+ let previous_path = base_path.join(&payload.previous_model_id);
+ fs::remove_dir_all(previous_path)?;
+ return Ok(String::from(""));
+ }
+
+ // Copy file
+ let source_path = PathBuf::from(&payload.file_path);
+ let source_name = source_path.file_name()
+ .and_then(|n| n.to_str())
+ .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, "Invalid tokenizer file path"))?;
+ let model_path = &base_path.join(&payload.model_id);
+ let destination_path = &model_path.join(source_name);
+ println!("source_path: {}, destination_path: {}", source_path.display(), destination_path.display());
+ println!("equals {}", source_path.eq(destination_path));
+
+ if !source_path.eq(destination_path) && model_path.exists() {
+ fs::remove_dir_all(model_path)?;
+ }
+ fs::create_dir_all(model_path)?;
+ println!("Moving tokenizer file from {} to {}", source_path.display(), destination_path.display());
+ let previous_path = base_path.join(&payload.previous_model_id);
+
+ // Delete previous tokenizer folder if specified
+ if !payload.previous_model_id.trim().is_empty() && source_path.starts_with(&previous_path){
+ fs::rename(&source_path, &destination_path)?;
+ if previous_path.exists() && !previous_path.eq(model_path) {
+ fs::remove_dir_all(previous_path)?;
+ }
+ }else{
+ fs::copy( & source_path, & destination_path)?;
+ }
+ Ok(destination_path.to_str().unwrap().to_string())
+}
+
+pub fn get_token_count(text: &str) -> Result {
+ if text.trim().is_empty() {
+ return Err(TokenizerError::from("Input text is empty"));
+ }
+
+ let tokenizer = TOKENIZER.get().cloned().ok_or_else(|| TokenizerError::from("Tokenizer not initialized"))?;
+ let enc = tokenizer.encode(text, true)?;
+ Ok(enc.len())
+}
+
+#[post("/tokenizer/count", data = "")]
+pub fn token_count(_token: APIToken, req: Json) -> Json {
+ Json(get_token_count(&req.text).into())
+}
+
+#[post("/tokenizer/validate", data = "")]
+pub fn validate_tokenizer(_token: APIToken, payload: Json) -> Json{
+ println!("Received tokenizer validation request: {}", payload.file_path);
+ Json(validate_tokenizer_at_path(&PathBuf::from(payload.file_path.clone())).into())
+}
+
+#[post("/tokenizer/store", data = "")]
+pub fn store_tokenizer(_token: APIToken, payload: Json) -> Json{
+ println!("Received tokenizer store request: {}, {}, {}", payload.model_id, payload.previous_model_id, payload.file_path);
+ match handle_tokenizer_store(&payload) {
+ Ok(dest_path) => Json(TokenizerResponse {
+ success: true,
+ token_count: 0,
+ message: dest_path,
+ }),
+ Err(e) => Json(TokenizerResponse {
+ success: false,
+ token_count: 0,
+ message: e.to_string(),
+ }),
+ }
+
+}
\ No newline at end of file