From 6eb076fdcd64ae3c14a8b42e8e943ae885346557 Mon Sep 17 00:00:00 2001 From: MuleiSY Date: Wed, 25 Mar 2026 07:17:20 +0000 Subject: [PATCH] fix: rebuild database when custom file filters are provided When users specify excluded/included files or directories via the "Refresh Wiki" advanced options, the cached .pkl database was returned immediately without applying the filter parameters. This caused file exclusion rules to be silently ignored. Now, when any custom file filter parameter is provided, the existing database cache is removed and rebuilt with the new filters applied. Fixes #494 Co-Authored-By: Claude Opus 4.6 --- api/data_pipeline.py | 57 +++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/api/data_pipeline.py b/api/data_pipeline.py index 5e1f5fa47..9e29aea81 100644 --- a/api/data_pipeline.py +++ b/api/data_pipeline.py @@ -865,34 +865,41 @@ def _embedding_vector_length(doc: Document) -> int: # Handle backward compatibility if embedder_type is None and is_ollama_embedder is not None: embedder_type = 'ollama' if is_ollama_embedder else None + # Check if custom file filters are provided + has_custom_filters = any([excluded_dirs, excluded_files, included_dirs, included_files]) + # check the database if self.repo_paths and os.path.exists(self.repo_paths["save_db_file"]): - logger.info("Loading existing database...") - try: - self.db = LocalDB.load_state(self.repo_paths["save_db_file"]) - documents = self.db.get_transformed_data(key="split_and_embed") - if documents: - lengths = [_embedding_vector_length(doc) for doc in documents] - non_empty = sum(1 for n in lengths if n > 0) - empty = len(lengths) - non_empty - sample_sizes = sorted({n for n in lengths if n > 0})[:3] - logger.info( - "Loaded %s documents from existing database (embeddings: %s non-empty, %s empty; sample_dims=%s)", - len(documents), - non_empty, - empty, - sample_sizes, - ) - - if non_empty == 0: - logger.warning( - "Existing database contains no usable embeddings. Rebuilding embeddings..." + if has_custom_filters: + logger.info("Custom file filters provided. Rebuilding database to apply filters...") + os.remove(self.repo_paths["save_db_file"]) + else: + logger.info("Loading existing database...") + try: + self.db = LocalDB.load_state(self.repo_paths["save_db_file"]) + documents = self.db.get_transformed_data(key="split_and_embed") + if documents: + lengths = [_embedding_vector_length(doc) for doc in documents] + non_empty = sum(1 for n in lengths if n > 0) + empty = len(lengths) - non_empty + sample_sizes = sorted({n for n in lengths if n > 0})[:3] + logger.info( + "Loaded %s documents from existing database (embeddings: %s non-empty, %s empty; sample_dims=%s)", + len(documents), + non_empty, + empty, + sample_sizes, ) - else: - return documents - except Exception as e: - logger.error(f"Error loading existing database: {e}") - # Continue to create a new database + + if non_empty == 0: + logger.warning( + "Existing database contains no usable embeddings. Rebuilding embeddings..." + ) + else: + return documents + except Exception as e: + logger.error(f"Error loading existing database: {e}") + # Continue to create a new database # prepare the database logger.info("Creating new database...")