Skip to content

Commit b023510

Browse files
committed
Add toggle for Create New vs Use Existing vector store modes
Improvements to the Split/Embed tool UI: - Add toggle control to switch between "Create New Vector Store" (default) and "Use Existing Vector Store" modes - When creating new VS: show simple text input for vector store name, display all configuration options (chunk size, overlap, distance metric, index type) - When using existing VS: hide configuration options (already defined by VS), filter dropdown to show only vector stores created with the same embedding model to prevent mixing embeddings - Show full vector store table name in both modes - Improved validation messages and help text - Prevents potential issues with mixing embeddings from different models in the same vector store This simplifies the UI and makes the distinction between creating new vs using existing vector stores much clearer.
1 parent cdd9c4a commit b023510

File tree

1 file changed

+155
-90
lines changed

1 file changed

+155
-90
lines changed

src/client/content/tools/tabs/split_embed.py

Lines changed: 155 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,19 @@ def update_chunk_size_input() -> None:
142142
#############################################################################
143143
# Helper Functions
144144
#############################################################################
145-
def _render_embedding_configuration(embed_models_enabled: dict, embed_request: DatabaseVectorStorage) -> None:
146-
"""Render the embedding configuration section"""
145+
def _render_embedding_configuration(
146+
embed_models_enabled: dict,
147+
embed_request: DatabaseVectorStorage,
148+
show_vs_config: bool = True
149+
) -> None:
150+
"""Render the embedding configuration section
151+
152+
Args:
153+
embed_models_enabled: Dictionary of enabled embedding models
154+
embed_request: The database vector storage request object
155+
show_vs_config: If True, show chunk size, overlap, distance metric, and index type options.
156+
If False, these are determined by the selected existing vector store.
157+
"""
147158
st.header("Embedding Configuration", divider="red")
148159
embed_request.model = st.selectbox(
149160
"Embedding models available: ",
@@ -160,62 +171,75 @@ def _render_embedding_configuration(embed_models_enabled: dict, embed_request: D
160171
st.rerun()
161172
st.stop()
162173

163-
chunk_size_max = embed_models_enabled[embed_request.model]["max_chunk_size"]
164-
col1_1, col1_2 = st.columns([0.8, 0.2])
165-
with col1_1:
166-
st.slider(
167-
"Chunk Size (tokens):",
168-
min_value=0,
169-
max_value=chunk_size_max,
170-
value=chunk_size_max,
171-
key="selected_chunk_size_slider",
172-
on_change=update_chunk_size_input,
173-
help=help_text.help_dict["chunk_size"],
174-
)
175-
st.slider(
176-
"Chunk Overlap (% of Chunk Size)",
177-
min_value=0,
178-
max_value=100,
179-
value=20,
180-
step=5,
181-
key="selected_chunk_overlap_slider",
182-
on_change=update_chunk_overlap_input,
183-
format="%d%%",
184-
help=help_text.help_dict["chunk_overlap"],
185-
)
174+
# Only show vector store configuration if creating new
175+
if show_vs_config:
176+
chunk_size_max = embed_models_enabled[embed_request.model]["max_chunk_size"]
177+
col1_1, col1_2 = st.columns([0.8, 0.2])
178+
with col1_1:
179+
st.slider(
180+
"Chunk Size (tokens):",
181+
min_value=0,
182+
max_value=chunk_size_max,
183+
value=chunk_size_max,
184+
key="selected_chunk_size_slider",
185+
on_change=update_chunk_size_input,
186+
help=help_text.help_dict["chunk_size"],
187+
)
188+
st.slider(
189+
"Chunk Overlap (% of Chunk Size)",
190+
min_value=0,
191+
max_value=100,
192+
value=20,
193+
step=5,
194+
key="selected_chunk_overlap_slider",
195+
on_change=update_chunk_overlap_input,
196+
format="%d%%",
197+
help=help_text.help_dict["chunk_overlap"],
198+
)
199+
200+
with col1_2:
201+
embed_request.chunk_size = st.number_input(
202+
"Chunk Size (tokens):",
203+
label_visibility="hidden",
204+
min_value=0,
205+
max_value=chunk_size_max,
206+
value=chunk_size_max,
207+
key="selected_chunk_size_input",
208+
on_change=update_chunk_size_slider,
209+
)
210+
chunk_overlap_pct = st.number_input(
211+
"Chunk Overlap (% of Chunk Size):",
212+
label_visibility="hidden",
213+
min_value=0,
214+
max_value=100,
215+
value=20,
216+
step=5,
217+
key="selected_chunk_overlap_input",
218+
on_change=update_chunk_overlap_slider,
219+
)
220+
embed_request.chunk_overlap = math.ceil((chunk_overlap_pct / 100) * embed_request.chunk_size)
186221

187-
with col1_2:
188-
embed_request.chunk_size = st.number_input(
189-
"Chunk Size (tokens):",
190-
label_visibility="hidden",
191-
min_value=0,
192-
max_value=chunk_size_max,
193-
value=chunk_size_max,
194-
key="selected_chunk_size_input",
195-
on_change=update_chunk_size_slider,
222+
col2_1, col2_2 = st.columns([0.5, 0.5])
223+
embed_request.distance_metric = col2_1.selectbox(
224+
"Distance Metric:",
225+
list(DistanceMetrics.__args__),
226+
key="selected_distance_metric",
227+
help=help_text.help_dict["distance_metric"],
196228
)
197-
chunk_overlap_pct = st.number_input(
198-
"Chunk Overlap (% of Chunk Size):",
199-
label_visibility="hidden",
200-
min_value=0,
201-
max_value=100,
202-
value=20,
203-
step=5,
204-
key="selected_chunk_overlap_input",
205-
on_change=update_chunk_overlap_slider,
229+
embed_request.index_type = col2_2.selectbox(
230+
"Index Type:", list(IndexTypes.__args__), key="selected_index_type", help=help_text.help_dict["index_type"]
206231
)
207-
embed_request.chunk_overlap = math.ceil((chunk_overlap_pct / 100) * embed_request.chunk_size)
208-
209-
col2_1, col2_2 = st.columns([0.5, 0.5])
210-
embed_request.distance_metric = col2_1.selectbox(
211-
"Distance Metric:",
212-
list(DistanceMetrics.__args__),
213-
key="selected_distance_metric",
214-
help=help_text.help_dict["distance_metric"],
215-
)
216-
embed_request.index_type = col2_2.selectbox(
217-
"Index Type:", list(IndexTypes.__args__), key="selected_index_type", help=help_text.help_dict["index_type"]
218-
)
232+
else:
233+
# These will be set from the selected existing vector store
234+
# Set defaults to avoid errors, will be overwritten in _render_vector_store_section
235+
if not hasattr(embed_request, 'chunk_size') or embed_request.chunk_size is None:
236+
embed_request.chunk_size = embed_models_enabled[embed_request.model]["max_chunk_size"]
237+
if not hasattr(embed_request, 'chunk_overlap') or embed_request.chunk_overlap is None:
238+
embed_request.chunk_overlap = 0
239+
if not hasattr(embed_request, 'distance_metric') or embed_request.distance_metric is None:
240+
embed_request.distance_metric = list(DistanceMetrics.__args__)[0]
241+
if not hasattr(embed_request, 'index_type') or embed_request.index_type is None:
242+
embed_request.index_type = list(IndexTypes.__args__)[0]
219243

220244
def _render_file_source_section(file_sources: list, oci_setup: dict) -> FileSourceData:
221245
"""Render file source selection and return processing data"""
@@ -341,8 +365,16 @@ def _display_file_list_expander(file_list_response: dict) -> None:
341365
st.info("No files found in this vector store.")
342366

343367

344-
def _render_vector_store_section(embed_request: DatabaseVectorStorage) -> tuple:
345-
"""Render vector store configuration section and return validation status and rate limit"""
368+
def _render_vector_store_section(embed_request: DatabaseVectorStorage, create_new_vs: bool) -> tuple:
369+
"""Render vector store configuration section and return validation status and rate limit
370+
371+
Args:
372+
embed_request: The database vector storage request object
373+
create_new_vs: If True, allow creating new vector store. If False, select from existing only.
374+
375+
Returns:
376+
Tuple of (embed_alias_invalid, rate_limit, existing_vs)
377+
"""
346378
st.header("Populate Vector Store", divider="red")
347379
database_lookup = st_common.state_configs_lookup("database_configs", "name")
348380
existing_vs = database_lookup.get(state.client_settings.get("database", {}).get("alias"), {}).get(
@@ -353,50 +385,65 @@ def _render_vector_store_section(embed_request: DatabaseVectorStorage) -> tuple:
353385
embed_alias_invalid = False
354386
embed_request.vector_store = None
355387

356-
# Filter vector stores by matching chunk size and overlap
357-
matching_vs = [
358-
vs for vs in existing_vs
359-
if vs.get("chunk_size") == embed_request.chunk_size
360-
and vs.get("chunk_overlap") == embed_request.chunk_overlap
361-
and vs.get("alias")
362-
]
363-
matching_vs_names = [vs.get("alias", "") for vs in matching_vs]
364-
vs_options = ["Create new..."] + matching_vs_names
365-
366388
with embed_alias_size:
367-
# Dropdown for existing vector stores
368-
selected_vs = st.selectbox(
369-
"Select or Create Vector Store:",
370-
options=vs_options,
371-
index=0,
372-
help="Only showing vector stores with matching chunk size and overlap configuration",
373-
key="selected_vs_dropdown"
374-
)
375-
376-
# Show text input if "Create new..." is selected or for editing
377-
if selected_vs == "Create new...":
389+
if create_new_vs:
390+
# Creating new vector store: just show text input for new VS name
378391
embed_request.alias = st.text_input(
379392
"New Vector Store Alias:",
380393
max_chars=20,
381394
help=help_text.help_dict["embed_alias"],
382395
key="selected_embed_alias",
383-
placeholder="Press Enter to set.",
396+
placeholder="Enter a name for the new vector store",
384397
)
385398
else:
386-
# Use the selected existing vector store name
399+
# Using existing mode: show only VS created with the same embedding model
400+
# Filter by model to prevent mixing embeddings from different models
401+
vs_lookup = {
402+
vs.get("alias"): vs
403+
for vs in existing_vs
404+
if vs.get("alias") and vs.get("model") == embed_request.model
405+
}
406+
vs_options = list(vs_lookup.keys())
407+
408+
if not vs_options:
409+
st.warning(
410+
f"No existing vector stores found for embedding model '{embed_request.model}'. "
411+
f"Toggle 'Create New Vector Store' to create one.",
412+
icon="⚠️"
413+
)
414+
415+
selected_vs = st.selectbox(
416+
"Select Existing Vector Store:",
417+
options=vs_options if vs_options else [""],
418+
index=0 if vs_options else None,
419+
help="Only showing vector stores created with the same embedding model to prevent mixing embeddings",
420+
key="selected_vs_dropdown",
421+
disabled=not vs_options
422+
)
387423
embed_request.alias = selected_vs
424+
425+
# Get VS properties from selected existing VS and update embed_request
426+
if selected_vs and selected_vs in vs_lookup:
427+
selected_vs_props = vs_lookup[selected_vs]
428+
embed_request.chunk_size = selected_vs_props.get("chunk_size", embed_request.chunk_size)
429+
embed_request.chunk_overlap = selected_vs_props.get("chunk_overlap", embed_request.chunk_overlap)
430+
embed_request.distance_metric = selected_vs_props.get("distance_metric", embed_request.distance_metric)
431+
embed_request.index_type = selected_vs_props.get("index_type", embed_request.index_type)
432+
433+
# Show disabled text input with alias
388434
st.text_input(
389435
"Vector Store Alias:",
390-
value=selected_vs,
436+
value=selected_vs if selected_vs else "",
391437
max_chars=20,
392438
help=help_text.help_dict["embed_alias"],
393-
key="selected_embed_alias",
439+
key="selected_embed_alias_readonly",
394440
disabled=True,
395441
)
442+
396443
pattern = r"^[A-Za-z][A-Za-z0-9_]*$"
397444

398445
# Check if alias is empty when creating new vector store
399-
if selected_vs == "Create new..." and not embed_request.alias:
446+
if create_new_vs and not embed_request.alias:
400447
st.warning("Please enter a Vector Store Alias to continue.")
401448
embed_alias_invalid = True
402449
elif embed_request.alias and not re.match(pattern, embed_request.alias):
@@ -405,16 +452,23 @@ def _render_vector_store_section(embed_request: DatabaseVectorStorage) -> tuple:
405452
)
406453
embed_alias_invalid = True
407454

408-
if not embed_alias_invalid:
455+
if not embed_alias_invalid and embed_request.alias:
409456
embed_request.vector_store, _ = functions.get_vs_table(
410457
**embed_request.model_dump(exclude={"database", "vector_store"})
411458
)
412-
vs_msg = f"{embed_request.vector_store}, will be created."
413459
vs_exists = any(d.get("vector_store") == embed_request.vector_store for d in existing_vs)
414-
if vs_exists:
415-
vs_msg = f"{embed_request.vector_store} exists, new chunks will be added."
460+
461+
# Show full vector store table name
416462
st.markdown(f"##### **Vector Store:** `{embed_request.vector_store}`")
417-
st.caption(f"{vs_msg}")
463+
464+
# Different messages based on mode
465+
if create_new_vs:
466+
if vs_exists:
467+
st.caption("Vector store already exists. New chunks will be added.")
468+
else:
469+
st.caption("New vector store will be created.")
470+
else:
471+
st.caption("Adding files to existing vector store.")
418472

419473
# Display files in existing vector store
420474
if vs_exists and embed_request.vector_store:
@@ -611,11 +665,22 @@ def display_split_embed() -> None:
611665

612666
embed_request = DatabaseVectorStorage()
613667

614-
_render_embedding_configuration(embed_models_enabled, embed_request)
668+
# Toggle between creating new vector store or using existing
669+
create_new_vs = st.toggle(
670+
"Create New Vector Store",
671+
key="selected_create_new_vs",
672+
value=True,
673+
help="Toggle between creating a new vector store or adding to an existing one. "
674+
"When using an existing vector store, chunk size, overlap, distance metric, "
675+
"and index type are already defined and cannot be changed.",
676+
)
677+
678+
# Render embedding configuration - only show VS config options when creating new
679+
_render_embedding_configuration(embed_models_enabled, embed_request, show_vs_config=create_new_vs)
615680

616681
source_data = _render_file_source_section(file_sources, oci_setup)
617682

618-
embed_alias_invalid, rate_limit, existing_vs = _render_vector_store_section(embed_request)
683+
embed_alias_invalid, rate_limit, existing_vs = _render_vector_store_section(embed_request, create_new_vs)
619684

620685
if not embed_alias_invalid:
621686
_handle_vector_store_population(

0 commit comments

Comments
 (0)