Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ tree-sitter-lua = "0.5.0"
tree-sitter-clojure-orchard = "0.2.5"
glob = "0.3.3"
async-trait = "0.1.89"
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
sysinfo = "0.38.4"
indexmap = { version = "2.13.0", features = ["serde"] }

Expand Down
114 changes: 89 additions & 25 deletions src/cli/commands/index_parallel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};

use crate::config::Settings;
use crate::indexing::facade::{build_embedding_backend, resolve_remote_model_name};
use crate::indexing::pipeline::{IncrementalStats, Phase2Stats, Pipeline, PipelineConfig};
use crate::io::status_line::{ProgressBar, ProgressBarOptions, ProgressBarStyle};
use crate::semantic::SimpleSemanticSearch;
use crate::semantic::{EmbeddingBackend, SemanticSearchError, SimpleSemanticSearch};
use crate::storage::DocumentIndex;

/// Arguments for the index-parallel command.
Expand Down Expand Up @@ -71,8 +72,10 @@ pub fn run(args: IndexParallelArgs, settings: &Settings) {
}
};

// Create semantic search (for embeddings)
let semantic = create_semantic_search(settings, &semantic_path);
// Create semantic search (for storing/loading/searching embeddings)
// and a separate embedding backend for generating new embeddings.
let (semantic, embedding_backend) =
create_semantic_search(settings, &semantic_path);

// Create pipeline
let settings_arc = Arc::new(settings.clone());
Expand Down Expand Up @@ -102,7 +105,7 @@ pub fn run(args: IndexParallelArgs, settings: &Settings) {

tracing::info!(target: "pipeline", "Indexing directory ({mode}): {}", path.display());

match pipeline.index_incremental(path, Arc::clone(&index), semantic.clone(), None, force) {
match pipeline.index_incremental(path, Arc::clone(&index), semantic.clone(), embedding_backend.clone(), force) {
Ok(stats) => {
display_incremental_stats(&stats, progress);
}
Expand All @@ -119,42 +122,103 @@ pub fn run(args: IndexParallelArgs, settings: &Settings) {
}
}

/// Create semantic search instance if enabled in settings.
/// Create semantic search instance and embedding backend if enabled in settings.
///
/// Returns `(semantic, backend)` where:
/// - `semantic` stores/loads/searches the embedding vectors
/// - `backend` generates new embeddings (local fastembed pool or remote HTTP)
fn create_semantic_search(
settings: &Settings,
semantic_path: &Path,
) -> Option<Arc<Mutex<SimpleSemanticSearch>>> {
) -> (Option<Arc<Mutex<SimpleSemanticSearch>>>, Option<Arc<EmbeddingBackend>>) {
if !settings.semantic_search.enabled {
tracing::debug!(target: "pipeline", "Semantic search disabled");
return None;
return (None, None);
}

let is_remote = std::env::var("CODANNA_EMBED_URL").is_ok()
|| settings.semantic_search.remote_url.is_some();

// Build embedding backend (local pool or remote HTTP)
let backend = match build_embedding_backend(&settings.semantic_search) {
Ok(b) => Arc::new(b),
Err(e) => {
tracing::warn!(target: "pipeline", "Failed to initialize embedding backend: {e}");
return (None, None);
}
};

let model = &settings.semantic_search.model;

// Try to load existing embeddings first
if semantic_path.exists() {
match SimpleSemanticSearch::load(semantic_path) {
Ok(semantic) => {
// Load existing embeddings or create fresh instance.
// After loading, verify dimensions match the backend so we don't silently
// drop all new embeddings during an incremental run after a backend switch.
let semantic = if semantic_path.exists() {
// In remote mode load without initialising a local fastembed model
let load_result = if is_remote {
SimpleSemanticSearch::load_remote(semantic_path)
} else {
SimpleSemanticSearch::load(semantic_path)
};
match load_result {
Ok(s) => {
let index_dim = s.dimensions();
let backend_dim = backend.dimensions();
if index_dim != backend_dim {
tracing::error!(
target: "pipeline",
"Semantic index dimension mismatch: index has {index_dim}d but backend produces {backend_dim}d. \
Re-index with: codanna index-parallel <path> --force"
);
std::process::exit(1);
}
let index_is_remote = s.is_remote_index();
if index_is_remote != is_remote {
tracing::warn!(
target: "pipeline",
"Backend kind changed (index={}, current={}). \
Embedding spaces may differ — similarity scores could be inaccurate. \
Re-index with --force to fix.",
if index_is_remote { "remote" } else { "local" },
if is_remote { "remote" } else { "local" },
);
}
tracing::debug!(target: "pipeline", "Loaded existing embeddings from {}", semantic_path.display());
return Some(Arc::new(Mutex::new(semantic)));
Some(Arc::new(Mutex::new(s)))
}
Err(SemanticSearchError::DimensionMismatch { suggestion, .. }) => {
// Incompatible existing index — cannot continue silently as stored
// vectors are structurally wrong for this backend.
tracing::error!(target: "pipeline", "Semantic index incompatible: {suggestion}");
std::process::exit(1);
}
Err(e) => {
tracing::warn!(target: "pipeline", "Failed to load embeddings: {e}");
tracing::warn!(target: "pipeline", "Failed to load embeddings, continuing without semantic search: {e}");
None
}
}
}

// Create new semantic search instance
match SimpleSemanticSearch::from_model_name(model) {
Ok(semantic) => {
tracing::debug!(target: "pipeline", "Created new semantic search with model: {model}");
Some(Arc::new(Mutex::new(semantic)))
}
Err(e) => {
tracing::warn!(target: "pipeline", "Failed to initialize semantic search: {e}");
None
} else {
let new_result = if is_remote {
Ok(SimpleSemanticSearch::new_empty(
backend.dimensions(),
&resolve_remote_model_name(&settings.semantic_search),
))
} else {
SimpleSemanticSearch::from_model_name(model)
};
match new_result {
Ok(s) => {
tracing::debug!(target: "pipeline", "Created new semantic search with model: {model}");
Some(Arc::new(Mutex::new(s)))
}
Err(e) => {
tracing::warn!(target: "pipeline", "Failed to initialize semantic search: {e}");
None
}
}
}
};

(semantic, Some(backend))
}

fn display_incremental_stats(stats: &IncrementalStats, with_progress: bool) {
Expand Down
19 changes: 19 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,22 @@ pub struct SemanticSearchConfig {
/// Number of parallel embedding model instances
#[serde(default = "default_embedding_threads")]
pub embedding_threads: usize,

/// Remote embedding server URL (OpenAI-compatible, e.g. http://host:8100).
/// When set, local fastembed is bypassed and this endpoint is used instead.
/// Overrideable via CODANNA_EMBED_URL env var.
#[serde(default)]
pub remote_url: Option<String>,

/// Model name to send to the remote embedding server.
/// Overrideable via CODANNA_EMBED_MODEL env var.
#[serde(default)]
pub remote_model: Option<String>,

/// Output dimension of the remote embedding model.
/// Required when remote_url is set. Overrideable via CODANNA_EMBED_DIM env var.
#[serde(default)]
pub remote_dim: Option<usize>,
}

#[derive(Debug, Deserialize, Serialize, Clone)]
Expand Down Expand Up @@ -424,6 +440,9 @@ impl Default for SemanticSearchConfig {
model: default_embedding_model(),
threshold: default_similarity_threshold(),
embedding_threads: default_embedding_threads(),
remote_url: None,
remote_model: None,
remote_dim: None,
}
}
}
Expand Down
11 changes: 7 additions & 4 deletions src/documents/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,14 @@ fn highlight_keywords(text: &str, query: &str) -> String {
let mut merged: Vec<(usize, usize)> = Vec::new();
for (start, end) in matches {
if let Some(last) = merged.last_mut() {
// Check if adjacent: only spaces/tabs between (no newlines)
let between = &text[last.1..start];
let is_adjacent = start <= last.1 || between.chars().all(|c| c == ' ' || c == '\t');
// Check overlap first — slice is only safe when start > last.1
let is_adjacent = if start <= last.1 {
true // overlapping, merge unconditionally
} else {
// Adjacent: only spaces/tabs between ranges (no newlines)
text[last.1..start].chars().all(|c| c == ' ' || c == '\t')
};
if is_adjacent {
// Merge: extend the previous range
last.1 = last.1.max(end);
continue;
}
Expand Down
Loading
Loading