Skip to content

Commit a8baedb

Browse files
authored
feat(embed_text): add expected_output_dimension parameter (#1373)
* feat(embed_text): add `expected_output_dimension` parameter Separate API parameter (output_dimension) from validation dimension (expected_output_dimension) for clearer semantics and better flexibility. Maintains backward compatibility with fallback to output_dimension when expected_output_dimension is not specified. * feat(embed_text): warn on dimension parameter mismatch Add warning when output_dimension and expected_output_dimension are both specified with different values, explaining their respective uses in schema definition vs API request.
1 parent da5d382 commit a8baedb

File tree

3 files changed

+25
-11
lines changed

3 files changed

+25
-11
lines changed

docs/docs/ops/functions.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,11 @@ The spec takes the following fields:
189189
* `api_type` ([`cocoindex.LlmApiType`](/docs/ai/llm#llm-api-types)): The type of LLM API to use for embedding.
190190
* `model` (`str`): The name of the embedding model to use.
191191
* `address` (`str`, optional): The address of the LLM API. If not specified, uses the default address for the API type.
192-
* `output_dimension` (`int`, optional): The expected dimension of the output embedding vector. If not specified, use the default dimension of the model.
192+
* `output_dimension` (`int`, optional): The dimension to request from the embedding API. Some APIs support specifying the output dimension (e.g., OpenAI's models support dimension reduction). If not specified, the API will use its default dimension.
193+
* `expected_output_dimension` (`int`, optional): The expected dimension of the output embedding vector for validation and type schema. If not specified, falls back to `output_dimension`, then to the default dimension of the model.
193194

194-
For most API types, the function internally keeps a registry for the default output dimension of known model.
195-
You need to explicitly specify the `output_dimension` if you want to use a new model that is not in the registry yet.
195+
For most API types, the function internally keeps a registry for the default output dimension of known models.
196+
You need to explicitly specify `expected_output_dimension` (or `output_dimension`) if you want to use a new model that is not in the registry yet.
196197

197198
* `task_type` (`str`, optional): The task type for embedding, used by some embedding models to optimize the embedding for specific use cases.
198199

python/cocoindex/functions/_engine_builtin_specs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class EmbedText(op.FunctionSpec):
5555
model: str
5656
address: str | None = None
5757
output_dimension: int | None = None
58+
expected_output_dimension: int | None = None
5859
task_type: str | None = None
5960
api_config: llm.VertexAiConfig | None = None
6061
api_key: TransientAuthEntryReference[str] | None = None

rust/cocoindex/src/ops/functions/embed_text.rs

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ struct Spec {
1212
address: Option<String>,
1313
api_config: Option<LlmApiConfig>,
1414
output_dimension: Option<u32>,
15+
expected_output_dimension: Option<u32>,
1516
task_type: Option<String>,
1617
api_key: Option<AuthEntryReference<String>>,
1718
}
@@ -129,23 +130,33 @@ impl SimpleFunctionFactoryBase for Factory {
129130
spec.api_config.clone(),
130131
)
131132
.await?;
132-
let output_dimension = match spec.output_dimension {
133-
Some(output_dimension) => output_dimension,
134-
None => {
135-
client.get_default_embedding_dimension(spec.model.as_str())
136-
.ok_or_else(|| api_error!("model \"{}\" is unknown for {:?}, needs to specify `output_dimension` explicitly", spec.model, spec.api_type))?
133+
134+
// Warn if both parameters are specified but have different values
135+
if let (Some(expected), Some(output)) =
136+
(spec.expected_output_dimension, spec.output_dimension)
137+
{
138+
if expected != output {
139+
warn!(
140+
"Both `expected_output_dimension` ({expected}) and `output_dimension` ({output}) are specified but have different values. \
141+
`expected_output_dimension` will be used for output schema and validation, while `output_dimension` will be sent to the embedding API."
142+
);
137143
}
138-
};
144+
}
145+
146+
let expected_output_dimension = spec.expected_output_dimension
147+
.or(spec.output_dimension)
148+
.or_else(|| client.get_default_embedding_dimension(spec.model.as_str()))
149+
.ok_or_else(|| api_error!("model \"{}\" is unknown for {:?}, needs to specify `expected_output_dimension` (or `output_dimension`) explicitly", spec.model, spec.api_type))? as usize;
139150
let output_schema = make_output_type(BasicValueType::Vector(VectorTypeSchema {
140-
dimension: Some(output_dimension as usize),
151+
dimension: Some(expected_output_dimension),
141152
element_type: Box::new(BasicValueType::Float32),
142153
}));
143154
Ok(SimpleFunctionAnalysisOutput {
144155
behavior_version: client.behavior_version(),
145156
resolved_args: Args {
146157
client,
147158
text,
148-
expected_output_dimension: output_dimension as usize,
159+
expected_output_dimension,
149160
},
150161
output_schema,
151162
})
@@ -179,6 +190,7 @@ mod tests {
179190
address: None,
180191
api_config: None,
181192
output_dimension: None,
193+
expected_output_dimension: None,
182194
task_type: None,
183195
api_key: None,
184196
};

0 commit comments

Comments
 (0)