Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-jina.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,6 @@ jobs:
env:
JINA_API_KEY: ${{ secrets.JINA_API_KEY }}
RUN_JINA_TESTS: "1"
RUN_JINA_CLIP_TESTS: "1"
RUN_JINA_MULTIMODAL_TESTS: "1"
run: |
uv run --package embcli-jina pytest packages/embcli-jina/tests
11 changes: 7 additions & 4 deletions packages/embcli-jina/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,15 @@ JinaEmbeddingModel
* dimensions (int) - The number of dimensions the resulting output embeddings should have. Only supported in jina-embeddings-v3 and jina-colbert-v2.
* input_type (str) - The type of input to the model. Supported types: 'query', 'document' Only supported in jina-corebert-v2.
* embedding_type (str) - The type of embeddings to return. Options include 'float', 'binary', 'ubinary'. Default is 'float'.
JinaClipModel
JinaMultiModalModel
Vendor: jina
Models:
* jina-embeddings-v4 (aliases: jina-v4)
* jina-clip-v2 (aliases: )
Model Options:
* task (str) - Downstream task for which the embeddings are used. Supported tasks: 'retrieval.query', 'retrieval.passage'.
* task (str) - Downstream task for which the embeddings are used. Supported tasks: 'retrieval.query', 'retrieval.passage', 'text-matching', 'code.query', 'code.passage'.
* late_chunking (bool) - Whether if the late chunking is applied. Only supported in jina-embeddings-v4.
* truncate (bool) - When enabled, the model will automatically drop the tail that extends beyond the maximum context length allowed by the model instead of throwing an error. Only supported in jina-embeddings-v4.
* dimensions (int) - The number of dimensions the resulting output embeddings should have.
* embedding_type (str) - The type of embeddings to return. Options include 'float', 'binary', 'ubinary'. Default is 'float'.

Expand All @@ -64,9 +67,9 @@ emb embed -m jina-v3 "Embeddings are essential for semantic search and RAG apps.
# get an embedding for an input text by jina-embeddings-v3 model model with embedding_type=binary.
emb embed -m jina-v3 "Embeddings are essential for semantic search and RAG apps." -o embedding_type binary

# get an embedding for an image input by jina-clip-v2 model.
# get an embedding for an image input by jina-embeddings-v4 model.
# assume you have an image file named `gingercat.jpg` in the current directory.
emb embed -m jina-clip-v2 --image gingercat.jpeg
emb embed -m jina-v4 --image gingercat.jpeg

# calculate similarity score between two texts by jina-embeddings-v3 model model. the default metric is cosine similarity.
emb simscore -m jina-v3 "The cat drifts toward sleep." "Sleep dances in the cat's eyes."
Expand Down
2 changes: 1 addition & 1 deletion packages/embcli-jina/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ build-backend = "hatchling.build"

[project.entry-points."embcli"]
jina = "embcli_jina.jina"
jina-clip = "embcli_jina.jina_clip"
jina-clip = "embcli_jina.jina_multimodal"
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,25 @@ def image_to_base64(image_path: str) -> str:
return data.decode("utf-8")


class JinaClipModel(MultimodalEmbeddingModel):
class JinaMultiModalModel(MultimodalEmbeddingModel):
vendor = "jina"
default_batch_size = 100
model_aliases = [("jina-clip-v2", [])]
model_aliases = [("jina-embeddings-v4", ["jina-v4"]), ("jina-clip-v2", [])]
valid_options = [
ModelOption(
"task",
ModelOptionType.STR,
"Downstream task for which the embeddings are used. Supported tasks: 'retrieval.query', 'retrieval.passage'.", # noqa: E501
"Downstream task for which the embeddings are used. Supported tasks: 'retrieval.query', 'retrieval.passage', 'text-matching', 'code.query', 'code.passage'.", # noqa: E501
),
ModelOption(
"late_chunking",
ModelOptionType.BOOL,
"Whether if the late chunking is applied. Only supported in jina-embeddings-v4.",
),
ModelOption(
"truncate",
ModelOptionType.BOOL,
"When enabled, the model will automatically drop the tail that extends beyond the maximum context length allowed by the model instead of throwing an error. Only supported in jina-embeddings-v4.", # noqa: E501
),
ModelOption(
"dimensions",
Expand Down Expand Up @@ -87,9 +97,9 @@ def embed_for_search(self, input, **kwargs):
@embcli_core.hookimpl
def embedding_model():
def create(model_id: str, **kwargs):
model_ids = [alias[0] for alias in JinaClipModel.model_aliases]
model_ids = [alias[0] for alias in JinaMultiModalModel.model_aliases]
if model_id not in model_ids:
raise ValueError(f"Model ID {model_id} is not supported.")
return JinaClipModel(model_id, **kwargs)
return JinaMultiModalModel(model_id, **kwargs)

return JinaClipModel, create
return JinaMultiModalModel, create
12 changes: 6 additions & 6 deletions packages/embcli-jina/tests/embcli_jina/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest
from embcli_jina import jina, jina_clip
from embcli_jina import jina, jina_multimodal
from embcli_jina.jina import JinaEmbeddingModel
from embcli_jina.jina_clip import JinaClipModel
from embcli_jina.jina_multimodal import JinaMultiModalModel


@pytest.fixture
Expand All @@ -11,9 +11,9 @@ def jina_models():


@pytest.fixture
def jina_clip_models():
model_ids = [alias[0] for alias in JinaClipModel.model_aliases]
return [JinaClipModel(model_id) for model_id in model_ids]
def jina_multimodal_models():
model_ids = [alias[0] for alias in JinaMultiModalModel.model_aliases]
return [JinaMultiModalModel(model_id) for model_id in model_ids]


@pytest.fixture
Expand All @@ -25,5 +25,5 @@ def plugin_manager():
pm = pluggy.PluginManager("embcli")
pm.add_hookspecs(hookspecs)
pm.register(jina)
pm.register(jina_clip)
pm.register(jina_multimodal)
return pm
12 changes: 6 additions & 6 deletions packages/embcli-jina/tests/embcli_jina/test_cli_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,21 @@
from embcli_core.cli import embed

skip_if_no_api_key = pytest.mark.skipif(
not os.environ.get("JINA_API_KEY") or not os.environ.get("RUN_JINA_CLIP_TESTS") == "1",
reason="JINA_API_KEY and RUN_JINA_CLIP_TESTS environment variables not set",
not os.environ.get("JINA_API_KEY") or not os.environ.get("RUN_JINA_MULTIMODAL_TESTS") == "1",
reason="JINA_API_KEY and RUN_JINA_MULTIMODAL_TESTS environment variables not set",
)


@skip_if_no_api_key
def test_embed_command_text(plugin_manager, mocker):
mocker.patch("embcli_core.cli._pm", plugin_manager)
runner = CliRunner()
result = runner.invoke(embed, ["--model", "jina-clip-v2", "flying cat"])
result = runner.invoke(embed, ["--model", "jina-v4", "flying cat"])
assert result.exit_code == 0

embeddings = json.loads(result.output)
assert isinstance(embeddings, list)
assert len(embeddings) == 1024
assert len(embeddings) == 2048
assert all(isinstance(val, float) for val in embeddings)


Expand All @@ -30,10 +30,10 @@ def test_embed_command_image(plugin_manager, mocker):
mocker.patch("embcli_core.cli._pm", plugin_manager)
runner = CliRunner()
image_path = files("tests.embcli_jina").joinpath("flying_cat.jpeg")
result = runner.invoke(embed, ["--model", "jina-clip-v2", "--image", str(image_path)])
result = runner.invoke(embed, ["--model", "jina-v4", "--image", str(image_path)])
assert result.exit_code == 0

embeddings = json.loads(result.output)
assert isinstance(embeddings, list)
assert len(embeddings) == 1024
assert len(embeddings) == 2048
assert all(isinstance(val, float) for val in embeddings)
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

import pytest
from embcli_core.models import Modality
from embcli_jina.jina_clip import JinaClipModel, embedding_model
from embcli_jina.jina_multimodal import JinaMultiModalModel, embedding_model

skip_if_no_api_key = pytest.mark.skipif(
not os.environ.get("JINA_API_KEY") or not os.environ.get("RUN_JINA_CLIP_TESTS") == "1",
reason="JINA_API_KEY and RUN_JINA_CLIP_TESTS environment variables not set",
not os.environ.get("JINA_API_KEY") or not os.environ.get("RUN_JINA_MULTIMODAL_TESTS") == "1",
reason="JINA_API_KEY and RUN_JINA_MULTIMODAL_TESTS environment variables not set",
)


@skip_if_no_api_key
def test_factory_create_valid_model():
_, create = embedding_model()
model = create("jina-clip-v2")
assert isinstance(model, JinaClipModel)
assert isinstance(model, JinaMultiModalModel)
assert model.model_id == "jina-clip-v2"
assert model.endpoint == "https://api.jina.ai/v1/embeddings"

Expand All @@ -28,8 +28,8 @@ def test_factory_create_invalid_model():


@skip_if_no_api_key
def test_embed_one_batch_multimodal(jina_clip_models):
for model in jina_clip_models:
def test_embed_one_batch_multimodal(jina_multimodal_models):
for model in jina_multimodal_models:
print(f"Testing model: {model.model_id}")
input_data = ["hello", "world"]

Expand All @@ -42,8 +42,8 @@ def test_embed_one_batch_multimodal(jina_clip_models):


@skip_if_no_api_key
def test_embed_one_batch_multimodal_image(jina_clip_models):
for model in jina_clip_models:
def test_embed_one_batch_multimodal_image(jina_multimodal_models):
for model in jina_multimodal_models:
image_paths = [
files("tests.embcli_jina").joinpath("flying_cat.jpeg"),
files("tests.embcli_jina").joinpath("sleepy_sheep.jpeg"),
Expand All @@ -54,13 +54,16 @@ def test_embed_one_batch_multimodal_image(jina_clip_models):
for emb in embeddings:
assert isinstance(emb, list)
assert all(isinstance(x, float) for x in emb)
assert len(emb) == 1024
if model.model_id == "jina-clip-v2":
assert len(emb) == 1024
elif model.model_id == "jina-clip-v4":
assert len(emb) == 2048


@skip_if_no_api_key
def test_embed_batch_with_options(jina_clip_models):
def test_embed_batch_with_options(jina_multimodal_models):
input_data = ["hello", "world"]
for model in jina_clip_models:
for model in jina_multimodal_models:
options = {"task": "retrieval.query", "dimensions": 512}
embeddings = list(model.embed_batch(input_data, None, **options))
assert len(embeddings) == len(input_data)
Expand All @@ -71,16 +74,15 @@ def test_embed_batch_with_options(jina_clip_models):


@skip_if_no_api_key
def test_embed_batch_embedding_types(jina_clip_models):
def test_embed_batch_embedding_types(jina_multimodal_models):
input_data = ["hello", "world"]
for model in jina_clip_models:
for model in jina_multimodal_models:
# Test binary embedding type
options = {"embedding_type": "binary"}
embeddings = list(model.embed_batch(input_data, None, **options))
assert len(embeddings) == len(input_data)
for emb in embeddings:
assert isinstance(emb, list)
assert all(isinstance(x, int) for x in emb)
assert all(-128 <= x <= 127 for x in emb)

# Test ubinary embedding type
Expand All @@ -89,5 +91,4 @@ def test_embed_batch_embedding_types(jina_clip_models):
assert len(embeddings) == len(input_data)
for emb in embeddings:
assert isinstance(emb, list)
assert all(isinstance(x, int) for x in emb)
assert all(0 <= x <= 255 for x in emb)
Loading