diff --git a/.github/workflows/ci-jina.yml b/.github/workflows/ci-jina.yml index 92475c2..d363f9a 100644 --- a/.github/workflows/ci-jina.yml +++ b/.github/workflows/ci-jina.yml @@ -47,6 +47,6 @@ jobs: env: JINA_API_KEY: ${{ secrets.JINA_API_KEY }} RUN_JINA_TESTS: "1" - RUN_JINA_CLIP_TESTS: "1" + RUN_JINA_MULTIMODAL_TESTS: "1" run: | uv run --package embcli-jina pytest packages/embcli-jina/tests diff --git a/packages/embcli-jina/README.md b/packages/embcli-jina/README.md index 9167f76..08b7670 100644 --- a/packages/embcli-jina/README.md +++ b/packages/embcli-jina/README.md @@ -46,12 +46,15 @@ JinaEmbeddingModel * dimensions (int) - The number of dimensions the resulting output embeddings should have. Only supported in jina-embeddings-v3 and jina-colbert-v2. * input_type (str) - The type of input to the model. Supported types: 'query', 'document' Only supported in jina-corebert-v2. * embedding_type (str) - The type of embeddings to return. Options include 'float', 'binary', 'ubinary'. Default is 'float'. -JinaClipModel +JinaMultiModalModel Vendor: jina Models: + * jina-embeddings-v4 (aliases: jina-v4) * jina-clip-v2 (aliases: ) Model Options: - * task (str) - Downstream task for which the embeddings are used. Supported tasks: 'retrieval.query', 'retrieval.passage'. + * task (str) - Downstream task for which the embeddings are used. Supported tasks: 'retrieval.query', 'retrieval.passage', 'text-matching', 'code.query', 'code.passage'. + * late_chunking (bool) - Whether if the late chunking is applied. Only supported in jina-embeddings-v4. + * truncate (bool) - When enabled, the model will automatically drop the tail that extends beyond the maximum context length allowed by the model instead of throwing an error. Only supported in jina-embeddings-v4. * dimensions (int) - The number of dimensions the resulting output embeddings should have. * embedding_type (str) - The type of embeddings to return. Options include 'float', 'binary', 'ubinary'. Default is 'float'. @@ -64,9 +67,9 @@ emb embed -m jina-v3 "Embeddings are essential for semantic search and RAG apps. # get an embedding for an input text by jina-embeddings-v3 model model with embedding_type=binary. emb embed -m jina-v3 "Embeddings are essential for semantic search and RAG apps." -o embedding_type binary -# get an embedding for an image input by jina-clip-v2 model. +# get an embedding for an image input by jina-embeddings-v4 model. # assume you have an image file named `gingercat.jpg` in the current directory. -emb embed -m jina-clip-v2 --image gingercat.jpeg +emb embed -m jina-v4 --image gingercat.jpeg # calculate similarity score between two texts by jina-embeddings-v3 model model. the default metric is cosine similarity. emb simscore -m jina-v3 "The cat drifts toward sleep." "Sleep dances in the cat's eyes." diff --git a/packages/embcli-jina/pyproject.toml b/packages/embcli-jina/pyproject.toml index 2b825ea..07d454e 100644 --- a/packages/embcli-jina/pyproject.toml +++ b/packages/embcli-jina/pyproject.toml @@ -34,4 +34,4 @@ build-backend = "hatchling.build" [project.entry-points."embcli"] jina = "embcli_jina.jina" -jina-clip = "embcli_jina.jina_clip" +jina-clip = "embcli_jina.jina_multimodal" diff --git a/packages/embcli-jina/src/embcli_jina/jina_clip.py b/packages/embcli-jina/src/embcli_jina/jina_multimodal.py similarity index 77% rename from packages/embcli-jina/src/embcli_jina/jina_clip.py rename to packages/embcli-jina/src/embcli_jina/jina_multimodal.py index c911d8c..bd9d4da 100644 --- a/packages/embcli-jina/src/embcli_jina/jina_clip.py +++ b/packages/embcli-jina/src/embcli_jina/jina_multimodal.py @@ -14,15 +14,25 @@ def image_to_base64(image_path: str) -> str: return data.decode("utf-8") -class JinaClipModel(MultimodalEmbeddingModel): +class JinaMultiModalModel(MultimodalEmbeddingModel): vendor = "jina" default_batch_size = 100 - model_aliases = [("jina-clip-v2", [])] + model_aliases = [("jina-embeddings-v4", ["jina-v4"]), ("jina-clip-v2", [])] valid_options = [ ModelOption( "task", ModelOptionType.STR, - "Downstream task for which the embeddings are used. Supported tasks: 'retrieval.query', 'retrieval.passage'.", # noqa: E501 + "Downstream task for which the embeddings are used. Supported tasks: 'retrieval.query', 'retrieval.passage', 'text-matching', 'code.query', 'code.passage'.", # noqa: E501 + ), + ModelOption( + "late_chunking", + ModelOptionType.BOOL, + "Whether if the late chunking is applied. Only supported in jina-embeddings-v4.", + ), + ModelOption( + "truncate", + ModelOptionType.BOOL, + "When enabled, the model will automatically drop the tail that extends beyond the maximum context length allowed by the model instead of throwing an error. Only supported in jina-embeddings-v4.", # noqa: E501 ), ModelOption( "dimensions", @@ -87,9 +97,9 @@ def embed_for_search(self, input, **kwargs): @embcli_core.hookimpl def embedding_model(): def create(model_id: str, **kwargs): - model_ids = [alias[0] for alias in JinaClipModel.model_aliases] + model_ids = [alias[0] for alias in JinaMultiModalModel.model_aliases] if model_id not in model_ids: raise ValueError(f"Model ID {model_id} is not supported.") - return JinaClipModel(model_id, **kwargs) + return JinaMultiModalModel(model_id, **kwargs) - return JinaClipModel, create + return JinaMultiModalModel, create diff --git a/packages/embcli-jina/tests/embcli_jina/conftest.py b/packages/embcli-jina/tests/embcli_jina/conftest.py index bd729cf..2b4a504 100644 --- a/packages/embcli-jina/tests/embcli_jina/conftest.py +++ b/packages/embcli-jina/tests/embcli_jina/conftest.py @@ -1,7 +1,7 @@ import pytest -from embcli_jina import jina, jina_clip +from embcli_jina import jina, jina_multimodal from embcli_jina.jina import JinaEmbeddingModel -from embcli_jina.jina_clip import JinaClipModel +from embcli_jina.jina_multimodal import JinaMultiModalModel @pytest.fixture @@ -11,9 +11,9 @@ def jina_models(): @pytest.fixture -def jina_clip_models(): - model_ids = [alias[0] for alias in JinaClipModel.model_aliases] - return [JinaClipModel(model_id) for model_id in model_ids] +def jina_multimodal_models(): + model_ids = [alias[0] for alias in JinaMultiModalModel.model_aliases] + return [JinaMultiModalModel(model_id) for model_id in model_ids] @pytest.fixture @@ -25,5 +25,5 @@ def plugin_manager(): pm = pluggy.PluginManager("embcli") pm.add_hookspecs(hookspecs) pm.register(jina) - pm.register(jina_clip) + pm.register(jina_multimodal) return pm diff --git a/packages/embcli-jina/tests/embcli_jina/test_cli_embed.py b/packages/embcli-jina/tests/embcli_jina/test_cli_embed.py index f87c595..3277c86 100644 --- a/packages/embcli-jina/tests/embcli_jina/test_cli_embed.py +++ b/packages/embcli-jina/tests/embcli_jina/test_cli_embed.py @@ -7,8 +7,8 @@ from embcli_core.cli import embed skip_if_no_api_key = pytest.mark.skipif( - not os.environ.get("JINA_API_KEY") or not os.environ.get("RUN_JINA_CLIP_TESTS") == "1", - reason="JINA_API_KEY and RUN_JINA_CLIP_TESTS environment variables not set", + not os.environ.get("JINA_API_KEY") or not os.environ.get("RUN_JINA_MULTIMODAL_TESTS") == "1", + reason="JINA_API_KEY and RUN_JINA_MULTIMODAL_TESTS environment variables not set", ) @@ -16,12 +16,12 @@ def test_embed_command_text(plugin_manager, mocker): mocker.patch("embcli_core.cli._pm", plugin_manager) runner = CliRunner() - result = runner.invoke(embed, ["--model", "jina-clip-v2", "flying cat"]) + result = runner.invoke(embed, ["--model", "jina-v4", "flying cat"]) assert result.exit_code == 0 embeddings = json.loads(result.output) assert isinstance(embeddings, list) - assert len(embeddings) == 1024 + assert len(embeddings) == 2048 assert all(isinstance(val, float) for val in embeddings) @@ -30,10 +30,10 @@ def test_embed_command_image(plugin_manager, mocker): mocker.patch("embcli_core.cli._pm", plugin_manager) runner = CliRunner() image_path = files("tests.embcli_jina").joinpath("flying_cat.jpeg") - result = runner.invoke(embed, ["--model", "jina-clip-v2", "--image", str(image_path)]) + result = runner.invoke(embed, ["--model", "jina-v4", "--image", str(image_path)]) assert result.exit_code == 0 embeddings = json.loads(result.output) assert isinstance(embeddings, list) - assert len(embeddings) == 1024 + assert len(embeddings) == 2048 assert all(isinstance(val, float) for val in embeddings) diff --git a/packages/embcli-jina/tests/embcli_jina/test_jina_clip.py b/packages/embcli-jina/tests/embcli_jina/test_jina_multimodal.py similarity index 76% rename from packages/embcli-jina/tests/embcli_jina/test_jina_clip.py rename to packages/embcli-jina/tests/embcli_jina/test_jina_multimodal.py index db6c97e..4452ac7 100644 --- a/packages/embcli-jina/tests/embcli_jina/test_jina_clip.py +++ b/packages/embcli-jina/tests/embcli_jina/test_jina_multimodal.py @@ -3,11 +3,11 @@ import pytest from embcli_core.models import Modality -from embcli_jina.jina_clip import JinaClipModel, embedding_model +from embcli_jina.jina_multimodal import JinaMultiModalModel, embedding_model skip_if_no_api_key = pytest.mark.skipif( - not os.environ.get("JINA_API_KEY") or not os.environ.get("RUN_JINA_CLIP_TESTS") == "1", - reason="JINA_API_KEY and RUN_JINA_CLIP_TESTS environment variables not set", + not os.environ.get("JINA_API_KEY") or not os.environ.get("RUN_JINA_MULTIMODAL_TESTS") == "1", + reason="JINA_API_KEY and RUN_JINA_MULTIMODAL_TESTS environment variables not set", ) @@ -15,7 +15,7 @@ def test_factory_create_valid_model(): _, create = embedding_model() model = create("jina-clip-v2") - assert isinstance(model, JinaClipModel) + assert isinstance(model, JinaMultiModalModel) assert model.model_id == "jina-clip-v2" assert model.endpoint == "https://api.jina.ai/v1/embeddings" @@ -28,8 +28,8 @@ def test_factory_create_invalid_model(): @skip_if_no_api_key -def test_embed_one_batch_multimodal(jina_clip_models): - for model in jina_clip_models: +def test_embed_one_batch_multimodal(jina_multimodal_models): + for model in jina_multimodal_models: print(f"Testing model: {model.model_id}") input_data = ["hello", "world"] @@ -42,8 +42,8 @@ def test_embed_one_batch_multimodal(jina_clip_models): @skip_if_no_api_key -def test_embed_one_batch_multimodal_image(jina_clip_models): - for model in jina_clip_models: +def test_embed_one_batch_multimodal_image(jina_multimodal_models): + for model in jina_multimodal_models: image_paths = [ files("tests.embcli_jina").joinpath("flying_cat.jpeg"), files("tests.embcli_jina").joinpath("sleepy_sheep.jpeg"), @@ -54,13 +54,16 @@ def test_embed_one_batch_multimodal_image(jina_clip_models): for emb in embeddings: assert isinstance(emb, list) assert all(isinstance(x, float) for x in emb) - assert len(emb) == 1024 + if model.model_id == "jina-clip-v2": + assert len(emb) == 1024 + elif model.model_id == "jina-clip-v4": + assert len(emb) == 2048 @skip_if_no_api_key -def test_embed_batch_with_options(jina_clip_models): +def test_embed_batch_with_options(jina_multimodal_models): input_data = ["hello", "world"] - for model in jina_clip_models: + for model in jina_multimodal_models: options = {"task": "retrieval.query", "dimensions": 512} embeddings = list(model.embed_batch(input_data, None, **options)) assert len(embeddings) == len(input_data) @@ -71,16 +74,15 @@ def test_embed_batch_with_options(jina_clip_models): @skip_if_no_api_key -def test_embed_batch_embedding_types(jina_clip_models): +def test_embed_batch_embedding_types(jina_multimodal_models): input_data = ["hello", "world"] - for model in jina_clip_models: + for model in jina_multimodal_models: # Test binary embedding type options = {"embedding_type": "binary"} embeddings = list(model.embed_batch(input_data, None, **options)) assert len(embeddings) == len(input_data) for emb in embeddings: assert isinstance(emb, list) - assert all(isinstance(x, int) for x in emb) assert all(-128 <= x <= 127 for x in emb) # Test ubinary embedding type @@ -89,5 +91,4 @@ def test_embed_batch_embedding_types(jina_clip_models): assert len(embeddings) == len(input_data) for emb in embeddings: assert isinstance(emb, list) - assert all(isinstance(x, int) for x in emb) assert all(0 <= x <= 255 for x in emb)