diff --git a/python/flink_agents/integrations/chat_models/anthropic/tests/test_anthropic_chat_model.py b/python/flink_agents/integrations/chat_models/anthropic/tests/test_anthropic_chat_model.py
index 9741e054a..1aeb0ce54 100644
--- a/python/flink_agents/integrations/chat_models/anthropic/tests/test_anthropic_chat_model.py
+++ b/python/flink_agents/integrations/chat_models/anthropic/tests/test_anthropic_chat_model.py
@@ -30,6 +30,8 @@
     AnthropicChatModelSetup,
 )
 
+pytestmark = pytest.mark.integration
+
 test_model = os.environ.get("TEST_MODEL")
 api_key = os.environ.get("TEST_API_KEY")
 
diff --git a/python/flink_agents/integrations/chat_models/azure/tests/test_azure_openai_chat_model.py b/python/flink_agents/integrations/chat_models/azure/tests/test_azure_openai_chat_model.py
index ce69d42ec..983bbb47f 100644
--- a/python/flink_agents/integrations/chat_models/azure/tests/test_azure_openai_chat_model.py
+++ b/python/flink_agents/integrations/chat_models/azure/tests/test_azure_openai_chat_model.py
@@ -30,6 +30,8 @@
 from flink_agents.plan.function import PythonFunction
 from flink_agents.plan.tools.function_tool import FunctionTool
 
+pytestmark = pytest.mark.integration
+
 test_deployment = os.environ.get("TEST_AZURE_DEPLOYMENT")
 api_key = os.environ.get("AZURE_OPENAI_API_KEY")
 azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
diff --git a/python/flink_agents/integrations/chat_models/openai/tests/test_openai_chat_model.py b/python/flink_agents/integrations/chat_models/openai/tests/test_openai_chat_model.py
index 7ccb6c225..ff4d2bb39 100644
--- a/python/flink_agents/integrations/chat_models/openai/tests/test_openai_chat_model.py
+++ b/python/flink_agents/integrations/chat_models/openai/tests/test_openai_chat_model.py
@@ -31,6 +31,8 @@
 from flink_agents.plan.function import PythonFunction
 from flink_agents.plan.tools.function_tool import FunctionTool
 
+pytestmark = pytest.mark.integration
+
 test_model = os.environ.get("TEST_MODEL")
 api_key = os.environ.get("TEST_API_KEY")
 api_base_url = os.environ.get("TEST_API_BASE_URL")
diff --git a/python/flink_agents/integrations/chat_models/tests/test_ollama_chat_model.py b/python/flink_agents/integrations/chat_models/tests/test_ollama_chat_model.py
index 6a2a47117..27f07f340 100644
--- a/python/flink_agents/integrations/chat_models/tests/test_ollama_chat_model.py
+++ b/python/flink_agents/integrations/chat_models/tests/test_ollama_chat_model.py
@@ -34,6 +34,8 @@
 from flink_agents.plan.function import PythonFunction
 from flink_agents.plan.tools.function_tool import FunctionTool
 
+pytestmark = pytest.mark.integration
+
 test_model = os.environ.get("OLLAMA_CHAT_MODEL", "qwen3:1.7b")
 current_dir = Path(__file__).parent
 
diff --git a/python/flink_agents/integrations/chat_models/tests/test_tongyi_chat_model.py b/python/flink_agents/integrations/chat_models/tests/test_tongyi_chat_model.py
index c33a792ce..997422513 100644
--- a/python/flink_agents/integrations/chat_models/tests/test_tongyi_chat_model.py
+++ b/python/flink_agents/integrations/chat_models/tests/test_tongyi_chat_model.py
@@ -32,6 +32,8 @@
 from flink_agents.plan.function import PythonFunction
 from flink_agents.plan.tools.function_tool import FunctionTool
 
+pytestmark = pytest.mark.integration
+
 test_model = os.environ.get("TONGYI_CHAT_MODEL", "qwen-plus")
 api_key_available = "DASHSCOPE_API_KEY" in os.environ
 
diff --git a/python/flink_agents/integrations/embedding_models/local/tests/test_ollama_embedding_model.py b/python/flink_agents/integrations/embedding_models/local/tests/test_ollama_embedding_model.py
index b770eea3f..dd71c14ca 100644
--- a/python/flink_agents/integrations/embedding_models/local/tests/test_ollama_embedding_model.py
+++ b/python/flink_agents/integrations/embedding_models/local/tests/test_ollama_embedding_model.py
@@ -31,6 +31,8 @@
     OllamaEmbeddingModelSetup,
 )
 
+pytestmark = pytest.mark.integration
+
 test_model = os.environ.get("OLLAMA_EMBEDDING_MODEL", "all-minilm:22m")
 current_dir = Path(__file__).parent
 
diff --git a/python/flink_agents/integrations/embedding_models/tests/test_openai_embedding_model.py b/python/flink_agents/integrations/embedding_models/tests/test_openai_embedding_model.py
index e76cc3faa..49907340f 100644
--- a/python/flink_agents/integrations/embedding_models/tests/test_openai_embedding_model.py
+++ b/python/flink_agents/integrations/embedding_models/tests/test_openai_embedding_model.py
@@ -27,6 +27,8 @@
     OpenAIEmbeddingModelSetup,
 )
 
+pytestmark = pytest.mark.integration
+
 test_model = os.environ.get("TEST_EMBEDDING_MODEL", "text-embedding-3-small")
 api_key = os.environ.get("TEST_API_KEY")
 
diff --git a/python/flink_agents/integrations/embedding_models/tests/test_tongyi_embedding_model.py b/python/flink_agents/integrations/embedding_models/tests/test_tongyi_embedding_model.py
index 75e477067..b60c75596 100644
--- a/python/flink_agents/integrations/embedding_models/tests/test_tongyi_embedding_model.py
+++ b/python/flink_agents/integrations/embedding_models/tests/test_tongyi_embedding_model.py
@@ -29,6 +29,8 @@
     TongyiEmbeddingModelSetup,
 )
 
+pytestmark = pytest.mark.integration
+
 test_model = os.environ.get("TONGYI_EMBEDDING_MODEL", "text-embedding-v4")
 api_key_available = "DASHSCOPE_API_KEY" in os.environ
 
diff --git a/python/flink_agents/integrations/vector_stores/chroma/tests/test_chroma_vector_store.py b/python/flink_agents/integrations/vector_stores/chroma/tests/test_chroma_vector_store.py
index 754798cc8..135ded8b5 100644
--- a/python/flink_agents/integrations/vector_stores/chroma/tests/test_chroma_vector_store.py
+++ b/python/flink_agents/integrations/vector_stores/chroma/tests/test_chroma_vector_store.py
@@ -42,6 +42,8 @@
     _translate_filters_to_chroma_where,
 )
 
+pytestmark = pytest.mark.integration
+
 api_key = os.environ.get("TEST_API_KEY")
 tenant = os.environ.get("TEST_TENANT")
 database = os.environ.get("TEST_DATABASE")
diff --git a/python/flink_agents/integrations/vector_stores/mem0/tests/test_mem0_vector_store.py b/python/flink_agents/integrations/vector_stores/mem0/tests/test_mem0_vector_store.py
index 357d0352b..27253b964 100644
--- a/python/flink_agents/integrations/vector_stores/mem0/tests/test_mem0_vector_store.py
+++ b/python/flink_agents/integrations/vector_stores/mem0/tests/test_mem0_vector_store.py
@@ -62,9 +62,12 @@ def _doc(
         Mem0VectorStore,
     )
 
-pytestmark = pytest.mark.skipif(
-    not _backend_available, reason="mem0 / chromadb is not available"
-)
+pytestmark = [
+    pytest.mark.skipif(
+        not _backend_available, reason="mem0 / chromadb is not available"
+    ),
+    pytest.mark.integration,
+]
 
 
 # ---------------------------------------------------------------------------
diff --git a/python/pyproject.toml b/python/pyproject.toml
index cbd0d5b69..2d2d8d41c 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -202,3 +202,9 @@ strict = true
 
 [tool.ruff.format]
 docstring-code-format = true
+
+[tool.pytest.ini_options]
+markers = [
+    "integration: tests that require live external services (Ollama, DashScope, OpenAI, Azure, Anthropic, Chroma, mem0). Deselect with -m 'not integration'.",
+]
+strict_markers = true
diff --git a/tools/ut.sh b/tools/ut.sh
index 8b9de711a..a40303e4b 100755
--- a/tools/ut.sh
+++ b/tools/ut.sh
@@ -229,6 +229,7 @@ python_tests() {
             if $run_e2e; then
                 # There will be an individual build step before run e2e test for including java dist
                 uv pip install apache-flink~=${version}.0
+                # Arm 1: existing e2e tests (directory-based selector).
                 uv run --no-sync pytest flink_agents \
                 -s \
                 -k "e2e_tests_integration" \
@@ -236,15 +237,32 @@ python_tests() {
                 --reruns-delay 5 \
                 -o log_cli=true \
                 -o log_cli_level=${LOG_LEVEL:-CRITICAL}
+                rc1=$?
+                # Arm 2: integration-marked tests (registered in pyproject.toml).
+                # Trap exit code 5 (no tests collected) as failure to defend
+                # against -m selector typos that --strict-markers does not catch.
+                uv run --no-sync pytest flink_agents \
+                -s \
+                -m "integration" \
+                -o log_cli=true \
+                -o log_cli_level=${LOG_LEVEL:-CRITICAL}
+                rc2=$?
+                if [ $rc2 -eq 5 ]; then rc2=1; fi
+                # Logical-OR aggregation: any nonzero exit on either arm yields testcode=1.
+                # Side effect: pytest exit 5 (no tests collected) becomes failure on BOTH
+                # arms, not just arm 2 — which is the correct semantics (zero collection
+                # on either arm indicates a selector regression).
+                testcode=$((rc1 || rc2))
             else
                 uv sync --extra test
                 uv pip install apache-flink~=${version}.0
                 uv run --no-sync pytest flink_agents \
                 -k "not e2e_tests" \
+                -m "not integration" \
                 -o log_cli=true \
-                -o log_cli_level=${LOG_LEVEL:-CRITICAL}            
+                -o log_cli_level=${LOG_LEVEL:-CRITICAL}
+                testcode=$?
             fi
-            testcode=$?
         else
             if $verbose; then
                 echo "uv not found, falling back to pip"
@@ -262,10 +280,20 @@ python_tests() {
             fi
             if $run_e2e; then
                 pytest flink_agents -k "e2e_tests_integration" --reruns 2 --reruns-delay 5 -o log_cli=true -o log_cli_level=${LOG_LEVEL:-OFF}
+                rc1=$?
+                # Arm 2: integration-marked tests; trap exit code 5 as failure.
+                pytest flink_agents -m "integration" -o log_cli=true -o log_cli_level=${LOG_LEVEL:-OFF}
+                rc2=$?
+                if [ $rc2 -eq 5 ]; then rc2=1; fi
+                # Logical-OR aggregation: any nonzero exit on either arm yields testcode=1.
+                # Side effect: pytest exit 5 (no tests collected) becomes failure on BOTH
+                # arms, not just arm 2 — which is the correct semantics (zero collection
+                # on either arm indicates a selector regression).
+                testcode=$((rc1 || rc2))
             else
-                pytest flink_agents -k "not e2e_tests" -o log_cli=true -o log_cli_level=${LOG_LEVEL:-OFF}
+                pytest flink_agents -k "not e2e_tests" -m "not integration" -o log_cli=true -o log_cli_level=${LOG_LEVEL:-OFF}
+                testcode=$?
             fi
-            testcode=$?
         fi
 
         # Handle pytest exit codes