From c031d4bbf974c4c2ad028fcdec9646beaafdb5f7 Mon Sep 17 00:00:00 2001 From: HuyNVQ Date: Thu, 12 Feb 2026 17:31:19 +0700 Subject: [PATCH] feat(extension): add Blaze STT and TTS extensions --- .../extension/blaze_stt_python/README.md | 121 ++++++ .../extension/blaze_stt_python/__init__.py | 12 + .../extension/blaze_stt_python/blaze_stt.py | 348 +++++++++++++++ .../extension/blaze_stt_python/manifest.json | 83 ++++ .../extension/blaze_stt_python/property.json | 11 + .../blaze_stt_python/requirements.txt | 3 + .../blaze_stt_python/tests/__init__.py | 4 + .../blaze_stt_python/tests/conftest.py | 111 +++++ .../blaze_stt_python/tests/pytest.ini | 7 + .../blaze_stt_python/tests/requirements.txt | 4 + .../blaze_stt_python/tests/test_blaze_stt.py | 350 ++++++++++++++++ .../extension/blaze_tts_python/README.md | 125 ++++++ .../extension/blaze_tts_python/__init__.py | 12 + .../extension/blaze_tts_python/blaze_tts.py | 396 ++++++++++++++++++ .../extension/blaze_tts_python/manifest.json | 86 ++++ .../extension/blaze_tts_python/property.json | 12 + .../blaze_tts_python/requirements.txt | 3 + .../blaze_tts_python/tests/__init__.py | 4 + .../blaze_tts_python/tests/conftest.py | 60 +++ .../blaze_tts_python/tests/pytest.ini | 7 + .../blaze_tts_python/tests/requirements.txt | 4 + .../blaze_tts_python/tests/test_blaze_tts.py | 375 +++++++++++++++++ 22 files changed, 2138 insertions(+) create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/README.md create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/__init__.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/blaze_stt.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/manifest.json create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/property.json create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/requirements.txt create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/__init__.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/conftest.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/pytest.ini create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/requirements.txt create mode 100644 ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/test_blaze_stt.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/README.md create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/__init__.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/blaze_tts.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/manifest.json create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/property.json create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/requirements.txt create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/__init__.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/conftest.py create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/pytest.ini create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/requirements.txt create mode 100644 ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/test_blaze_tts.py diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/README.md b/ai_agents/agents/ten_packages/extension/blaze_stt_python/README.md new file mode 100644 index 0000000000..54687ab6c5 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/README.md @@ -0,0 +1,121 @@ +# Blaze STT Extension for TEN Framework + +Blaze Speech-to-Text (STT) extension for [TEN Framework](https://github.com/TEN-framework/ten-framework). + +## Installation + +```bash +pip install -r requirements.txt +``` + +Or install dependencies directly: + +```bash +pip install httpx pydantic +``` + +## Configuration + +### Environment Variables + +Set the following environment variables: + +```bash +export BLAZE_STT_API_URL="http://localhost:8000" +export BLAZE_STT_API_KEY="your-api-key-here" # Optional +``` + +### Property.json (TEN Framework) + +The extension includes a `property.json` file with default configuration that TEN framework can use: + +```json +{ + "params": { + "api_url": "${env:BLAZE_STT_API_URL}", + "api_key": "${env:BLAZE_STT_API_KEY}", + "language": "vi", + "enable_segments": false, + "enable_refinement": false, + "timeout": 3600 + } +} +``` + +TEN framework will automatically read this file and use environment variables for configuration. + +## Usage + +### As TEN Framework Extension + +```python +from blaze_stt_python import BlazeSTTExtension + +# Initialize extension (can accept dict config from TEN framework) +stt = BlazeSTTExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "your-api-key", + "language": "vi", +}) + +# Process audio using TEN framework interface +result = stt.process({ + "audio_data": audio_bytes, + "audio_content_type": "audio/wav", + "language": "vi", +}) + +print(result["transcription"]) + +# Get extension metadata +metadata = stt.get_metadata() +print(metadata) +``` + +### As Direct Extension + +```python +from blaze_stt_python import BlazeSTTExtension, BlazeSTTConfig + +# Initialize extension +config = BlazeSTTConfig( + api_url="http://localhost:8000", + api_key="your-api-key", + default_language="vi", +) +stt = BlazeSTTExtension(config=config) + +# Transcribe audio +result = stt.transcribe( + audio_data=audio_bytes, + audio_content_type="audio/wav", + language="vi", +) + +print(result["transcription"]) +``` + +## API Reference + +### BlazeSTTExtension + +**TEN Framework Interface Methods:** +- `process(input_data)` - Process audio and return transcription (TEN framework interface) +- `get_metadata()` - Get extension metadata (TEN framework interface) + +**Direct Methods:** + +- `transcribe(audio_data, audio_file, audio_content_type, language, enable_segments, enable_refinement, lazy_process)` - Transcribe audio data (bytes) or file (UploadFile) +- `get_job_status(job_id)` - Get transcription job status + +## Supported Formats + +- `audio/wav` - WAV format +- `audio/mpeg` - MP3 format +- `audio/webm` - WebM format +- `audio/ogg` - OGG format + +## License + +This extension is provided as-is for use with the TEN Framework and Blaze services. + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/__init__.py b/ai_agents/agents/ten_packages/extension/blaze_stt_python/__init__.py new file mode 100644 index 0000000000..f76f61db82 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/__init__.py @@ -0,0 +1,12 @@ +""" +Blaze STT Extension for TEN Framework + +This extension provides Speech-to-Text (STT) functionality using Blaze API. +Implements TEN framework extension interface. +""" + +from .blaze_stt import BlazeSTTExtension, BlazeSTTConfig + +__all__ = ["BlazeSTTExtension", "BlazeSTTConfig"] +__version__ = "1.0.0" + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/blaze_stt.py b/ai_agents/agents/ten_packages/extension/blaze_stt_python/blaze_stt.py new file mode 100644 index 0000000000..e5800f2575 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/blaze_stt.py @@ -0,0 +1,348 @@ +""" +Blaze STT Extension Implementation + +This extension wraps the Blaze STT API endpoint for use in TEN framework. +""" + +import os +import logging +from typing import Optional, Dict, Any, BinaryIO, Union +from io import BytesIO + +import httpx +from pydantic import BaseModel, Field + +# Import UploadFile for multipart support +try: + from fastapi import UploadFile +except ImportError: + # Fallback if fastapi is not available + UploadFile = None + +logger = logging.getLogger(__name__) + + +class BlazeSTTConfig(BaseModel): + """Configuration for Blaze STT Extension""" + + api_url: str = Field( + default=os.getenv("BLAZE_STT_API_URL", "http://localhost:8000"), + description="Blaze STT API base URL" + ) + api_key: Optional[str] = Field( + default=os.getenv("BLAZE_STT_API_KEY", None), + description="API key for authentication (Bearer token)" + ) + timeout: int = Field( + default=3600, + description="Request timeout in seconds" + ) + enable_segments: bool = Field( + default=False, + description="Split audio into segments with timestamps" + ) + enable_refinement: bool = Field( + default=False, + description="Apply post-processing refinement to improve accuracy" + ) + default_language: str = Field( + default="vi", + description="Default language code (e.g., 'vi' for Vietnamese)" + ) + + +class BlazeSTTExtension: + """ + Blaze STT Extension for TEN Framework + + This extension provides Speech-to-Text functionality by wrapping + the Blaze STT API endpoint: /v1/stt/execute + + Implements TEN framework extension interface with process() and get_metadata() methods. + """ + + def __init__(self, config: Optional[Union[BlazeSTTConfig, Dict[str, Any]]] = None): + """ + Initialize Blaze STT Extension + + Args: + config: Configuration object (BlazeSTTConfig) or dict from TEN framework. + If None, uses environment variables. + If dict, converts to BlazeSTTConfig. + """ + if config is None: + self.config = BlazeSTTConfig() + elif isinstance(config, dict): + # Convert dict from TEN framework to BlazeSTTConfig + self.config = BlazeSTTConfig( + api_url=config.get("api_url", "http://localhost:8000"), + api_key=config.get("api_key"), + default_language=config.get("language", "vi"), + enable_segments=config.get("enable_segments", False), + enable_refinement=config.get("enable_refinement", False), + timeout=config.get("timeout", 3600), + ) + else: + self.config = config + + self.base_url = self.config.api_url.rstrip("/") + self.endpoint = f"{self.base_url}/v1/stt/execute" + + logger.info(f"Blaze STT Extension initialized with API URL: {self.base_url}") + + def transcribe( + self, + audio_data: Optional[bytes] = None, + audio_file: Optional[UploadFile] = None, + audio_content_type: Optional[str] = None, + language: Optional[str] = None, + enable_segments: Optional[bool] = None, + enable_refinement: Optional[bool] = None, + lazy_process: bool = False, + ) -> Dict[str, Any]: + """ + Transcribe audio data to text + + Similar to API endpoint /v1/stt/execute which accepts: + - UploadFile via multipart/form-data (field name: audio_file) + - Binary data in request body with Content-Type header + + Args: + audio_data: Binary audio data (bytes). Required if audio_file is None. + audio_file: FastAPI UploadFile object (sent as multipart/form-data). + If provided, audio_data is ignored. + audio_content_type: MIME type. Auto-detected if not provided. + language: Language code (e.g., 'vi' for Vietnamese). Defaults to config default. + enable_segments: Split audio into segments with timestamps + enable_refinement: Apply post-processing refinement + lazy_process: If True, process in background (returns job_id). If False, returns result immediately. + + Returns: + Dict containing transcription result or job information + + Raises: + httpx.HTTPError: If the API request fails + ValueError: If both audio_data and audio_file are None, or if audio_data is empty + """ + if audio_file is None and audio_data is None: + raise ValueError("Either audio_data or audio_file must be provided") + + if audio_file is not None and audio_data is not None: + logger.warning("Both audio_file and audio_data provided. audio_file will be used.") + + # Use provided values or fall back to config defaults + language = language or self.config.default_language + enable_segments = enable_segments if enable_segments is not None else self.config.enable_segments + enable_refinement = enable_refinement if enable_refinement is not None else self.config.enable_refinement + + # Prepare headers + headers = {} + if self.config.api_key: + headers["Authorization"] = f"Bearer {self.config.api_key}" + + # Prepare query parameters + params = { + "language": language, + "enable_segments": str(enable_segments).lower(), + "enable_refinement": str(enable_refinement).lower(), + "lazy_process": str(lazy_process).lower(), + } + + try: + with httpx.Client(timeout=self.config.timeout) as client: + if audio_file is not None: + if UploadFile is None: + raise ImportError("fastapi is required to use audio_file parameter. Install with: pip install fastapi") + + # Reset file pointer if needed + if hasattr(audio_file.file, 'seek'): + audio_file.file.seek(0) + + # Get filename and content type + filename = getattr(audio_file, 'filename', 'audio.mp3') or 'audio.mp3' + content_type = audio_content_type or getattr(audio_file, 'content_type', None) or "audio/mpeg" + + # Infer content type from filename if needed + if content_type == "application/octet-stream" or not content_type: + ext = os.path.splitext(filename)[1].lower() + if ext == ".wav": + content_type = "audio/wav" + elif ext in [".mp3", ".mpeg"]: + content_type = "audio/mpeg" + + files = { + "audio_file": (filename, audio_file.file, content_type) + } + + response = client.post( + self.endpoint, + files=files, + headers=headers, + params=params, + ) + + else: + if not audio_data: + raise ValueError("audio_data cannot be empty") + + content_type = audio_content_type or "audio/wav" + headers["Content-Type"] = content_type + + response = client.post( + self.endpoint, + content=audio_data, + headers=headers, + params=params, + ) + + response.raise_for_status() + result = response.json() + + # Handle response format from service + # Response structure: + # - lazy_process=False: {"job_status": "completed", "result": {"data": {"transcription": "..."}}} + # - lazy_process=True: {"job_id": "...", "job_status": "processing"} + + # Extract transcription from nested result.data structure if available + transcription = "" + if result.get("result") and isinstance(result["result"], dict): + result_data = result["result"].get("data", {}) + if isinstance(result_data, dict): + transcription = result_data.get("transcription", "") + + # Return normalized format + return { + "transcription": transcription, + "job_id": result.get("job_id"), + "job_status": result.get("job_status", "processing"), + "raw_result": result, # Include full result for advanced use cases + } + + except httpx.HTTPStatusError as e: + logger.error(f"Blaze STT API error: {e.response.status_code} - {e.response.text}") + raise + except httpx.RequestError as e: + logger.error(f"Blaze STT request error: {str(e)}") + raise + except Exception as e: + logger.error(f"Unexpected error in Blaze STT: {str(e)}") + raise + + def get_job_status(self, job_id: str) -> Dict[str, Any]: + """ + Get status of a transcription job + + Args: + job_id: Job ID returned from transcribe with lazy_process=True + + Returns: + Dict containing job status and result if available + Format: { + "job_id": "...", + "job_status": "processing" | "completed" | "failed", + "transcription": "...", # Extracted from result.data.transcription + "result": {...} # Full result structure + } + """ + headers = {} + if self.config.api_key: + headers["Authorization"] = f"Bearer {self.config.api_key}" + + endpoint = f"{self.base_url}/v1/stt/{job_id}" + + try: + with httpx.Client(timeout=30) as client: + response = client.get(endpoint, headers=headers) + response.raise_for_status() + result = response.json() + + # Extract transcription from nested result.data structure if available + transcription = "" + if result.get("result") and isinstance(result["result"], dict): + result_data = result["result"].get("data", {}) + if isinstance(result_data, dict): + transcription = result_data.get("transcription", "") + + # Return normalized format + return { + "job_id": result.get("job_id", job_id), + "job_status": result.get("job_status", "processing"), + "transcription": transcription, + "result": result.get("result"), + "raw_result": result, # Include full result for advanced use cases + } + + except httpx.HTTPStatusError as e: + logger.error(f"Blaze STT job status error: {e.response.status_code} - {e.response.text}") + raise + except httpx.RequestError as e: + logger.error(f"Blaze STT request error: {str(e)}") + raise + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process input according to TEN framework interface + + This method implements the TEN framework extension interface. + + Args: + input_data: Input dict with: + - audio_data (bytes): Required. Audio data to transcribe + - audio_content_type (str): Optional. MIME type (default: "audio/wav") + - language (str): Optional. Language code (default: from config) + - enable_segments (bool): Optional. Enable segments + - enable_refinement (bool): Optional. Enable refinement + - lazy_process (bool): Optional. Process in background (default: False) + + Returns: + Output dict with: + - transcription (str): Transcribed text + - job_id (str): Optional. Job ID if lazy_process=True + - status (str): Job status + """ + audio_data = input_data.get("audio_data") + if not audio_data: + raise ValueError("audio_data is required in input_data") + + result = self.transcribe( + audio_data=audio_data, + audio_content_type=input_data.get("audio_content_type", "audio/wav"), + language=input_data.get("language"), + enable_segments=input_data.get("enable_segments"), + enable_refinement=input_data.get("enable_refinement"), + lazy_process=input_data.get("lazy_process", False), + ) + + # Return normalized format (transcribe() already handles response format) + return { + "transcription": result.get("transcription", ""), + "job_id": result.get("job_id"), + "status": result.get("job_status", "completed"), + "raw_result": result, # Include full result for advanced use cases + } + + def get_metadata(self) -> Dict[str, Any]: + """ + Return extension metadata for TEN framework + + This method implements the TEN framework extension interface. + + Returns: + Dict with extension information + """ + return { + "name": "blaze_stt_python", + "version": "1.0.0", + "description": "Blaze Speech-to-Text extension for TEN framework", + "capabilities": ["stt", "transcription", "speech_to_text"], + "supported_formats": ["audio/wav", "audio/mpeg", "audio/webm", "audio/ogg"], + "supported_languages": ["vi", "en"], + "config_schema": { + "api_url": {"type": "string", "required": False, "default": "http://localhost:8000"}, + "api_key": {"type": "string", "required": False}, + "language": {"type": "string", "required": False, "default": "vi"}, + "enable_segments": {"type": "boolean", "required": False, "default": False}, + "enable_refinement": {"type": "boolean", "required": False, "default": False}, + }, + } + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/manifest.json b/ai_agents/agents/ten_packages/extension/blaze_stt_python/manifest.json new file mode 100644 index 0000000000..2acb59ae8d --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/manifest.json @@ -0,0 +1,83 @@ +{ + "type": "extension", + "name": "blaze_stt_python", + "version": "1.0.0", + "display_name": { + "locales": { + "en-US": { + "content": "Blaze STT Extension" + }, + "vi-VN": { + "content": "Blaze STT Extension" + } + } + }, + "description": { + "locales": { + "en-US": { + "content": "Blaze Speech-to-Text extension for TEN Framework" + }, + "vi-VN": { + "content": "Extension chuyển đổi giọng nói thành văn bản cho TEN Framework" + } + } + }, + "dependencies": [ + { + "type": "system", + "name": "ten_runtime_python", + "version": "0.11" + }, + { + "type": "system", + "name": "ten_ai_base", + "version": "0.7" + } + ], + "package": { + "include": [ + "manifest.json", + "property.json", + "BUILD.gn", + "**.tent", + "**.py", + "README.md", + "requirements.txt" + ] + }, + "api": { + "interface": [ + { + "import_uri": "../../system/ten_ai_base/api/stt-interface.json" + } + ], + "property": { + "properties": { + "params": { + "type": "object", + "properties": { + "api_url": { + "type": "string" + }, + "api_key": { + "type": "string" + }, + "language": { + "type": "string" + }, + "enable_segments": { + "type": "boolean" + }, + "enable_refinement": { + "type": "boolean" + }, + "timeout": { + "type": "number" + } + } + } + } + } + } +} + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/property.json b/ai_agents/agents/ten_packages/extension/blaze_stt_python/property.json new file mode 100644 index 0000000000..200877eaf5 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/property.json @@ -0,0 +1,11 @@ +{ + "params": { + "api_url": "${env:BLAZE_STT_API_URL}", + "api_key": "${env:BLAZE_STT_API_KEY}", + "language": "vi", + "enable_segments": false, + "enable_refinement": false, + "timeout": 3600 + } +} + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/requirements.txt b/ai_agents/agents/ten_packages/extension/blaze_stt_python/requirements.txt new file mode 100644 index 0000000000..e0a3d7ee1a --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/requirements.txt @@ -0,0 +1,3 @@ +httpx>=0.24.0 +pydantic>=2.0.0 + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/__init__.py b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/__init__.py new file mode 100644 index 0000000000..4b0485c84e --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/__init__.py @@ -0,0 +1,4 @@ +""" +Tests for Blaze STT Extension +""" + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/conftest.py b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/conftest.py new file mode 100644 index 0000000000..a8ba55fb8a --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/conftest.py @@ -0,0 +1,111 @@ +""" +Pytest fixtures for Blaze STT Extension tests +""" +import io +import pytest +from typing import Optional + +try: + from fastapi import UploadFile +except ImportError: + UploadFile = None + + +@pytest.fixture +def sample_audio_bytes(): + """Sample audio bytes (minimal WAV file header)""" + # Minimal WAV file header (44 bytes) + return ( + b'RIFF' # ChunkID + b'\x24\x00\x00\x00' # ChunkSize (36) + b'WAVE' # Format + b'fmt ' # Subchunk1ID + b'\x10\x00\x00\x00' # Subchunk1Size (16) + b'\x01\x00' # AudioFormat (1 = PCM) + b'\x01\x00' # NumChannels (1 = mono) + b'\x44\xac\x00\x00' # SampleRate (44100) + b'\x88\x58\x01\x00' # ByteRate + b'\x02\x00' # BlockAlign + b'\x10\x00' # BitsPerSample (16) + b'data' # Subchunk2ID + b'\x00\x00\x00\x00' # Subchunk2Size (0 for empty) + ) + + +@pytest.fixture +def mock_upload_file(sample_audio_bytes): + """Mock UploadFile object""" + if UploadFile is None: + pytest.skip("fastapi not installed") + + file_obj = io.BytesIO(sample_audio_bytes) + file_obj.seek(0) + + # Create a mock UploadFile-like object + class MockUploadFile: + def __init__(self, filename: str, file_obj, content_type: str): + self.filename = filename + self.file = file_obj + self.content_type = content_type + + return MockUploadFile( + filename="test_audio.wav", + file_obj=file_obj, + content_type="audio/wav" + ) + + +@pytest.fixture +def mock_config(): + """Mock configuration for BlazeSTTExtension""" + return { + "api_url": "http://localhost:8000", + "api_key": "test-api-key", + "language": "vi", + "enable_segments": False, + "enable_refinement": False, + "timeout": 3600, + } + + +@pytest.fixture +def mock_api_response_completed(): + """Mock API response for completed transcription""" + return { + "job_status": "completed", + "result": { + "status_code": 200, + "error": "", + "data": { + "transcription": "Xin chào, đây là test transcription", + "is_successful": True, + } + } + } + + +@pytest.fixture +def mock_api_response_processing(): + """Mock API response for processing job""" + return { + "job_id": "test-job-id-123", + "job_status": "processing", + } + + +@pytest.fixture +def mock_api_response_job_status(): + """Mock API response for get_job_status""" + return { + "job_id": "test-job-id-123", + "job_status": "completed", + "result": { + "status_code": 200, + "error": "", + "data": { + "transcription": "Xin chào, đây là test transcription", + "is_successful": True, + } + } + } + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/pytest.ini b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/pytest.ini new file mode 100644 index 0000000000..0770e6b0fe --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -v --tb=short + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/requirements.txt b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/requirements.txt new file mode 100644 index 0000000000..ae8bbe77c0 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/requirements.txt @@ -0,0 +1,4 @@ +pytest>=7.0.0 +pytest-mock>=3.10.0 +httpx>=0.24.0 + diff --git a/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/test_blaze_stt.py b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/test_blaze_stt.py new file mode 100644 index 0000000000..1d22d38001 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_stt_python/tests/test_blaze_stt.py @@ -0,0 +1,350 @@ +""" +Unit tests for BlazeSTTExtension +""" +import pytest +from unittest.mock import Mock, patch, MagicMock +import httpx + +from blaze_stt_python import BlazeSTTExtension, BlazeSTTConfig + + +class TestBlazeSTTExtension: + """Test suite for BlazeSTTExtension""" + + def test_init_with_config_dict(self, mock_config): + """Test initialization with dict config""" + stt = BlazeSTTExtension(config=mock_config) + assert stt.config.api_url == "http://localhost:8000" + assert stt.config.api_key == "test-api-key" + assert stt.config.default_language == "vi" + assert stt.endpoint == "http://localhost:8000/v1/stt/execute" + + def test_init_with_config_object(self): + """Test initialization with BlazeSTTConfig object""" + config = BlazeSTTConfig( + api_url="http://test.com", + api_key="test-key", + default_language="en", + ) + stt = BlazeSTTExtension(config=config) + assert stt.config.api_url == "http://test.com" + assert stt.config.api_key == "test-key" + assert stt.config.default_language == "en" + + def test_init_with_env_vars(self, monkeypatch): + """Test initialization with environment variables""" + monkeypatch.setenv("BLAZE_STT_API_URL", "http://env-test.com") + monkeypatch.setenv("BLAZE_STT_API_KEY", "env-key") + + stt = BlazeSTTExtension(config=None) + assert stt.config.api_url == "http://env-test.com" + assert stt.config.api_key == "env-key" + + @patch('httpx.Client') + def test_transcribe_with_bytes(self, mock_client_class, sample_audio_bytes, mock_api_response_completed): + """Test transcribe() with bytes (binary mode)""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_completed + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + stt = BlazeSTTExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "test-key", + }) + + # Call transcribe + result = stt.transcribe( + audio_data=sample_audio_bytes, + audio_content_type="audio/wav", + language="vi", + ) + + # Verify request was made correctly + mock_client.post.assert_called_once() + call_args = mock_client.post.call_args + + # Check endpoint + assert call_args[0][0] == "http://localhost:8000/v1/stt/execute" + + # Check content (binary data) + assert call_args[1]["content"] == sample_audio_bytes + + # Check headers + headers = call_args[1]["headers"] + assert headers["Content-Type"] == "audio/wav" + assert headers["Authorization"] == "Bearer test-key" + + # Check params + params = call_args[1]["params"] + assert params["language"] == "vi" + assert params["enable_segments"] == "false" + assert params["enable_refinement"] == "false" + assert params["lazy_process"] == "false" + + # Verify result + assert result["transcription"] == "Xin chào, đây là test transcription" + assert result["job_status"] == "completed" + + @patch('httpx.Client') + def test_transcribe_with_upload_file(self, mock_client_class, mock_upload_file, mock_api_response_completed): + """Test transcribe() with UploadFile (multipart mode)""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_completed + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + stt = BlazeSTTExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "test-key", + }) + + # Call transcribe with UploadFile + result = stt.transcribe( + audio_file=mock_upload_file, + language="vi", + ) + + # Verify request was made correctly + mock_client.post.assert_called_once() + call_args = mock_client.post.call_args + + # Check endpoint + assert call_args[0][0] == "http://localhost:8000/v1/stt/execute" + + # Check files (multipart) + assert "files" in call_args[1] + files = call_args[1]["files"] + assert "audio_file" in files + assert files["audio_file"][0] == "test_audio.wav" + + # Check headers (no Content-Type for multipart) + headers = call_args[1]["headers"] + assert "Content-Type" not in headers + assert headers["Authorization"] == "Bearer test-key" + + # Verify result + assert result["transcription"] == "Xin chào, đây là test transcription" + assert result["job_status"] == "completed" + + @patch('httpx.Client') + def test_transcribe_lazy_process(self, mock_client_class, sample_audio_bytes, mock_api_response_processing): + """Test transcribe() with lazy_process=True""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_processing + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + # Call transcribe with lazy_process=True + result = stt.transcribe( + audio_data=sample_audio_bytes, + lazy_process=True, + ) + + # Verify lazy_process parameter + call_args = mock_client.post.call_args + params = call_args[1]["params"] + assert params["lazy_process"] == "true" + + # Verify result + assert result["job_id"] == "test-job-id-123" + assert result["job_status"] == "processing" + + def test_transcribe_no_input(self): + """Test transcribe() with no input raises ValueError""" + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + with pytest.raises(ValueError, match="Either audio_data or audio_file must be provided"): + stt.transcribe() + + def test_transcribe_empty_bytes(self): + """Test transcribe() with empty bytes raises ValueError""" + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + with pytest.raises(ValueError, match="audio_data cannot be empty"): + stt.transcribe(audio_data=b"") + + @patch('httpx.Client') + def test_get_job_status(self, mock_client_class, mock_api_response_job_status): + """Test get_job_status()""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_job_status + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.get.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + stt = BlazeSTTExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "test-key", + }) + + # Call get_job_status + result = stt.get_job_status("test-job-id-123") + + # Verify request was made correctly + mock_client.get.assert_called_once() + call_args = mock_client.get.call_args + + # Check endpoint + assert call_args[0][0] == "http://localhost:8000/v1/stt/test-job-id-123" + + # Check headers + headers = call_args[1]["headers"] + assert headers["Authorization"] == "Bearer test-key" + + # Verify result + assert result["job_id"] == "test-job-id-123" + assert result["job_status"] == "completed" + assert result["transcription"] == "Xin chào, đây là test transcription" + + def test_process_method(self, sample_audio_bytes, mock_api_response_completed): + """Test process() method (TEN framework interface)""" + with patch('httpx.Client') as mock_client_class: + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_completed + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + # Call process + result = stt.process({ + "audio_data": sample_audio_bytes, + "audio_content_type": "audio/wav", + "language": "vi", + }) + + # Verify result format + assert result["transcription"] == "Xin chào, đây là test transcription" + assert result["status"] == "completed" + assert "job_id" in result + + def test_process_method_missing_audio_data(self): + """Test process() method raises error when audio_data is missing""" + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + with pytest.raises(ValueError, match="audio_data is required in input_data"): + stt.process({}) + + def test_get_metadata(self): + """Test get_metadata() method""" + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + metadata = stt.get_metadata() + + assert metadata["name"] == "blaze_stt_python" + assert metadata["version"] == "1.0.0" + assert "stt" in metadata["capabilities"] + assert "transcription" in metadata["capabilities"] + assert "speech_to_text" in metadata["capabilities"] + assert "audio/wav" in metadata["supported_formats"] + assert "vi" in metadata["supported_languages"] + + @patch('httpx.Client') + def test_transcribe_with_enable_segments(self, mock_client_class, sample_audio_bytes, mock_api_response_completed): + """Test transcribe() with enable_segments=True""" + mock_response = Mock() + mock_response.json.return_value = mock_api_response_completed + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + result = stt.transcribe( + audio_data=sample_audio_bytes, + enable_segments=True, + ) + + call_args = mock_client.post.call_args + params = call_args[1]["params"] + assert params["enable_segments"] == "true" + + @patch('httpx.Client') + def test_transcribe_with_enable_refinement(self, mock_client_class, sample_audio_bytes, mock_api_response_completed): + """Test transcribe() with enable_refinement=True""" + mock_response = Mock() + mock_response.json.return_value = mock_api_response_completed + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + result = stt.transcribe( + audio_data=sample_audio_bytes, + enable_refinement=True, + ) + + call_args = mock_client.post.call_args + params = call_args[1]["params"] + assert params["enable_refinement"] == "true" + + @patch('httpx.Client') + def test_transcribe_http_error(self, mock_client_class, sample_audio_bytes): + """Test transcribe() handles HTTP errors""" + # Setup mock response with error + mock_response = Mock() + mock_response.status_code = 400 + mock_response.text = "Bad Request" + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "Bad Request", + request=Mock(), + response=mock_response + ) + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + stt = BlazeSTTExtension(config={"api_url": "http://localhost:8000"}) + + with pytest.raises(httpx.HTTPStatusError): + stt.transcribe(audio_data=sample_audio_bytes) + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/README.md b/ai_agents/agents/ten_packages/extension/blaze_tts_python/README.md new file mode 100644 index 0000000000..b44fae58a3 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/README.md @@ -0,0 +1,125 @@ +# Blaze TTS Extension for TEN Framework + +Blaze Text-to-Speech (TTS) extension for [TEN Framework](https://github.com/TEN-framework/ten-framework). + +## Installation + +```bash +pip install -r requirements.txt +``` + +Or install dependencies directly: + +```bash +pip install httpx pydantic +``` + +## Configuration + +### Environment Variables + +Set the following environment variables: + +```bash +export BLAZE_TTS_API_URL="http://localhost:8000" +export BLAZE_TTS_API_KEY="your-api-key-here" # Optional +``` + +### Property.json (TEN Framework) + +The extension includes a `property.json` file with default configuration that TEN framework can use: + +```json +{ + "params": { + "api_url": "${env:BLAZE_TTS_API_URL}", + "api_key": "${env:BLAZE_TTS_API_KEY}", + "language": "vi", + "speaker_id": null, + "audio_speed": 1.0, + "audio_quality": 64, + "timeout": 3600 + } +} +``` + +TEN framework will automatically read this file and use environment variables for configuration. + +## Usage + +### As TEN Framework Extension + +```python +from blaze_tts_python import BlazeTTSExtension + +# Initialize extension (can accept dict config from TEN framework) +tts = BlazeTTSExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "your-api-key", + "speaker_id": "speaker-123", +}) + +# Synthesize text using TEN framework interface +result = tts.process({ + "text": "Xin chào", + "speaker_id": "speaker-123", + "language": "vi", +}) + +audio_bytes = result.get("audio_data") + +# Get extension metadata +metadata = tts.get_metadata() +print(metadata) +``` + +### As Direct Extension + +```python +from blaze_tts_python import BlazeTTSExtension, BlazeTTSConfig + +# Initialize extension +config = BlazeTTSConfig( + api_url="http://localhost:8000", + api_key="your-api-key", + default_language="vi", +) +tts = BlazeTTSExtension(config=config) + +# Synthesize text +result = tts.synthesize( + text="Xin chào", + speaker_id="speaker-123", + language="vi", +) + +# Download audio +job_id = result["id"] +audio_bytes = tts.download_audio(job_id) +``` + +## API Reference + +### BlazeTTSExtension + +**TEN Framework Interface Methods:** +- `process(input_data)` - Process text and return audio (TEN framework interface) +- `get_metadata()` - Get extension metadata (TEN framework interface) + +**Direct Methods:** + +- `synthesize(text, speaker_id, language, audio_speed, audio_quality, ...)` - Synthesize text to speech +- `get_speakers()` - Get list of available speakers +- `download_audio(job_id, output_path)` - Download generated audio +- `get_job_info(job_id)` - Get TTS job information + +## Supported Formats + +- `wav` - WAV format +- `mp3` - MP3 format +- `ogg` - OGG format + +## License + +This extension is provided as-is for use with the TEN Framework and Blaze services. + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/__init__.py b/ai_agents/agents/ten_packages/extension/blaze_tts_python/__init__.py new file mode 100644 index 0000000000..a454f4eff6 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/__init__.py @@ -0,0 +1,12 @@ +""" +Blaze TTS Extension for TEN Framework + +This extension provides Text-to-Speech (TTS) functionality using Blaze API. +Implements TEN framework extension interface. +""" + +from .blaze_tts import BlazeTTSExtension, BlazeTTSConfig + +__all__ = ["BlazeTTSExtension", "BlazeTTSConfig"] +__version__ = "1.0.0" + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/blaze_tts.py b/ai_agents/agents/ten_packages/extension/blaze_tts_python/blaze_tts.py new file mode 100644 index 0000000000..0a72553715 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/blaze_tts.py @@ -0,0 +1,396 @@ +""" +Blaze TTS Extension Implementation + +This extension wraps the Blaze TTS API endpoint for use in TEN framework. +""" + +import os +import logging +from typing import Optional, Dict, Any, Union +from enum import Enum + +import httpx +from pydantic import BaseModel, Field + +logger = logging.getLogger(__name__) + + +class AudioFormat(str, Enum): + """Audio format options""" + WAV = "wav" + MP3 = "mp3" + OGG = "ogg" + + +class MediaType(str, Enum): + """Media type options""" + AUDIO_OGG_CODECS_OPUS = "audio/ogg; codecs=opus" + AUDIO_MP3 = "audio/mp3" + AUDIO_WAV = "audio/wav" + + +class Normalization(str, Enum): + """Normalization options""" + NO = "no" + YES = "yes" + + +class Model(str, Enum): + """Model options""" + V1_5_FLASH = "v1.5_flash" + V1_5_PRO = "v1.5_pro" + + +class BlazeTTSConfig(BaseModel): + """Configuration for Blaze TTS Extension""" + + api_url: str = Field( + default=os.getenv("BLAZE_TTS_API_URL", "http://localhost:8000"), + description="Blaze TTS API base URL" + ) + api_key: Optional[str] = Field( + default=os.getenv("BLAZE_TTS_API_KEY", None), + description="API key for authentication (Bearer token)" + ) + timeout: int = Field( + default=3600, + description="Request timeout in seconds" + ) + default_language: str = Field( + default="vi", + description="Default language code (e.g., 'vi' for Vietnamese)" + ) + default_speaker_id: Optional[str] = Field( + default=None, + description="Default speaker ID" + ) + default_audio_speed: float = Field( + default=1.0, + description="Default audio speed multiplier" + ) + default_audio_quality: int = Field( + default=64, + description="Default audio quality (kbps)" + ) + + +class BlazeTTSExtension: + """ + Blaze TTS Extension for TEN Framework + + This extension provides Text-to-Speech functionality by wrapping + the Blaze TTS API endpoint: /v1/tts + + Implements TEN framework extension interface with process() and get_metadata() methods. + """ + + def __init__(self, config: Optional[Union[BlazeTTSConfig, Dict[str, Any]]] = None): + """ + Initialize Blaze TTS Extension + + Args: + config: Configuration object (BlazeTTSConfig) or dict from TEN framework. + If None, uses environment variables. + If dict, converts to BlazeTTSConfig. + """ + if config is None: + self.config = BlazeTTSConfig() + elif isinstance(config, dict): + # Convert dict from TEN framework to BlazeTTSConfig + self.config = BlazeTTSConfig( + api_url=config.get("api_url", "http://localhost:8000"), + api_key=config.get("api_key"), + default_language=config.get("language", "vi"), + default_speaker_id=config.get("speaker_id"), + default_audio_speed=config.get("audio_speed", 1.0), + default_audio_quality=config.get("audio_quality", 64), + timeout=config.get("timeout", 3600), + ) + else: + self.config = config + + self.base_url = self.config.api_url.rstrip("/") + self.endpoint = f"{self.base_url}/v1/tts" + + logger.info(f"Blaze TTS Extension initialized with API URL: {self.base_url}") + + def synthesize( + self, + text: str, + speaker_id: Optional[str] = None, + language: Optional[str] = None, + audio_speed: Optional[float] = None, + audio_quality: Optional[int] = None, + audio_format: Union[AudioFormat, str] = AudioFormat.WAV, + media_type: Union[MediaType, str] = MediaType.AUDIO_OGG_CODECS_OPUS, + normalization: Union[Normalization, str] = Normalization.NO, + model: Union[Model, str] = Model.V1_5_PRO, + ) -> Dict[str, Any]: + """ + Synthesize text to speech + + Args: + text: Text to synthesize + speaker_id: Speaker/voice ID. Required if not set in config. + language: Language code (e.g., 'vi' for Vietnamese). Defaults to config default. + audio_speed: Audio speed multiplier (default: 1.0) + audio_quality: Audio quality in kbps (default: 64) + audio_format: Audio format (wav, mp3, ogg) + media_type: Media type + normalization: Normalization option (no, yes) + model: Model version to use + + Returns: + Dict containing TTS result with job_id or audio URL + + Raises: + httpx.HTTPError: If the API request fails + ValueError: If text is empty or speaker_id is missing + """ + if not text: + raise ValueError("text cannot be empty") + + speaker_id = speaker_id or self.config.default_speaker_id + if not speaker_id: + raise ValueError("speaker_id is required (either as parameter or in config)") + + # Use provided values or fall back to config defaults + language = language or self.config.default_language + audio_speed = audio_speed if audio_speed is not None else self.config.default_audio_speed + audio_quality = audio_quality if audio_quality is not None else self.config.default_audio_quality + + # Convert enum to string if needed + if isinstance(audio_format, AudioFormat): + audio_format = audio_format.value + if isinstance(media_type, MediaType): + media_type = media_type.value + if isinstance(normalization, Normalization): + normalization = normalization.value + if isinstance(model, Model): + model = model.value + + # Prepare request payload + payload = { + "query": text, + "language": language, + "audio_speed": audio_speed, + "audio_quality": audio_quality, + "audio_format": audio_format, + "speaker_id": speaker_id, + "media_type": media_type, + "normalization": normalization, + "model": model, + } + + # Prepare headers + headers = { + "Content-Type": "application/json", + } + + if self.config.api_key: + headers["Authorization"] = f"Bearer {self.config.api_key}" + + try: + with httpx.Client(timeout=self.config.timeout) as client: + response = client.post( + self.endpoint, + json=payload, + headers=headers, + ) + response.raise_for_status() + return response.json() + + except httpx.HTTPStatusError as e: + logger.error(f"Blaze TTS API error: {e.response.status_code} - {e.response.text}") + raise + except httpx.RequestError as e: + logger.error(f"Blaze TTS request error: {str(e)}") + raise + except Exception as e: + logger.error(f"Unexpected error in Blaze TTS: {str(e)}") + raise + + def get_speakers(self) -> Dict[str, Any]: + """ + Get list of available speakers/voices + + Returns: + Dict containing list of speakers + """ + headers = {} + if self.config.api_key: + headers["Authorization"] = f"Bearer {self.config.api_key}" + + endpoint = f"{self.base_url}/v1/tts/list-speaker-ids" + + try: + with httpx.Client(timeout=30) as client: + response = client.get(endpoint, headers=headers) + response.raise_for_status() + return response.json() + + except httpx.HTTPStatusError as e: + logger.error(f"Blaze TTS speakers error: {e.response.status_code} - {e.response.text}") + raise + except httpx.RequestError as e: + logger.error(f"Blaze TTS request error: {str(e)}") + raise + + def download_audio(self, job_id: str, output_path: Optional[str] = None) -> bytes: + """ + Download generated audio file + + Args: + job_id: Job ID returned from synthesize + output_path: Optional path to save the audio file. If None, returns bytes. + + Returns: + Audio file bytes + """ + headers = {} + if self.config.api_key: + headers["Authorization"] = f"Bearer {self.config.api_key}" + + endpoint = f"{self.base_url}/v1/tts/{job_id}/download" + + try: + with httpx.Client(timeout=self.config.timeout) as client: + response = client.get(endpoint, headers=headers) + response.raise_for_status() + + audio_bytes = response.content + + if output_path: + with open(output_path, "wb") as f: + f.write(audio_bytes) + logger.info(f"Audio saved to {output_path}") + + return audio_bytes + + except httpx.HTTPStatusError as e: + logger.error(f"Blaze TTS download error: {e.response.status_code} - {e.response.text}") + raise + except httpx.RequestError as e: + logger.error(f"Blaze TTS request error: {str(e)}") + raise + + def get_job_info(self, job_id: str) -> Dict[str, Any]: + """ + Get information about a TTS job + + Args: + job_id: Job ID returned from synthesize + + Returns: + Dict containing job information + """ + headers = {} + if self.config.api_key: + headers["Authorization"] = f"Bearer {self.config.api_key}" + + endpoint = f"{self.base_url}/v1/tts/{job_id}/info" + + try: + with httpx.Client(timeout=30) as client: + response = client.get(endpoint, headers=headers) + response.raise_for_status() + return response.json() + + except httpx.HTTPStatusError as e: + logger.error(f"Blaze TTS job info error: {e.response.status_code} - {e.response.text}") + raise + except httpx.RequestError as e: + logger.error(f"Blaze TTS request error: {str(e)}") + raise + + def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Process input according to TEN framework interface + + This method implements the TEN framework extension interface. + + Args: + input_data: Input dict with: + - text (str): Required. Text to synthesize + - speaker_id (str): Optional. Speaker ID (default: from config) + - language (str): Optional. Language code (default: from config) + - audio_speed (float): Optional. Audio speed (default: 1.0) + - audio_quality (int): Optional. Audio quality in kbps (default: 64) + - audio_format (str): Optional. Audio format (default: "wav") + - download_audio (bool): Optional. Download audio immediately (default: True) + + Returns: + Output dict with: + - audio_data (bytes): Audio bytes if download_audio=True + - job_id (str): Job ID + - format (str): Audio format + - status (str): Job status + """ + text = input_data.get("text") + if not text: + raise ValueError("text is required in input_data") + + result = self.synthesize( + text=text, + speaker_id=input_data.get("speaker_id"), + language=input_data.get("language"), + audio_speed=input_data.get("audio_speed", 1.0), + audio_quality=input_data.get("audio_quality", 64), + audio_format=input_data.get("audio_format", "wav"), + media_type=input_data.get("media_type", MediaType.AUDIO_OGG_CODECS_OPUS), + normalization=input_data.get("normalization", "no"), + model=input_data.get("model", Model.V1_5_PRO), + ) + + job_id = result.get("id") or result.get("job_id") + + # If immediate result requested, download audio + if job_id and input_data.get("download_audio", True): + try: + audio_bytes = self.download_audio(job_id) + return { + "audio_data": audio_bytes, + "job_id": job_id, + "format": input_data.get("audio_format", "mp3"), + "status": "completed", + "size_bytes": len(audio_bytes), + } + except Exception as e: + # If download fails, return job_id for later retrieval + return { + "job_id": job_id, + "status": "processing", + "error": str(e), + } + + return { + "job_id": job_id, + "status": "processing", + } + + def get_metadata(self) -> Dict[str, Any]: + """ + Return extension metadata for TEN framework + + This method implements the TEN framework extension interface. + + Returns: + Dict with extension information + """ + return { + "name": "blaze_tts_python", + "version": "1.0.0", + "description": "Blaze Text-to-Speech extension for TEN framework", + "capabilities": ["tts", "synthesis", "text_to_speech"], + "supported_formats": ["mp3", "wav", "ogg"], + "supported_languages": ["vi", "en"], + "config_schema": { + "api_url": {"type": "string", "required": False, "default": "http://localhost:8000"}, + "api_key": {"type": "string", "required": False}, + "language": {"type": "string", "required": False, "default": "vi"}, + "speaker_id": {"type": "string", "required": False}, + "audio_speed": {"type": "float", "required": False, "default": 1.0}, + "audio_quality": {"type": "integer", "required": False, "default": 64}, + }, + } + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/manifest.json b/ai_agents/agents/ten_packages/extension/blaze_tts_python/manifest.json new file mode 100644 index 0000000000..a25b313a23 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/manifest.json @@ -0,0 +1,86 @@ +{ + "type": "extension", + "name": "blaze_tts_python", + "version": "1.0.0", + "display_name": { + "locales": { + "en-US": { + "content": "Blaze TTS Extension" + }, + "vi-VN": { + "content": "Blaze TTS Extension" + } + } + }, + "description": { + "locales": { + "en-US": { + "content": "Blaze Text-to-Speech extension for TEN Framework" + }, + "vi-VN": { + "content": "Extension chuyển đổi văn bản thành giọng nói cho TEN Framework" + } + } + }, + "dependencies": [ + { + "type": "system", + "name": "ten_runtime_python", + "version": "0.11" + }, + { + "type": "system", + "name": "ten_ai_base", + "version": "0.7" + } + ], + "package": { + "include": [ + "manifest.json", + "property.json", + "BUILD.gn", + "**.tent", + "**.py", + "README.md", + "requirements.txt" + ] + }, + "api": { + "interface": [ + { + "import_uri": "../../system/ten_ai_base/api/tts-interface.json" + } + ], + "property": { + "properties": { + "params": { + "type": "object", + "properties": { + "api_url": { + "type": "string" + }, + "api_key": { + "type": "string" + }, + "language": { + "type": "string" + }, + "speaker_id": { + "type": "string" + }, + "audio_speed": { + "type": "number" + }, + "audio_quality": { + "type": "number" + }, + "timeout": { + "type": "number" + } + } + } + } + } + } +} + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/property.json b/ai_agents/agents/ten_packages/extension/blaze_tts_python/property.json new file mode 100644 index 0000000000..25ca345113 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/property.json @@ -0,0 +1,12 @@ +{ + "params": { + "api_url": "${env:BLAZE_TTS_API_URL}", + "api_key": "${env:BLAZE_TTS_API_KEY}", + "language": "vi", + "speaker_id": null, + "audio_speed": 1.0, + "audio_quality": 64, + "timeout": 3600 + } +} + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/requirements.txt b/ai_agents/agents/ten_packages/extension/blaze_tts_python/requirements.txt new file mode 100644 index 0000000000..e0a3d7ee1a --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/requirements.txt @@ -0,0 +1,3 @@ +httpx>=0.24.0 +pydantic>=2.0.0 + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/__init__.py b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/__init__.py new file mode 100644 index 0000000000..48a47bcf6a --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/__init__.py @@ -0,0 +1,4 @@ +""" +Tests for Blaze TTS Extension +""" + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/conftest.py b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/conftest.py new file mode 100644 index 0000000000..644eedb0bb --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/conftest.py @@ -0,0 +1,60 @@ +""" +Pytest fixtures for Blaze TTS Extension tests +""" +import pytest + + +@pytest.fixture +def mock_config(): + """Mock configuration for BlazeTTSExtension""" + return { + "api_url": "http://localhost:8000", + "api_key": "test-api-key", + "language": "vi", + "speaker_id": "test-speaker-123", + "audio_speed": 1.0, + "audio_quality": 64, + "timeout": 3600, + } + + +@pytest.fixture +def mock_api_response_synthesize(): + """Mock API response for synthesize""" + return { + "job_id": "test-tts-job-123", + "job_status": "completed", + "audio_url": "https://example.com/audio/test-tts-job-123.mp3", + } + + +@pytest.fixture +def mock_api_response_speakers(): + """Mock API response for list_speakers""" + return { + "list_speakers": [ + { + "id": "speaker-1", + "name": "Vietnamese Female", + "language": "vi", + "gender": "female", + }, + { + "id": "speaker-2", + "name": "Vietnamese Male", + "language": "vi", + "gender": "male", + }, + ] + } + + +@pytest.fixture +def mock_api_response_job_info(): + """Mock API response for get_job_info""" + return { + "job_id": "test-tts-job-123", + "job_status": "completed", + "audio_url": "https://example.com/audio/test-tts-job-123.mp3", + } + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/pytest.ini b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/pytest.ini new file mode 100644 index 0000000000..0770e6b0fe --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -v --tb=short + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/requirements.txt b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/requirements.txt new file mode 100644 index 0000000000..ae8bbe77c0 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/requirements.txt @@ -0,0 +1,4 @@ +pytest>=7.0.0 +pytest-mock>=3.10.0 +httpx>=0.24.0 + diff --git a/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/test_blaze_tts.py b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/test_blaze_tts.py new file mode 100644 index 0000000000..913c58312c --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/blaze_tts_python/tests/test_blaze_tts.py @@ -0,0 +1,375 @@ +""" +Unit tests for BlazeTTSExtension +""" +import pytest +from unittest.mock import Mock, patch +import httpx + +from blaze_tts_python import BlazeTTSExtension, BlazeTTSConfig + + +class TestBlazeTTSExtension: + """Test suite for BlazeTTSExtension""" + + def test_init_with_config_dict(self, mock_config): + """Test initialization with dict config""" + tts = BlazeTTSExtension(config=mock_config) + assert tts.config.api_url == "http://localhost:8000" + assert tts.config.api_key == "test-api-key" + assert tts.config.default_language == "vi" + assert tts.config.default_speaker_id == "test-speaker-123" + assert tts.endpoint == "http://localhost:8000/v1/tts" + + def test_init_with_config_object(self): + """Test initialization with BlazeTTSConfig object""" + config = BlazeTTSConfig( + api_url="http://test.com", + api_key="test-key", + default_language="en", + default_speaker_id="speaker-456", + ) + tts = BlazeTTSExtension(config=config) + assert tts.config.api_url == "http://test.com" + assert tts.config.api_key == "test-key" + assert tts.config.default_language == "en" + assert tts.config.default_speaker_id == "speaker-456" + + def test_init_with_env_vars(self, monkeypatch): + """Test initialization with environment variables""" + monkeypatch.setenv("BLAZE_TTS_API_URL", "http://env-test.com") + monkeypatch.setenv("BLAZE_TTS_API_KEY", "env-key") + + tts = BlazeTTSExtension(config=None) + assert tts.config.api_url == "http://env-test.com" + assert tts.config.api_key == "env-key" + + @patch('httpx.Client') + def test_synthesize(self, mock_client_class, mock_api_response_synthesize): + """Test synthesize() method""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_synthesize + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + tts = BlazeTTSExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "test-key", + "speaker_id": "test-speaker-123", + }) + + # Call synthesize + result = tts.synthesize( + text="Xin chào", + speaker_id="test-speaker-123", + language="vi", + ) + + # Verify request was made correctly + mock_client.post.assert_called_once() + call_args = mock_client.post.call_args + + # Check endpoint + assert call_args[0][0] == "http://localhost:8000/v1/tts" + + # Check JSON body + json_data = call_args[1]["json"] + assert json_data["text"] == "Xin chào" + assert json_data["speaker_id"] == "test-speaker-123" + assert json_data["language"] == "vi" + + # Check headers + headers = call_args[1]["headers"] + assert headers["Authorization"] == "Bearer test-key" + + # Verify result + assert result["job_id"] == "test-tts-job-123" + assert result["job_status"] == "completed" + assert result["audio_url"] == "https://example.com/audio/test-tts-job-123.mp3" + + def test_synthesize_empty_text(self): + """Test synthesize() with empty text raises ValueError""" + tts = BlazeTTSExtension(config={"api_url": "http://localhost:8000"}) + + with pytest.raises(ValueError, match="text cannot be empty"): + tts.synthesize(text="", speaker_id="test-speaker") + + def test_synthesize_missing_speaker_id(self): + """Test synthesize() without speaker_id raises ValueError""" + tts = BlazeTTSExtension(config={"api_url": "http://localhost:8000"}) + + with pytest.raises(ValueError, match="speaker_id is required"): + tts.synthesize(text="Hello") + + @patch('httpx.Client') + def test_synthesize_with_default_speaker_id(self, mock_client_class, mock_api_response_synthesize): + """Test synthesize() uses default speaker_id from config""" + mock_response = Mock() + mock_response.json.return_value = mock_api_response_synthesize + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + tts = BlazeTTSExtension(config={ + "api_url": "http://localhost:8000", + "speaker_id": "default-speaker", + }) + + result = tts.synthesize(text="Hello") + + call_args = mock_client.post.call_args + json_data = call_args[1]["json"] + assert json_data["speaker_id"] == "default-speaker" + + @patch('httpx.Client') + def test_get_speakers(self, mock_client_class, mock_api_response_speakers): + """Test get_speakers() method""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_speakers + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.get.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + tts = BlazeTTSExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "test-key", + }) + + # Call get_speakers + result = tts.get_speakers() + + # Verify request was made correctly + mock_client.get.assert_called_once() + call_args = mock_client.get.call_args + + # Check endpoint + assert call_args[0][0] == "http://localhost:8000/v1/tts/list-speaker-ids" + + # Check headers + headers = call_args[1]["headers"] + assert headers["Authorization"] == "Bearer test-key" + + # Verify result + assert len(result["list_speakers"]) == 2 + assert result["list_speakers"][0]["id"] == "speaker-1" + assert result["list_speakers"][1]["id"] == "speaker-2" + + @patch('httpx.Client') + def test_get_job_info(self, mock_client_class, mock_api_response_job_info): + """Test get_job_info() method""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_job_info + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.get.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + tts = BlazeTTSExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "test-key", + }) + + # Call get_job_info + result = tts.get_job_info("test-tts-job-123") + + # Verify request was made correctly + mock_client.get.assert_called_once() + call_args = mock_client.get.call_args + + # Check endpoint + assert call_args[0][0] == "http://localhost:8000/v1/tts/test-tts-job-123/info" + + # Check headers + headers = call_args[1]["headers"] + assert headers["Authorization"] == "Bearer test-key" + + # Verify result + assert result["job_id"] == "test-tts-job-123" + assert result["job_status"] == "completed" + assert result["audio_url"] == "https://example.com/audio/test-tts-job-123.mp3" + + @patch('httpx.Client') + def test_download_audio(self, mock_client_class): + """Test download_audio() method""" + # Setup mock response with audio bytes + mock_audio_bytes = b"fake audio data" + mock_response = Mock() + mock_response.content = mock_audio_bytes + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.get.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + tts = BlazeTTSExtension(config={ + "api_url": "http://localhost:8000", + "api_key": "test-key", + }) + + # Call download_audio + audio_data = tts.download_audio("test-tts-job-123") + + # Verify request was made correctly + mock_client.get.assert_called_once() + call_args = mock_client.get.call_args + + # Check endpoint + assert call_args[0][0] == "http://localhost:8000/v1/tts/test-tts-job-123/download" + + # Check headers + headers = call_args[1]["headers"] + assert headers["Authorization"] == "Bearer test-key" + + # Verify result + assert audio_data == mock_audio_bytes + + def test_process_method(self, mock_api_response_synthesize): + """Test process() method (TEN framework interface)""" + with patch('httpx.Client') as mock_client_class: + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = mock_api_response_synthesize + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + # Initialize extension + tts = BlazeTTSExtension(config={ + "api_url": "http://localhost:8000", + "speaker_id": "test-speaker", + }) + + # Call process + result = tts.process({ + "text": "Xin chào", + "speaker_id": "test-speaker", + "language": "vi", + }) + + # Verify result format + assert result["job_id"] == "test-tts-job-123" + assert result["status"] == "completed" + assert "audio_url" in result + + def test_process_method_missing_text(self): + """Test process() method raises error when text is missing""" + tts = BlazeTTSExtension(config={"api_url": "http://localhost:8000"}) + + with pytest.raises(ValueError, match="text is required in input_data"): + tts.process({}) + + def test_get_metadata(self): + """Test get_metadata() method""" + tts = BlazeTTSExtension(config={"api_url": "http://localhost:8000"}) + + metadata = tts.get_metadata() + + assert metadata["name"] == "blaze_tts_python" + assert metadata["version"] == "1.0.0" + assert "tts" in metadata["capabilities"] + assert "text_to_speech" in metadata["capabilities"] + assert "audio/wav" in metadata["supported_formats"] + assert "vi" in metadata["supported_languages"] + + @patch('httpx.Client') + def test_synthesize_with_audio_speed(self, mock_client_class, mock_api_response_synthesize): + """Test synthesize() with custom audio_speed""" + mock_response = Mock() + mock_response.json.return_value = mock_api_response_synthesize + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + tts = BlazeTTSExtension(config={"api_url": "http://localhost:8000"}) + + result = tts.synthesize( + text="Hello", + speaker_id="test-speaker", + audio_speed=1.5, + ) + + call_args = mock_client.post.call_args + json_data = call_args[1]["json"] + assert json_data["audio_speed"] == 1.5 + + @patch('httpx.Client') + def test_synthesize_with_audio_quality(self, mock_client_class, mock_api_response_synthesize): + """Test synthesize() with custom audio_quality""" + mock_response = Mock() + mock_response.json.return_value = mock_api_response_synthesize + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + tts = BlazeTTSExtension(config={"api_url": "http://localhost:8000"}) + + result = tts.synthesize( + text="Hello", + speaker_id="test-speaker", + audio_quality=128, + ) + + call_args = mock_client.post.call_args + json_data = call_args[1]["json"] + assert json_data["audio_quality"] == 128 + + @patch('httpx.Client') + def test_synthesize_http_error(self, mock_client_class): + """Test synthesize() handles HTTP errors""" + # Setup mock response with error + mock_response = Mock() + mock_response.status_code = 400 + mock_response.text = "Bad Request" + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "Bad Request", + request=Mock(), + response=mock_response + ) + + mock_client = Mock() + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + mock_client.post.return_value = mock_response + mock_client_class.return_value = mock_client + + tts = BlazeTTSExtension(config={"api_url": "http://localhost:8000"}) + + with pytest.raises(httpx.HTTPStatusError): + tts.synthesize(text="Hello", speaker_id="test-speaker") +