Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions livekit-plugins/livekit-plugins-funasr/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# LiveKit Plugins FunASR

Agent Framework plugin for local speech-to-text with [FunASR](https://github.com/modelscope/FunASR) models such as [SenseVoice](https://github.com/FunAudioLLM/SenseVoice).

SenseVoice is an open-source, fully-local, non-autoregressive multilingual ASR model (Chinese, Cantonese, English, Japanese, Korean and more) with leading Chinese accuracy and fast inference. The model runs locally, so no API key is required.

## Installation

```bash
pip install livekit-plugins-funasr
```

## Usage

```python
from livekit.plugins import funasr

stt = funasr.STT(model="iic/SenseVoiceSmall", device="cuda")
```

The first run downloads the model from ModelScope/Hugging Face. Use `language=None` (default) for automatic language detection, or set e.g. `language="zh"`.
Comment thread
devin-ai-integration[bot] marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2024 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""FunASR plugin for LiveKit Agents.

Local, fully-offline multilingual speech-to-text using FunASR models such as
SenseVoice (Chinese, Cantonese, English, Japanese, Korean and more).
See https://github.com/modelscope/FunASR for more information.
"""

from .stt import FunASRSTT, FunASRSTT as STT
from .version import __version__

__all__ = ["FunASRSTT", "STT", "__version__"]

from livekit.agents import Plugin

from .log import logger


class FunASRPlugin(Plugin):
def __init__(self) -> None:
super().__init__(__name__, __version__, __package__, logger)


Plugin.register_plugin(FunASRPlugin())
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import logging

logger = logging.getLogger("livekit.plugins.funasr")
Empty file.
161 changes: 161 additions & 0 deletions livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Copyright 2024 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import asyncio
import re
from dataclasses import dataclass

import numpy as np

from livekit import rtc
from livekit.agents import APIConnectionError, APIConnectOptions, LanguageCode, stt
from livekit.agents.stt import SpeechEventType, STTCapabilities
from livekit.agents.types import NOT_GIVEN, NotGivenOr
from livekit.agents.utils import AudioBuffer, is_given

from .log import logger

try:
from funasr import AutoModel # type: ignore
from funasr.utils.postprocess_utils import rich_transcription_postprocess # type: ignore
except ImportError as e:
raise ImportError(
"funasr is required for the FunASR plugin. Install it with: pip install funasr"
) from e

# Languages natively supported by SenseVoice; anything else falls back to auto-detect.
_FUNASR_LANGUAGES = {"zh", "en", "ja", "ko", "yue", "nospeech"}
# Spoken-language codes reported as the detected language (excludes "nospeech",
# which is a SenseVoice classification label, not a language).
_DETECTED_LANGUAGES = {"zh", "en", "ja", "ko", "yue"}
_SAMPLE_RATE = 16000
_LANG_TAG_RE = re.compile(r"<\|([a-z]+)\|>")


def _normalize_language(language: NotGivenOr[str]) -> str:
if not is_given(language) or not language:
return "auto"
code = str(language).split("-")[0].lower()
return code if code in _FUNASR_LANGUAGES else "auto"


@dataclass
class _STTOptions:
language: str = "auto"
use_itn: bool = True


class FunASRSTT(stt.STT):
"""Local speech-to-text using a FunASR model such as SenseVoice.

SenseVoice is an open-source, fully-local, non-autoregressive multilingual ASR
model (Chinese, Cantonese, English, Japanese, Korean and more) with strong
Chinese accuracy and fast inference. The model runs locally; no API key needed.
"""

def __init__(
self,
*,
model: str = "iic/SenseVoiceSmall",
device: str = "cpu",
language: NotGivenOr[str] = NOT_GIVEN,
use_itn: bool = True,
) -> None:
"""Create a FunASR STT instance.

Args:
model: FunASR model id on ModelScope/Hugging Face (default
``"iic/SenseVoiceSmall"``).
device: Inference device, ``"cpu"`` or ``"cuda"``.
language: Default language. When not given, the language is
auto-detected per utterance.
use_itn: Apply inverse text normalization (e.g. "nine" -> "9").
"""
super().__init__(capabilities=STTCapabilities(streaming=False, interim_results=False))
self._model_name = model
self._opts = _STTOptions(language=_normalize_language(language), use_itn=use_itn)
# FunASR's model.generate is not guaranteed thread-safe; serialize access
# across concurrent _recognize_impl calls that share this instance.
self._lock = asyncio.Lock()
logger.info(f"loading FunASR model {model} on {device}...")
self._model = AutoModel(model=model, device=device, disable_update=True)
logger.info("FunASR model loaded")

@property
def model(self) -> str:
return self._model_name

@property
def provider(self) -> str:
return "FunASR"

def update_options(
self,
*,
language: NotGivenOr[str] = NOT_GIVEN,
use_itn: NotGivenOr[bool] = NOT_GIVEN,
) -> None:
if is_given(language):
self._opts.language = _normalize_language(language)
if is_given(use_itn):
self._opts.use_itn = use_itn

async def _recognize_impl(
self,
buffer: AudioBuffer,
*,
language: NotGivenOr[str] = NOT_GIVEN,
conn_options: APIConnectOptions,
) -> stt.SpeechEvent:
lang = _normalize_language(language) if is_given(language) else self._opts.language

combined = rtc.combine_audio_frames(buffer)
channels = combined.num_channels
if combined.sample_rate != _SAMPLE_RATE:
resampler = rtc.AudioResampler(
combined.sample_rate, _SAMPLE_RATE, num_channels=channels
)
Comment on lines +128 to +130

@devin-ai-integration devin-ai-integration Bot Jun 21, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 AudioResampler called with num_channels= unlike any other usage in the repo

The rtc.AudioResampler() call on line 123 passes num_channels=channels, but across 25+ other AudioResampler usages in the codebase (base STT class at livekit-agents/livekit/agents/stt/stt.py:480, silero VAD, openai realtime, etc.), none pass num_channels. All other callers use only input_rate/output_rate (or positional equivalents) and occasionally quality. I couldn't verify the actual rtc.AudioResampler constructor signature since the livekit-rtc native package isn't available in this environment. If num_channels is not a valid parameter, this would cause a TypeError at runtime whenever combined.sample_rate != 16000. Worth verifying against the livekit-rtc API docs.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

frames = list(resampler.push(combined)) + list(resampler.flush())
data = b"".join(bytes(f.data) for f in frames)
else:
data = bytes(combined.data)
samples = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
if channels > 1:
samples = samples.reshape(-1, channels).mean(axis=1)

def _run() -> str:
result = self._model.generate(
input=samples,
cache={},
language=lang,
use_itn=self._opts.use_itn,
)
return result[0]["text"] if result else ""

try:
async with self._lock:
raw = await asyncio.to_thread(_run)
except Exception as e:
raise APIConnectionError("failed to run FunASR inference") from e
Comment on lines +148 to +152

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 All exceptions wrapped as retryable APIConnectionError causes needless retries of deterministic local-inference failures

The blanket except Exception at line 151 wraps every error from FunASR inference (e.g. KeyError from unexpected model output, RuntimeError from CUDA OOM, ValueError from bad input) as APIConnectionError, which defaults to retryable=True. The base class recognize() method (livekit-agents/livekit/agents/stt/stt.py:227) catches APIError (the parent of APIConnectionError) and retries. For a fully-local inference model, errors are virtually never transient — retrying a deterministic failure like an OOM or bad model output wastes time and delays the real error being surfaced to the caller.

Suggested change
try:
async with self._lock:
raw = await asyncio.to_thread(_run)
except Exception as e:
raise APIConnectionError("failed to run FunASR inference") from e
try:
async with self._lock:
raw = await asyncio.to_thread(_run)
except Exception as e:
raise APIConnectionError(
"failed to run FunASR inference", retryable=False
) from e
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


text = rich_transcription_postprocess(raw).strip()
m = _LANG_TAG_RE.match(raw)
detected = m.group(1) if m and m.group(1) in _DETECTED_LANGUAGES else ""

return stt.SpeechEvent(
type=SpeechEventType.FINAL_TRANSCRIPT,
alternatives=[stt.SpeechData(text=text, language=LanguageCode(detected))],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2024 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.1.0"
42 changes: 42 additions & 0 deletions livekit-plugins/livekit-plugins-funasr/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "livekit-plugins-funasr"
dynamic = ["version"]
description = "FunASR (SenseVoice) local STT plugin for LiveKit Agents"
readme = "README.md"
license = "Apache-2.0"
requires-python = ">=3.10.0"
authors = [{ name = "LiveKit" }]
keywords = ["voice", "ai", "realtime", "audio", "video", "livekit", "webrtc"]
classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Topic :: Multimedia :: Sound/Audio",
"Topic :: Multimedia :: Video",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3 :: Only",
]
dependencies = ["livekit-agents>=1.2.0", "funasr>=1.1.0", "numpy"]

[project.urls]
Documentation = "https://docs.livekit.io"
Website = "https://livekit.io/"
Source = "https://github.com/livekit/agents"

[tool.hatch.version]
path = "livekit/plugins/funasr/version.py"

[tool.hatch.build.targets.wheel]
packages = ["livekit"]

[tool.hatch.build.targets.sdist]
include = ["/livekit"]

[tool.uv]
exclude-newer = "7 days"
exclude-newer-package = { livekit-agents = "0 days" }
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ livekit-plugins-elevenlabs = { workspace = true }
livekit-plugins-fal = { workspace = true }
livekit-plugins-fireworksai = { workspace = true }
livekit-plugins-fishaudio = { workspace = true }
livekit-plugins-funasr = { workspace = true }
livekit-plugins-gladia = { workspace = true }
livekit-plugins-gnani = { workspace = true }
livekit-plugins-google = { workspace = true }
Expand Down