diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 593e44cde714..8d7eb727745d 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -3723,6 +3723,302 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + # voice-detect + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect' + base-image: "ubuntu:24.04" + ubuntu-version: '2404' + runs-on: 'ubuntu-24.04-arm' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-voice-detect' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'sycl_f32' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f32-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'sycl_f16' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f16-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-voice-detect' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-voice-detect' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2204' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-voice-detect' + base-image: "rocm/dev-ubuntu-24.04:7.2.1" + runs-on: 'ubuntu-latest' + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + # face-detect + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-face-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-face-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-face-detect' + base-image: "ubuntu:24.04" + ubuntu-version: '2404' + runs-on: 'ubuntu-24.04-arm' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-face-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-face-detect' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'sycl_f32' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f32-face-detect' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'sycl_f16' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f16-face-detect' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-face-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-face-detect' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-face-detect' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2204' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-face-detect' + base-image: "rocm/dev-ubuntu-24.04:7.2.1" + runs-on: 'ubuntu-latest' + skip-drivers: 'false' + backend: "face-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' # acestep-cpp - build-type: '' cuda-major-version: "" @@ -4906,6 +5202,14 @@ includeDarwin: tag-suffix: "-metal-darwin-arm64-ced" build-type: "metal" lang: "go" + - backend: "voice-detect" + tag-suffix: "-metal-darwin-arm64-voice-detect" + build-type: "metal" + lang: "go" + - backend: "face-detect" + tag-suffix: "-metal-darwin-arm64-face-detect" + build-type: "metal" + lang: "go" - backend: "acestep-cpp" tag-suffix: "-metal-darwin-arm64-acestep-cpp" build-type: "metal" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 481c9a6092dc..3a7a59fe2a55 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -46,6 +46,14 @@ jobs: variable: "CED_VERSION" branch: "master" file: "backend/go/ced/Makefile" + - repository: "mudler/voice-detect.cpp" + variable: "VOICEDETECT_VERSION" + branch: "master" + file: "backend/go/voice-detect/Makefile" + - repository: "mudler/face-detect.cpp" + variable: "FACEDETECT_VERSION" + branch: "master" + file: "backend/go/face-detect/Makefile" - repository: "mudler/depth-anything.cpp" variable: "DEPTHANYTHING_VERSION" branch: "master" diff --git a/backend/go/face-detect/.gitignore b/backend/go/face-detect/.gitignore new file mode 100644 index 000000000000..7c80b29aba8d --- /dev/null +++ b/backend/go/face-detect/.gitignore @@ -0,0 +1,18 @@ +# Fetched upstream sources +sources/ + +# CMake build directories +build*/ + +# build artifacts staged in-tree by the Makefile (cp from sources/) or +# symlinked for local dev; the real sources live in face-detect.cpp upstream. +*.so +*.so.* +facedetect_capi.h +compile_commands.json + +# Compiled backend binary +face-detect-grpc + +# Packaging output +package/ diff --git a/backend/go/face-detect/Makefile b/backend/go/face-detect/Makefile new file mode 100644 index 000000000000..fa0d3b41b9ce --- /dev/null +++ b/backend/go/face-detect/Makefile @@ -0,0 +1,97 @@ +# face-detect backend Makefile. +# +# Upstream pin lives below as FACEDETECT_VERSION?=c1db23d... (.github/bump_deps.sh +# can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp +# convention). +# +# Local dev shortcut: if you already have an out-of-tree face-detect.cpp build, +# symlink the .so + header into this directory and skip the clone/cmake steps: +# +# ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so . +# ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h . +# go build -o face-detect-grpc . +# +# The default target below does the proper clone-at-pin + cmake build so CI does +# not need a side-checkout. + +FACEDETECT_VERSION?=c1db23d2138907edcf0d6858a89dc884633e920a +FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp + +GOCMD?=go +GO_TAGS?= +JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) + +BUILD_TYPE?= +NATIVE?=false + +# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC) +# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it, +# only system libs (libstdc++/libgomp/libc) the runtime image already provides. +# The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++ +# side, so only the facedetect_capi_* surface is exported. +CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON + +ifeq ($(NATIVE),false) + CMAKE_ARGS+=-DGGML_NATIVE=OFF +endif + +# face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and +# does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare +# -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_* +# options instead. (openblas is not gated, so -DGGML_BLAS passes through.) +ifeq ($(BUILD_TYPE),cublas) + CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON +else ifeq ($(BUILD_TYPE),openblas) + CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS +else ifeq ($(BUILD_TYPE),hipblas) + CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON +else ifeq ($(BUILD_TYPE),vulkan) + CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON +else ifeq ($(BUILD_TYPE),metal) + CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON +endif + +.PHONY: face-detect-grpc package build clean purge test all + +all: face-detect-grpc + +# Clone the upstream face-detect.cpp source at the pinned commit. Directory acts +# as the target so make only re-clones when missing. After a FACEDETECT_VERSION +# bump, run 'make purge && make' to refetch. +sources/face-detect.cpp: + mkdir -p sources/face-detect.cpp + cd sources/face-detect.cpp && \ + git init -q && \ + git remote add origin $(FACEDETECT_REPO) && \ + git fetch --depth 1 origin $(FACEDETECT_VERSION) && \ + git checkout FETCH_HEAD && \ + git submodule update --init --recursive --depth 1 --single-branch + +# Build the shared lib + header out-of-tree, then stage them next to the Go +# sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick +# them up. +libfacedetect.so: sources/face-detect.cpp + cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS) + cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect + cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true + cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./ + +face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go + CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc . + +package: face-detect-grpc + bash package.sh + +build: package + +# Test target. The embed/detect/verify/analyze smoke specs are gated on +# FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the +# heavy specs auto-skip and only the pure-Go parsing specs run. +test: + LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1 + +clean: purge + rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc + +purge: + rm -rf sources/face-detect.cpp diff --git a/backend/go/face-detect/gofacedetect.go b/backend/go/face-detect/gofacedetect.go new file mode 100644 index 000000000000..4ad6c067c69b --- /dev/null +++ b/backend/go/face-detect/gofacedetect.go @@ -0,0 +1,431 @@ +package main + +import ( + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "time" + "unsafe" + + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/xlog" +) + +// purego-bound entry points from libfacedetect.so. Names match +// facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi` +// is enough to spot drift. +// +// The opaque ctx and the malloc'd char*/float* return values are declared as +// uintptr so we get the raw pointer back and can release it via the matching +// capi free function. purego's native string/[]float32 returns would copy and +// forget the original pointer, leaking the C-owned buffer on every call. +var ( + CppAbiVersion func() int32 + CppLoad func(ggufPath string) uintptr + CppFree func(ctx uintptr) + CppLastError func(ctx uintptr) string + CppFreeString func(s uintptr) + CppFreeVec func(v uintptr) + CppEmbedPath func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32 + CppEmbedRGB func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32 + CppDetectJSON func(ctx uintptr, imagePath string) uintptr + CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32 + CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr +) + +// FaceDetect implements the face-recognition (biometric) subset of the Backend +// gRPC service over libfacedetect.so. The C side keeps a single loaded model +// pack plus a per-ctx last-error buffer and is not reentrant, so +// base.SingleThread serializes every call. +type FaceDetect struct { + base.SingleThread + opts loadOptions + ctxPtr uintptr +} + +func (f *FaceDetect) Load(opts *pb.ModelOptions) error { + model := opts.ModelFile + if model == "" { + model = opts.ModelPath + } + if !filepath.IsAbs(model) && opts.ModelPath != "" { + model = filepath.Join(opts.ModelPath, model) + } + if model == "" { + return errors.New("face-detect: ModelFile is required") + } + + f.opts = parseOptions(opts.Options) + if f.opts.modelName == "" { + f.opts.modelName = filepath.Base(model) + } + + // Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns + // one backend process per model and serves requests concurrently, so the + // engine's own min(hardware_concurrency, 8) default can oversubscribe cores. + // FACEDETECT_THREADS is read by the engine at backend construction, so it + // must be set before the capi load. A non-positive Threads means "unset": + // leave the env alone so the engine keeps its sane default. + threads := opts.Threads + if threads > 0 { + if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil { + return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err) + } + xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads) + } + + xlog.Info("face-detect: loading model", "model", model, + "verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion()) + + ctx := CppLoad(model) + if ctx == 0 { + // The last-error buffer lives on the ctx that was never returned, so + // surface the path the operator tried to load instead. + return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model) + } + f.ctxPtr = ctx + return nil +} + +// Embeddings returns the L2-normalized ArcFace embedding of the primary face in +// the supplied image. Mirroring the Python face backend, the image is read from +// Images[0] as a base64 payload; materializeImage decodes it to a temp file so +// the path-based C-API can run its own decode (cv2.imread parity). The gRPC +// server wraps the returned slice in an EmbeddingResult. +func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) { + if f.ctxPtr == 0 { + return nil, errors.New("face-detect: model not loaded") + } + if len(req.Images) == 0 || req.Images[0] == "" { + return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image") + } + + path, cleanup, err := materializeImage(req.Images[0]) + if err != nil { + return nil, err + } + defer cleanup() + + return f.embedPath(path) +} + +func (f *FaceDetect) embedPath(path string) ([]float32, error) { + var vec uintptr + var dim int32 + rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim)) + if rc != 0 || vec == 0 || dim <= 0 { + return nil, f.lastErr("embed", path) + } + defer CppFreeVec(vec) + // Copy out of the C-owned malloc'd buffer before freeing it. The + // uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell + // a C heap pointer from Go-managed memory; safe here, the GC neither tracks + // nor moves this buffer and we copy immediately. + src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free + out := make([]float32, int(dim)) + copy(out, src) + return out, nil +} + +// Detect runs SCRFD over the image and returns one Detection per face. The +// C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus +// width/height, so the corners are converted. The 5 facial landmarks the engine +// also returns are dropped: the Detection message has no field for them. +func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) { + if f.ctxPtr == 0 { + return pb.DetectResponse{}, errors.New("face-detect: model not loaded") + } + if req.Src == "" { + return pb.DetectResponse{}, errors.New("face-detect: src image is required") + } + + path, cleanup, err := materializeImage(req.Src) + if err != nil { + return pb.DetectResponse{}, err + } + defer cleanup() + + faces, err := f.detectFaces(path) + if err != nil { + return pb.DetectResponse{}, err + } + + dets := make([]*pb.Detection, 0, len(faces)) + for _, fc := range faces { + if req.Threshold > 0 && fc.Score < req.Threshold { + continue + } + x, y, w, h := fc.xywh() + dets = append(dets, &pb.Detection{ + X: x, + Y: y, + Width: w, + Height: h, + Confidence: fc.Score, + ClassName: "face", + }) + } + return pb.DetectResponse{Detections: dets}, nil +} + +// FaceVerify embeds the primary face in each image and reports whether they are +// the same identity by cosine distance against a threshold. A request threshold +// <= 0 falls back to the model-configured default (verify_threshold option, +// 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet +// veto internally (verified forced false on a spoof); the per-image liveness +// scores are not exposed by the verify entry point, so img*_is_real / +// img*_antispoof_score stay at their zero values. +func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) { + if f.ctxPtr == 0 { + return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded") + } + if req.Img1 == "" || req.Img2 == "" { + return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required") + } + + path1, cleanup1, err := materializeImage(req.Img1) + if err != nil { + return pb.FaceVerifyResponse{}, err + } + defer cleanup1() + path2, cleanup2, err := materializeImage(req.Img2) + if err != nil { + return pb.FaceVerifyResponse{}, err + } + defer cleanup2() + + threshold := req.Threshold + if threshold <= 0 { + threshold = f.opts.verifyThreshold + } + + antiSpoof := int32(0) + if req.AntiSpoofing { + antiSpoof = 1 + } + + started := time.Now() + var distance float32 + var verified int32 + rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof, + unsafe.Pointer(&distance), unsafe.Pointer(&verified)) + if rc != 0 { + return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...") + } + elapsedMs := float32(time.Since(started).Seconds() * 1000.0) + + // Confidence decays linearly from 100 at distance 0 to 0 at the threshold, + // matching the Python face backend's reporting. + confidence := float32(0) + if threshold > 0 { + confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0))) + } + + return pb.FaceVerifyResponse{ + Verified: verified != 0, + Distance: distance, + Threshold: threshold, + Confidence: confidence, + Model: f.opts.modelName, + Img1Area: f.bestArea(path1), + Img2Area: f.bestArea(path2), + ProcessingTimeMs: elapsedMs, + }, nil +} + +// FaceAnalyze runs the genderage head on every detected face. The C-API returns +// "M"/"F" gender labels and a rounded age; the labels are normalized to the +// "Man"/"Woman" values the proto documents. +func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) { + if f.ctxPtr == 0 { + return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded") + } + if req.Img == "" { + return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required") + } + + path, cleanup, err := materializeImage(req.Img) + if err != nil { + return pb.FaceAnalyzeResponse{}, err + } + defer cleanup() + + ptr := CppAnalyzeJSON(f.ctxPtr, path) + if ptr == 0 { + return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path) + } + defer CppFreeString(ptr) + + faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr)) + if err != nil { + return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err) + } + return pb.FaceAnalyzeResponse{Faces: faces}, nil +} + +// faceBox is one entry of the detect/analyze JSON documents the engine emits. +type faceBox struct { + Score float32 `json:"score"` + Box []float32 `json:"box"` + Age float32 `json:"age"` + Gender string `json:"gender"` +} + +// xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the +// proto carries. A short or missing box yields zeros. +func (b faceBox) xywh() (x, y, w, h float32) { + if len(b.Box) < 4 { + return 0, 0, 0, 0 + } + return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1] +} + +type facesJSON struct { + Faces []faceBox `json:"faces"` +} + +func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) { + ptr := CppDetectJSON(f.ctxPtr, path) + if ptr == 0 { + return nil, f.lastErr("detect", path) + } + defer CppFreeString(ptr) + + var doc facesJSON + if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil { + return nil, fmt.Errorf("face-detect: detect JSON: %w", err) + } + return doc.Faces, nil +} + +// bestArea returns the FacialArea of the highest-scoring face in an image, or an +// empty area when detection fails or finds nothing. Best-effort: verify already +// succeeded, so a missing region must not turn a valid match into an error. +func (f *FaceDetect) bestArea(path string) *pb.FacialArea { + faces, err := f.detectFaces(path) + if err != nil || len(faces) == 0 { + return &pb.FacialArea{} + } + best := faces[0] + for _, fc := range faces[1:] { + if fc.Score > best.Score { + best = fc + } + } + x, y, w, h := best.xywh() + return &pb.FacialArea{X: x, Y: y, W: w, H: h} +} + +// parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries. +// The engine reports gender as "M"/"F"; both the dominant label and the score +// map are filled with the "Man"/"Woman" form the proto documents. +func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) { + var parsed facesJSON + if err := json.Unmarshal([]byte(doc), &parsed); err != nil { + return nil, err + } + + out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces)) + for _, fc := range parsed.Faces { + x, y, w, h := fc.xywh() + fa := &pb.FaceAnalysis{ + Region: &pb.FacialArea{X: x, Y: y, W: w, H: h}, + FaceConfidence: fc.Score, + Age: fc.Age, + } + if label := normalizeGender(fc.Gender); label != "" { + fa.DominantGender = label + fa.Gender = map[string]float32{label: 1.0} + } + out = append(out, fa) + } + return out, nil +} + +// normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the +// proto documents. Unknown codes pass through unchanged. +func normalizeGender(g string) string { + switch strings.ToUpper(strings.TrimSpace(g)) { + case "M": + return "Man" + case "F": + return "Woman" + case "": + return "" + default: + return g + } +} + +// materializeImage decodes a base64 image payload into a temp file and returns +// its path plus a cleanup func. As a convenience for callers that already pass a +// filesystem path (e.g. a test fixture), an existing path is used as-is with a +// no-op cleanup. data: URI prefixes are stripped before decoding. +func materializeImage(src string) (path string, cleanup func(), err error) { + noop := func() {} + if src == "" { + return "", noop, errors.New("face-detect: empty image input") + } + if _, statErr := os.Stat(src); statErr == nil { + return src, noop, nil + } + + payload := src + if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 { + payload = payload[i+1:] + } + data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload)) + if decErr != nil || len(data) == 0 { + return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64") + } + + tmp, createErr := os.CreateTemp("", "face-detect-*.img") + if createErr != nil { + return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr) + } + cleanup = func() { _ = os.Remove(tmp.Name()) } + if _, wErr := tmp.Write(data); wErr != nil { + _ = tmp.Close() + cleanup() + return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr) + } + if cErr := tmp.Close(); cErr != nil { + cleanup() + return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr) + } + return tmp.Name(), cleanup, nil +} + +// lastErr wraps the C-API's per-ctx last-error buffer into a Go error. +func (f *FaceDetect) lastErr(op, subject string) error { + msg := strings.TrimSpace(CppLastError(f.ctxPtr)) + if msg == "" { + msg = "no error detail" + } + return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg) +} + +// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a +// malloc'd buffer the caller owns; release it via CppFreeString after the copy. +// +// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell +// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor +// moves the buffer and we dereference it immediately to copy the bytes out. +func goStringFromCPtr(cptr uintptr) string { + if cptr == 0 { + return "" + } + p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above) + n := 0 + for *(*byte)(unsafe.Add(p, n)) != 0 { + n++ + } + return string(unsafe.Slice((*byte)(p), n)) +} diff --git a/backend/go/face-detect/gofacedetect_test.go b/backend/go/face-detect/gofacedetect_test.go new file mode 100644 index 000000000000..54a942fba41b --- /dev/null +++ b/backend/go/face-detect/gofacedetect_test.go @@ -0,0 +1,230 @@ +package main + +import ( + "encoding/base64" + "os" + "sync" + "testing" + + "github.com/ebitengine/purego" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestFaceDetect(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "face-detect Backend Suite") +} + +var ( + libLoadOnce sync.Once + libLoadErr error +) + +// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API +// bridge without spinning up the gRPC server. Records the error (the smoke +// specs skip themselves) when libfacedetect.so is not loadable from cwd +// (LD_LIBRARY_PATH or a symlink in ./). +func ensureLibLoaded() error { + libLoadOnce.Do(func() { + libName := os.Getenv("FACEDETECT_LIBRARY") + if libName == "" { + libName = "libfacedetect.so" + } + lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + libLoadErr = err + return + } + purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version") + purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load") + purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free") + purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error") + purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string") + purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec") + purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path") + purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb") + purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json") + purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths") + purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json") + }) + return libLoadErr +} + +var _ = Describe("parseOptions", func() { + It("defaults verify_threshold to 0.35", func() { + o := parseOptions(nil) + Expect(o.verifyThreshold).To(Equal(float32(0.35))) + Expect(o.modelName).To(Equal("")) + }) + + It("parses verify_threshold, threshold alias and model_name", func() { + o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"}) + Expect(o.verifyThreshold).To(Equal(float32(0.4))) + Expect(o.modelName).To(Equal("buffalo_l")) + + o2 := parseOptions([]string{"threshold:0.3"}) + Expect(o2.verifyThreshold).To(Equal(float32(0.3))) + }) + + It("ignores non-positive thresholds and keeps the default", func() { + o := parseOptions([]string{"verify_threshold:0", "threshold:-1"}) + Expect(o.verifyThreshold).To(Equal(float32(0.35))) + }) +}) + +var _ = Describe("normalizeGender", func() { + It("maps M/F codes to Man/Woman", func() { + Expect(normalizeGender("M")).To(Equal("Man")) + Expect(normalizeGender("f")).To(Equal("Woman")) + Expect(normalizeGender(" m ")).To(Equal("Man")) + }) + + It("passes empty and unknown codes through", func() { + Expect(normalizeGender("")).To(Equal("")) + Expect(normalizeGender("nonbinary")).To(Equal("nonbinary")) + }) +}) + +var _ = Describe("faceBox.xywh", func() { + It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() { + b := faceBox{Box: []float32{10, 20, 50, 80}} + x, y, w, h := b.xywh() + Expect(x).To(Equal(float32(10))) + Expect(y).To(Equal(float32(20))) + Expect(w).To(Equal(float32(40))) + Expect(h).To(Equal(float32(60))) + }) + + It("returns zeros for a short box", func() { + x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh() + Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0})) + }) +}) + +var _ = Describe("parseAnalyzeJSON", func() { + It("maps region, age and gender for each face", func() { + doc := `{"faces":[ + {"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"}, + {"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}` + faces, err := parseAnalyzeJSON(doc) + Expect(err).ToNot(HaveOccurred()) + Expect(faces).To(HaveLen(2)) + + Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4)) + Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4)) + Expect(faces[0].DominantGender).To(Equal("Man")) + Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0))) + Expect(faces[0].Region.W).To(Equal(float32(40))) + Expect(faces[0].Region.H).To(Equal(float32(60))) + + Expect(faces[1].DominantGender).To(Equal("Woman")) + }) + + It("tolerates a missing gender field", func() { + faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`) + Expect(err).ToNot(HaveOccurred()) + Expect(faces).To(HaveLen(1)) + Expect(faces[0].DominantGender).To(Equal("")) + Expect(faces[0].Gender).To(BeEmpty()) + }) + + It("returns no faces for an empty document", func() { + faces, err := parseAnalyzeJSON(`{"faces":[]}`) + Expect(err).ToNot(HaveOccurred()) + Expect(faces).To(BeEmpty()) + }) + + It("returns an error on malformed JSON", func() { + _, err := parseAnalyzeJSON(`{not-json`) + Expect(err).To(HaveOccurred()) + }) +}) + +var _ = Describe("materializeImage", func() { + It("decodes a base64 payload to a temp file", func() { + payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg")) + path, cleanup, err := materializeImage(payload) + Expect(err).ToNot(HaveOccurred()) + defer cleanup() + data, rerr := os.ReadFile(path) + Expect(rerr).ToNot(HaveOccurred()) + Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg"))) + }) + + It("strips a data: URI prefix before decoding", func() { + payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello")) + path, cleanup, err := materializeImage(payload) + Expect(err).ToNot(HaveOccurred()) + defer cleanup() + data, rerr := os.ReadFile(path) + Expect(rerr).ToNot(HaveOccurred()) + Expect(data).To(Equal([]byte("hello"))) + }) + + It("uses an existing path as-is", func() { + tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin") + Expect(err).ToNot(HaveOccurred()) + defer func() { _ = os.Remove(tmp.Name()) }() + Expect(tmp.Close()).To(Succeed()) + + path, cleanup, err := materializeImage(tmp.Name()) + Expect(err).ToNot(HaveOccurred()) + defer cleanup() + Expect(path).To(Equal(tmp.Name())) + }) + + It("errors on input that is neither a path nor base64", func() { + _, _, err := materializeImage("not base64!!!") + Expect(err).To(HaveOccurred()) + }) +}) + +// The specs below exercise the real C-API end to end. They run only when both a +// model GGUF and a test image are provided, and skip cleanly otherwise so the +// suite stays green without large assets. +var _ = Describe("FaceDetect end-to-end", Ordered, func() { + var ( + f *FaceDetect + modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL") + imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE") + ) + + BeforeAll(func() { + if modelPath == "" || imagePath == "" { + Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs") + } + if err := ensureLibLoaded(); err != nil { + Skip("libfacedetect.so not loadable: " + err.Error()) + } + f = &FaceDetect{} + Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed()) + }) + + It("embeds the primary face in an image", func() { + emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}}) + Expect(err).ToNot(HaveOccurred()) + Expect(emb).ToNot(BeEmpty()) + }) + + It("detects at least one face", func() { + resp, err := f.Detect(&pb.DetectOptions{Src: imagePath}) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.Detections).ToNot(BeEmpty()) + Expect(resp.Detections[0].ClassName).To(Equal("face")) + }) + + It("verifies an image against itself as the same identity", func() { + resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath}) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.Verified).To(BeTrue()) + Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold)) + }) + + It("analyzes age/gender for each face", func() { + resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath}) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.Faces).ToNot(BeEmpty()) + }) +}) diff --git a/backend/go/face-detect/main.go b/backend/go/face-detect/main.go new file mode 100644 index 000000000000..dc52f1e606ca --- /dev/null +++ b/backend/go/face-detect/main.go @@ -0,0 +1,65 @@ +package main + +// Started internally by LocalAI - one gRPC server per loaded model. +// +// Loads libfacedetect.so via purego and registers the flat C-API entry points +// declared in facedetect_capi.h. The library name can be overridden with +// FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY +// convention in the sibling backends); the default looks for the .so next to +// this binary (resolved via LD_LIBRARY_PATH by run.sh). +import ( + "flag" + "fmt" + "os" + + "github.com/ebitengine/purego" + grpc "github.com/mudler/LocalAI/pkg/grpc" +) + +var ( + addr = flag.String("addr", "localhost:50051", "the address to connect to") +) + +type LibFuncs struct { + FuncPtr any + Name string +} + +func main() { + libName := os.Getenv("FACEDETECT_LIBRARY") + if libName == "" { + libName = "libfacedetect.so" + } + + lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err)) + } + + // Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as + // uintptr so the raw pointer can be freed via the matching capi free fn. + libFuncs := []LibFuncs{ + {&CppAbiVersion, "facedetect_capi_abi_version"}, + {&CppLoad, "facedetect_capi_load"}, + {&CppFree, "facedetect_capi_free"}, + {&CppLastError, "facedetect_capi_last_error"}, + {&CppFreeString, "facedetect_capi_free_string"}, + {&CppFreeVec, "facedetect_capi_free_vec"}, + {&CppEmbedPath, "facedetect_capi_embed_path"}, + {&CppEmbedRGB, "facedetect_capi_embed_rgb"}, + {&CppDetectJSON, "facedetect_capi_detect_path_json"}, + {&CppVerifyPaths, "facedetect_capi_verify_paths"}, + {&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"}, + } + for _, lf := range libFuncs { + purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name) + } + + fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion()) + + flag.Parse() + + if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil { + panic(err) + } +} diff --git a/backend/go/face-detect/options.go b/backend/go/face-detect/options.go new file mode 100644 index 000000000000..51951bfd714f --- /dev/null +++ b/backend/go/face-detect/options.go @@ -0,0 +1,47 @@ +package main + +import ( + "strconv" + "strings" +) + +// defaultVerifyThreshold is the cosine-distance cutoff used when a request does +// not set one. Matches the insightface buffalo_l ArcFace R50 default the Python +// face backend ships with so the two implementations agree on verdicts out of +// the box. +const defaultVerifyThreshold float32 = 0.35 + +// loadOptions holds the parsed model-level options for face-detect. +type loadOptions struct { + verifyThreshold float32 + modelName string +} + +func splitOption(o string) (key, value string, ok bool) { + i := strings.Index(o, ":") + if i < 0 { + return "", "", false + } + return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true +} + +// parseOptions reads the backend "key:value" option slice. Unknown keys are +// ignored. Defaults: verify_threshold 0.35, model_name derived from the file. +func parseOptions(opts []string) loadOptions { + o := loadOptions{verifyThreshold: defaultVerifyThreshold} + for _, oo := range opts { + key, value, ok := splitOption(oo) + if !ok { + continue + } + switch key { + case "verify_threshold", "threshold": + if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 { + o.verifyThreshold = float32(f) + } + case "model_name": + o.modelName = value + } + } + return o +} diff --git a/backend/go/face-detect/package.sh b/backend/go/face-detect/package.sh new file mode 100644 index 000000000000..36ffa89935dd --- /dev/null +++ b/backend/go/face-detect/package.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# +# Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs +# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE +# so the package is self-contained. Mirrors backend/go/voice-detect/package.sh; +# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc +# is used instead of the host's. + +set -e + +CURDIR=$(dirname "$(realpath "$0")") +REPO_ROOT="${CURDIR}/../../.." + +mkdir -p "$CURDIR/package/lib" + +cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/" +cp -avf "$CURDIR/run.sh" "$CURDIR/package/" + +# libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via +# LD_LIBRARY_PATH, which run.sh points at lib/. +cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || { + echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2 + exit 1 +} + +# Detect architecture and copy the core runtime libs libfacedetect.so links +# against, plus the matching dynamic loader as lib/ld.so. +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ "$(uname -s)" = "Darwin" ]; then + echo "Detected Darwin" +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on +# BUILD_TYPE so the backend can reach the GPU without the runtime base image +# shipping those drivers. +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah "$CURDIR/package/" "$CURDIR/package/lib/" diff --git a/backend/go/face-detect/run.sh b/backend/go/face-detect/run.sh new file mode 100644 index 000000000000..a6cc59034fb7 --- /dev/null +++ b/backend/go/face-detect/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath "$0")") + +export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}" + +# If a self-contained ld.so was packaged, route through it so the packaged +# libc / libstdc++ are used instead of the host's (matches the voice-detect / +# whisper / parakeet backends' runtime layout). +if [ -f "$CURDIR/lib/ld.so" ]; then + echo "Using lib/ld.so" + exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@" +fi + +exec "$CURDIR/face-detect-grpc" "$@" diff --git a/backend/go/face-detect/test.sh b/backend/go/face-detect/test.sh new file mode 100644 index 000000000000..da290c343df0 --- /dev/null +++ b/backend/go/face-detect/test.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath "$0")") +cd "$CURDIR" + +echo "Running face-detect backend tests..." + +# The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke +# specs run only when a model + image are provided via +# FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they +# auto-skip. +LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s . + +echo "face-detect tests completed." diff --git a/backend/go/voice-detect/.gitignore b/backend/go/voice-detect/.gitignore new file mode 100644 index 000000000000..812afb9b2142 --- /dev/null +++ b/backend/go/voice-detect/.gitignore @@ -0,0 +1,18 @@ +# Fetched upstream sources +sources/ + +# CMake build directories +build*/ + +# build artifacts staged in-tree by the Makefile (cp from sources/) or +# symlinked for local dev; the real sources live in voice-detect.cpp upstream. +*.so +*.so.* +voicedetect_capi.h +compile_commands.json + +# Compiled backend binary +voice-detect-grpc + +# Packaging output +package/ diff --git a/backend/go/voice-detect/Makefile b/backend/go/voice-detect/Makefile new file mode 100644 index 000000000000..ea89e968cd48 --- /dev/null +++ b/backend/go/voice-detect/Makefile @@ -0,0 +1,94 @@ +# voice-detect backend Makefile. +# +# Upstream pin lives below as VOICEDETECT_VERSION?=d2839ca... (.github/bump_deps.sh +# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention). +# +# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build, +# symlink the .so + header into this directory and skip the clone/cmake steps: +# +# ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so . +# ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h . +# go build -o voice-detect-grpc . +# +# The default target below does the proper clone-at-pin + cmake build so CI does +# not need a side-checkout. + +VOICEDETECT_VERSION?=d2839ca0f24118d8f30ab9fbf53bd9d9fcfca978 +VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp + +GOCMD?=go +GO_TAGS?= +JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) + +BUILD_TYPE?= +NATIVE?=false + +# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is +# self-contained: dlopen needs no libggml*.so alongside it, only system libs +# (libstdc++/libgomp/libc) that the runtime image already provides. +CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON + +ifeq ($(NATIVE),false) + CMAKE_ARGS+=-DGGML_NATIVE=OFF +endif + +# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and +# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare +# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_* +# options instead. (openblas is not gated, so -DGGML_BLAS passes through.) +ifeq ($(BUILD_TYPE),cublas) + CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON +else ifeq ($(BUILD_TYPE),openblas) + CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS +else ifeq ($(BUILD_TYPE),hipblas) + CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON +else ifeq ($(BUILD_TYPE),vulkan) + CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON +else ifeq ($(BUILD_TYPE),metal) + CMAKE_ARGS+=-DVOICEDETECT_GGML_METAL=ON +endif + +.PHONY: voice-detect-grpc package build clean purge test all + +all: voice-detect-grpc + +# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts +# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION +# bump, run 'make purge && make' to refetch. +sources/voice-detect.cpp: + mkdir -p sources/voice-detect.cpp + cd sources/voice-detect.cpp && \ + git init -q && \ + git remote add origin $(VOICEDETECT_REPO) && \ + git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \ + git checkout FETCH_HEAD && \ + git submodule update --init --recursive --depth 1 --single-branch + +# Build the shared lib + header out-of-tree, then stage them next to the Go +# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick +# them up. +libvoicedetect.so: sources/voice-detect.cpp + cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS) + cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect + cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true + cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./ + +voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go + CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc . + +package: voice-detect-grpc + bash package.sh + +build: package + +# Test target. The embed/verify/analyze smoke specs are gated on +# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the +# heavy specs auto-skip and only the pure-Go parsing specs run. +test: + LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1 + +clean: purge + rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc + +purge: + rm -rf sources/voice-detect.cpp diff --git a/backend/go/voice-detect/govoicedetect.go b/backend/go/voice-detect/govoicedetect.go new file mode 100644 index 000000000000..2bbe74bd0dec --- /dev/null +++ b/backend/go/voice-detect/govoicedetect.go @@ -0,0 +1,273 @@ +package main + +import ( + "encoding/json" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "time" + "unsafe" + + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/xlog" +) + +// purego-bound entry points from libvoicedetect.so. Names match +// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi` +// is enough to spot drift. +// +// The opaque ctx and the malloc'd char*/float* return values are declared as +// uintptr so we get the raw pointer back and can release it via the matching +// capi free function. purego's native string/[]float32 returns would copy and +// forget the original pointer, leaking the C-owned buffer on every call. +var ( + CppAbiVersion func() int32 + CppLoad func(ggufPath string) uintptr + CppFree func(ctx uintptr) + CppLastError func(ctx uintptr) string + CppFreeString func(s uintptr) + CppFreeVec func(v uintptr) + CppEmbedPath func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32 + CppEmbedPCM func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32 + CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32 + CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr +) + +// VoiceDetect implements the speaker-recognition voice subset of the Backend +// gRPC service over libvoicedetect.so. The C side keeps a single loaded model +// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread +// serializes every call. +type VoiceDetect struct { + base.SingleThread + opts loadOptions + ctxPtr uintptr +} + +func (v *VoiceDetect) Load(opts *pb.ModelOptions) error { + model := opts.ModelFile + if model == "" { + model = opts.ModelPath + } + if !filepath.IsAbs(model) && opts.ModelPath != "" { + model = filepath.Join(opts.ModelPath, model) + } + if model == "" { + return errors.New("voice-detect: ModelFile is required") + } + + v.opts = parseOptions(opts.Options) + if v.opts.modelName == "" { + v.opts.modelName = filepath.Base(model) + } + + // Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns + // one backend process per model and serves requests concurrently, so the + // engine's own min(hardware_concurrency, 8) default can oversubscribe cores. + // VOICEDETECT_THREADS is read by the engine at backend construction, so it + // must be set before the capi load. A non-positive Threads means "unset": + // leave the env alone so the engine keeps its sane default. + threads := opts.Threads + if threads > 0 { + if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil { + return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err) + } + xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads) + } + + xlog.Info("voice-detect: loading model", "model", model, + "verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion()) + + ctx := CppLoad(model) + if ctx == 0 { + // The last-error buffer lives on the ctx that was never returned, so + // surface the path the operator tried to load instead. + return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model) + } + v.ctxPtr = ctx + return nil +} + +// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip. +// The request carries a filesystem PATH; the HTTP layer materializes +// base64/URL/data-URI inputs to a temp file before the gRPC call. +func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) { + if v.ctxPtr == 0 { + return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded") + } + if req.Audio == "" { + return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required") + } + emb, err := v.embedPath(req.Audio) + if err != nil { + return pb.VoiceEmbedResponse{}, err + } + return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil +} + +func (v *VoiceDetect) embedPath(path string) ([]float32, error) { + var vec uintptr + var dim int32 + rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim)) + if rc != 0 || vec == 0 || dim <= 0 { + return nil, v.lastErr("embed", path) + } + defer CppFreeVec(vec) + // Copy out of the C-owned malloc'd buffer before freeing it. The + // uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell + // a C heap pointer from Go-managed memory; safe here, the GC neither tracks + // nor moves this buffer and we copy immediately. + src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free + out := make([]float32, int(dim)) + copy(out, src) + return out, nil +} + +// VoiceVerify embeds two clips and reports whether they are the same speaker by +// cosine distance against a threshold. A request threshold <= 0 falls back to +// the model-configured default (verify_threshold option, 0.25 if unset). +func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) { + if v.ctxPtr == 0 { + return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded") + } + if req.Audio1 == "" || req.Audio2 == "" { + return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required") + } + + threshold := req.Threshold + if threshold <= 0 { + threshold = v.opts.verifyThreshold + } + + started := time.Now() + var distance float32 + var verified int32 + rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold, + unsafe.Pointer(&distance), unsafe.Pointer(&verified)) + if rc != 0 { + return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2) + } + elapsedMs := float32(time.Since(started).Seconds() * 1000.0) + + // Confidence decays linearly from 100 at distance 0 to 0 at the threshold, + // matching the Python speaker-recognition backend's reporting. + confidence := float32(0) + if threshold > 0 { + confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0))) + } + + return pb.VoiceVerifyResponse{ + Verified: verified != 0, + Distance: distance, + Threshold: threshold, + Confidence: confidence, + Model: v.opts.modelName, + ProcessingTimeMs: elapsedMs, + }, nil +} + +// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API +// always evaluates every supported head, so the request's actions filter is +// advisory and the full analysis is returned as a single segment (the engine +// does not produce time-bounded segments). +func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) { + if v.ctxPtr == 0 { + return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded") + } + if req.Audio == "" { + return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required") + } + + ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio) + if ptr == 0 { + return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio) + } + defer CppFreeString(ptr) + + seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr)) + if err != nil { + return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err) + } + return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil +} + +// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json: +// +// {"age":42.0, +// "gender":{"label":"female","female":0.88,"male":0.12}, +// "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}} +// +// gender is a mixed object (a "label" string plus per-class float scores), so +// it is decoded into raw messages and split in parseAnalyzeJSON. +type analyzeJSON struct { + Age float32 `json:"age"` + Gender map[string]json.RawMessage `json:"gender"` + Emotion struct { + Label string `json:"label"` + Scores map[string]float32 `json:"scores"` + } `json:"emotion"` +} + +// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis. +// start/end stay 0: the model emits a single whole-utterance result, not +// time-bounded segments. +func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) { + var a analyzeJSON + if err := json.Unmarshal([]byte(doc), &a); err != nil { + return nil, err + } + + seg := &pb.VoiceAnalysis{ + Age: a.Age, + DominantEmotion: a.Emotion.Label, + Emotion: a.Emotion.Scores, + } + + if len(a.Gender) > 0 { + gender := make(map[string]float32, len(a.Gender)) + for k, raw := range a.Gender { + if k == "label" { + _ = json.Unmarshal(raw, &seg.DominantGender) + continue + } + var score float32 + if err := json.Unmarshal(raw, &score); err == nil { + gender[k] = score + } + } + seg.Gender = gender + } + + return seg, nil +} + +// lastErr wraps the C-API's per-ctx last-error buffer into a Go error. +func (v *VoiceDetect) lastErr(op, subject string) error { + msg := strings.TrimSpace(CppLastError(v.ctxPtr)) + if msg == "" { + msg = "no error detail" + } + return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg) +} + +// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a +// malloc'd buffer the caller owns; release it via CppFreeString after the copy. +// +// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell +// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor +// moves the buffer and we dereference it immediately to copy the bytes out. +func goStringFromCPtr(cptr uintptr) string { + if cptr == 0 { + return "" + } + p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above) + n := 0 + for *(*byte)(unsafe.Add(p, n)) != 0 { + n++ + } + return string(unsafe.Slice((*byte)(p), n)) +} diff --git a/backend/go/voice-detect/govoicedetect_test.go b/backend/go/voice-detect/govoicedetect_test.go new file mode 100644 index 000000000000..2de7fcc8ac35 --- /dev/null +++ b/backend/go/voice-detect/govoicedetect_test.go @@ -0,0 +1,144 @@ +package main + +import ( + "os" + "sync" + "testing" + + "github.com/ebitengine/purego" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestVoiceDetect(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "voice-detect Backend Suite") +} + +var ( + libLoadOnce sync.Once + libLoadErr error +) + +// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API +// bridge without spinning up the gRPC server. Records the error (the smoke +// specs skip themselves) when libvoicedetect.so is not loadable from cwd +// (LD_LIBRARY_PATH or a symlink in ./). +func ensureLibLoaded() error { + libLoadOnce.Do(func() { + libName := os.Getenv("VOICEDETECT_LIBRARY") + if libName == "" { + libName = "libvoicedetect.so" + } + lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + libLoadErr = err + return + } + purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version") + purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load") + purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free") + purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error") + purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string") + purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec") + purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path") + purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm") + purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths") + purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json") + }) + return libLoadErr +} + +var _ = Describe("parseOptions", func() { + It("defaults verify_threshold to 0.25", func() { + o := parseOptions(nil) + Expect(o.verifyThreshold).To(Equal(float32(0.25))) + Expect(o.modelName).To(Equal("")) + }) + + It("parses verify_threshold, threshold alias and model_name", func() { + o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"}) + Expect(o.verifyThreshold).To(Equal(float32(0.4))) + Expect(o.modelName).To(Equal("ecapa")) + + o2 := parseOptions([]string{"threshold:0.3"}) + Expect(o2.verifyThreshold).To(Equal(float32(0.3))) + }) + + It("ignores non-positive thresholds and keeps the default", func() { + o := parseOptions([]string{"verify_threshold:0", "threshold:-1"}) + Expect(o.verifyThreshold).To(Equal(float32(0.25))) + }) +}) + +var _ = Describe("parseAnalyzeJSON", func() { + It("maps age, gender label+scores and emotion label+scores", func() { + doc := `{"age":42.0, + "gender":{"label":"female","female":0.88,"male":0.12}, + "emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}` + seg, err := parseAnalyzeJSON(doc) + Expect(err).ToNot(HaveOccurred()) + Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4)) + Expect(seg.Start).To(Equal(float32(0))) + Expect(seg.End).To(Equal(float32(0))) + + Expect(seg.DominantGender).To(Equal("female")) + Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4))) + Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4))) + // The "label" entry is consumed into DominantGender, not the score map. + Expect(seg.Gender).ToNot(HaveKey("label")) + + Expect(seg.DominantEmotion).To(Equal("neutral")) + Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4))) + Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4))) + }) + + It("tolerates a missing gender block", func() { + seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`) + Expect(err).ToNot(HaveOccurred()) + Expect(seg.DominantGender).To(Equal("")) + Expect(seg.DominantEmotion).To(Equal("happy")) + }) + + It("returns an error on malformed JSON", func() { + _, err := parseAnalyzeJSON(`{not-json`) + Expect(err).To(HaveOccurred()) + }) +}) + +// The specs below exercise the real C-API end to end. They run only when both a +// model GGUF and a test WAV are provided, and skip cleanly otherwise so the +// suite stays green without large assets. +var _ = Describe("VoiceDetect end-to-end", Ordered, func() { + var ( + v *VoiceDetect + modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL") + wavPath = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV") + ) + + BeforeAll(func() { + if modelPath == "" || wavPath == "" { + Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs") + } + if err := ensureLibLoaded(); err != nil { + Skip("libvoicedetect.so not loadable: " + err.Error()) + } + v = &VoiceDetect{} + Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed()) + }) + + It("embeds an audio clip", func() { + resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath}) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.Embedding).ToNot(BeEmpty()) + Expect(resp.Model).ToNot(BeEmpty()) + }) + + It("verifies a clip against itself as the same speaker", func() { + resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath}) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.Verified).To(BeTrue()) + Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold)) + }) +}) diff --git a/backend/go/voice-detect/main.go b/backend/go/voice-detect/main.go new file mode 100644 index 000000000000..35421b5c36ee --- /dev/null +++ b/backend/go/voice-detect/main.go @@ -0,0 +1,64 @@ +package main + +// Started internally by LocalAI - one gRPC server per loaded model. +// +// Loads libvoicedetect.so via purego and registers the flat C-API entry points +// declared in voicedetect_capi.h. The library name can be overridden with +// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY +// convention in the sibling backends); the default looks for the .so next to +// this binary (resolved via LD_LIBRARY_PATH by run.sh). +import ( + "flag" + "fmt" + "os" + + "github.com/ebitengine/purego" + grpc "github.com/mudler/LocalAI/pkg/grpc" +) + +var ( + addr = flag.String("addr", "localhost:50051", "the address to connect to") +) + +type LibFuncs struct { + FuncPtr any + Name string +} + +func main() { + libName := os.Getenv("VOICEDETECT_LIBRARY") + if libName == "" { + libName = "libvoicedetect.so" + } + + lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err)) + } + + // Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as + // uintptr so the raw pointer can be freed via the matching capi free fn. + libFuncs := []LibFuncs{ + {&CppAbiVersion, "voicedetect_capi_abi_version"}, + {&CppLoad, "voicedetect_capi_load"}, + {&CppFree, "voicedetect_capi_free"}, + {&CppLastError, "voicedetect_capi_last_error"}, + {&CppFreeString, "voicedetect_capi_free_string"}, + {&CppFreeVec, "voicedetect_capi_free_vec"}, + {&CppEmbedPath, "voicedetect_capi_embed_path"}, + {&CppEmbedPCM, "voicedetect_capi_embed_pcm"}, + {&CppVerifyPaths, "voicedetect_capi_verify_paths"}, + {&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"}, + } + for _, lf := range libFuncs { + purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name) + } + + fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion()) + + flag.Parse() + + if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil { + panic(err) + } +} diff --git a/backend/go/voice-detect/options.go b/backend/go/voice-detect/options.go new file mode 100644 index 000000000000..c5a6e2595cb2 --- /dev/null +++ b/backend/go/voice-detect/options.go @@ -0,0 +1,46 @@ +package main + +import ( + "strconv" + "strings" +) + +// defaultVerifyThreshold is the cosine-distance cutoff used when a request does +// not set one. Matches the Python speaker-recognition backend's default so the +// two implementations agree on verdicts out of the box. +const defaultVerifyThreshold float32 = 0.25 + +// loadOptions holds the parsed model-level options for voice-detect. +type loadOptions struct { + verifyThreshold float32 + modelName string +} + +func splitOption(o string) (key, value string, ok bool) { + i := strings.Index(o, ":") + if i < 0 { + return "", "", false + } + return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true +} + +// parseOptions reads the backend "key:value" option slice. Unknown keys are +// ignored. Defaults: verify_threshold 0.25, model_name derived from the file. +func parseOptions(opts []string) loadOptions { + o := loadOptions{verifyThreshold: defaultVerifyThreshold} + for _, oo := range opts { + key, value, ok := splitOption(oo) + if !ok { + continue + } + switch key { + case "verify_threshold", "threshold": + if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 { + o.verifyThreshold = float32(f) + } + case "model_name": + o.modelName = value + } + } + return o +} diff --git a/backend/go/voice-detect/package.sh b/backend/go/voice-detect/package.sh new file mode 100755 index 000000000000..de95c8ce2d16 --- /dev/null +++ b/backend/go/voice-detect/package.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# +# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs +# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE +# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; +# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc +# is used instead of the host's. + +set -e + +CURDIR=$(dirname "$(realpath "$0")") +REPO_ROOT="${CURDIR}/../../.." + +mkdir -p "$CURDIR/package/lib" + +cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/" +cp -avf "$CURDIR/run.sh" "$CURDIR/package/" + +# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via +# LD_LIBRARY_PATH, which run.sh points at lib/. +cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || { + echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2 + exit 1 +} + +# Detect architecture and copy the core runtime libs libvoicedetect.so links +# against, plus the matching dynamic loader as lib/ld.so. +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ "$(uname -s)" = "Darwin" ]; then + echo "Detected Darwin" +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on +# BUILD_TYPE so the backend can reach the GPU without the runtime base image +# shipping those drivers. +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah "$CURDIR/package/" "$CURDIR/package/lib/" diff --git a/backend/go/voice-detect/run.sh b/backend/go/voice-detect/run.sh new file mode 100755 index 000000000000..ea5fef5083b7 --- /dev/null +++ b/backend/go/voice-detect/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath "$0")") + +export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}" + +# If a self-contained ld.so was packaged, route through it so the packaged +# libc / libstdc++ are used instead of the host's (matches the whisper / +# parakeet backends' runtime layout). +if [ -f "$CURDIR/lib/ld.so" ]; then + echo "Using lib/ld.so" + exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@" +fi + +exec "$CURDIR/voice-detect-grpc" "$@" diff --git a/backend/go/voice-detect/test.sh b/backend/go/voice-detect/test.sh new file mode 100755 index 000000000000..17addfebf59c --- /dev/null +++ b/backend/go/voice-detect/test.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath "$0")") +cd "$CURDIR" + +echo "Running voice-detect backend tests..." + +# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run +# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and +# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip. +LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s . + +echo "voice-detect tests completed." diff --git a/backend/index.yaml b/backend/index.yaml index 3f61f7b4ee1b..57f5784e0ef9 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -209,6 +209,78 @@ nvidia-cuda-12: "cuda12-ced" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced" +- &voicedetect + name: "voice-detect" + alias: "voice-detect" + license: mit + icon: https://avatars.githubusercontent.com/u/95302084 + description: | + voice-detect speaker recognition and voice analysis. + voice-detect.cpp is a C++/ggml engine that produces L2-normalised + speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker + ERes2Net, CAM++) for voice verification and 1:N identification, plus + a wav2vec2 age / gender / emotion analysis head. It replaces the + Python speaker-recognition backend and is exposed through the Voice* + gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA + CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets. + urls: + - https://github.com/mudler/voice-detect.cpp + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - CPU + - GPU + - CUDA + - HIP + capabilities: + default: "cpu-voice-detect" + nvidia: "cuda12-voice-detect" + intel: "intel-sycl-f16-voice-detect" + metal: "metal-voice-detect" + amd: "rocm-voice-detect" + vulkan: "vulkan-voice-detect" + nvidia-l4t: "nvidia-l4t-arm64-voice-detect" + nvidia-cuda-13: "cuda13-voice-detect" + nvidia-cuda-12: "cuda12-voice-detect" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect" +- &facedetect + name: "face-detect" + alias: "face-detect" + license: mit + icon: https://avatars.githubusercontent.com/u/95302084 + description: | + face-detect face detection, embedding, verification and analysis. + face-detect.cpp is a C++/ggml engine that runs SCRFD / YuNet face + detection and ArcFace / SFace 512-d (or 128-d) L2-normalised face + embeddings for verification and 1:N identification, plus a landmark / + age / gender analysis head. It replaces the Python insightface backend + and is exposed through the Embedding, Detect and Face* gRPC rpcs and + the /v1/face/* REST endpoints. It runs on CPU, NVIDIA CUDA, AMD + ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets. + urls: + - https://github.com/mudler/face-detect.cpp + tags: + - face-recognition + - face-verification + - face-embedding + - CPU + - GPU + - CUDA + - HIP + capabilities: + default: "cpu-face-detect" + nvidia: "cuda12-face-detect" + intel: "intel-sycl-f16-face-detect" + metal: "metal-face-detect" + amd: "rocm-face-detect" + vulkan: "vulkan-face-detect" + nvidia-l4t: "nvidia-l4t-arm64-face-detect" + nvidia-cuda-13: "cuda13-face-detect" + nvidia-cuda-12: "cuda12-face-detect" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect" - &voxtral name: "voxtral" alias: "voxtral" @@ -2796,6 +2868,236 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-13-ced +## voice-detect +- !!merge <<: *voicedetect + name: "voice-detect-development" + capabilities: + default: "cpu-voice-detect-development" + nvidia: "cuda12-voice-detect-development" + intel: "intel-sycl-f16-voice-detect-development" + metal: "metal-voice-detect-development" + amd: "rocm-voice-detect-development" + vulkan: "vulkan-voice-detect-development" + nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development" + nvidia-cuda-13: "cuda13-voice-detect-development" + nvidia-cuda-12: "cuda12-voice-detect-development" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development" +- !!merge <<: *voicedetect + name: "nvidia-l4t-arm64-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect +- !!merge <<: *voicedetect + name: "nvidia-l4t-arm64-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect" + mirrors: + - localai/localai-backends:master-nvidia-l4t-arm64-voice-detect +- !!merge <<: *voicedetect + name: "cuda13-nvidia-l4t-arm64-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect +- !!merge <<: *voicedetect + name: "cuda13-nvidia-l4t-arm64-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect +- !!merge <<: *voicedetect + name: "cpu-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect" + mirrors: + - localai/localai-backends:latest-cpu-voice-detect +- !!merge <<: *voicedetect + name: "cpu-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect" + mirrors: + - localai/localai-backends:master-cpu-voice-detect +- !!merge <<: *voicedetect + name: "metal-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-voice-detect +- !!merge <<: *voicedetect + name: "metal-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-voice-detect +- !!merge <<: *voicedetect + name: "cuda12-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect +- !!merge <<: *voicedetect + name: "cuda12-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect +- !!merge <<: *voicedetect + name: "rocm-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect +- !!merge <<: *voicedetect + name: "rocm-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-voice-detect +- !!merge <<: *voicedetect + name: "intel-sycl-f32-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect +- !!merge <<: *voicedetect + name: "intel-sycl-f32-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect +- !!merge <<: *voicedetect + name: "intel-sycl-f16-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect +- !!merge <<: *voicedetect + name: "intel-sycl-f16-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect +- !!merge <<: *voicedetect + name: "vulkan-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-vulkan-voice-detect +- !!merge <<: *voicedetect + name: "vulkan-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-vulkan-voice-detect +- !!merge <<: *voicedetect + name: "cuda13-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect +- !!merge <<: *voicedetect + name: "cuda13-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect +## face-detect +- !!merge <<: *facedetect + name: "face-detect-development" + capabilities: + default: "cpu-face-detect-development" + nvidia: "cuda12-face-detect-development" + intel: "intel-sycl-f16-face-detect-development" + metal: "metal-face-detect-development" + amd: "rocm-face-detect-development" + vulkan: "vulkan-face-detect-development" + nvidia-l4t: "nvidia-l4t-arm64-face-detect-development" + nvidia-cuda-13: "cuda13-face-detect-development" + nvidia-cuda-12: "cuda12-face-detect-development" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect-development" +- !!merge <<: *facedetect + name: "nvidia-l4t-arm64-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-face-detect" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-arm64-face-detect +- !!merge <<: *facedetect + name: "nvidia-l4t-arm64-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-face-detect" + mirrors: + - localai/localai-backends:master-nvidia-l4t-arm64-face-detect +- !!merge <<: *facedetect + name: "cuda13-nvidia-l4t-arm64-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect +- !!merge <<: *facedetect + name: "cuda13-nvidia-l4t-arm64-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect +- !!merge <<: *facedetect + name: "cpu-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-face-detect" + mirrors: + - localai/localai-backends:latest-cpu-face-detect +- !!merge <<: *facedetect + name: "cpu-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-face-detect" + mirrors: + - localai/localai-backends:master-cpu-face-detect +- !!merge <<: *facedetect + name: "metal-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-face-detect" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-face-detect +- !!merge <<: *facedetect + name: "metal-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-face-detect" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-face-detect +- !!merge <<: *facedetect + name: "cuda12-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-face-detect" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-face-detect +- !!merge <<: *facedetect + name: "cuda12-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-face-detect" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-face-detect +- !!merge <<: *facedetect + name: "rocm-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-face-detect" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-face-detect +- !!merge <<: *facedetect + name: "rocm-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-face-detect" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-face-detect +- !!merge <<: *facedetect + name: "intel-sycl-f32-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-face-detect" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f32-face-detect +- !!merge <<: *facedetect + name: "intel-sycl-f32-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-face-detect" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f32-face-detect +- !!merge <<: *facedetect + name: "intel-sycl-f16-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-face-detect" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f16-face-detect +- !!merge <<: *facedetect + name: "intel-sycl-f16-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-face-detect" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f16-face-detect +- !!merge <<: *facedetect + name: "vulkan-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-face-detect" + mirrors: + - localai/localai-backends:latest-gpu-vulkan-face-detect +- !!merge <<: *facedetect + name: "vulkan-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-face-detect" + mirrors: + - localai/localai-backends:master-gpu-vulkan-face-detect +- !!merge <<: *facedetect + name: "cuda13-face-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-face-detect" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-face-detect +- !!merge <<: *facedetect + name: "cuda13-face-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-face-detect" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-face-detect ## stablediffusion-ggml - !!merge <<: *stablediffusionggml name: "cpu-stablediffusion-ggml" diff --git a/core/config/backend_capabilities.go b/core/config/backend_capabilities.go index cc9567887e30..d54463a8e4f9 100644 --- a/core/config/backend_capabilities.go +++ b/core/config/backend_capabilities.go @@ -542,6 +542,19 @@ var BackendCapabilities = map[string]BackendCapability{ DefaultUsecases: []string{UsecaseSpeakerRecognition}, Description: "Speaker recognition — voice identity verification and analysis", }, + "voice-detect": { + GRPCMethods: []GRPCMethod{MethodVoiceVerify, MethodVoiceEmbed, MethodVoiceAnalyze}, + PossibleUsecases: []string{UsecaseSpeakerRecognition}, + DefaultUsecases: []string{UsecaseSpeakerRecognition}, + Description: "voice-detect.cpp: C++/ggml speaker embedding, verification and voice analysis (age/gender/emotion)", + }, + "face-detect": { + GRPCMethods: []GRPCMethod{MethodEmbedding, MethodDetect, MethodFaceVerify, MethodFaceAnalyze}, + PossibleUsecases: []string{UsecaseEmbeddings, UsecaseDetection, UsecaseFaceRecognition}, + DefaultUsecases: []string{UsecaseFaceRecognition}, + AcceptsImages: true, + Description: "face-detect.cpp: C++/ggml face detection, embedding, verification and attribute analysis", + }, "silero-vad": { GRPCMethods: []GRPCMethod{MethodVAD}, PossibleUsecases: []string{UsecaseVAD}, diff --git a/docs/content/features/face-recognition.md b/docs/content/features/face-recognition.md index ecc3e721382d..7bddc702ff52 100644 --- a/docs/content/features/face-recognition.md +++ b/docs/content/features/face-recognition.md @@ -7,16 +7,93 @@ url = "/features/face-recognition/" ![Face recognition: 1:N match against a vector store, with an anti-spoofing liveness gate that can veto a verification](/images/diagrams/face-recognition-flow.png) -LocalAI supports face recognition through the `insightface` backend: -face verification (1:1), face identification (1:N) against a built-in -vector store, face embedding, face detection, demographic analysis -(age / gender), and antispoofing / liveness detection. +LocalAI supports face recognition: face verification (1:1), face +identification (1:N) against a built-in vector store, face embedding, +face detection, demographic analysis (age / gender), and antispoofing / +liveness detection. -The backend ships **two interchangeable engines** under one image, each -paired with a distinct gallery entry so users can pick by license and -accuracy needs. +The same `/v1/face/*` HTTP API is served by two backends: -## Licensing — read this first +- **`face-detect` (recommended, default).** A standalone C++/ggml + engine ([face-detect.cpp](https://github.com/mudler/face-detect.cpp)): + no Python, no onnxruntime, no torch runtime. Each gallery entry is a + single self-describing GGUF. This is the recommended option for new + deployments. +- **`insightface` (Python).** The original ONNX Runtime backend. Still + supported; see [the Python backend](#insightface-python-backend) below. + +Both backends expose the identical wire format, so the API examples in +this page work with either - only the gallery entry name (the `model` +field) changes. + +## face-detect (ggml) backend + +The `face-detect` backend reads the detector and recognizer architecture +(`facedetect.arch`) directly from the GGUF metadata, so installing a +gallery entry is all that is needed to select an engine. It drives the +Embeddings / Detect / FaceVerify / FaceAnalyze gRPC rpcs behind the +`/v1/face/{embed,verify,analyze,detect,register,identify,forget}` +endpoints. + +### Licensing - read this first + +| Gallery entry | Detector + recognizer | Embedding dim | License | +|---|---|---|---| +| `face-detect-buffalo-l` | SCRFD-10GF + ArcFace R50 + GenderAge | 512 | **Non-commercial research only** (upstream insightface weights) | +| `face-detect-buffalo-m` | SCRFD-2.5GF + ArcFace R50 + GenderAge | 512 | **Non-commercial research only** | +| `face-detect-buffalo-s` | SCRFD-500MF + MBF + GenderAge | 512 | **Non-commercial research only** | +| `face-detect-yunet-sface` | YuNet + SFace (OpenCV Zoo) | 128 | **Apache 2.0 - commercial-safe** | + +The insightface buffalo packs (buffalo_l / buffalo_m / buffalo_s) are +released by the upstream maintainers for **non-commercial research use +only**. Pick the `face-detect-yunet-sface` entry for production / +commercial deployments. + +### Quickstart + +Install the commercial-safe entry (recommended for copy-paste): + +```bash +local-ai models install face-detect-yunet-sface +``` + +Verify that two images depict the same person: + +```bash +curl -sX POST http://localhost:8080/v1/face/verify \ + -H "Content-Type: application/json" \ + -d '{ + "model": "face-detect-yunet-sface", + "img1": "https://example.com/alice_1.jpg", + "img2": "https://example.com/alice_2.jpg" + }' +``` + +Detect faces and analyze demographics (buffalo entries populate +age / gender; YuNet + SFace returns regions only): + +```bash +curl -sX POST http://localhost:8080/v1/face/detect \ + -H "Content-Type: application/json" \ + -d '{"model": "face-detect-buffalo-l", "img": "https://example.com/group.jpg"}' + +curl -sX POST http://localhost:8080/v1/face/analyze \ + -H "Content-Type: application/json" \ + -d '{"model": "face-detect-buffalo-l", "img": "https://example.com/alice.jpg"}' +``` + +The 1:N register / identify / forget workflow and the rest of the API +are identical to the [API reference](#api-reference) below - just pass a +`face-detect-*` model name. The per-engine verify thresholds are ~0.35 +for the buffalo ArcFace/MBF recognizers and ~0.363 for SFace. + +## insightface (Python) backend + +The `insightface` backend ships **two interchangeable engines** under +one image, each paired with a distinct gallery entry so users can pick +by license and accuracy needs. + +### Licensing - read this first | Gallery entry | Detector + recognizer | Size | License | |---|---|---|---| diff --git a/docs/content/features/voice-recognition.md b/docs/content/features/voice-recognition.md index 20728a28f724..aed5d5bf68be 100644 --- a/docs/content/features/voice-recognition.md +++ b/docs/content/features/voice-recognition.md @@ -7,16 +7,92 @@ url = "/features/voice-recognition/" ![Voice recognition: register, identify, and forget voiceprints in a vector store, for 1:1 verify or 1:N identify](/images/diagrams/voice-recognition-flow.png) -LocalAI supports voice (speaker) recognition through the -`speaker-recognition` backend: speaker verification (1:1), speaker -identification (1:N) against a built-in vector store, speaker -embedding, and demographic analysis (age / gender / emotion from -voice). +LocalAI supports voice (speaker) recognition: speaker verification +(1:1), speaker identification (1:N) against a built-in vector store, +speaker embedding, and demographic analysis (age / gender / emotion +from voice). The audio analog to [Face Recognition](/features/face-recognition/), -following the same two-engine pattern under one image. +served over the same `/v1/voice/*` HTTP API by two backends: -## Engines +- **`voice-detect` (recommended, default).** A standalone C++/ggml + engine ([voice-detect.cpp](https://github.com/mudler/voice-detect.cpp)): + no Python, no onnxruntime, no torch runtime. Each gallery entry is a + single self-describing GGUF. This is the recommended option for new + deployments. +- **`speaker-recognition` (Python).** The original SpeechBrain / ONNX + backend. Still supported; see [the Python backend](#speaker-recognition-python-backend) + below. + +Both backends expose the identical wire format, so the API examples on +this page work with either - only the gallery entry name (the `model` +field) changes. + +## voice-detect (ggml) backend + +The `voice-detect` backend reads the embedding (or analysis) +architecture (`voicedetect.arch`) directly from the GGUF metadata, so +installing a gallery entry is all that is needed to select an engine. It +drives the VoiceEmbed / VoiceVerify / VoiceAnalyze gRPC rpcs behind the +`/v1/voice/{embed,verify,analyze,register,identify,forget}` endpoints. + +### Gallery entries + +| Gallery entry | Model | Embedding dim | License | +|---|---|---|---| +| `voice-detect-ecapa-tdnn` | SpeechBrain ECAPA-TDNN (VoxCeleb) | 192 | **Apache 2.0 - commercial-safe** | +| `voice-detect-wespeaker-resnet34` | WeSpeaker ResNet34 (VoxCeleb) | 256 | CC-BY-4.0 | +| `voice-detect-eres2net` | 3D-Speaker ERes2Net (VoxCeleb) | 192 | **Apache 2.0 - commercial-safe** | +| `voice-detect-campplus` | 3D-Speaker CAM++ (VoxCeleb) | 192 | **Apache 2.0 - commercial-safe** | +| `voice-detect-emotion-wav2vec2` | audEERING wav2vec2 (age / gender / emotion) | analyze head | **CC-BY-NC-SA-4.0 - non-commercial** | + +The four speaker-recognition entries drive verify / embed / identify. +`voice-detect-emotion-wav2vec2` is the analysis head behind +`/v1/voice/analyze` (continuous age estimate plus gender and emotion +class scores) and is **non-commercial / research use only**. + +### Quickstart + +Install the default entry (recommended for copy-paste): + +```bash +local-ai models install voice-detect-ecapa-tdnn +``` + +Verify that two audio clips were spoken by the same person: + +```bash +curl -sX POST http://localhost:8080/v1/voice/verify \ + -H "Content-Type: application/json" \ + -d '{ + "model": "voice-detect-ecapa-tdnn", + "audio1": "https://example.com/alice_1.wav", + "audio2": "https://example.com/alice_2.wav" + }' +``` + +Analyze age / gender / emotion (install the analyze entry first): + +```bash +local-ai models install voice-detect-emotion-wav2vec2 + +curl -sX POST http://localhost:8080/v1/voice/analyze \ + -H "Content-Type: application/json" \ + -d '{"model": "voice-detect-emotion-wav2vec2", "audio": "https://example.com/alice.wav"}' +``` + +The 1:N register / identify / forget workflow and the rest of the API +are identical to the [API reference](#api-reference) below - just pass a +`voice-detect-*` model name. The default verify threshold is ~0.25 for +the ECAPA-TDNN / ERes2Net / CAM++ recognizers and ~0.30 for WeSpeaker +ResNet34. + +## speaker-recognition (Python) backend + +The `speaker-recognition` backend follows the same two-engine pattern +under one image. + +### Engines | Gallery entry | Model | Size | License | |---|---|---|---| diff --git a/docs/content/reference/compatibility-table.md b/docs/content/reference/compatibility-table.md index 21971ff45cf0..0e9551b3bc3c 100644 --- a/docs/content/reference/compatibility-table.md +++ b/docs/content/reference/compatibility-table.md @@ -97,6 +97,8 @@ All backends listed here can be installed on demand from the [Backend Gallery]({ | [locate-anything.cpp](https://github.com/mudler/locate-anything.cpp) | Open-vocabulary object detection and visual grounding (LocateAnything-3B) in C/C++ using GGML | CPU, CUDA 12/13, Intel SYCL, Vulkan, Jetson L4T | | [depth-anything.cpp](https://github.com/mudler/depth-anything.cpp) | Depth Anything 3 monocular metric depth + camera pose in C/C++ using GGML | CPU, CUDA 12/13, Intel SYCL, Vulkan, Jetson L4T | | [sam3.cpp](https://github.com/PABannier/sam3.cpp) | Segment Anything (SAM 3/2/EdgeTAM) with text/point/box prompts in C/C++ using GGML | CPU, CUDA 12/13, Intel SYCL, Vulkan, Jetson L4T | +| [face-detect.cpp](https://github.com/mudler/face-detect.cpp) | Native face detection, recognition, embedding, demographics and anti-spoofing (SCRFD/ArcFace, YuNet/SFace) in C/C++ using GGML | CPU, CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, Jetson L4T | +| [voice-detect.cpp](https://github.com/mudler/voice-detect.cpp) | Native speaker (voice) recognition and voice analysis (ECAPA-TDNN, WeSpeaker, ERes2Net, CAM++, wav2vec2) in C/C++ using GGML | CPU, CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, Jetson L4T | | [insightface](https://github.com/deepinsight/insightface) | Face verification, embedding, and anti-spoofing liveness (ONNX Runtime) | CPU, CUDA 12 | | [speaker-recognition](https://speechbrain.github.io/) | Speaker (voice) recognition via SpeechBrain ECAPA-TDNN | CPU, CUDA 12, Metal | diff --git a/gallery/index.yaml b/gallery/index.yaml index cde505d721ea..5db26bf0dd51 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -8449,6 +8449,248 @@ - filename: MiniFASNetV1SE.onnx sha256: ebab7f90c7833fbccd46d3a555410e78d969db5438e169b6524be444862b3676 uri: https://github.com/yakhyo/face-anti-spoofing/releases/download/weights/MiniFASNetV1SE.onnx +- name: face-detect-buffalo-l + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/face-detect.cpp + - https://github.com/deepinsight/insightface + description: | + Face recognition with insightface's `buffalo_l` pack (SCRFD-10GF + detector + ResNet50 ArcFace 512-d embedder), ported to C++/ggml and + shipped as a single GGUF for the `face-detect` backend. Highest + accuracy of the buffalo line. + + No Python / onnxruntime / torch runtime: face-detect.cpp reads the + detector and embedder architecture (`facedetect.arch`) directly from + the GGUF metadata, so installing this entry is all that is needed to + select buffalo_l. Drives the Embedding / Detect / FaceVerify / + FaceAnalyze gRPC rpcs and the /v1/face/{verify,analyze,embed,detect} + REST endpoints. This GGUF also embeds the MiniFASNet anti-spoof + ensemble, available via the FaceVerify `anti_spoof` request flag. + NON-COMMERCIAL RESEARCH USE ONLY: for commercial use see + `face-detect-yunet-sface`. + license: insightface-non-commercial + icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4 + tags: + - face-recognition + - face-verification + - face-embedding + - research-only + - gpu + - cpu + last_checked: "2026-06-22" + overrides: + backend: face-detect + known_usecases: + - face_recognition + - detection + - embeddings + options: + - verify_threshold:0.35 + parameters: + model: face-detect-buffalo-l.gguf + files: + - filename: face-detect-buffalo-l.gguf + sha256: 6ed070f6e569beeed542ddd5603bcbc9eb8ea57f728f7d8013d6a90b2b952116 + uri: https://huggingface.co/mudler/face-detect-gguf/resolve/main/buffalo_l.gguf +- name: face-detect-buffalo-m + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/face-detect.cpp + - https://github.com/deepinsight/insightface + description: | + Face recognition with insightface's `buffalo_m` pack (SCRFD-2.5GF + detector + ResNet50 ArcFace embedder), converted to a C++/ggml GGUF + for the `face-detect` backend. Same recognition accuracy as + `buffalo_l` with a cheaper detector: a good balance on mid-range + hardware. + + The architecture (`facedetect.arch`) is read from the GGUF metadata, + so this entry alone selects the buffalo_m engine. This GGUF also + embeds the MiniFASNet anti-spoof ensemble, available via the + FaceVerify `anti_spoof` request flag. NON-COMMERCIAL RESEARCH USE + ONLY. + license: insightface-non-commercial + icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4 + tags: + - face-recognition + - face-verification + - face-embedding + - research-only + - gpu + - cpu + last_checked: "2026-06-22" + overrides: + backend: face-detect + known_usecases: + - face_recognition + - detection + - embeddings + options: + - verify_threshold:0.35 + parameters: + model: face-detect-buffalo-m.gguf + files: + - filename: face-detect-buffalo-m.gguf + sha256: 0f7527eeb97b88719bf7e11e43ab8af6f05999357d767f8dde53db3c586c1c3f + uri: https://huggingface.co/mudler/face-detect-gguf/resolve/main/buffalo_m.gguf +- name: face-detect-buffalo-s + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/face-detect.cpp + - https://github.com/deepinsight/insightface + description: | + Face recognition with insightface's `buffalo_s` pack (SCRFD-500MF + detector + MBF 512-d embedder), converted to a C++/ggml GGUF for the + `face-detect` backend. Small and CPU-friendly: a good fit for + mid-range and edge deployments. + + The architecture (`facedetect.arch`) is read from the GGUF metadata, + so this entry alone selects the buffalo_s engine. This GGUF also + embeds the MiniFASNet anti-spoof ensemble, available via the + FaceVerify `anti_spoof` request flag. NON-COMMERCIAL RESEARCH USE + ONLY. + license: insightface-non-commercial + icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4 + tags: + - face-recognition + - face-verification + - face-embedding + - research-only + - edge + - cpu + last_checked: "2026-06-22" + overrides: + backend: face-detect + known_usecases: + - face_recognition + - detection + - embeddings + options: + - verify_threshold:0.35 + parameters: + model: face-detect-buffalo-s.gguf + files: + - filename: face-detect-buffalo-s.gguf + sha256: 7490b1efbc8746b188a5aef0adf5e3d1a2dc9607abd474018893f95571999969 + uri: https://huggingface.co/mudler/face-detect-gguf/resolve/main/buffalo_s.gguf +- name: face-detect-buffalo-sc + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/face-detect.cpp + - https://github.com/deepinsight/insightface + description: | + Face recognition with insightface's `buffalo_sc` pack (SCRFD-500M + detector + a small ArcFace embedder), converted to a C++/ggml GGUF + for the `face-detect` backend. This is the smallest insightface + pack: the lightest option for low-resource and edge deployments. + + The architecture (`facedetect.arch`) is read from the GGUF metadata, + so this entry alone selects the buffalo_sc engine. If this GGUF + embeds the MiniFASNet anti-spoof ensemble, it is available via the + FaceVerify `anti_spoof` request flag. NON-COMMERCIAL RESEARCH USE + ONLY. + license: insightface-non-commercial + icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4 + tags: + - face-recognition + - face-verification + - face-embedding + - research-only + - edge + - cpu + last_checked: "2026-06-22" + overrides: + backend: face-detect + known_usecases: + - face_recognition + - detection + - embeddings + options: + - verify_threshold:0.35 + parameters: + model: face-detect-buffalo-sc.gguf + files: + - filename: face-detect-buffalo-sc.gguf + sha256: f754c0e32d5efbbc53d7efca13be2807676bf5db20a8594ef96b32afa2c482b1 + uri: https://huggingface.co/mudler/face-detect-gguf/resolve/main/buffalo_sc.gguf +- name: face-detect-antelopev2 + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/face-detect.cpp + - https://github.com/deepinsight/insightface + description: | + Face recognition with insightface's `antelopev2` pack (SCRFD-10G + detector + ArcFace glint360k R100, 512-d embedder), converted to a + C++/ggml GGUF for the `face-detect` backend. The higher-accuracy + insightface pack: heavier, but the best fit when recognition + quality matters more than speed. + + The architecture (`facedetect.arch`) is read from the GGUF metadata, + so this entry alone selects the antelopev2 engine. If this GGUF + embeds the MiniFASNet anti-spoof ensemble, it is available via the + FaceVerify `anti_spoof` request flag. NON-COMMERCIAL RESEARCH USE + ONLY. + license: insightface-non-commercial + icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4 + tags: + - face-recognition + - face-verification + - face-embedding + - research-only + last_checked: "2026-06-22" + overrides: + backend: face-detect + known_usecases: + - face_recognition + - detection + - embeddings + options: + - verify_threshold:0.35 + parameters: + model: face-detect-antelopev2.gguf + files: + - filename: face-detect-antelopev2.gguf + sha256: 245e657e51754fbf075dd43d80a80a2d14a60c2fc42a3220f63eef17a315e96c + uri: https://huggingface.co/mudler/face-detect-gguf/resolve/main/antelopev2.gguf +- name: face-detect-yunet-sface + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/face-detect.cpp + - https://github.com/opencv/opencv_zoo + description: | + Face recognition with OpenCV Zoo weights: YuNet detector + SFace + 128-d recognizer, converted to a C++/ggml GGUF for the `face-detect` + backend. APACHE 2.0: safe for commercial use. Lower accuracy than the + buffalo packs and no demographic head, but the commercial-friendly + alternative to the insightface buffalo line. + + The architecture (`facedetect.arch`) is read from the GGUF metadata, + so this entry alone selects the YuNet + SFace engine. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - face-recognition + - face-verification + - face-embedding + - commercial-ok + - gpu + - cpu + last_checked: "2026-06-22" + overrides: + backend: face-detect + known_usecases: + - face_recognition + - detection + - embeddings + options: + - verify_threshold:0.363 + parameters: + model: face-detect-yunet-sface.gguf + files: + - filename: face-detect-yunet-sface.gguf + sha256: 9ce78d4ba0ae9d5e8c91a0e145d511558d1d90f5d9c1f4131cca9bb4bce60902 + uri: https://huggingface.co/mudler/face-detect-gguf/resolve/main/yunet-sface.gguf - name: speechbrain-ecapa-tdnn url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: @@ -8518,6 +8760,217 @@ - filename: wespeaker_voxceleb_resnet34.onnx sha256: 7bb2f06e9df17cdf1ef14ee8a15ab08ed28e8d0ef5054ee135741560df2ec068 uri: https://huggingface.co/Wespeaker/wespeaker-voxceleb-resnet34-LM/resolve/main/voxceleb_resnet34_LM.onnx +- name: voice-detect-ecapa-tdnn + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb + description: | + Speaker (voice) recognition with SpeechBrain's ECAPA-TDNN trained + on VoxCeleb, ported to C++/ggml and shipped as a single GGUF for the + `voice-detect` backend. 192-d L2-normalised embeddings, ~1.9% Equal + Error Rate on VoxCeleb1-O. APACHE 2.0 - commercial-safe. + + No Python / torch runtime: voice-detect.cpp reads the embedding + architecture (`voicedetect.arch`) directly from the GGUF metadata, + so installing this entry is all that is needed to select ECAPA-TDNN. + Drives the VoiceVerify / VoiceEmbed gRPC rpcs and the + /v1/voice/{verify,embed,register,identify,forget} REST endpoints. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - commercial-ok + - cpu + - gpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + options: + - verify_threshold:0.25 + parameters: + model: voice-detect-ecapa-tdnn-voxceleb.gguf + files: + - filename: voice-detect-ecapa-tdnn-voxceleb.gguf + sha256: 68046a1fdfb7843f460962db4739fbd381cc5c3ab93d1505e75e2f4c0dc19b8f + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/ecapa-tdnn-voxceleb.gguf +- name: voice-detect-wespeaker-resnet34 + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://github.com/wenet-e2e/wespeaker + description: | + Speaker recognition with WeSpeaker's ResNet34 trained on VoxCeleb, + converted to a C++/ggml GGUF for the `voice-detect` backend. 256-d + embeddings, CPU-friendly and runtime-free (no onnxruntime or torch). + CC-BY-4.0. + + Use when you want WeSpeaker's ResNet34 topology instead of + ECAPA-TDNN. The embedding architecture (`voicedetect.arch`) is read + from the GGUF metadata, so this entry alone selects the engine. + license: cc-by-4.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - commercial-ok + - edge + - cpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + options: + - verify_threshold:0.25 + parameters: + model: voice-detect-wespeaker-resnet34.gguf + files: + - filename: voice-detect-wespeaker-resnet34.gguf + sha256: 72040372494eafec299836bc1977cfc13c603cb486674ed59b0f4c03758d29da + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/wespeaker-resnet34-voxceleb.gguf +- name: voice-detect-eres2net + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://huggingface.co/iic/speech_eres2net_sv_en_voxceleb_16k + description: | + Speaker recognition with 3D-Speaker's ERes2Net trained on VoxCeleb, + converted to a C++/ggml GGUF for the `voice-detect` backend. + 192-d embeddings with strong verification accuracy. APACHE 2.0. + + The embedding architecture (`voicedetect.arch`) is read from the + GGUF metadata, so this entry alone selects the ERes2Net engine. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - commercial-ok + - cpu + - gpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + options: + - verify_threshold:0.25 + parameters: + model: voice-detect-eres2net.gguf + files: + - filename: voice-detect-eres2net.gguf + sha256: d39f53c7a4d39734740a86a07521b9a819ee8ea56c1a9436eba611ab733a3d06 + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/eres2net-base-zh-cn.gguf +- name: voice-detect-campplus + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://huggingface.co/iic/speech_campplus_sv_en_voxceleb_16k + description: | + Speaker recognition with 3D-Speaker's CAM++ trained on VoxCeleb, + converted to a C++/ggml GGUF for the `voice-detect` backend. 192-d + embeddings, a fast context-aware masking topology well-suited to + CPU and edge deployments. APACHE 2.0. + + The embedding architecture (`voicedetect.arch`) is read from the + GGUF metadata, so this entry alone selects the CAM++ engine. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - commercial-ok + - edge + - cpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + options: + - verify_threshold:0.25 + parameters: + model: voice-detect-campplus.gguf + files: + - filename: voice-detect-campplus.gguf + sha256: a6e34c6d230cff26e37b71a2df0907fde1de425654e28d9d5cacca32e02a13d3 + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/campplus-zh-cn.gguf +- name: voice-detect-emotion-wav2vec2 + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim + description: | + Voice analysis (age / gender / emotion) with audEERING's wav2vec2 + model, converted to a C++/ggml GGUF for the `voice-detect` backend. + Drives the VoiceAnalyze gRPC rpc and the /v1/voice/analyze REST + endpoint, returning a continuous age estimate plus gender and + emotion class scores for a single utterance. CC-BY-NC-SA-4.0 - + research / non-commercial use only. + + The analysis architecture (`voicedetect.arch`) is read from the + GGUF metadata, so this entry alone selects the wav2vec2 analyze head. + license: cc-by-nc-sa-4.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - voice-analysis + - emotion-recognition + - cpu + - gpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + parameters: + model: voice-detect-emotion-wav2vec2.gguf + files: + - filename: voice-detect-emotion-wav2vec2.gguf + sha256: 9e9793e4f77a27f4ae068bcb29c2b6fe2f74881799e2cfea0f8e436ad3765e50 + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/emotion-wav2vec2-superb-er.gguf +- name: voice-detect-age-gender-wav2vec2 + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://huggingface.co/audeering/wav2vec2-large-robust-24-ft-age-gender + - https://github.com/mudler/voice-detect.cpp + description: | + wav2vec2-large-robust age + gender analysis head + (audeering/wav2vec2-large-robust-24-ft-age-gender), converted to a + C++/ggml GGUF for the `voice-detect` backend. Drives the VoiceAnalyze + gRPC rpc and the /v1/voice/analyze REST endpoint, returning a + continuous age estimate plus gender class scores for a single + utterance. CC-BY-NC-SA-4.0 - research / non-commercial use only. + + The analysis architecture (`voicedetect.arch`) is read from the + GGUF metadata, so this entry alone selects the wav2vec2 analyze head. + license: cc-by-nc-sa-4.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - voice-analysis + - research-only + - cpu + - gpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + parameters: + model: voice-detect-age-gender-wav2vec2.gguf + files: + - filename: voice-detect-age-gender-wav2vec2.gguf + sha256: d92486b3f1ea7baf6a90f1026b7b8e9848b3a8332bccfb01cc8889eed7069064 + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/age-gender-wav2vec2-audeering.gguf - name: rfdetr-base url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: