diff --git a/README.md b/README.md index 6911dbef..87e36f71 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This is a thin frontend UI for [Kaapi backend](https://github.com/ProjectTech4De - [Software Dependencies](#software-dependencies) - [npm (via asdf)](#npm-via-asdf) - [Kaapi Backend](#kaapi-backend) + - [Kaapi Guardrails Service](#kaapi-guardrails-service) - [Clone Frontend Repo](#clone-frontend-repo) - [Git commands to clone](#git-commands-to-clone) - [Installation](#installation) @@ -52,11 +53,11 @@ You need to set up the [Kaapi backend](https://github.com/ProjectTech4DevAI/kaap > ๐Ÿ’ก Note: Ensure the backend is running and accessible before starting the frontend. -### Kaapi Guardrails Service _(coming soon)_ +### Kaapi Guardrails Service -The Guardrails UI (currently in development) will require the Kaapi Guardrails service to be running alongside the backend. Setup instructions will be added here once the service is available. +You need to set up the [Kaapi Guardrails](https://github.com/ProjectTech4DevAI/kaapi-guardrails) service and follow the instructions there. -> ๐Ÿšง No action needed for now โ€” this is a placeholder for when the Guardrails feature lands in `main`. +> ๐Ÿ’ก Note: The Guardrails service must be running and accessible whenever you use the Guardrails module in the frontend. --- @@ -149,14 +150,14 @@ Deployments are automated via a GitHub Actions CD pipeline that SSHes into the E 1. SSHes into the EC2 instance 2. Runs `git pull` to fetch the latest code 3. Runs `npm run build` to create an optimized production build -4. Restarts the server to apply the new build +4. Restarts the pm2 server to apply the new build **Production** โ€” on every version tag (e.g. `v1.0.0`, `v2.1.0`), the pipeline automatically: 1. SSHes into the EC2 instance 2. Runs `git fetch --tags` and checks out the tag 3. Runs `npm run build` to create an optimized production build -4. Restarts the server to apply the new build +4. Restarts the pm2 server to apply the new build --- @@ -169,4 +170,4 @@ Deployments are automated via a GitHub Actions CD pipeline that SSHes into the E ### Chat With Us -- ๐Ÿ’ฌ [Discord](https://discord.gg/BRYzSYha) +- ๐Ÿ’ฌ [Discord](https://discord.gg/s7e2UBFku) diff --git a/app/(main)/evaluations/[id]/page.tsx b/app/(main)/evaluations/[id]/page.tsx index 517c9275..ddd05717 100644 --- a/app/(main)/evaluations/[id]/page.tsx +++ b/app/(main)/evaluations/[id]/page.tsx @@ -5,15 +5,15 @@ "use client"; -import { useState, useEffect, useCallback } from "react"; +import { useState, useEffect, useCallback, useRef } from "react"; import { useRouter, useParams } from "next/navigation"; import { apiFetch } from "@/app/lib/apiClient"; import { useAuth } from "@/app/lib/context/AuthContext"; import { useApp } from "@/app/lib/context/AppContext"; import type { EvalJob, + EvalJobApiResponse, AssistantConfig, - GroupedTraceItem, } from "@/app/lib/types/evaluation"; import { hasSummaryScores, @@ -22,21 +22,23 @@ import { normalizeToIndividualScores, isGroupedFormat, } from "@/app/lib/utils/evaluation"; +import { + exportGroupedCSV, + exportRowCSV, +} from "@/app/lib/utils/evaluationExport"; import ConfigModal from "@/app/components/ConfigModal"; import Sidebar from "@/app/components/Sidebar"; import DetailedResultsTable from "@/app/components/evaluations/DetailedResultsTable"; -import { colors } from "@/app/lib/colors"; +import MetricsOverview from "@/app/components/evaluations/MetricsOverview"; +import { Button, Modal, ResultsTableSkeleton } from "@/app/components"; import { useToast } from "@/app/components/Toast"; import Loader from "@/app/components/Loader"; import { - WarningTriangleIcon, MenuIcon, ChevronLeftIcon, DatabaseIcon, GroupIcon, - RefreshIcon, } from "@/app/components/icons"; -import { sanitizeCSVCell } from "@/app/lib/utils"; export default function EvaluationReport() { const router = useRouter(); @@ -49,7 +51,9 @@ export default function EvaluationReport() { AssistantConfig | undefined >(undefined); const [isLoading, setIsLoading] = useState(true); + const [isFormatSwitching, setIsFormatSwitching] = useState(false); const [error, setError] = useState(null); + const hasLoadedRef = useRef(false); const { apiKeys, isAuthenticated } = useAuth(); const apiKey = apiKeys[0]?.key ?? ""; const { sidebarCollapsed, setSidebarCollapsed } = useApp(); @@ -61,12 +65,16 @@ export default function EvaluationReport() { const fetchJobDetails = useCallback(async () => { if (!isAuthenticated || !jobId) return; - setIsLoading(true); - setError(null); + const isFirstLoad = !hasLoadedRef.current; + if (isFirstLoad) { + setIsLoading(true); + setError(null); + } else { + setIsFormatSwitching(true); + } try { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const data = await apiFetch( + const data = await apiFetch( `/api/evaluations/${jobId}?export_format=${exportFormat}`, apiKey, ); @@ -77,10 +85,12 @@ export default function EvaluationReport() { return; } - const foundJob = data.data || data; + const foundJob: EvalJob | undefined = + data.data ?? (data as unknown as EvalJob); if (!foundJob) throw new Error("Evaluation job not found"); setJob(foundJob); + hasLoadedRef.current = true; if (foundJob.assistant_id) { fetchAssistantConfig(foundJob.assistant_id); @@ -89,11 +99,16 @@ export default function EvaluationReport() { fetchConfigInfo(foundJob.config_id, foundJob.config_version); } } catch (err: unknown) { - setError( - err instanceof Error ? err.message : "Failed to fetch evaluation job", - ); + const message = + err instanceof Error ? err.message : "Failed to fetch evaluation job"; + if (isFirstLoad) { + setError(message); + } else { + toast.error(message); + } } finally { setIsLoading(false); + setIsFormatSwitching(false); } }, [apiKey, isAuthenticated, jobId, exportFormat]); @@ -128,118 +143,6 @@ export default function EvaluationReport() { if (isAuthenticated && jobId) fetchJobDetails(); }, [isAuthenticated, jobId, fetchJobDetails]); - const exportGroupedCSV = (traces: GroupedTraceItem[]) => { - if (!job) return; - try { - const maxAnswers = Math.max(...traces.map((g) => g.llm_answers.length)); - const scoreNames = traces[0]?.scores[0]?.map((s) => s.name) || []; - let csvContent = "Question ID,Question,Ground Truth"; - for (let i = 1; i <= maxAnswers; i++) { - csvContent += `,LLM Answer ${i},Trace ID ${i}`; - scoreNames.forEach((name) => { - csvContent += `,${name} (${i}),${sanitizeCSVCell(`${name} (${i}) Comment`)}`; - }); - } - csvContent += "\n"; - traces.forEach((group) => { - const row: string[] = [ - String(group.question_id), - sanitizeCSVCell(group.question || ""), - sanitizeCSVCell(group.ground_truth_answer || ""), - ]; - for (let i = 0; i < maxAnswers; i++) { - row.push( - `"${(group.llm_answers[i] || "").replace(/"/g, '""').replace(/\n/g, " ")}"`, - ); - row.push(group.trace_ids[i] || ""); - scoreNames.forEach((name) => { - const score = group.scores[i]?.find((s) => s.name === name); - row.push(score ? String(score.value) : ""); - row.push( - score?.comment ? sanitizeCSVCell(score.comment, true) : "", - ); - }); - } - csvContent += row.join(",") + "\n"; - }); - const blob = new Blob([csvContent], { type: "text/csv;charset=utf-8;" }); - const url = URL.createObjectURL(blob); - const link = document.createElement("a"); - link.setAttribute("href", url); - link.setAttribute( - "download", - `evaluation_${job.id}_${job.run_name.replace(/[^a-z0-9]/gi, "_")}_grouped.csv`, - ); - document.body.appendChild(link); - link.click(); - document.body.removeChild(link); - URL.revokeObjectURL(url); - toast.success(`Grouped CSV exported with ${traces.length} questions`); - } catch (_error) { - toast.error("Failed to export grouped CSV"); - } - }; - - // Export row format CSV - const exportRowCSV = () => { - if (!job || !scoreObject) return; - try { - const individual_scores = normalizeToIndividualScores(scoreObject); - if (!individual_scores || individual_scores.length === 0) { - toast.error("No valid data available to export"); - return; - } - let csvContent = ""; - const firstItem = individual_scores[0]; - const scoreNames = firstItem?.trace_scores?.map((s) => s.name) || []; - csvContent += - "Counter,Trace ID,Job ID,Run Name,Dataset,Model,Status,Total Items,"; - csvContent += "Question,Answer,Ground Truth,"; - csvContent += - scoreNames.map((name) => `${name},${name} (comment)`).join(",") + "\n"; - let rowCount = 0; - individual_scores.forEach((item, index) => { - const row = [ - index + 1, - item.trace_id || "N/A", - job.id, - `"${job.run_name.replace(/"/g, '""')}"`, - `"${job.dataset_name.replace(/"/g, '""')}"`, - assistantConfig?.model || job.config?.model || "N/A", - job.status, - job.total_items, - `"${(item.input?.question || "").replace(/"/g, '""').replace(/\n/g, " ")}"`, - `"${(item.output?.answer || "").replace(/"/g, '""').replace(/\n/g, " ")}"`, - `"${(item.metadata?.ground_truth || "").replace(/"/g, '""').replace(/\n/g, " ")}"`, - ...scoreNames.flatMap((name) => { - const score = item.trace_scores?.find((s) => s.name === name); - return [ - score ? score.value : "N/A", - score?.comment ? sanitizeCSVCell(score.comment, true) : "", - ]; - }), - ].join(","); - csvContent += row + "\n"; - rowCount++; - }); - const blob = new Blob([csvContent], { type: "text/csv;charset=utf-8;" }); - const url = URL.createObjectURL(blob); - const link = document.createElement("a"); - link.setAttribute("href", url); - link.setAttribute( - "download", - `evaluation_${job.id}_${job.run_name.replace(/[^a-z0-9]/gi, "_")}.csv`, - ); - document.body.appendChild(link); - link.click(); - document.body.removeChild(link); - URL.revokeObjectURL(url); - toast.success(`CSV exported successfully with ${rowCount} rows`); - } catch (_error) { - toast.error("Failed to export CSV"); - } - }; - const handleExportCSV = () => { if (!job || !scoreObject) { toast.error("No valid data available to export"); @@ -256,14 +159,14 @@ export default function EvaluationReport() { return; } if (isGroupedFormat(traces)) { - exportGroupedCSV(traces); + const count = exportGroupedCSV(job, traces); + toast.success(`Grouped CSV exported with ${count} questions`); } else { - exportRowCSV(); + const count = exportRowCSV(job, scoreObject, assistantConfig); + toast.success(`CSV exported successfully with ${count} rows`); } - } catch (_error) { - toast.error( - "Failed to export CSV. Please check the console for details.", - ); + } catch (err: unknown) { + toast.error(err instanceof Error ? err.message : "Failed to export CSV"); } }; @@ -272,12 +175,12 @@ export default function EvaluationReport() { setIsResyncing(true); try { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const data = await apiFetch( + const data = await apiFetch( `/api/evaluations/${jobId}?get_trace_info=true&resync_score=true&export_format=${exportFormat}`, apiKey, ); - const foundJob = data.data || data; + const foundJob: EvalJob | undefined = + data.data ?? (data as unknown as EvalJob); if (!foundJob) throw new Error("Evaluation job not found"); const newScoreObject = getScoreObject(foundJob); @@ -301,12 +204,9 @@ export default function EvaluationReport() { } }; - if (isLoading) { + if (isLoading && !job) { return ( -
+
@@ -317,32 +217,23 @@ export default function EvaluationReport() { ); } - if (error || !job) { + if ((error && !job) || !job) { return ( -
+
-

+

{error || "Evaluation job not found"}

- +
@@ -360,68 +251,52 @@ export default function EvaluationReport() { job.status.toLowerCase() !== "completed" && job.status.toLowerCase() !== "failed"; + const segmentedClass = + "inline-flex items-center gap-1.5 px-3 py-1.5 rounded-md text-xs font-medium transition-all cursor-pointer border border-transparent text-text-primary hover:bg-black/4 hover:shadow-[0_0_0_1px_rgba(0,0,0,0.06)] data-[selected=true]:bg-bg-primary data-[selected=true]:border-border data-[selected=true]:shadow-[0_1px_2px_rgba(0,0,0,0.08)] data-[selected=true]:hover:bg-bg-primary data-[selected=true]:hover:shadow-[0_1px_2px_rgba(0,0,0,0.08)]"; + return ( -
+
- {/* Header */} -
+
- + {sidebarCollapsed && ( + + )}
-

+

{job.run_name}

- - + + {job.dataset_name}
-
-
+
+
- - +
-
+
{hasScore && isNewFormat ? ( -
- {summaryScores.some( - (s) => job.total_items && s.total_pairs < job.total_items, - ) && - isJobInProgress && ( -
- - Some traces are still being scored. Scores shown are - partial and may change - click{" "} - Resync to get - the latest. -
- )} -
-

- Metrics Overview -

- -
- {summaryScores.length > 0 ? ( -
- {summaryScores - .filter((s) => s.data_type === "NUMERIC") - .map((summary) => ( -
-
- {summary.name} -
-
- {summary.avg !== undefined - ? summary.avg.toFixed(3) - : "N/A"} -
-
- {summary.std !== undefined && - `ยฑ${summary.std.toFixed(3)} ยท `} - - {summary.total_pairs} - {job.total_items && - summary.total_pairs < job.total_items && - `/${job.total_items}`}{" "} - pairs - -
-
- ))} - {summaryScores - .filter((s) => s.data_type === "CATEGORICAL") - .map((summary) => ( -
-
- {summary.name} -
-
- {summary.distribution && - Object.entries(summary.distribution).map( - ([key, value]) => ( -
- - {key} - - - {value} - -
- ), - )} -
-
- - {summary.total_pairs} - {job.total_items && - summary.total_pairs < job.total_items && - `/${job.total_items}`}{" "} - pairs - -
-
- ))} -
- ) : ( -
-

- No summary scores available -

-
- )} -
+ ) : ( -
+

{job.error_message || "No results available yet"}

)} - {/* Detailed Results */} {hasScore && (
-

+

Detailed Results

- {isNewFormat && ( - + {isNewFormat && !isFormatSwitching && ( + ({normalizeToIndividualScores(scoreObject).length}{" "} items) )}
- + {isFormatSwitching ? ( + + ) : ( + + )}
)}
@@ -651,7 +375,6 @@ export default function EvaluationReport() {
- {/* Config Modal */} setIsConfigModalOpen(false)} @@ -659,44 +382,28 @@ export default function EvaluationReport() { assistantConfig={assistantConfig} /> - {/* No Traces Modal */} - {showNoTracesModal && ( -
setShowNoTracesModal(false)} - > -
e.stopPropagation()} + setShowNoTracesModal(false)} + title="No Langfuse Traces Available" + maxWidth="max-w-md" + maxHeight="max-h-fit" + > +
+

+ This evaluation does not have Langfuse traces. +

+
+
+ -
-
+ OK +
- )} +
); } diff --git a/app/(main)/evaluations/page.tsx b/app/(main)/evaluations/page.tsx index 13ca97c2..450eeff1 100644 --- a/app/(main)/evaluations/page.tsx +++ b/app/(main)/evaluations/page.tsx @@ -22,8 +22,7 @@ import { FeatureGateModal, LoginModal } from "@/app/components/auth"; import Loader from "@/app/components/Loader"; import DatasetsTab from "@/app/components/evaluations/DatasetsTab"; import EvaluationsTab from "@/app/components/evaluations/EvaluationsTab"; - -type Tab = "datasets" | "evaluations"; +import { Tab } from "@/app/lib/types/evaluation"; const leftPanelWidth = 450; @@ -43,14 +42,13 @@ function SimplifiedEvalContent() { const apiKey = activeKey?.key ?? ""; const [showLoginModal, setShowLoginModal] = useState(false); const [mounted, setMounted] = useState(false); - // Dataset creation state const [datasetName, setDatasetName] = useState(""); const [datasetDescription, setDatasetDescription] = useState(""); const [duplicationFactor, setDuplicationFactor] = useState("1"); const [uploadedFile, setUploadedFile] = useState(null); const [isUploading, setIsUploading] = useState(false); const [storedDatasets, setStoredDatasets] = useState([]); - const [isDatasetsLoading, setIsDatasetsLoading] = useState(false); + const [isDatasetsLoading, setIsDatasetsLoading] = useState(true); const [selectedDatasetId, setSelectedDatasetId] = useState(() => { return searchParams.get("dataset") || ""; }); @@ -90,6 +88,7 @@ function SimplifiedEvalContent() { useEffect(() => { if (isAuthenticated) loadStoredDatasets(); + else setIsDatasetsLoading(false); }, [isAuthenticated, loadStoredDatasets]); const handleFileSelect = (event: React.ChangeEvent) => { @@ -260,7 +259,6 @@ function SimplifiedEvalContent() { subtitle="Compare model response quality on your datasets across different configs" /> - {/* Tab Navigation */} setActiveTab(tabId as Tab)} /> - {/* Tab Content */} {!mounted || !isAuthenticated ? ( <> (null); const [isCreating, setIsCreating] = useState(false); const [datasets, setDatasets] = useState([]); - const [isLoadingDatasets, setIsLoadingDatasets] = useState(false); + const [isLoadingDatasets, setIsLoadingDatasets] = useState(true); const [evaluationName, setEvaluationName] = useState(""); const [selectedDatasetId, setSelectedDatasetId] = useState( null, @@ -60,7 +60,7 @@ export default function SpeechToTextPage() { const [selectedModel, setSelectedModel] = useState("gemini-2.5-pro"); const [isRunning, setIsRunning] = useState(false); const [runs, setRuns] = useState([]); - const [isLoadingRuns, setIsLoadingRuns] = useState(false); + const [isLoadingRuns, setIsLoadingRuns] = useState(true); const [selectedRunId, setSelectedRunId] = useState(null); const [results, setResults] = useState([]); const [isLoadingResults, setIsLoadingResults] = useState(false); diff --git a/app/(main)/text-to-speech/page.tsx b/app/(main)/text-to-speech/page.tsx index bfdb9f68..d3645b8b 100644 --- a/app/(main)/text-to-speech/page.tsx +++ b/app/(main)/text-to-speech/page.tsx @@ -53,7 +53,7 @@ export default function TextToSpeechPage() { const [textSamples, setTextSamples] = useState([]); const [isCreating, setIsCreating] = useState(false); const [datasets, setDatasets] = useState([]); - const [isLoadingDatasets, setIsLoadingDatasets] = useState(false); + const [isLoadingDatasets, setIsLoadingDatasets] = useState(true); const [evaluationName, setEvaluationName] = useState(""); const [selectedDatasetId, setSelectedDatasetId] = useState( null, @@ -63,7 +63,7 @@ export default function TextToSpeechPage() { ); const [isRunning, setIsRunning] = useState(false); const [runs, setRuns] = useState([]); - const [isLoadingRuns, setIsLoadingRuns] = useState(false); + const [isLoadingRuns, setIsLoadingRuns] = useState(true); const [selectedRunId, setSelectedRunId] = useState(null); const [results, setResults] = useState([]); const [isLoadingResults, setIsLoadingResults] = useState(false); diff --git a/app/components/Button.tsx b/app/components/Button.tsx index dc8238e4..0d978781 100644 --- a/app/components/Button.tsx +++ b/app/components/Button.tsx @@ -56,7 +56,7 @@ export default function Button({ return ( +
+ {summaryScores.length > 0 ? ( +
+ {summaryScores + .filter((s) => s.data_type === "NUMERIC") + .map((summary) => ( +
+
+ {summary.name} +
+
+ {summary.avg !== undefined ? summary.avg.toFixed(3) : "N/A"} +
+
+ {summary.std !== undefined && `ยฑ${summary.std.toFixed(3)} ยท `} + + {summary.total_pairs} + {job.total_items && + summary.total_pairs < job.total_items && + `/${job.total_items}`}{" "} + pairs + +
+
+ ))} + {summaryScores + .filter((s) => s.data_type === "CATEGORICAL") + .map((summary) => ( +
+
+ {summary.name} +
+
+ {summary.distribution && + Object.entries(summary.distribution).map(([key, value]) => ( +
+ + {key} + + + {value} + +
+ ))} +
+
+ + {summary.total_pairs} + {job.total_items && + summary.total_pairs < job.total_items && + `/${job.total_items}`}{" "} + pairs + +
+
+ ))} +
+ ) : ( +
+

+ No summary scores available +

+
+ )} +
+ ); +} diff --git a/app/components/evaluations/RunEvaluationForm.tsx b/app/components/evaluations/RunEvaluationForm.tsx index 535494c3..7b6465b6 100644 --- a/app/components/evaluations/RunEvaluationForm.tsx +++ b/app/components/evaluations/RunEvaluationForm.tsx @@ -6,8 +6,7 @@ import Select from "@/app/components/Select"; import { CheckCircleIcon, PlayIcon } from "@/app/components/icons"; import ConfigSelector from "@/app/components/ConfigSelector"; import EvalDatasetDescription from "./EvalDatasetDescription"; - -type Tab = "datasets" | "evaluations"; +import { Tab } from "@/app/lib/types/evaluation"; interface RunEvaluationFormProps { storedDatasets: Dataset[]; diff --git a/app/components/speech-to-text/AudioFileItem.tsx b/app/components/speech-to-text/AudioFileItem.tsx index dca739bd..2219c999 100644 --- a/app/components/speech-to-text/AudioFileItem.tsx +++ b/app/components/speech-to-text/AudioFileItem.tsx @@ -3,7 +3,7 @@ import { AudioFile, Language } from "@/app/lib/types/speechToText"; import { CheckLineIcon, CloseIcon } from "@/app/components/icons"; import Select from "@/app/components/Select"; -import AudioPlayer from "./AudioPlayer"; +import AudioPlayer from "@/app/components/speech-to-text/AudioPlayer"; interface AudioFileItemProps { audioFile: AudioFile; diff --git a/app/components/speech-to-text/CreateSTTDatasetForm.tsx b/app/components/speech-to-text/CreateSTTDatasetForm.tsx index dd3be51d..a5e13afd 100644 --- a/app/components/speech-to-text/CreateSTTDatasetForm.tsx +++ b/app/components/speech-to-text/CreateSTTDatasetForm.tsx @@ -1,9 +1,8 @@ "use client"; -import { useEffect, useState } from "react"; import { AudioFile, Language } from "@/app/lib/types/speechToText"; import { useAuth } from "@/app/lib/context/AuthContext"; -import { Button, Field } from "@/app/components"; +import { Button, Field, InfoTooltip } from "@/app/components"; import Select from "@/app/components/Select"; import { MusicNoteIcon, PlusIcon } from "@/app/components/icons"; import AudioFileItem from "./AudioFileItem"; @@ -52,20 +51,6 @@ export default function CreateSTTDatasetForm({ languages, }: CreateSTTDatasetFormProps) { const { isAuthenticated } = useAuth(); - const [showLanguageInfo, setShowLanguageInfo] = useState(false); - const [languageInfoPos, setLanguageInfoPos] = useState({ top: 0, left: 0 }); - - useEffect(() => { - if (!showLanguageInfo) return; - const handleClick = () => setShowLanguageInfo(false); - const handleScroll = () => setShowLanguageInfo(false); - document.addEventListener("click", handleClick); - window.addEventListener("scroll", handleScroll, true); - return () => { - document.removeEventListener("click", handleClick); - window.removeEventListener("scroll", handleScroll, true); - }; - }, [showLanguageInfo]); const isCreateDisabled = isCreating || !datasetName.trim() || audioFiles.length === 0; @@ -100,42 +85,18 @@ export default function CreateSTTDatasetForm({ setEvaluationName(e.target.value)} - placeholder="e.g., English Podcast Evaluation v1" - className="w-full px-3 py-2 border rounded-md text-sm" - style={{ - backgroundColor: colors.bg.primary, - borderColor: colors.border, - color: colors.text.primary, - }} - /> -
- - {/* Model Selection */} -
- - -
- - {/* Dataset Selection */} -
- - {isLoadingDatasets ? ( - - ) : datasets.length === 0 ? ( -
-

- No datasets available -

-

- Create a dataset first in the Datasets tab -

-
- ) : ( - - )} -
- - {/* Selected Dataset Info */} - {selectedDataset && ( -
-
- - - -
-
- {selectedDataset.name} -
-
-
- {selectedDataset.dataset_metadata?.sample_count || 0}{" "} - samples -
-
-
-
-
- )} -
- - {/* Run Evaluation Button */} -
- -
+
)} - - {/* Right Panel - Evaluation Runs List or Results */} -
-
-
-
- {selectedRunId !== null ? ( -
- -

- {runs.find((r) => r.id === selectedRunId)?.run_name} -

-
- ) : ( -

- Evaluation Runs -

- )} -
- {selectedRunId === null && ( -
- - -
- )} -
- -
- {selectedRunId !== null ? ( - // Results View - isLoadingResults ? ( -
- -
- ) : results.length === 0 ? ( -
-

- No results found -

-

- This evaluation has no results yet -

-
- ) : ( - - - - - - - - - - - - {results.map((result) => ( - - - - - -
- Sample - -
-
Ground Truth vs Transcription
-
- - - - Deletion - - - - - - Insertion - - - - - - Substitution - - -
-
-
- - Score - { - e.stopPropagation(); - const rect = - e.currentTarget.getBoundingClientRect(); - setScoreInfoPos({ - top: rect.bottom + 4, - left: rect.left, - }); - setOpenScoreInfo( - openScoreInfo ? null : "accuracy", - ); - }} - > - i - - {openScoreInfo && - (() => { - const metrics = [ - { - key: "accuracy", - title: - "Accuracy (Word Information Preserved)", - desc: "Measures how much of the original information was correctly captured.", - formula: "WIP = (C / N) ร— (C / H)", - formulaDesc: - "C = correct words\nN = total words in reference\nH = total words in hypothesis", - example: `Reference: "the cat sat on the mat" (N=6)\nHypothesis: "a cat sit on mat" (H=5)\nC = 3 (cat, on, mat)\n\nWIP = (3/6) ร— (3/5)\n = 0.5 ร— 0.6 = 0.30 = 30%`, - direction: "Higher is better.", - directionColor: colors.status.success, - }, - { - key: "wer", - title: "WER (Word Error Rate)", - desc: "The most widely used metric in STT evaluation.", - formula: "WER = (S + D + I) / N", - formulaDesc: - "S = substitutions, D = deletions\nI = insertions, N = total words in reference", - example: `Reference: "the cat sat on the mat" (N=6)\nHypothesis: "a cat sit on mat"\n\nthe โ†’ a (Substitution)\ncat โ†’ cat (Correct)\nsat โ†’ sit (Substitution)\non โ†’ on (Correct)\nthe โ†’ โˆ… (Deletion)\nmat โ†’ mat (Correct)\n\nS=2, D=1, I=0\nWER = (2+1+0) / 6 = 0.50 = 50%`, - direction: "Lower is better.", - directionColor: colors.status.error, - }, - { - key: "cer", - title: "CER (Character Error Rate)", - desc: "Same concept as WER but at the character level โ€” more granular, catches partial word errors.", - formula: "CER = (S + D + I) / N", - formulaDesc: - "S, D, I = character-level errors\nN = total characters in reference", - example: `Reference: "the cat sat" (N=11 chars)\nHypothesis: "the bat set"\n\nt โ†’ t (Correct)\nh โ†’ h (Correct)\ne โ†’ e (Correct)\nยท โ†’ ยท (Correct)\nc โ†’ b (Substitution)\na โ†’ a (Correct)\nt โ†’ t (Correct)\nยท โ†’ ยท (Correct)\ns โ†’ s (Correct)\na โ†’ e (Substitution)\nt โ†’ t (Correct)\n\nS=2, D=0, I=0\nCER = 2/11 = 0.18 = 18%`, - direction: "Lower is better.", - directionColor: colors.status.error, - }, - { - key: "lenient_wer", - title: "Lenient WER", - desc: "Same as WER but ignores differences in casing and punctuation โ€” useful when exact formatting doesn't matter.", - formula: "Same as WER after normalizing text", - formulaDesc: - "Normalization: lowercase + remove punctuation", - example: `Reference: "Hello, World!"\nHypothesis: "hello world"\n\nAfter normalization:\n"hello world" vs "hello world"\nโ†’ exact match\n\nLenient WER = 0%\n(strict WER would be higher)`, - direction: "Lower is better.", - directionColor: colors.status.error, - }, - ]; - const currentIdx = metrics.findIndex( - (m) => m.key === openScoreInfo, - ); - const current = - metrics[currentIdx >= 0 ? currentIdx : 0]; - return ( -
e.stopPropagation()} - > - {/* Tab navigation */} -
- {metrics.map((m, _idx) => ( - - ))} -
- {/* Content */} -
-
- {current.title} -
-

- {current.desc} -

-
- Formula -
-
- {current.formula} - {"\n"} - - {current.formulaDesc} - -
-
- Example -
-
- {current.example} -
-
- {current.direction} -
-
-
- ); - })()} -
-
- Is Correct - - Comment -
- {result.signedUrl ? ( - - setPlayingResultId( - playingResultId === result.id - ? null - : result.id, - ) - } - /> - ) : ( -
- {result.sampleName || "-"} -
- )} -
- {(() => { - const hasBoth = - result.groundTruth && result.transcription; - const segments = hasBoth - ? computeWordDiff( - result.groundTruth, - result.transcription, - ) - : []; - const isExpanded = expandedTranscriptions.has( - result.id, - ); - return ( -
-
- {/* Left Panel - Ground Truth */} -
-
- Ground Truth -
-
- {hasBoth ? ( - segments.map((seg, idx) => { - if (seg.type === "insertion") - return null; - const word = seg.reference || ""; - return ( - - - {seg.type === "deletion" && - "- "} - {word} - {" "} - - ); - }) - ) : ( - - {result.groundTruth || "-"} - - )} -
-
- {/* Right Panel - Transcription */} -
-
- Transcription -
-
- {hasBoth ? ( - segments.map((seg, idx) => { - if (seg.type === "deletion") { - return ( - - - ___ - {" "} - - ); - } - const word = - seg.hypothesis || - seg.reference || - ""; - return ( - - - {seg.type === "insertion" && - "+ "} - {word} - {" "} - - ); - }) - ) : ( - - {result.transcription || "-"} - - )} -
-
-
- {hasBoth && - (result.groundTruth!.length > 100 || - result.transcription!.length > 100) && ( - - )} -
- ); - })()} -
- {result.score ? ( -
-
- - Accuracy - - = 0.9 - ? colors.status.success - : result.score.wip >= 0.7 - ? "#ca8a04" - : colors.status.error, - }} - > - {(result.score.wip * 100).toFixed(1)}% - -
-
-
- Errors -
-
- {[ - { label: "WER", value: result.score.wer }, - { label: "CER", value: result.score.cer }, - { - label: "Lenient WER", - value: result.score.lenient_wer, - }, - ].map(({ label, value }) => ( -
- - {label} - - = 0.8 - ? colors.status.error - : value >= 0.4 - ? "#ca8a04" - : colors.status.success, - }} - > - {(value * 100).toFixed(1)}% - -
- ))} -
-
-
- ) : ( - - - - - )} -
- - -
-