Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/components/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
"@aws-sdk/client-bedrock-runtime": "3.966.0",
"@aws-sdk/client-dynamodb": "^3.360.0",
"@aws-sdk/client-kendra": "^3.750.0",
"@aws-sdk/client-polly": "^3.699.0",
"@aws-sdk/client-s3": "^3.844.0",
"@aws-sdk/client-transcribe": "^3.699.0",
"@aws-sdk/client-secrets-manager": "^3.699.0",
"@aws-sdk/client-sns": "^3.699.0",
"@aws-sdk/client-sts": "^3.699.0",
Expand Down
162 changes: 161 additions & 1 deletion packages/components/src/speechToText.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,23 @@ import { AssemblyAI } from 'assemblyai'
import { getFileFromStorage } from './storageUtils'
import axios from 'axios'
import Groq from 'groq-sdk'
import { S3Client, PutObjectCommand, DeleteObjectCommand } from '@aws-sdk/client-s3'
import {
TranscribeClient,
StartTranscriptionJobCommand,
GetTranscriptionJobCommand,
TranscriptionJobStatus,
MediaFormat,
DeleteTranscriptionJobCommand
} from '@aws-sdk/client-transcribe'
Comment thread
SyncWithRaj marked this conversation as resolved.

const SpeechToTextType = {
OPENAI_WHISPER: 'openAIWhisper',
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
LOCALAI_STT: 'localAISTT',
AZURE_COGNITIVE: 'azureCognitive',
GROQ_WHISPER: 'groqWhisper'
GROQ_WHISPER: 'groqWhisper',
AWS_TRANSCRIBE: 'awsTranscribe'
}

export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
Expand Down Expand Up @@ -125,6 +135,156 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
}
break
}
case SpeechToTextType.AWS_TRANSCRIBE: {
const region = speechToTextConfig.region || 'us-east-1'
const s3BucketName = speechToTextConfig.s3BucketName as string
const languageCode = speechToTextConfig.languageCode || 'en-US'

if (!s3BucketName) {
throw new Error('S3 Bucket Name is required for AWS Transcribe')
}

const awsClientConfig: Record<string, any> = { region }
if (credentialData.awsKey && credentialData.awsSecret) {
awsClientConfig.credentials = {
accessKeyId: credentialData.awsKey,
secretAccessKey: credentialData.awsSecret,
...(credentialData.awsSession && { sessionToken: credentialData.awsSession })
}
}

const s3Client = new S3Client(awsClientConfig)
const transcribeClient = new TranscribeClient(awsClientConfig)

// Generate unique file name and upload to S3
const fileExtension = ((upload.name || '').split('.').pop() || 'webm').toLowerCase()
const s3Key = 'flowise-stt-temp/' + Date.now() + '-' + Math.random().toString(36).substring(2) + '.' + fileExtension
const jobName = 'flowise-' + Date.now() + '-' + Math.random().toString(36).substring(2)

try {
await s3Client.send(
new PutObjectCommand({
Bucket: s3BucketName,
Key: s3Key,
Body: Buffer.from(audio_file),
ContentType: upload.mime || 'audio/webm'
})
)

// Determine media format from file extension
const mediaFormatMap: Record<string, string> = {
webm: 'webm',
mp3: 'mp3',
mp4: 'mp4',
m4a: 'm4a',
wav: 'wav',
flac: 'flac',
ogg: 'ogg',
amr: 'amr'
}
const mediaFormat = (mediaFormatMap[fileExtension] || 'webm') as MediaFormat

// Start transcription job
await transcribeClient.send(
Comment thread
SyncWithRaj marked this conversation as resolved.
new StartTranscriptionJobCommand({
TranscriptionJobName: jobName,
LanguageCode: languageCode,
Media: {
MediaFileUri: `s3://${s3BucketName}/${s3Key}`
},
MediaFormat: mediaFormat
})
)

// Poll for completion with 60 second timeout
const POLL_INTERVAL_MS = 3000
const TIMEOUT_MS = 60000
const startTime = Date.now()

let transcriptText = ''
let jobCompleted = false

while (!jobCompleted) {
if (Date.now() - startTime > TIMEOUT_MS) {
throw new Error('AWS Transcribe job timed out after 60 seconds')
}

await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS))

const jobResult = await transcribeClient.send(
new GetTranscriptionJobCommand({
TranscriptionJobName: jobName
})
)

const status = jobResult.TranscriptionJob?.TranscriptionJobStatus

if (status === TranscriptionJobStatus.COMPLETED) {
const transcriptUri = jobResult.TranscriptionJob?.Transcript?.TranscriptFileUri
if (transcriptUri) {
const transcriptResponse = await axios.get(transcriptUri)
const transcriptData = transcriptResponse.data
transcriptText = transcriptData?.results?.transcripts?.[0]?.transcript || ''
}
jobCompleted = true
} else if (status === TranscriptionJobStatus.FAILED) {
const failureReason = jobResult.TranscriptionJob?.FailureReason || 'Unknown error'
throw new Error(`AWS Transcribe job failed: ${failureReason}`)
}
// IN_PROGRESS or QUEUED — continue polling
}

// Clean up: delete temporary S3 file and Transcribe job
try {
await s3Client.send(
new DeleteObjectCommand({
Bucket: s3BucketName,
Key: s3Key
})
)
} catch {
// Non-fatal: log but don't fail if cleanup fails
}

try {
await transcribeClient.send(
new DeleteTranscriptionJobCommand({
TranscriptionJobName: jobName
})
)
} catch {
// Non-fatal
}

if (transcriptText) {
return transcriptText
}
} catch (error) {
// Attempt cleanup on error too
try {
await s3Client.send(
new DeleteObjectCommand({
Bucket: s3BucketName,
Key: s3Key
})
)
} catch {
// Non-fatal cleanup error
}

try {
await transcribeClient.send(
new DeleteTranscriptionJobCommand({
TranscriptionJobName: jobName
})
)
} catch {
// Non-fatal cleanup error
}
throw error
}
Comment thread
SyncWithRaj marked this conversation as resolved.
break
}
}
} else {
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
Expand Down
74 changes: 73 additions & 1 deletion packages/components/src/textToSpeech.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ import { ICommonObject } from './Interface'
import { getCredentialData } from './utils'
import OpenAI from 'openai'
import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js'
import { PollyClient, SynthesizeSpeechCommand, Engine, VoiceId } from '@aws-sdk/client-polly'
import { Readable } from 'node:stream'
import type { ReadableStream } from 'node:stream/web'

const TextToSpeechType = {
OPENAI_TTS: 'openai',
ELEVEN_LABS_TTS: 'elevenlabs'
ELEVEN_LABS_TTS: 'elevenlabs',
AMAZON_POLLY_TTS: 'amazonPolly'
}

export const convertTextToSpeechStream = async (
Expand Down Expand Up @@ -100,6 +102,51 @@ export const convertTextToSpeechStream = async (
})
break
}

case TextToSpeechType.AMAZON_POLLY_TTS: {
onStart('mp3')

const region = textToSpeechConfig.region || 'us-east-1'
const pollyClientConfig: Record<string, any> = { region }

if (credentialData.awsKey && credentialData.awsSecret) {
pollyClientConfig.credentials = {
accessKeyId: credentialData.awsKey,
secretAccessKey: credentialData.awsSecret,
...(credentialData.awsSession && { sessionToken: credentialData.awsSession })
}
}

const pollyClient = new PollyClient(pollyClientConfig)

const voiceId = (textToSpeechConfig.voice || 'Joanna') as VoiceId
const engine = (textToSpeechConfig.engine || 'neural') as Engine

const command = new SynthesizeSpeechCommand({
Text: text,
OutputFormat: 'mp3',
VoiceId: voiceId,
Engine: engine
})

const pollyResponse = await pollyClient.send(command, {
abortSignal: abortController.signal
})

if (!pollyResponse.AudioStream) {
throw new Error('Amazon Polly returned no audio stream')
}

// AudioStream from Polly is a Readable in Node.js
const pollyStream = pollyResponse.AudioStream as unknown as Readable
const stream =
pollyStream instanceof Readable ? pollyStream : Readable.fromWeb(pollyStream as unknown as ReadableStream)

await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 20, abortController, () => {
streamDestroyed = true
})
break
}
}
} else {
reject(new Error('Text to speech is not selected. Please configure TTS in the chatflow.'))
Expand Down Expand Up @@ -234,6 +281,31 @@ export const getVoices = async (provider: string, credentialId: string, options:
}))
}

case TextToSpeechType.AMAZON_POLLY_TTS:
return [
{ id: 'Joanna', name: 'Joanna (Female, US English)' },
{ id: 'Matthew', name: 'Matthew (Male, US English)' },
{ id: 'Ruth', name: 'Ruth (Female, US English)' },
{ id: 'Stephen', name: 'Stephen (Male, US English)' },
{ id: 'Ivy', name: 'Ivy (Female Child, US English)' },
{ id: 'Kevin', name: 'Kevin (Male Child, US English)' },
{ id: 'Kendra', name: 'Kendra (Female, US English)' },
{ id: 'Kimberly', name: 'Kimberly (Female, US English)' },
{ id: 'Salli', name: 'Salli (Female, US English)' },
{ id: 'Joey', name: 'Joey (Male, US English)' },
{ id: 'Justin', name: 'Justin (Male Child, US English)' },
{ id: 'Gregory', name: 'Gregory (Male, US English)' },
{ id: 'Danielle', name: 'Danielle (Female, US English)' },
{ id: 'Amy', name: 'Amy (Female, British English)' },
{ id: 'Brian', name: 'Brian (Male, British English)' },
{ id: 'Emma', name: 'Emma (Female, British English)' },
{ id: 'Lupe', name: 'Lupe (Female, US Spanish)' },
{ id: 'Pedro', name: 'Pedro (Male, US Spanish)' },
{ id: 'Léa', name: 'Léa (Female, French)' },
{ id: 'Vicki', name: 'Vicki (Female, German)' },
{ id: 'Daniel', name: 'Daniel (Male, German)' }
]

default:
throw new Error(`Unsupported TTS provider: ${provider}`)
}
Expand Down
14 changes: 11 additions & 3 deletions packages/server/src/controllers/text-to-speech/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ const generateTextToSpeech = async (req: Request, res: Response) => {
provider: bodyProvider,
credentialId: bodyCredentialId,
voice: bodyVoice,
model: bodyModel
model: bodyModel,
engine: bodyEngine,
region: bodyRegion
} = req.body

if (!text) {
Expand All @@ -27,7 +29,7 @@ const generateTextToSpeech = async (req: Request, res: Response) => {
)
}

let provider: string, credentialId: string, voice: string, model: string
let provider: string, credentialId: string, voice: string, model: string, engine: string, region: string

if (chatflowId) {
let workspaceId = req.user?.activeWorkspaceId
Expand Down Expand Up @@ -64,12 +66,16 @@ const generateTextToSpeech = async (req: Request, res: Response) => {
credentialId = providerConfig.credentialId
voice = providerConfig.voice
model = providerConfig.model
engine = providerConfig.engine
region = providerConfig.region
} else {
// Use TTS config from request body
provider = bodyProvider
credentialId = bodyCredentialId
voice = bodyVoice
model = bodyModel
engine = bodyEngine
region = bodyRegion
}

if (!provider) {
Expand Down Expand Up @@ -103,7 +109,9 @@ const generateTextToSpeech = async (req: Request, res: Response) => {
name: provider,
credentialId: credentialId,
voice: voice,
model: model
model: model,
engine: engine,
region: region
}

// Create and store AbortController
Expand Down
3 changes: 2 additions & 1 deletion packages/server/src/services/text-to-speech/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import { databaseEntities } from '../../utils'

export enum TextToSpeechProvider {
OPENAI = 'openai',
ELEVEN_LABS = 'elevenlabs'
ELEVEN_LABS = 'elevenlabs',
AMAZON_POLLY = 'amazonPolly'
}

export interface TTSRequest {
Expand Down
1 change: 1 addition & 0 deletions packages/server/src/utils/buildChatflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ const generateTTSForResponseStream = async (
const provider = config[providerKey]
if (provider && provider.status === true) {
activeProviderConfig = {
...provider,
name: providerKey,
credentialId: provider.credentialId,
voice: provider.voice,
Expand Down
1 change: 1 addition & 0 deletions packages/ui/src/assets/images/aws.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading