diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2e9eca2..6fe16c7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: - node-version: [14.x, 16.x, 18.x] + node-version: [22.x, 24.x] steps: - uses: actions/checkout@v3 @@ -26,7 +26,7 @@ jobs: run: npm install - name: Check formatting - run: node_modules/.bin/prettier --check $(find src -type f) + run: npm run format:check - name: Check lint run: npm run lint diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..b5555cd --- /dev/null +++ b/.prettierignore @@ -0,0 +1,7 @@ +node_modules +coverage +dist +dist.browser +tmp +npm-debug.log* + diff --git a/.travis.yml b/.travis.yml index b962165..70b329a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ language: node_js node_js: - - "stable" + - 'stable' diff --git a/.vscode/launch.json b/.vscode/launch.json index 19fe8b0..95b961d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -11,7 +11,7 @@ "internalConsoleOptions": "neverOpen", "disableOptimisticBPs": true, "windows": { - "program": "${workspaceFolder}/node_modules/jest/bin/jest", + "program": "${workspaceFolder}/node_modules/jest/bin/jest" } }, { @@ -19,16 +19,12 @@ "request": "launch", "name": "Jest Current File", "program": "${workspaceFolder}/node_modules/.bin/jest", - "args": [ - "${fileBasenameNoExtension}", - "--config", - "jest.config.js" - ], + "args": ["${fileBasenameNoExtension}", "--config", "jest.config.js"], "console": "integratedTerminal", "internalConsoleOptions": "neverOpen", "disableOptimisticBPs": true, "windows": { - "program": "${workspaceFolder}/node_modules/jest/bin/jest", + "program": "${workspaceFolder}/node_modules/jest/bin/jest" } } ] diff --git a/.vscode/settings.json b/.vscode/settings.json index 15a5edb..f2770c3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,21 +1,21 @@ { - "cSpell.words": [ - "Aditi", - "Bixby", - "Celine", - "Conchita", - "Giorgio", - "Mathieu", - "Mizuki", - "Raveena", - "Salli", - "Takumi", - "dedent", - "implicity", - "speechmarkdown", - "ssml", - "transpiled", - "tsify", - "uglifyjs" - ] -} \ No newline at end of file + "cSpell.words": [ + "Aditi", + "Bixby", + "Celine", + "Conchita", + "Giorgio", + "Mathieu", + "Mizuki", + "Raveena", + "Salli", + "Takumi", + "dedent", + "implicity", + "speechmarkdown", + "ssml", + "transpiled", + "tsify", + "uglifyjs" + ] +} diff --git a/CHANGELOG.md b/CHANGELOG.md index 79679c0..e557cf1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,67 +1,88 @@ # Change Log + All notable changes to the speechmarkdown-js project will be documented in this file. ## 2.1.0 - (December 22, 2022) + ### Added + - Support for audio captions ## 2.0.0 - (October 28, 2021) + ### Added + - Support for `voice` and `language` for `google-assistant` - Formatters for `amazon-polly`, `amazon-polly-neural`, and `microsoft-azure` ## 0.8.0-beta.0 - (July 7, 2019) + ### Added + - Support for sections with the `voice` and `lang` tags ## 0.7.0-alpha.0 - (July 6, 2019) + ### Added + - Support for `audio` tag ## 0.6.0-alpha.0 - (July 6, 2019) + ### Added + - Support for `voice` and `lang` tags ## 0.5.0-alpha.0 - (July 5, 2019) + ### Fixed + - Issue #7 - Grammar - multiple modifiers for the same text + ### Added + - Grammar and formatters for standard: - - volume / vol - - rate - - pitch - - sub - - ipa + - volume / vol + - rate + - pitch + - sub + - ipa ## 0.4.0-alpha.0 - (June 30, 2019) + ### Added + - Update grammar and formatters for standard: - - emphasis - - address - - characters / chars - - date (skipped tests) - - expletive / bleep - - fraction (skipped tests) - - interjection - - number - - ordinal - - phone / telephone (skipped tests) - - time - - unit - - whisper + - emphasis + - address + - characters / chars + - date (skipped tests) + - expletive / bleep + - fraction (skipped tests) + - interjection + - number + - ordinal + - phone / telephone (skipped tests) + - time + - unit + - whisper - Add tests to increase coverage ## 0.3.0-alpha.0 - (June 30, 2019) + ### Added + - Update grammar and formatters for emphasis short format - Change speechmarkdown.toString(markdown) to speechmarkdown.toText(markdown) - ## 0.2.0-alpha.0 - (June 29, 2019) + ### Added + - CHANGELOG.md ### Update -- Links in package.json \ No newline at end of file + +- Links in package.json diff --git a/CODE-OF-CONDUCT.md b/CODE-OF-CONDUCT.md index c7d10b6..f691a60 100644 --- a/CODE-OF-CONDUCT.md +++ b/CODE-OF-CONDUCT.md @@ -20,4 +20,4 @@ This code of conduct provides guidance on participation in Speech Markdown-manag - Other conduct which could reasonably be considered inappropriate in a professional setting; - Advocating for or encouraging any of the above behaviors. -**Enforcement and Reporting Code of Conduct Issues.** Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting opensource-codeofconduct@speechmarkdown.org. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. \ No newline at end of file +**Enforcement and Reporting Code of Conduct Issues.** Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting opensource-codeofconduct@speechmarkdown.org. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 28d33ef..fc6ded0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -10,11 +10,11 @@ When you submit a pull request, our team is notified and will respond as quickly We look forward to receiving your pull requests for: -* New content you'd like to contribute (such as new code samples or tutorials) -* Inaccuracies in the content -* Information gaps in the content that need more detail to be complete -* Typos or grammatical errors -* Suggested rewrites that improve clarity and reduce confusion +- New content you'd like to contribute (such as new code samples or tutorials) +- Inaccuracies in the content +- Information gaps in the content that need more detail to be complete +- Typos or grammatical errors +- Suggested rewrites that improve clarity and reduce confusion **Note:** We all write differently, and you might not like how we've written or organized something currently. We want that feedback. But please be sure that your request for a rewrite is supported by the previous criteria. If it isn't, we might decline to merge it. @@ -45,7 +45,6 @@ In addition to written content, we really appreciate new examples and code sampl This project has adopted the [Speech Markdown Open Source Code of Conduct](https://github.com/speechmarkdown/speechmarkdown-js/blob/master/CODE-OF-CONDUCT). Contact [opensource-codeofconduct@speechmarkdown.org](mailto:opensource-codeofconduct@speechmarkdown.org) with any additional questions or comments. - ## Licensing See the [LICENSE](https://github.com/speechmarkdown/speechmarkdown-js/blob/master/LICENSE) file for this project's licensing. We will ask you to confirm the licensing of your contribution. We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. diff --git a/README.md b/README.md index 8f78c16..c57f924 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,19 @@ Supported platforms: - amazon-alexa - amazon-polly - amazon-polly-neural +- apple-avspeechsynthesizer - google-assistant +- ibm-watson - microsoft-azure +- microsoft-sapi +- w3c - samsung-bixby +- elevenlabs Find the architecture [here](./docs/architecture.md) +Platform-specific SSML notes are tracked in [`docs/platforms`](./docs/platforms/README.md). Use `npm run docs:update-voices` to refresh the auto-generated voice maps in `src/formatters/data` when vendor credentials are available. + ## Quick start ### SSML - Amazon Alexa @@ -126,9 +133,14 @@ Available options are: - "amazon-alexa" - "amazon-polly" - "amazon-polly-neural" + - "apple-avspeechsynthesizer" - "google-assistant" + - "ibm-watson" - "microsoft-azure" + - "microsoft-sapi" + - "w3c" - "samsung-bixby" + - "elevenlabs" - `includeFormatterComment` (boolean) - Adds an XML comment to the SSML output indicating the formatter used. Default is `false`. @@ -179,8 +191,14 @@ The biggest place we need help right now is with the completion of the grammar a - [x] emphasis - moderate - [x] emphasis - none - [x] emphasis - reduced -- [ ] ipa -- [ ] sub +- [x] ipa +- [x] sub + +Short-form examples: + +- `(pecan)/'pi.kæn/` → `pecan` +- `(Al){aluminum}` → `Al` +- `/ˈdeɪtə/` → `ipa` #### Standard Format diff --git a/docs/architecture.md b/docs/architecture.md index ba6875d..cad7aa0 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,13 +1,15 @@ # Architecture ## Simple Parser + Instead of a simple parser architecture as shown here: ![](./assets/simple-parser-diagram.png) ## Parser-Formatter Architecture + Speech Markdown is first translated into an Abstract Syntax Tree (AST) and a formatter transforms that into the correct format: ![](./assets/parser-formatter-diagram.png) -This is more powerful as formatters have the ability to customize the output based on the differences of each platform. \ No newline at end of file +This is more powerful as formatters have the ability to customize the output based on the differences of each platform. diff --git a/docs/platforms/README.md b/docs/platforms/README.md new file mode 100644 index 0000000..a6786bc --- /dev/null +++ b/docs/platforms/README.md @@ -0,0 +1,22 @@ +# Speech platform reference + +This directory contains reference notes about the SSML dialects that Speech Markdown targets. Each page describes: + +- Links to the vendor documentation for the dialect. +- Highlights of the current Speech Markdown formatter behaviour. +- Known gaps that are not currently translated by the formatter. +- A generated voice catalogue summarising the voices that expose the dialect when the provider shares the data programmatically. + +## Available platform notes + +- [Amazon Polly](./amazon-polly.md) +- [Amazon Alexa](./amazon-alexa.md) +- [Apple AVSpeechSynthesizer](./apple-avspeechsynthesizer.md) +- [Google Cloud Text-to-Speech](./google-cloud-tts.md) +- [IBM Watson Text to Speech](./ibm-watson-tts.md) +- [ElevenLabs prompt controls](./elevenlabs.md) +- [Microsoft Azure Speech Service](./azure.md) +- [W3C SSML](./w3c.md) +- [Microsoft Speech API (SAPI)](./microsoft-sapi.md) + +Voice catalogues are produced by the helper script `npm run docs:update-voices` which gathers voice metadata from the vendor APIs when credentials are available. The generated Markdown files live alongside the service documentation so that the catalogues can be versioned with the code base. diff --git a/docs/platforms/amazon-alexa.md b/docs/platforms/amazon-alexa.md new file mode 100644 index 0000000..ac0f4da --- /dev/null +++ b/docs/platforms/amazon-alexa.md @@ -0,0 +1,20 @@ +# Amazon Alexa SSML + +## Official resources + +- [Alexa Skills SSML reference](https://developer.amazon.com/en-US/docs/alexa/custom-skills/speech-synthesis-markup-language-ssml-reference.html) +- [Alexa voice catalogue](https://developer.amazon.com/en-US/docs/alexa/custom-skills/choose-the-voice-for-your-skill.html) +- [Designing with domains and emotions](https://developer.amazon.com/en-US/docs/alexa/custom-skills/speechcons-reference-interjections-for-alexa.html#expressive-ssml) + +## Speech Markdown formatter coverage + +- **Say-as rendering.** Inline modifiers such as `address`, `characters`, `date`, `interjection`, `number`, `ordinal`, `telephone`, `time`, and `unit` are mapped to `` with sensible defaults for date and time formats so Alexa pronunciation fixes can stay in Speech Markdown.【F:src/formatters/AmazonAlexaSsmlFormatter.ts†L76-L106】 +- **Amazon-specific prosody.** Speech Markdown exposes `whisper`, `amazon:domain` (`dj` and `newscaster` modifiers), and `amazon:emotion` for `excited` and `disappointed`, emitting the appropriate tags and intensity attributes that Alexa recognises.【F:src/formatters/AmazonAlexaSsmlFormatter.ts†L107-L145】【F:src/formatters/AmazonAlexaSsmlFormatter.ts†L183-L201】 +- **Voice fallback.** When a voice name is not present in the built-in whitelist, the formatter now falls back to emitting `` so newly launched Alexa voices (for example Lupe or Aria) still render without code changes.【F:src/formatters/AmazonAlexaSsmlFormatter.ts†L49-L51】【F:src/formatters/SsmlFormatterBase.ts†L44-L57】 +- **Section-level wrappers.** `lang` and `voice` section modifiers wrap larger blocks, and Speech Markdown keeps Amazon-specific `music` and `news` domains available for long-form sections.【F:src/formatters/AmazonAlexaSsmlFormatter.ts†L174-L205】 + +## Known gaps + +- **Expressive extensions.** The formatter currently emits only `amazon:effect`, `amazon:domain`, and `amazon:emotion`, so features like ``, ``, ``, and the long-form `` still require manual SSML until new modifiers are defined.【F:src/formatters/AmazonAlexaSsmlFormatter.ts†L40-L46】【F:src/formatters/AmazonAlexaSsmlFormatter.ts†L183-L205】 +- **Voice metadata.** The built-in whitelist predates the expanded Alexa voice line-up and lacks locale metadata for the neural voices, so Speech Markdown relies on the new fallback behaviour instead of providing locale validation for every published voice.【F:src/formatters/AmazonAlexaSsmlFormatter.ts†L5-L33】 +- **No automated catalogue.** Unlike Azure, Google, Polly, and Watson, Alexa does not expose a public API for voice discovery, so the documentation cannot yet include a generated voice table and must be refreshed manually from the developer portal. diff --git a/docs/platforms/amazon-polly.md b/docs/platforms/amazon-polly.md new file mode 100644 index 0000000..3f72e40 --- /dev/null +++ b/docs/platforms/amazon-polly.md @@ -0,0 +1,28 @@ +# Amazon Polly SSML + +## Official resources + +- [Supported SSML tags](https://docs.aws.amazon.com/polly/latest/dg/supportedtags.html) +- [Voice catalogue](https://docs.aws.amazon.com/polly/latest/dg/voicelist.html) + +## Speech Markdown formatter coverage + +Speech Markdown ships two formatters for Amazon Polly. + +### `amazon-polly` (standard engine) + +- **Say-as pronunciations.** Modifiers such as `address`, `cardinal`, `characters`, `digits`, `fraction`, `number`, `ordinal`, `telephone`, and `unit` render as `` with sensible defaults for dates and times, mirroring Polly's SSML support.【F:src/formatters/AmazonPollySsmlFormatter.ts†L47-L72】 +- **Pronunciation controls.** The formatter exposes ``, ``, and `` so aliasing, IPA phonemes, and rate, pitch, or volume adjustments can be driven from Speech Markdown.【F:src/formatters/AmazonPollySsmlFormatter.ts†L78-L93】 +- **Amazon-specific effects.** Polly-only modifiers such as `whisper`, `timbre`, and `drc` produce `amazon:effect` tags, while inline `lang` modifiers wrap content in `` for mixed-language prompts.【F:src/formatters/AmazonPollySsmlFormatter.ts†L74-L105】 +- **Known gaps.** Inline `voice`, `excited`, and `disappointed` modifiers are defined but intentionally left without SSML output, and section-level variants such as `newscaster` are also ignored, so these behaviours still require manual SSML.【F:src/formatters/AmazonPollySsmlFormatter.ts†L107-L151】 + +### `amazon-polly-neural` + +- **Shared say-as handling.** The neural formatter mirrors the standard engine for `address`, `characters`, `digits`, `fraction`, `number`, `ordinal`, `telephone`, `unit`, `date`, and `time` modifiers so pronunciation fixes work across both engines.【F:src/formatters/AmazonPollyNeuralSsmlFormatter.ts†L41-L67】 +- **Pronunciation helpers.** `sub`, `ipa`, and the rate or volume prosody controls are preserved, and `lang` plus `drc` continue to emit `` and `amazon:effect` tags respectively.【F:src/formatters/AmazonPollyNeuralSsmlFormatter.ts†L69-L91】 +- **Neural-only domains.** Section-level `newscaster` modifiers wrap content in `` to reach Polly's neural news style.【F:src/formatters/AmazonPollyNeuralSsmlFormatter.ts†L115-L134】 +- **Known gaps.** Neural voices do not currently expose `emphasis`, `whisper`, `voice`, `excited`, or `disappointed` output because the formatter drops those modifiers, matching the limitations of Polly's neural styles.【F:src/formatters/AmazonPollyNeuralSsmlFormatter.ts†L93-L145】 + +## Voice catalogue + +Run `npm run docs:update-voices` with either `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY` plus `AWS_REGION` (or `AWS_DEFAULT_REGION`) or the `POLLY_AWS_KEY_ID`/`POLLY_AWS_ACCESS_KEY`/`POLLY_REGION` equivalents to regenerate `data/amazon-polly-voices.md`. The helper script calls Polly's `ListVoices` API (with additional language codes enabled) and writes a Markdown table of each voice's identifier, language, gender, and supported engines so formatter validations stay aligned with Amazon's inventory. diff --git a/docs/platforms/apple-avspeechsynthesizer.md b/docs/platforms/apple-avspeechsynthesizer.md new file mode 100644 index 0000000..dfb06a1 --- /dev/null +++ b/docs/platforms/apple-avspeechsynthesizer.md @@ -0,0 +1,14 @@ +# Apple AVSpeechSynthesizer voices + +## Official resources + +- [AVSpeechSynthesizer documentation](https://developer.apple.com/documentation/avfaudio/avspeechsynthesizer) +- [SSML support overview](https://developer.apple.com/documentation/avfoundation/speech_synthesis) + +## Speech Markdown status + +Speech Markdown now exposes an AVSpeechSynthesizer formatter that focuses on the subset of SSML Apple accepts. The formatter enables `say-as` for characters and numbers, preserves substitution and IPA hints, and keeps voice selections in the output while intentionally ignoring unsupported prosody keys such as `rate`, `pitch`, `volume`, and `whisper` so AVSpeechSynthesizer falls back to native utterance configuration.【F:src/formatters/FormatterFactory.ts†L1-L39】【F:src/formatters/AppleAvSpeechSynthesizerSsmlFormatter.ts†L6-L136】 + +## Voice catalogue + +macOS, iOS, and iPadOS ship a large number of built-in system voices that vary by OS version and user downloads. The helper script can create or refresh `data/apple-avspeechsynthesizer-voices.md` as a staging area for curated lists gathered from `AVSpeechSynthesisVoice.speechVoices()` output when maintainers have access to Apple hardware. Export the array to JSON (for example using a small Swift snippet) and set `APPLE_VOICE_EXPORT` before running `npm run docs:update-voices` to update the table. diff --git a/docs/platforms/azure.md b/docs/platforms/azure.md new file mode 100644 index 0000000..cdebde1 --- /dev/null +++ b/docs/platforms/azure.md @@ -0,0 +1,25 @@ +# Microsoft Azure Speech Service SSML + +## Official resources + +- [SSML structure reference](https://learn.microsoft.com/azure/ai-services/speech-service/speech-synthesis-markup-structure) +- [Voice gallery](https://learn.microsoft.com/azure/ai-services/speech-service/language-support?tabs=tts) + +## Speech Markdown formatter coverage + +Speech Markdown's `microsoft-azure` formatter layers Azure-specific behaviour on top of the shared SSML mapping: + +- **Say-as conversions.** Speech Markdown forwards modifiers such as `address`, `fraction`, `ordinal`, `telephone`, `number`, and `characters` to `` while automatically choosing `cardinal` or `digits` for numeric text.【F:src/formatters/MicrosoftAzureSsmlFormatter.ts†L9-L48】 +- **Dates and times.** The formatter emits `` and `` with Azure's default `ymd` and `hms12` formats when no explicit format is supplied.【F:src/formatters/MicrosoftAzureSsmlFormatter.ts†L49-L58】 +- **Pronunciation helpers.** `sub` and `ipa` modifiers become `` and ``, letting authors control pronunciation directly from Speech Markdown.【F:src/formatters/MicrosoftAzureSsmlFormatter.ts†L59-L66】 +- **Prosody and whispering.** Rate, pitch, and volume modifiers augment `` tags, and the `whisper` modifier approximates whispered delivery with `volume="x-soft"` and `rate="slow"` settings as recommended by Microsoft.【F:src/formatters/MicrosoftAzureSsmlFormatter.ts†L22-L27】【F:src/formatters/MicrosoftAzureSsmlFormatter.ts†L67-L75】 +- **Voice and style selection.** Inline `voice` modifiers add `` tags, and the section-level `newscaster` modifier wraps content in `` so maintainers can target Azure's neural styles.【F:src/formatters/MicrosoftAzureSsmlFormatter.ts†L23-L27】【F:src/formatters/MicrosoftAzureSsmlFormatter.ts†L76-L103】 + +### Unsupported or manual features + +- The formatter explicitly disables Azure-only constructs such as `emphasis`, `expletive`, `interjection`, and `unit`, so those modifiers currently do not produce SSML output.【F:src/formatters/MicrosoftAzureSsmlFormatter.ts†L8-L17】 +- Additional expressive behaviours—including `excited`, `disappointed`, and other MSTTS styles—remain unmapped because the shared SSML base leaves those modifiers set to `null` pending future design work.【F:src/formatters/SsmlFormatterBase.ts†L63-L86】 + +## Voice catalogue + +The generated catalogue `data/azure-voices.md` is produced by `npm run docs:update-voices` when either `AZURE_SPEECH_KEY`/`AZURE_SPEECH_REGION` or `MICROSOFT_TOKEN`/`MICROSOFT_REGION` environment variables are supplied. The file lists every voice name, locale, gender, type, style, and sample rate returned by the Speech Service REST API so that formatter validations can remain current. diff --git a/docs/platforms/elevenlabs.md b/docs/platforms/elevenlabs.md new file mode 100644 index 0000000..53a0ef9 --- /dev/null +++ b/docs/platforms/elevenlabs.md @@ -0,0 +1,19 @@ +# ElevenLabs prompt controls + +- **Vendor documentation:** [Prompt controls reference](https://elevenlabs.io/docs/best-practices/prompting/controls) +- **Formatter entry point:** `platform: 'elevenlabs'` + +## Implemented Speech Markdown features + +The ElevenLabs API does not accept full SSML documents. The formatter therefore emits plain text with the inline tags that the service recognises: + +- `[break:""]` → `` +- `[break:""]` → `` with strength values mapped to approximate second values the engine accepts (`none` → `0s`, `x-weak` → `0.2s`, `weak` → `0.35s`, `medium` → `0.5s`, `strong` → `0.8s`, `x-strong` → `1.2s`). +- `(word)[ipa:""]` → `word` + +All other Speech Markdown modifiers fall back to plain text because ElevenLabs relies on narrative prompting rather than SSML equivalents for emphasis, say-as, or voice selection. + +## Notes + +- ElevenLabs does not publish a programmatic voice catalogue. The formatter expects the caller to choose a voice through the API request payload instead of inside the generated prompt. +- The formatter honours `includeFormatterComment` by inserting an HTML comment at the start of the prompt, but `includeSpeakTag` and `includeParagraphTag` are ignored because the target does not support container tags. diff --git a/docs/platforms/google-cloud-tts.md b/docs/platforms/google-cloud-tts.md new file mode 100644 index 0000000..81261ec --- /dev/null +++ b/docs/platforms/google-cloud-tts.md @@ -0,0 +1,25 @@ +# Google Cloud Text-to-Speech SSML + +## Official resources + +- [SSML reference](https://cloud.google.com/text-to-speech/docs/ssml) +- [Voice list](https://cloud.google.com/text-to-speech/docs/voices) + +## Speech Markdown formatter coverage + +Speech Markdown's `google-assistant` formatter targets the SSML dialect that Google Cloud Text-to-Speech accepts: + +- **Emphasis and say-as mappings.** Modifiers like `emphasis`, `address`, `fraction`, `number`, `ordinal`, `telephone`, and `unit` map to `` or `` tags so common pronunciations flow through automatically.【F:src/formatters/GoogleAssistantSsmlFormatter.ts†L25-L49】 +- **Date and time formatting.** Speech Markdown emits `` and `` with Google's default `ymd` and `hms12` formats unless authors override them.【F:src/formatters/GoogleAssistantSsmlFormatter.ts†L50-L57】 +- **Pronunciation control.** `sub` and `ipa` generate `` and `` tags respectively, while rate, pitch, and volume modifiers augment `` and the `whisper` modifier applies Google's recommended soft volume and slow rate.【F:src/formatters/GoogleAssistantSsmlFormatter.ts†L58-L77】 +- **Language and voice selection.** Inline and section-level `lang` modifiers wrap content in ``, and `voice` modifiers delegate to the formatter's built-in voice catalogue so that canonical Google Assistant names resolve without manual SSML.【F:src/formatters/GoogleAssistantSsmlFormatter.ts†L20-L35】【F:src/formatters/GoogleAssistantSsmlFormatter.ts†L78-L113】 + +### Unsupported or manual features + +- The formatter disables the `interjection` modifier because Google Cloud's SSML does not provide a direct equivalent today.【F:src/formatters/GoogleAssistantSsmlFormatter.ts†L24-L27】 +- Speech Markdown still ships an older, hard-coded `validVoices` map that only covers the legacy Assistant inventory, so Cloud TTS voices such as WaveNet, Neural2, Studio, and Polyglot variants require manual SSML until the formatter is updated to query the modern voice list.【F:src/formatters/GoogleAssistantSsmlFormatter.ts†L6-L34】 +- Expressive domains like `newscaster`, `excited`, or `disappointed` fall back to raw text because those modifiers are left unset in the shared SSML base pending a Google-specific design.【F:src/formatters/SsmlFormatterBase.ts†L63-L86】 + +## Voice catalogue + +Run `npm run docs:update-voices` with a `GOOGLE_TTS_API_KEY` to regenerate `data/google-cloud-voices.md`. The generated table captures the voice name, languages, gender, and natural sample rate returned by the Text-to-Speech REST API so that formatter validation can keep pace with Google's evolving inventory. diff --git a/docs/platforms/ibm-watson-tts.md b/docs/platforms/ibm-watson-tts.md new file mode 100644 index 0000000..3b70a5f --- /dev/null +++ b/docs/platforms/ibm-watson-tts.md @@ -0,0 +1,14 @@ +# IBM Watson Text to Speech SSML + +## Official resources + +- [SSML documentation](https://cloud.ibm.com/apidocs/text-to-speech#synthesize) +- [Voice list](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-voices) + +## Speech Markdown status + +Speech Markdown now ships an IBM Watson formatter that translates the core modifiers into Watson-compatible SSML. The implementation covers `say-as`, `sub`, `phoneme`, `emphasis`, and prosody attributes, while leaving Watson-exclusive constructs such as `` or voice transformations untouched so the generated markup stays within the documented subset of the service.【F:src/formatters/FormatterFactory.ts†L1-L39】【F:src/formatters/IbmWatsonSsmlFormatter.ts†L6-L213】 + +## Voice catalogue + +Provide `WATSON_TTS_URL` (e.g. `https://api.us-south.text-to-speech.watson.cloud.ibm.com`) and `WATSON_TTS_API_KEY` to `npm run docs:update-voices` to refresh `data/ibm-watson-voices.md`. The script queries `/v1/voices` and writes a Markdown table that includes each voice's name, language, gender, and available expressive features so formatter validations can be cross-checked. diff --git a/docs/platforms/microsoft-sapi.md b/docs/platforms/microsoft-sapi.md new file mode 100644 index 0000000..f61e39a --- /dev/null +++ b/docs/platforms/microsoft-sapi.md @@ -0,0 +1,13 @@ +# Microsoft Speech API (SAPI) voices + +## Official resources + +- [SAPI 5 XML reference]() + +## Speech Markdown status + +Speech Markdown routes Microsoft SAPI output through the shared W3C formatter so the generated SSML stays aligned with the [W3C Speech Synthesis Recommendation](https://www.w3.org/TR/speech-synthesis/). The shared formatter covers emphasis, `say-as`, prosody, phonemes, `lang`, and voice tags and falls back to `` when a voice is not present in the configured catalogue.【F:src/formatters/W3cSsmlFormatter.ts†L1-L210】 The SAPI adapter simply reuses that implementation while swapping the formatter comment so maintainers can tell the target engine in generated markup.【F:src/formatters/MicrosoftSapiSsmlFormatter.ts†L1-L11】【F:src/formatters/FormatterFactory.ts†L1-L40】 Unsupported Speech Markdown modifiers such as whisper effects or vendor-specific styles are ignored because the W3C dialect does not define them. + +## Voice catalogue + +SAPI voice availability is determined by the voices installed on the host machine. Developers can enumerate the voices locally via PowerShell (`Get-SPVoice`) or .NET (`SpeechSynthesizer.GetInstalledVoices()`). Documenting the complete list in-repo is impractical because it varies by Windows SKU and third-party voice packs. When maintainers export the installed voices to JSON (for example `Get-InstalledVoices | ConvertTo-Json > voices.json`) and set `SAPI_VOICE_EXPORT` to that file, `npm run docs:update-voices` will regenerate `data/microsoft-sapi-voices.md` with the captured baseline. diff --git a/docs/platforms/w3c.md b/docs/platforms/w3c.md new file mode 100644 index 0000000..63ff99a --- /dev/null +++ b/docs/platforms/w3c.md @@ -0,0 +1,20 @@ +# W3C SSML + +## Official resources + +- [Speech Synthesis Markup Language (SSML) Version 1.1 Recommendation](https://www.w3.org/TR/speech-synthesis/) + +## Speech Markdown status + +Speech Markdown exposes a `w3c` formatter that emits spec-compliant SSML and serves as the foundation for engines that consume the standard, including Microsoft SAPI.【F:src/formatters/W3cSsmlFormatter.ts†L1-L210】【F:src/formatters/FormatterFactory.ts†L1-L40】 The formatter supports: + +- `` with optional level fallbacks when not supplied in the markup. +- `` for numbers, digits, addresses, dates, times, ordinals, interjections, expletives, telephone numbers, and generic units. +- `` attributes for rate, pitch, and volume adjustments on the same element. +- `` for IPA transcriptions. +- `` substitutions. +- `` wrappers at inline and section scope. +- `` tags driven by configured catalogues, with a fallback to `name="…"` for unlisted voices. +- `` and `