diff --git a/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap b/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap index e1fa016b40..392b7cf05d 100644 --- a/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap +++ b/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap @@ -34,7 +34,10 @@ Use your-api-key and your-channel-name in your code. ## Code blocks -\`\`\`javascript + +### Javascript + +\`\`\` const channel = realtime.channels.get('your-channel-name'); \`\`\` diff --git a/data/onPostBuild/transpileMdxToMarkdown.test.ts b/data/onPostBuild/transpileMdxToMarkdown.test.ts index 86fcdd759b..4904852fad 100644 --- a/data/onPostBuild/transpileMdxToMarkdown.test.ts +++ b/data/onPostBuild/transpileMdxToMarkdown.test.ts @@ -11,6 +11,10 @@ import { convertRelativeUrls, replaceTemplateVariables, calculateOutputPath, + getLanguageDisplayName, + findPrecedingHeadingLevel, + transformCodeBlocksWithSubheadings, + addLanguageSubheadingsToCodeBlocks, } from './transpileMdxToMarkdown'; import * as fs from 'fs'; import * as path from 'path'; @@ -622,4 +626,326 @@ Real prop: link: '/docs/presence'`; expect(output).toMatch(/public\/docs\/chat\/moderation\/direct\/bodyguard\.md$/); }); }); + + describe('getLanguageDisplayName', () => { + it('should capitalize simple language names', () => { + expect(getLanguageDisplayName('javascript')).toBe('Javascript'); + expect(getLanguageDisplayName('kotlin')).toBe('Kotlin'); + expect(getLanguageDisplayName('swift')).toBe('Swift'); + }); + + it('should handle underscore-separated variants', () => { + expect(getLanguageDisplayName('realtime_javascript')).toBe('Realtime Javascript'); + expect(getLanguageDisplayName('rest_python')).toBe('Rest Python'); + }); + + it('should handle empty string', () => { + expect(getLanguageDisplayName('')).toBe(''); + }); + + it('should handle single character', () => { + expect(getLanguageDisplayName('a')).toBe('A'); + }); + }); + + describe('findPrecedingHeadingLevel', () => { + it('should return 3 when no heading is found (so +1 gives h4 default)', () => { + const content = 'Some text without headings'; + expect(findPrecedingHeadingLevel(content, content.length)).toBe(3); + }); + + it('should find h1 heading level', () => { + const content = '# Main Title\n\nSome content'; + expect(findPrecedingHeadingLevel(content, content.length)).toBe(1); + }); + + it('should find h2 heading level', () => { + const content = '## Section\n\nSome content'; + expect(findPrecedingHeadingLevel(content, content.length)).toBe(2); + }); + + it('should find the nearest preceding heading', () => { + const content = '# Title\n\n## Section\n\n### Subsection\n\nContent here'; + expect(findPrecedingHeadingLevel(content, content.length)).toBe(3); + }); + + it('should only consider headings before the given position', () => { + const content = '## First\n\nContent\n\n### Second'; + const positionBeforeSecond = content.indexOf('### Second'); + expect(findPrecedingHeadingLevel(content, positionBeforeSecond)).toBe(2); + }); + + it('should handle h6 heading level', () => { + const content = '###### Deep heading\n\nContent'; + expect(findPrecedingHeadingLevel(content, content.length)).toBe(6); + }); + }); + + describe('transformCodeBlocksWithSubheadings', () => { + it('should transform code blocks with subheadings and remove language from fence', () => { + const input = ` +\`\`\`javascript +const x = 1; +\`\`\` +`; + const output = transformCodeBlocksWithSubheadings(input, '###'); + expect(output).toContain('### Javascript'); + expect(output).toContain('```\nconst x = 1;'); + expect(output).not.toContain('```javascript'); + }); + + it('should return null when no code blocks with language identifiers', () => { + const input = ` +\`\`\` +const x = 1; +\`\`\` +`; + const output = transformCodeBlocksWithSubheadings(input, '###'); + expect(output).toBeNull(); + }); + + it('should handle multiple code blocks', () => { + const input = ` +\`\`\`javascript +const x = 1; +\`\`\` + +\`\`\`python +x = 1 +\`\`\` +`; + const output = transformCodeBlocksWithSubheadings(input, '####'); + expect(output).toContain('#### Javascript'); + expect(output).toContain('#### Python'); + expect(output).not.toContain('```javascript'); + expect(output).not.toContain('```python'); + }); + }); + + describe('addLanguageSubheadingsToCodeBlocks', () => { + it('should add subheadings to multiple code blocks within tags and remove language from fence', () => { + const input = ` +\`\`\`javascript +const x = 1; +\`\`\` + +\`\`\`kotlin +val x = 1 +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('#### Javascript'); + expect(output).toContain('#### Kotlin'); + // Language should be removed from fenced code blocks + expect(output).not.toContain('```javascript'); + expect(output).not.toContain('```kotlin'); + expect(output).toContain('```\nconst x = 1;'); + expect(output).toContain('```\nval x = 1'); + }); + + it('should handle realtime/rest SDK variants', () => { + const input = ` +\`\`\`realtime_javascript +const channel = realtime.channels.get('test'); +\`\`\` + +\`\`\`rest_javascript +const channel = rest.channels.get('test'); +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('#### Realtime Javascript'); + expect(output).toContain('#### Rest Javascript'); + // Language should be removed from fenced code blocks + expect(output).not.toContain('```realtime_javascript'); + expect(output).not.toContain('```rest_javascript'); + }); + + it('should handle tags with attributes like fixed="true"', () => { + const input = ` +\`\`\`javascript +const x = 1; +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('#### Javascript'); + // Language should be removed from fenced code blocks + expect(output).not.toContain('```javascript'); + expect(output).toContain('```\nconst x = 1;'); + }); + + it('should handle code blocks without a language identifier', () => { + const input = ` +\`\`\` +const x = 1; +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + // Code blocks without language should be returned as-is (no subheading added) + expect(output).not.toContain('####'); + expect(output).toContain('```\nconst x = 1;'); + }); + + it('should not modify code blocks outside tags', () => { + const input = `\`\`\`javascript +const x = 1; +\`\`\` + +\`\`\`kotlin +val x = 1 +\`\`\``; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).not.toContain('####'); + expect(output).toBe(input); + }); + + it('should preserve code block content', () => { + const input = ` +\`\`\`javascript +const channel = realtime.channels.get('{{RANDOM_CHANNEL_NAME}}'); +channel.subscribe((message) => { + console.log(message); +}); +\`\`\` + +\`\`\`python +channel = realtime.channels.get('channel-name') +def on_message(message): + print(message) +channel.subscribe(on_message) +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain("const channel = realtime.channels.get('{{RANDOM_CHANNEL_NAME}}');"); + expect(output).toContain("channel = realtime.channels.get('channel-name')"); + expect(output).toContain('console.log(message);'); + expect(output).toContain('print(message)'); + }); + + it('should handle empty tags', () => { + const input = ``; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toBe(''); + }); + + it('should handle tags with only whitespace', () => { + const input = ` + +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toBe(input); + }); + + it('should handle code blocks with Windows-style line endings', () => { + const input = `\r\n\`\`\`javascript\r\nconst x = 1;\r\n\`\`\`\r\n`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('#### Javascript'); + // Language should be removed from fenced code blocks + expect(output).not.toContain('```javascript'); + }); + + it('should handle language identifiers with hyphens', () => { + const input = ` +\`\`\`objective-c +NSLog(@"Hello"); +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('#### Objective-c'); + // Language should be removed from fenced code blocks + expect(output).not.toContain('```objective-c'); + }); + + it('should handle language identifiers with special characters', () => { + const input = ` +\`\`\`shell-session +$ npm install +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('#### Shell-session'); + // Language should be removed from fenced code blocks + expect(output).not.toContain('```shell-session'); + }); + + it('should use h3 subheading when preceded by h2 heading', () => { + const input = `## Section Title + + +\`\`\`javascript +const x = 1; +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('### Javascript'); + expect(output).not.toContain('#### Javascript'); + }); + + it('should use h4 subheading when preceded by h3 heading', () => { + const input = `### Subsection Title + + +\`\`\`javascript +const x = 1; +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('#### Javascript'); + }); + + it('should use h5 subheading when preceded by h4 heading', () => { + const input = `#### Deep Section + + +\`\`\`javascript +const x = 1; +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('##### Javascript'); + }); + + it('should use h7 when preceded by h6 heading (no cap for LLM consumption)', () => { + const input = `###### Deepest Section + + +\`\`\`javascript +const x = 1; +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('####### Javascript'); + }); + + it('should use h4 as default when no preceding heading', () => { + const input = ` +\`\`\`javascript +const x = 1; +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('#### Javascript'); + }); + + it('should handle multiple blocks with different preceding headings', () => { + const input = `## First Section + + +\`\`\`javascript +const a = 1; +\`\`\` + + +### Nested Section + + +\`\`\`python +b = 2 +\`\`\` +`; + const output = addLanguageSubheadingsToCodeBlocks(input); + expect(output).toContain('### Javascript'); + expect(output).toContain('#### Python'); + }); + }); }); diff --git a/data/onPostBuild/transpileMdxToMarkdown.ts b/data/onPostBuild/transpileMdxToMarkdown.ts index ffcacdc569..0a87243acd 100644 --- a/data/onPostBuild/transpileMdxToMarkdown.ts +++ b/data/onPostBuild/transpileMdxToMarkdown.ts @@ -5,6 +5,89 @@ import frontMatter from 'front-matter'; const REPORTER_PREFIX = 'onPostBuild:transpileMdxToMarkdown'; +/** + * Get the display name for a language identifier + * Capitalizes the first letter of each word (e.g. javascript -> Javascript) + * Handles underscore-separated variants (e.g., realtime_javascript -> Realtime Javascript, rest_javascript -> Rest Javascript) + */ +function getLanguageDisplayName(lang: string): string { + if (!lang) { + return ''; + } + // Split by underscore, capitalize each part, join with space + return lang + .split('_') + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + .join(' '); +} + +/** + * Find the heading level of the nearest preceding heading before a given position. + * Used to determine the appropriate subheading level for code block language labels. + */ +function findPrecedingHeadingLevel(content: string, position: number): number { + const contentBefore = content.substring(0, position); + const headingRegex = /^(#+)\s+/gm; + let lastHeadingLevel = 3; // Defaults to 3 when no heading is found + let match; + + while ((match = headingRegex.exec(contentBefore)) !== null) { + lastHeadingLevel = match[1].length; + } + + return lastHeadingLevel; +} + +/** + * Transform code blocks within a tag by adding language subheadings + * and removing language identifiers from fenced code blocks. + * Returns null if no code blocks with language identifiers are found. + */ +function transformCodeBlocksWithSubheadings(innerContent: string, headingPrefix: string): string | null { + // Match ```language followed by code and closing ``` + // Uses [^\n`]+ to capture language identifiers with hyphens, plus signs, dots (e.g., objective-c, c++, shell-session) + // Supports both Unix (\n) and Windows (\r\n) line endings + const codeBlockRegex = /```([^\n`]+)\r?\n([\s\S]*?)```/g; + + // Check if there are any code blocks with language identifiers + if (!innerContent.match(codeBlockRegex)) { + return null; + } + + // Replace each code block with a subheading followed by the code block (without language in fence) + return innerContent.replace(codeBlockRegex, (_codeBlock, lang, codeContent) => { + const displayName = getLanguageDisplayName(lang); + return `${headingPrefix} ${displayName}\n\n\`\`\`\n${codeContent}\`\`\``; + }); +} + +/** + * Add language subheadings before each code block within tags. + * This makes it easier for LLMs to identify which language each code snippet belongs to. + * - Removes language identifier from fenced code blocks (since subheading provides this info) + * - Dynamically determines heading level based on preceding heading context + */ +function addLanguageSubheadingsToCodeBlocks(content: string): string { + // Match blocks with optional attributes (case-insensitive for the tag) + // Handles both and etc. + const codeTagRegex = /]*>([\s\S]*?)<\/Code>/gi; + + return content.replace(codeTagRegex, (fullMatch, innerContent: string, offset: number) => { + // Calculate the appropriate heading level based on preceding headings + const precedingLevel = findPrecedingHeadingLevel(content, offset); + const headingPrefix = '#'.repeat(precedingLevel + 1); + + // Transform code blocks with subheadings + const transformedContent = transformCodeBlocksWithSubheadings(innerContent, headingPrefix); + if (transformedContent === null) { + return fullMatch; // No code blocks with language - return unchanged + } + + // Ensure proper newline after tag for markdown formatting + return `\n\n${transformedContent.trimStart()}`; + }); +} + interface MdxNode { parent: { relativeDirectory: string; @@ -193,9 +276,7 @@ function removeImportExportStatements(content: string): string { * Remove script tags that are not inside code blocks */ function removeScriptTags(content: string): string { - return transformNonCodeBlocks(content, (text) => - text.replace(/]*>[\s\S]*?<\/script>/gi, ''), - ); + return transformNonCodeBlocks(content, (text) => text.replace(/]*>[\s\S]*?<\/script>/gi, '')); } /** @@ -219,9 +300,7 @@ function removeAnchorTags(content: string): string { * This makes hidden type definition tables visible in markdown output */ function stripHiddenFromTables(content: string): string { - return transformNonCodeBlocks(content, (text) => - text.replace(/(]*)\bhidden\b\s*/gi, '$1'), - ); + return transformNonCodeBlocks(content, (text) => text.replace(/(]*)\bhidden\b\s*/gi, '$1')); } /** @@ -275,7 +354,6 @@ function convertImagePathsToGitHub(content: string): string { * Preserves: Non-Ably /docs/ links, sdk.ably.com links (API docs), already .md links */ function convertDocsLinksToMarkdown(content: string): string { - // Allowed hostnames for docs link conversion (exact matches only) const ALLOWED_DOCS_HOSTNAMES = ['ably.com', 'www.ably.com', 'ably-dev.com', 'www.ably-dev.com']; @@ -474,7 +552,10 @@ function transformMdxToMarkdown( // Stage 11: Replace template variables content = replaceTemplateVariables(content); - // Stage 12: Prepend title as markdown heading + // Stage 12: Add language subheadings to code blocks within tags + content = addLanguageSubheadingsToCodeBlocks(content); + + // Stage 13: Prepend title as markdown heading const finalContent = `# ${title}\n\n${intro ? `${intro}\n\n` : ''}${content}`; return { content: finalContent, title, intro }; @@ -601,4 +682,8 @@ export { replaceTemplateVariables, calculateOutputPath, transformMdxToMarkdown, + getLanguageDisplayName, + findPrecedingHeadingLevel, + transformCodeBlocksWithSubheadings, + addLanguageSubheadingsToCodeBlocks, };