diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json index 83918a63..039d24c8 100644 --- a/.agents/plugins/marketplace.json +++ b/.agents/plugins/marketplace.json @@ -1406,6 +1406,18 @@ }, "category": "Design" }, + { + "name": "heygen", + "source": { + "source": "local", + "path": "./plugins/heygen" + }, + "policy": { + "installation": "AVAILABLE", + "authentication": "ON_INSTALL" + }, + "category": "Design" + }, { "name": "supabase", "source": { diff --git a/plugins/heygen/.app.json b/plugins/heygen/.app.json new file mode 100644 index 00000000..8ac6cc73 --- /dev/null +++ b/plugins/heygen/.app.json @@ -0,0 +1,7 @@ +{ + "apps": { + "heygen": { + "id": "asdk_app_69418aad55e08191aa5e437b649ca2e4" + } + } +} diff --git a/plugins/heygen/.codex-plugin/plugin.json b/plugins/heygen/.codex-plugin/plugin.json new file mode 100644 index 00000000..5400e5e9 --- /dev/null +++ b/plugins/heygen/.codex-plugin/plugin.json @@ -0,0 +1,44 @@ +{ + "name": "heygen", + "version": "2.2.0", + "description": "Create HeyGen avatar videos and personalized video messages. Build a persistent digital identity from a photo, then generate presenter-led videos with your digital twin.", + "author": { + "name": "HeyGen", + "email": "developers@heygen.com", + "url": "https://heygen.com" + }, + "homepage": "https://heygen.com", + "repository": "https://github.com/heygen-com/skills", + "license": "MIT", + "keywords": [ + "heygen", + "avatar", + "identity", + "video", + "digital-twin", + "video-message", + "presenter", + "talking-head", + "ai-avatar", + "avatar-video" + ], + "skills": "./skills/", + "apps": "./.app.json", + "interface": { + "displayName": "HeyGen", + "shortDescription": "Avatar videos and personalized video messages", + "longDescription": "HeyGen Skills give your agent a face, a voice, and the ability to send video like a message. Use heygen-avatar to build a persistent digital identity from a photo and pick a voice, then heygen-video to generate identity-first presenter videos via the HeyGen v3 Video Agent pipeline (avatar resolution, aspect ratio correction, prompt engineering, and voice selection are handled automatically).", + "developerName": "HeyGen", + "category": "Design", + "capabilities": ["Read", "Write"], + "websiteURL": "https://heygen.com", + "defaultPrompt": [ + "Create my HeyGen avatar from this photo", + "Make a 30-second intro video of myself", + "Send a video update to my team about this week's progress" + ], + "brandColor": "#0a0a0a", + "composerIcon": "./assets/icon.png", + "logo": "./assets/logo.png" + } +} diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md new file mode 100644 index 00000000..3d6d58d9 --- /dev/null +++ b/plugins/heygen/README.md @@ -0,0 +1,25 @@ +# heygen + +OpenAI Codex plugin for [HeyGen](https://heygen.com) — create AI avatar videos and personalized video messages. + +## What's included + +Two skills that chain together: + +- **heygen-avatar** — turn a photo into a persistent digital twin. Handles avatar lookup, instant-avatar creation, voice selection (or voice cloning), and writes an `AVATAR` file the video skill reads back. +- **heygen-video** — generate identity-first presenter videos via the HeyGen v3 Video Agent pipeline. Encodes the prompting, asset routing, aspect-ratio correction, and avatar/voice resolution that good HeyGen videos need. +- **HeyGen app reference** — `.app.json` points at the curated [HeyGen ChatGPT app](https://chatgpt.com/apps/heygen/asdk_app_69418aad55e08191aa5e437b649ca2e4). + +## Requirements + +Installing the plugin connects the HeyGen ChatGPT app automatically (OAuth on first use). That is enough for the skills to work end-to-end on the user's existing HeyGen plan credits. + +If you'd rather not use the app, the skills also support the HeyGen CLI: install it from and export `HEYGEN_API_KEY` (get one at ). + +## Source of truth + +The skills are authored in [`heygen-com/skills`](https://github.com/heygen-com/skills) (under `heygen-avatar/` and `heygen-video/` at the repo root) and mirrored here. The main structural delta in this mirror is the wrapping `skills/` parent directory required by the Codex plugin convention. File issues about skill content on that repo. + +## License + +MIT diff --git a/plugins/heygen/agents/openai.yaml b/plugins/heygen/agents/openai.yaml new file mode 100644 index 00000000..29e612cc --- /dev/null +++ b/plugins/heygen/agents/openai.yaml @@ -0,0 +1,6 @@ +interface: + display_name: "HeyGen" + short_description: "Create avatar videos and personalized video messages" + icon_small: "./assets/icon.png" + icon_large: "./assets/logo.png" + default_prompt: "Help me create a personalized HeyGen video message. Ask who should appear on camera, who the audience is, the key points, and the tone before generating it." diff --git a/plugins/heygen/assets/PRISM_ORB.svg b/plugins/heygen/assets/PRISM_ORB.svg new file mode 100644 index 00000000..dd3a1567 --- /dev/null +++ b/plugins/heygen/assets/PRISM_ORB.svg @@ -0,0 +1,189 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/plugins/heygen/assets/icon.png b/plugins/heygen/assets/icon.png new file mode 100644 index 00000000..a095fccb Binary files /dev/null and b/plugins/heygen/assets/icon.png differ diff --git a/plugins/heygen/assets/logo.png b/plugins/heygen/assets/logo.png new file mode 100644 index 00000000..f94a74d5 Binary files /dev/null and b/plugins/heygen/assets/logo.png differ diff --git a/plugins/heygen/skills/heygen-avatar/SKILL.md b/plugins/heygen/skills/heygen-avatar/SKILL.md new file mode 100644 index 00000000..bc4438e8 --- /dev/null +++ b/plugins/heygen/skills/heygen-avatar/SKILL.md @@ -0,0 +1,452 @@ +--- +version: 3.1.0 # x-release-please-version +name: heygen-avatar +description: | + Create a persistent HeyGen avatar — a reusable face + voice identity for the agent, + the user, or any named character — powered by HeyGen Avatar V technology. + Prompt-based creation by default (description → HeyGen builds it); photo upload is + optional for real-person digital twins. + Use when: (1) giving the agent a face + voice so it can present videos + ("bring yourself to life", "create your avatar", "give yourself an avatar", + "design a presenter", "set up an avatar", "let's make an avatar"), + (2) the user wants to appear in videos as themselves ("create my avatar", + "I want my face in a video", "digital twin of me", "build me an avatar"), + (3) building a named character presenter ("create an avatar called Cleo", + "design a character named X"), (4) establishing HeyGen identity before making videos — + the correct FIRST step when no avatar exists yet. + Chain signal: when the user says both an identity/avatar action AND a video action in the same + request ("create an avatar AND make a video", "set up identity THEN create a video", + "design a presenter AND immediately record"), run heygen-avatar first, then heygen-video. + Returns avatar_id + voice_id — pass directly to heygen-video to create HeyGen videos. + NOT for: generating videos (use heygen-video), translating videos, or TTS-only tasks. +argument-hint: "[name_or_description]" +--- + +# HeyGen Avatar Designer + +Create and manage HeyGen avatars for anyone: the agent, the user, or named characters. Handles identity extraction, avatar generation, voice selection, and saves everything to `AVATAR-.md` for consistent reuse. + +## Files & Paths + +This skill reads and writes the following. No other files are accessed without explicit user instruction. + +| Operation | Path | Purpose | +|-----------|------|---------| +| Read | `SOUL.md`, `IDENTITY.md` | Extract identity details when creating an avatar for the agent | +| Read | `AVATAR-.md` | Load existing avatar identity (for variant looks, voice updates) | +| Write | `AVATAR-.md` | Save new avatar identity after creation | +| Write | `AVATAR-AGENT.md`, `AVATAR-USER.md` (symlinks) | Role aliases, see Phase 5 | +| Temp write | `/tmp/heygen/uploads/` | Voice preview audio (downloaded for user playback, deleted after session) | +| Remote upload | HeyGen (via the app or `heygen asset create`) | User-provided photos uploaded to HeyGen for digital-twin creation | + +Assets are only uploaded to HeyGen when the user explicitly provides them. + +## Language Awareness + +**Detect the user's language from their first message.** Store as `user_language` (e.g., `en`, `ja`, `es`, `ko`, `zh`, `fr`, `de`, `pt`). + +1. **Communicate with the user in their language.** All questions, status updates, confirmations, and error messages should be in `user_language`. +2. **Voice design prompts and selection respect `user_language`.** When designing or selecting a voice, specify the target language so the voice library returns matches that speak it. +3. **Technical directives stay in English** — enum values (`Young Adult`, `Realistic`, `landscape`, etc.) are API-level and not translated. + +## UX Rules + +1. **Be concise.** No avatar IDs, group IDs, or raw API payloads in chat. Report the result (avatar created, ready to use) not the plumbing. +2. **No internal jargon.** Never mention internal phase names ("Phase 0", "Phase 5 Symlink Maintenance") to the user. The user sees natural conversation: "Setting up your avatar\u2026" not "Running Phase 2 avatar creation." +3. **One or two questions per phase.** Don't batch-ask. Walk phases in order, ask the smallest set of questions needed to proceed. +4. **Read workspace files before asking.** `SOUL.md`, `IDENTITY.md`, `AVATAR-*.md` at the workspace root contain identity. Check them first. Only ask the user for what's genuinely missing. +5. **Don't narrate skill internals.** Never say "let me read the workflow," "checking the reference files," "loading the avatar discovery guide." Read silently. The user sees questions and results, not internal navigation. +6. **Don't announce what you're about to do.** Skip meta-commentary like "Creating the avatar now." Just do the work. If a step takes time, the next thing the user hears should be the result (or a checkpoint question). +7. **Never narrate transport choice.** App vs CLI is internal. Pick the transport silently and never mention it. If both are unavailable, ask the user to configure one without explaining why. + +## Start Here (Critical) + +**Default target = the agent.** The primary use of this skill is giving the agent a face + voice so it can present videos. Route to "user" only on explicit "my avatar" / "me" / "my photo" language. When in doubt, make the agent's avatar. + +**Do NOT batch-ask questions.** No "give me a photo, voice preference, duration, target platform, tone, key message" all at once. Walk phases in order. Each phase asks at most one or two things at a time. + +**For agent avatars: read SOUL.md and IDENTITY.md first, then go straight to prompt-based creation.** Do NOT ask the user for a photo or appearance details first. The agent's identity lives in those workspace files. Only ask the user for traits that are genuinely missing. + +**Prompt-based is the default creation path.** Photo is opt-in, only relevant when the user explicitly wants a real-person digital twin of themselves. Agents and named characters almost always use prompt-based creation. + +## Before You Start (environment detection) + +Try to read `SOUL.md` from the workspace root. + +- **Found** → Workspace-native identity files are available. Skip this entire section and go straight to Phase 0. `SOUL.md` and `IDENTITY.md` will drive agent onboarding. +- **Not found** → No workspace identity files are available. Still go to Phase 0 next — do NOT skip ahead to listing user avatars or asking the user for a photo. + +**⚠️ AVATAR file caveat:** Ignore any AVATAR-*.md files found in the workspace that belong to a *different* person or agent (e.g., `AVATAR-Eve.md` when creating an avatar for a different subject). Only use an AVATAR file if its name matches the subject you're creating for right now. + +**⚠️ Do NOT fetch HeyGen avatars yet.** That's a Phase 0 sub-step (only after target detection). Fetching before Phase 0 causes the agent to frame the conversation around "your existing avatars" when the default should be creating one for the agent itself. + +## API Mode Detection + +**Mode selection is silent.** Detect once at the start of the session, pick a mode, move on. Never narrate transport choice ("CLI is broken", "switching to the app") — the user doesn't care how calls are made. + +**HeyGen app (preferred):** If the HeyGen app is available through the installed app integration, use it. The app authenticates via OAuth and runs against the user's existing HeyGen plan credits. + +**CLI fallback:** If the app is not available AND the `heygen` binary runs cleanly (`heygen --version` exits 0), use it. Auth: `HEYGEN_API_KEY` env OR `heygen auth login` (persists to `~/.heygen/credentials`). If the CLI is missing, errors on `--version`, or auth is not set, silently skip it. + +**Neither available:** Only if the app is unavailable AND the CLI doesn't work, tell the user once: "To use this skill, connect the HeyGen app or install the HeyGen CLI: `curl -fsSL https://static.heygen.ai/cli/install.sh | bash` then `heygen auth login`." + +**API:** v3 only. Never call v1 or v2 endpoints. + +**Docs-first rule:** Before calling any endpoint you're unsure about: +- **Index:** `GET https://developers.heygen.com/llms.txt` — full sitemap +- **Any page:** Append `.md` to the URL for clean markdown +- Or run `heygen --help` +- Read the spec, THEN build your request. Never guess field names. + +## Avatar File Convention + +Every avatar gets one file: `AVATAR-.md` at the workspace root. + +``` +AVATAR-EVE.md ← agent (named, canonical) +AVATAR-KEN.md ← user (named, canonical) +AVATAR-CLEO.md ← character (named, canonical) +``` + +The skill also maintains two **role-based symlinks** alongside the named +files, for generic lookups by consumer skills (e.g., heygen-video) when the +request doesn't carry a specific name ("make a video of yourself" → read +the agent alias; "make a video of me" → read the user alias): + +``` +AVATAR-AGENT.md → AVATAR-.md (symlink) +AVATAR-USER.md → AVATAR-.md (symlink) +``` + +Named files are the single source of truth; aliases are pointers and never +drift. Phase 5 of the workflow maintains them. Named characters get NO +role alias — they are referenced by name only. + +Format: +```markdown +# Avatar: + +## Appearance +- Age: +- Gender: +- Ethnicity: +- Hair: +- Build: +- Features: +- Style: +- Reference: + +## Voice +- Tone: +- Accent: +- Energy: +- Think: + +## HeyGen +- Group ID: +- Voice ID: +- Voice Name: +- Voice Designed: +- Voice Seed: +- Looks: landscape=, portrait=, square= +- Last Synced: + +⚠️ look_ids are ephemeral — always resolve fresh from group_id at runtime via `heygen avatar looks list --group-id ` (or the corresponding HeyGen app tool). Never hardcode look_id as the primary avatar reference. +``` + +**Top sections** (Appearance, Voice) are portable natural language. Any platform can use them. +**HeyGen section** is runtime config with API IDs. Skills read this to make API calls. + +## Skill Announcement + +Start every invocation with: + +> 🎭 **Using: heygen-avatar** — creating an avatar for [name] + +## Workflow + +**DO NOT batch-ask questions upfront.** Walk phases in order. Each phase asks at most one thing at a time, and only if needed. + +### Phase 0 — Who Are We Creating? + +See the Start Here block above for the default-to-agent rule. Only route to "user" or "named character" when the phrasing is unambiguous. + +Routing signals (in priority order): + +1. **User** (explicit only) — "create **my** avatar", "make **me** an avatar", "I want my face in a video", "a digital twin of **me**", "based on **my** photo". Requires a possessive pronoun referring to the user OR explicit mention of their photo. Ask for their name if not obvious. +2. **Named character** (explicit only) — "create an avatar called Cleo", "design a character named X", "build a presenter named Y" → use the given name. +3. **Agent** (default) — everything else: "create your avatar", "bring yourself to life", "set up an avatar", "let's make an avatar", "create an avatar", "design a presenter", "I want you to appear in videos", or any ambiguous phrasing. Read `IDENTITY.md` for name. + +**When unsure, default to agent.** Do NOT ask the user for their name, appearance, or voice on an ambiguous request — that's the wrong first move. If after reading IDENTITY.md + SOUL.md the intent still feels ambiguous, ask one short clarifying question to disambiguate (phrase it naturally — something like "quick check: this avatar is for you, or for me?"). + +Then check `AVATAR-.md` at the workspace root: + +- **AVATAR file exists + HeyGen section filled in** → "You already have an avatar set up. Want to add a new look, update it, or start fresh?" Wait for answer. +- **AVATAR file exists but HeyGen section empty** → skip to Phase 2. +- **No AVATAR file** → proceed to Phase 1. + +**Role alias staleness check.** Before proceeding, also check whether the +role alias for this target is already pointing at the right named file: + +- For **agent target**: read `AVATAR-AGENT.md` (follow symlink) and + compare to `AVATAR-.md`. If they differ (e.g., + `AVATAR-AGENT.md` → `AVATAR-OLD-NAME.md` because the agent identity + changed since the last run), re-link in Phase 5 even if no other + changes are made. The named file is canonical, but the alias must + match the *current* identity, not the historical one. +- For **user target**: same check on `AVATAR-USER.md`. +- For **named character**: no alias to check. + +**Optional existing-avatar check** (only useful on the user path when the user might already have avatars in their HeyGen account). If Phase 0 target = **user** AND no `AVATAR-.md` exists, list their HeyGen avatars first: + +**App:** use the HeyGen app to list private avatar groups +**CLI:** `heygen avatar list --ownership private` + +If the list is non-empty, present the options and ask which to use or whether to create new. If empty, proceed to Phase 1. Skip this check entirely for agent and named-character targets — those live in AVATAR-*.md, not the HeyGen catalog. + +### Phase 1 — Identity Extraction + +**Order matters. Files first, questions second. Prompt-based creation is the default path — photo is an opt-in upgrade.** + +**For the agent** (Phase 0 target = agent): +1. Read `SOUL.md`, `IDENTITY.md`, and any existing `AVATAR-.md` from the workspace root. +2. If SOUL.md or IDENTITY.md is found → extract appearance and voice traits silently. Do NOT ask the user "describe your appearance" — the agent IS the subject, and its identity lives in those files. **If the files describe only personality / values with no physical description, do NOT hallucinate traits.** Ask the user conversationally for the missing appearance traits only (one or two at a time). +3. If neither file is found (for example, in a workspace with no identity files) → ask the user to describe the agent's appearance and voice conversationally. +4. Proceed directly to **Type A (prompt) creation** in Phase 2 by default. Do NOT ask for a photo unless the user volunteers one or explicitly asks for photo realism — agents almost always use prompt-based creation. + +**For users/named characters** (Phase 0 target = user or named): +- Conversational onboarding. Ask naturally about appearance and voice — one or two questions at a time, not a form. Communicate in `user_language`. +- **User path only:** after the onboarding Q&A, run the Reference Photo Nudge below. +- **Named character path:** skip the nudge, go straight to Type A (prompt) creation. + +Write `AVATAR-.md` with the Appearance and Voice sections filled in. Leave the HeyGen section empty until Phase 2 succeeds. + +### Reference Photo Nudge (user path only) + +Only run this step when Phase 0 target = **user** (real-person digital twin) OR when the user explicitly asks for photo realism. + +- Check AVATAR file's Appearance → Reference field first. If a photo is already on file, skip asking and use it. +- Otherwise, ask one sentence: *"Got a headshot? It gives better face consistency for videos of you. I can also generate from your description — just say 'skip.'"* + +Branch: +- **Photo provided** → upload via the HeyGen app or `heygen asset create --file `, then Type B (photo) creation in Phase 2. +- **Skip** → Type A (prompt) creation in Phase 2. + +For agents and named characters, skip this entire step — go straight to Type A (prompt) creation. + + +### Phase 2 — Avatar Creation + +📖 **Full creation API surface (photo / prompt / digital twin), file input formats, identity field → enum mapping, response shape → [references/avatar-creation.md](references/avatar-creation.md)** + +Two modes: + +**Mode 1 — New character** (omit `avatar_group_id`): +Creates a brand new character with its own group. + +**Mode 2 — New look** (include `avatar_group_id`): +Adds a variation to an existing character. Read the Group ID from the AVATAR file. + +Two creation types: + +**Type A — From prompt (AI-generated appearance):** + +**App:** use the HeyGen app flow for prompt-based avatar creation. +**CLI:** `heygen avatar create -d '{"type":"prompt","name":"...","prompt":"...","avatar_group_id":"..."}'` (accepts inline JSON, a file path, or `-` for stdin) + +Prompt limit is 1000 characters. Be descriptive — include style, features, expression, lighting. The API spec says 200 but the actual enforced limit is 1000. + +**Type B — From reference image:** + +**App:** use the HeyGen app flow for photo avatar creation. +**CLI:** `heygen avatar create -d '{"type":"photo","name":"...","file":{"type":"url","url":"..."},"avatar_group_id":"..."}'` + +File options for Type B: +- `{ "type": "url", "url": "https://..." }` — public image URL +- `{ "type": "asset_id", "asset_id": "" }` — from `heygen asset create --file ` +- `{ "type": "base64", "media_type": "image/png", "data": "" }` — inline + +📖 **When to use each (URL vs asset_id vs base64), upload routing, and edge cases → [references/asset-routing.md](references/asset-routing.md)** + +**Response:** Returns `avatar_item.id` (look ID) and `avatar_item.group_id` (character identity). + +Map identity fields to HeyGen enums for the prompt: +- **age**: Young Adult | Early Middle Age | Late Middle Age | Senior | Unspecified +- **gender**: Man | Woman | Unspecified +- **ethnicity**: White | Black | Asian American | East Asian | South East Asian | South Asian | Middle Eastern | Pacific | Hispanic | Unspecified +- **style**: Realistic | Pixar | Cinematic | Vintage | Noir | Cyberpunk | Unspecified +- **orientation**: square | horizontal | vertical +- **pose**: half_body | close_up | full_body + +Show the prompt to the user before creating: +> **Appearance:** "[prompt]" +> **Settings:** Young Adult | Woman | East Asian | Realistic +> Look good? (yes / adjust / completely different) + +⛔ **STOP. Wait for the user to approve or adjust. Do NOT call the avatar creation API until the user confirms.** + +### Phase 3 — Voice + +Two paths: **Design** (describe what you want, get matched voices) or **Browse** (filter the catalog manually). + +Ask whether they want voice design (describe what they want) or catalog browsing. Communicate in `user_language`. + +Default to **Design** if the AVATAR file has a Voice section with personality traits. + +#### Path A — Voice Design (preferred) + +Find matching voices via semantic search using the Voice section from the AVATAR file. This searches HeyGen's full voice library. No new voices are generated and no quota is consumed. + +**Language matching:** The voice design prompt should specify the target language from `user_language`. Example for Japanese: `"A calm, warm female voice. Professional but approachable. Japanese speaker."` This ensures semantic search returns voices in the correct language. + +**App:** use the HeyGen app flow for voice selection or design. +**CLI:** `heygen voice create --prompt "..." --seed 0` (also accepts `--gender`, `--locale`) + +Returns 3 voice options per seed. Present all 3 with inline audio previews: +- Download each `preview_audio_url` to a temp path (any standard download method works — no HeyGen auth needed, these are public S3 URLs) +- Send as audio attachment: `message(action:send, media:"", caption:"Option : , ")` so it plays inline in Telegram/Discord +- After all previews sent, present selection buttons + +⛔ **STOP. Wait for the user to pick a voice via buttons or text. Do NOT select a voice yourself or proceed to Phase 4 until the user explicitly chooses.** + +If none match: +> "None of these hitting right? I can try a different set (same description, different variations) or you can tweak the description." + +Increment `seed` and call again. Different seeds give completely different voice options from the same prompt. + +- Clean up /tmp files after user picks + +#### Path B — Voice Browse (fallback) + +Browse HeyGen's existing voice library: + +**App:** browse available voices in the HeyGen app, filtered to the target language and voice characteristics when possible. +**CLI:** `heygen voice list --type private` / `heygen voice list --type public --language --gender ` + +1. Read the Voice section from the AVATAR file +2. Filter by gender and language +3. Pick top 3 candidates based on personality match +4. Present with inline audio previews (same download + send pattern as Path A) +5. ⛔ **STOP. Wait for the user to pick. Do NOT auto-select.** + +### Phase 4 — Save to AVATAR File + +Update the HeyGen section of `AVATAR-.md` to match the canonical format: + +```markdown +## HeyGen +- Group ID: +- Voice ID: +- Voice Name: +- Voice Designed: +- Voice Seed: +- Looks: = (e.g., landscape=, portrait=) +- Last Synced: + +⚠️ look_ids are ephemeral — always resolve fresh from group_id at runtime via `heygen avatar looks list --group-id ` (or the corresponding look picker in the HeyGen app). Never hardcode look_id as the primary avatar reference. +``` + +Confirm the avatar is saved and that other skills (like heygen-video) will pick it up automatically. Communicate in `user_language`. + +### Phase 5 — Maintain Role Alias + +After writing the named `AVATAR-.md`, create or update a role-based +symlink alongside it so other skills can do generic lookups without +resolving the agent / user name first. + +Based on the Phase 0 target: + +- **Agent target** → symlink `AVATAR-AGENT.md` → `AVATAR-.md` +- **User target** → symlink `AVATAR-USER.md` → `AVATAR-.md` +- **Named character** → no role alias. Named characters are referenced by + name only (e.g., `AVATAR-CLEO.md`); they are not the agent or the user. + +**Implementation (run from the workspace root, with fs-fallback):** + +The `cd` to workspace root is mandatory — bare relative paths in `ln -s` +resolve from the agent's current working directory, not where SOUL.md +lives. The `|| echo` clause handles filesystems that reject symlinks +(Windows without dev mode, some cloud-mounted storage) without aborting +Phase 5. + +```bash +# Agent +cd "$WORKSPACE_ROOT" && ln -sf AVATAR-.md AVATAR-AGENT.md \ + || echo "role alias skipped: fs doesn't support symlinks" + +# User +cd "$WORKSPACE_ROOT" && ln -sf AVATAR-.md AVATAR-USER.md \ + || echo "role alias skipped: fs doesn't support symlinks" +``` + +Use a relative link target (just the filename, no path prefix) so the +alias survives if the workspace is moved or copied. + +`ln -sf` is unlink-then-symlink under the hood, not strictly atomic. +Fine for single-user workspaces; if concurrent agents ever write the +same alias, expect interleaving and add explicit locking then. + +**Why symlink, not copy:** removes the duplicate-file drift class +(content can never diverge between named file and alias). It does NOT +remove staleness drift — if `IDENTITY.md` changes the agent name without +re-running heygen-avatar, `AVATAR-AGENT.md` keeps pointing at the *old* +named file. Phase 0 mismatch-and-re-alias handles this on the next +invocation; until then, the alias is stale-but-pointing-somewhere-valid, +not broken. + +**Multi-agent workspace caveat:** one role alias per workspace is +last-writer-wins. If two agents ever share a workspace and both run +heygen-avatar, only the most recent run's identity is reachable via +`AVATAR-AGENT.md`. Named files for both still exist. We accept this +limit — multi-agent shared workspaces are out of scope for v1. + +### Phase 6 — Test (Optional) + +If the user wants to see their avatar in action: + +**App:** use the HeyGen app's video-generation flow with the selected avatar and voice. +**CLI:** `heygen video-agent create --avatar-id --voice-id --prompt "..." --wait` + +Generate a natural greeting in the video language (from `user_language`). Examples: English "Hi, I'm [name]. Nice to meet you!", Japanese "[name]です。はじめまして!", Spanish "Hola, soy [name]. ¡Mucho gusto!", Korean "안녕하세요, [name]입니다. 만나서 반갑습니다!" + +## Iteration Flow + +When the user wants to refine: + +- **"Adjust the prompt"** → Mode 2 with existing group_id (keeps the character, adds a new look). Only Mode 1 if they say "start completely over." +- **"Add a new look"** / **"different outfit"** → Mode 2 with existing group_id. Add to Looks in AVATAR file. +- **"Try a different voice"** → back to Phase 3 +- **"Start completely over"** → Mode 1, new character. Overwrite HeyGen section. + +**Default to Mode 2 (new look under same group).** Only create a new group when the user explicitly wants a different character identity. This keeps the account clean and makes looks reusable across skills. + +Each iteration updates the AVATAR file. The file is always the source of truth. + +## UX Rules + +**Be interactive at checkpoints, silent everywhere else.** Stop and wait at avatar approval and voice selection. Between checkpoints, work silently — don't narrate reasoning or explain next steps. After voice pick: save + confirm in one message. + +## Video Producer Integration + +`heygen-video` reads AVATAR files for group_id and voice_id. Resolution +order: + +1. **Named request** ("Make a video with Eve") → read `AVATAR-EVE.md`. +2. **Agent self-reference** ("make a video of yourself", "give us a video +update") → read `AVATAR-AGENT.md` (symlink to current agent's named file). +3. **User self-reference** ("make a video of me", "my video update") → read +`AVATAR-USER.md` (symlink to current user's named file). +4. **No AVATAR file or symlink** → fall back to stock avatars or ask user. + +The alias targets are resolved by the OS at read time, so consumer skills +simply `cat AVATAR-AGENT.md` and get whatever the current agent's avatar is. + +## Error Handling + +- Missing SOUL.md/IDENTITY.md → conversational onboarding, write AVATAR file from answers +- API fails → retry once, then ask user to check API key +- Voice match poor → show all available voices, let user browse +- Asset upload fails → skip reference image, try prompt-only creation +- Existing avatar file with stale HeyGen IDs → offer to regenerate or keep + +📖 **Known issues, retry patterns, broken voice previews, error → action mapping → [references/troubleshooting.md](references/troubleshooting.md)** diff --git a/plugins/heygen/skills/heygen-avatar/agents/openai.yaml b/plugins/heygen/skills/heygen-avatar/agents/openai.yaml new file mode 100644 index 00000000..2cbb19ba --- /dev/null +++ b/plugins/heygen/skills/heygen-avatar/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "HeyGen Avatar" + short_description: "Create reusable HeyGen avatar identities" + default_prompt: "Create a reusable HeyGen avatar for me from a photo or written description, then help me choose a matching voice." diff --git a/plugins/heygen/skills/heygen-avatar/references/asset-routing.md b/plugins/heygen/skills/heygen-avatar/references/asset-routing.md new file mode 100644 index 00000000..04921e91 --- /dev/null +++ b/plugins/heygen/skills/heygen-avatar/references/asset-routing.md @@ -0,0 +1,86 @@ +# Asset Handling — The Classification Engine + +When the user provides files, URLs, or references, route each asset to the right path. The user should NEVER have to think about this. + +## Two Paths + +| Path | What happens | When to use | +|------|-------------|-------------| +| **A: Contextualize → Prompt** | Read/analyze the asset, extract key info, bake into script. Video Agent never sees the original. | Reference material, auth-walled content, documents where the *information* matters more than the *visual*. | +| **B: Attach to API** | Upload the raw file via `files[]`. Video Agent analyzes, extracts graphics, uses as frames/B-roll. | Screenshots, branded assets, PDFs with important visual layouts, images the viewer should literally see. | +| **A+B: Both** | Contextualize for script quality AND attach for visual use. | Long docs where you need to summarize but Video Agent should also have the full source. | + +## Classification Flow + +``` +1. Can Video Agent access this directly? + - Public URL (no auth, no paywall) → YES + - Private/internal URL → NO + - Local file → NO (must upload first) + +2. Should the viewer SEE this asset? + - Screenshot, logo, product image, chart → YES → Path B + - Research doc, article, context material → NO → Path A + - Ambiguous → Path A+B + +3. Is the content too long for the prompt? + - Short (< 500 words) → fits in prompt + - Long (> 500 words) → summarize key points, attach full doc +``` + +## Decision Matrix + +| Asset Type | Publicly Accessible? | Show On Screen? | Route | +|-----------|---------------------|----------------|-------| +| Screenshot / image | N/A | Yes | **B: Attach** + describe in prompt as B-roll | +| Logo / brand asset | N/A | Yes | **B: Attach** + anchor to intro/outro | +| Public URL to file (PDF, image, video) | Yes | Maybe | **B: Download → upload via `/v3/assets` → pass `asset_id`** + summarize | +| Public URL to web page (HTML) | Yes | No | **A: Fetch and contextualize only.** Do NOT pass HTML URLs in `files[]`. | +| Auth-walled URL (requires login) | No | No | **A: Ask the user to paste the content.** Never fabricate. | +| PDF (short, text-heavy) | N/A | No | **A+B: Extract key points** + attach | +| PDF (long, visual-rich) | N/A | Maybe | **B: Attach** + summarize top points | +| Raw data / spreadsheet | N/A | Partially | **A: Analyze and describe** key stats. Attach if charts should appear. | + +## Executing Routes + +### Path A (Contextualize) +- URLs: retrieve publicly accessible content with the environment's standard web/content fetch capability +- For auth-walled content you cannot access: ask the user to paste the text directly +- Extract 3-5 most important points relevant to the video +- Weave naturally into the script. Don't dump. Integrate. + +### Path B (Attach) +Upload to HeyGen: + +**App:** upload through the HeyGen app's asset flow when available. +**CLI:** `heygen asset create --file /path/to/file.png` + +Max 32MB per file. Returns JSON with the new `asset_id`. + +Or pass inline in `files[]`: +```json +{"type": "url", "url": "https://example.com/image.png"} +{"type": "asset_id", "asset_id": ""} +{"type": "base64", "data": "", "content_type": "image/png"} +``` + +### Describe Asset Usage in Prompt +Be SPECIFIC: +- "Use the uploaded dashboard screenshot as B-roll when discussing analytics" +- "Display the company logo in the intro and end card" + +### Log Classification +In the learning log entry, record: +```json +"assets_classified": [{"type": "image", "route": "attach", "accessible": true, "reason": "product screenshot"}] +``` + +## Rules + +- **Never ask the user which path unless genuinely 50/50.** You're the producer. Make the call. +- **When in doubt, do both (A+B).** Over-providing costs nothing. +- **Always describe attached assets in the prompt.** Uploading without description = ignored. +- **Auth-walled content is YOUR job.** Bridge the gap between your access and Video Agent's. +- **URLs that fail:** Try the environment's standard web/content fetch capability. If login/paywall/404 → tell the user, ask for content directly. Never silently fabricate. +- **HTML URLs cannot go in `files[]`.** Video Agent rejects `text/html`. Web pages are ALWAYS Path A only. +- **Prefer download→upload→asset_id** over `files[]{url}`. HeyGen's servers often blocked by CDN/WAF. diff --git a/plugins/heygen/skills/heygen-avatar/references/avatar-creation.md b/plugins/heygen/skills/heygen-avatar/references/avatar-creation.md new file mode 100644 index 00000000..2899539e --- /dev/null +++ b/plugins/heygen/skills/heygen-avatar/references/avatar-creation.md @@ -0,0 +1,178 @@ +# Avatar Creation API Surface + +This guide expands `heygen-avatar` Phase 2 (avatar creation) and Phase 3 +(voice selection) with the full API surface, field mappings, and file +input formats. The SKILL.md gives the high-level workflow; this file is +the reference when you need exact arguments, edge cases, or alternative +creation paths. + +For *avatar discovery* (finding an existing avatar at video time), see +[`../../heygen-video/references/avatar-discovery.md`](../../heygen-video/references/avatar-discovery.md). + +--- + +## Avatar Creation: Three Types + +`heygen-avatar` Phase 2 supports three creation types. Pick based on what +the user provides: + +| User input | Type | Flow | +|---|---|---| +| A photo of a real person | `photo` | Photo avatar creation | +| A description of an appearance | `prompt` | Prompt-based avatar creation | +| A short video recording of a real person | `video` | Digital-twin creation | + +All three accept an optional `avatar_group_id`: +- **Omit it** to create a new character (new group). +- **Include it** to add a new look (variation) to an existing character. + +Always use Mode 2 (with `avatar_group_id`) when the avatar already exists +and you're creating a variant (different outfit, orientation fix, bg +change). Only use Mode 1 (new character) for genuinely new identities. + +### Photo avatar (from user's photo) + +**App:** use the HeyGen app flow for photo avatar creation. + +**CLI:** +```bash +heygen avatar create -d '{ + "type": "photo", + "name": "My Avatar", + "file": {"type": "url", "url": "https://example.com/headshot.jpg"}, + "avatar_group_id": "" +}' +``` + +Photo requirements: +- JPEG or PNG +- Min 512x512 +- Clear front-facing face +- Good lighting + +### AI-generated avatar (from text prompt) + +**App:** use the HeyGen app flow for prompt-based avatar creation. + +**CLI:** +```bash +heygen avatar create -d '{ + "type": "prompt", + "name": "Tech Presenter", + "prompt": "Young professional woman, modern workspace, confident smile", + "avatar_group_id": "" +}' +``` + +Prompt limit: 1000 characters (the API spec says 200 but the actual +enforced limit is 1000). Be descriptive — include style, features, +expression, lighting. + +Optional: up to 3 `reference_images` to anchor the generated appearance. + +### Video avatar / digital twin (from a short recording) + +**App:** use the HeyGen app flow for digital-twin creation from video. + +**CLI:** +```bash +heygen avatar create -d '{ + "type": "video", + "name": "My Video Avatar", + "file": {"type": "asset_id", "asset_id": ""}, + "avatar_group_id": "" +}' +``` + +--- + +## File Input Formats + +`file` accepts three forms: + +```jsonc +// Public URL (no auth, no paywall) +{ "type": "url", "url": "https://example.com/headshot.jpg" } + +// Pre-uploaded asset (from `heygen asset create --file `) +{ "type": "asset_id", "asset_id": "" } + +// Inline base64 +{ "type": "base64", "data": "", "content_type": "image/png" } +``` + +For when each is appropriate, see +[`references/asset-routing.md`](asset-routing.md). + +--- + +## Response Shape + +All three types return: +```jsonc +{ + "avatar_item": { + "id": "", // ephemeral — the specific look + "group_id": "" // stable — the character identity + } +} +``` + +- `id` is the **look_id** — what you pass downstream as `avatar_id` for + HeyGen video generation. +- `group_id` is the **character identity** — stable across looks. Save + this in the AVATAR-.md file. Always resolve fresh look_ids at + video time via the avatar-looks flow rather than caching + a specific look_id. + +--- + +## Identity Field → HeyGen Enum Mapping + +When building a prompt-based avatar, map identity attributes to these +HeyGen enums: + +- **age**: Young Adult | Early Middle Age | Late Middle Age | Senior | Unspecified +- **gender**: Man | Woman | Unspecified +- **ethnicity**: White | Black | Asian American | East Asian | South East Asian | South Asian | Middle Eastern | Pacific | Hispanic | Unspecified +- **style**: Realistic | Pixar | Cinematic | Vintage | Noir | Cyberpunk | Unspecified +- **orientation**: square | horizontal | vertical +- **pose**: half_body | close_up | full_body + +--- + +## Voice Selection (during avatar setup) + +After the avatar look is created, pair it with a voice. Two paths: + +### Path A — Voice Design (preferred) + +Find matching voices via semantic search using the Voice section from +the AVATAR file. This searches HeyGen's full voice library. No new +voices are generated and no quota is consumed. + +**Language matching:** The voice design prompt should specify the target +language from `user_language`. Example for Japanese: `"A calm, warm +female voice. Professional but approachable. Japanese speaker."` This +ensures semantic search returns voices in the correct language. + +### Path B — Voice Browse (fallback) + +For manual catalog browsing: + +**App:** browse available voices in the HeyGen app, filtered to the target language and voice characteristics when possible. + +**CLI:** +```bash +heygen voice list --type private --limit 20 +heygen voice list --type public --engine starfish --language en --gender female --limit 20 +``` + +**ALWAYS show a playable voice preview.** Each voice response includes +`preview_audio_url` — share it before committing. + +**Handling missing/broken previews:** Some voices may not expose a usable +preview URL and can return `null`. When this happens: note "(no preview available)" and +offer to generate a short TTS sample via the app or +`heygen voice speech create --text "" --voice-id +--input-type plain_text --language en --locale en-US` (CLI). diff --git a/plugins/heygen/skills/heygen-avatar/references/troubleshooting.md b/plugins/heygen/skills/heygen-avatar/references/troubleshooting.md new file mode 100644 index 00000000..224e2bff --- /dev/null +++ b/plugins/heygen/skills/heygen-avatar/references/troubleshooting.md @@ -0,0 +1,151 @@ +# Known Issues & Troubleshooting + +## Known Bug: Video Agent "Talking Photo Not Found" + +**Error message:** "The Talking Photo for the current narrator could not be found." + +**Root Cause:** Confirmed as a Video Agent backend bug by HeyGen engineering (Jerry Yan). Affects `video_avatar` type narrators and stock avatar auto-selection. + +**Workaround:** +- Prefer explicit `avatar_id` over auto-selection +- If `video_avatar` fails, retry with a `studio_avatar` or `photo_avatar` + +**Status:** Fix in progress at HeyGen. + +--- + +## Weird Pauses / Unnatural Silence in Videos + +**Symptom:** Video has awkward pauses or breaks between sentences. Narrator stops speaking but video continues with dead air before next line. + +**Root Cause:** When Video Agent receives a script shorter than the target duration, it treats the script as verbatim speech and inserts silence/breaks to stretch it to the exact requested duration. It won't ad-lib or expand — it just pads with dead air. + +**Fix:** Add this directive to EVERY prompt: +> "This script is a concept and theme to convey — not a verbatim transcript. You have full creative freedom to expand, elaborate, add examples, and fill the duration naturally. Do not pad with silence or pauses." + +This tells Video Agent it can expand the script naturally instead of treating it as a fixed speech transcript. Per Jerry Yan: "If you tell it it's not a script to be strictly followed but concept or theme or give it green light to expand the script it will do well." + +**Status:** Skill-side fix (prompt directive). HeyGen is also tuning the default behavior but the explicit directive is the reliable workaround. + +--- + +## Duration Variance (Expected Behavior) + +Video Agent controls final video timing internally. Duration accuracy ranges from 79-174% of target across testing. This is NOT a bug. + +**Mitigation:** Variable padding multipliers (Script): +- ≤30s target: 1.6x padding +- 31-119s target: 1.4x padding +- ≥120s target: 1.3x padding + +With explicit `avatar_id`: ~97% duration accuracy average. +Without `avatar_id`: ~80% accuracy average. + +--- + +## Frame Check: Video Agent Not Applying Framing + +If the Video Agent ignores the FRAMING NOTE or BACKGROUND NOTE and produces black bars, letterboxing, or mismatched framing: + +1. **Ensure the note is appended at the END of the prompt**, after all other content (script, style block, etc.). Video Agent processes instructions sequentially and late-prompt directives have the strongest effect. +2. **Check that the correction note was actually appended.** Log the final prompt text and verify the FRAMING NOTE / BACKGROUND NOTE block is present. +3. **photo_avatar does NOT need BACKGROUND NOTE.** Video Agent generates avatar + environment together for photo_avatars. Only append framing notes for orientation mismatches. Background notes are for studio_avatars with transparent/empty backgrounds only. + +--- + +## Stock Avatar Auto-Selection Unreliable + +When no `avatar_id` is provided, Video Agent uses narrator tags (`{{@narrator_l0ug91}}`) that sometimes fail to resolve during render. + +**Fix:** Always use explicit `avatar_id` from discovery. The only exception is Quick Shot mode where the user explicitly wants speed over reliability. + +--- + +## HTML URLs in files[] Rejected + +Video Agent rejects `text/html` content type in the `files[]` array. Web pages (blogs, docs sites, articles) must be handled via Path A (contextualize) only. + +**What works in files[]:** Direct file URLs (PDFs, images, videos) — but prefer download→upload→asset_id since CDN/WAF often blocks HeyGen's servers. + +--- + +## Avatar Not Ready for Video Generation + +**Symptom:** Video generation fails or produces errors immediately after creating a new avatar. The avatar exists in the HeyGen dashboard but videos referencing it fail. + +**Root Cause:** Avatar creation is asynchronous. `heygen avatar create` (and the equivalent creation flow in the HeyGen app) return success immediately, but the avatar image is still being processed. If you submit a video request before processing completes, it fails. + +**Detection:** Poll with `heygen avatar looks list --group-id ` (or check the avatar-looks view in the HeyGen app). The avatar is NOT ready until: +- `preview_image_url` is non-null +- `image_width` and `image_height` are non-zero + +At the group level (`heygen avatar list`), an unready avatar will have no `preview_image_url` on the group object. + +**Fix:** Poll every 10 seconds after creation, wait for preview URL to appear. Typical: 30-90s for photo avatars, 1-3 min for prompt avatars. Timeout at 5 min. + +**The heygen-avatar skill handles this automatically.** If you bypass the skill, you must implement this polling yourself. + +--- + +## Interactive Sessions Reliability + +Interactive sessions (created without `--wait` and iterated via `heygen video-agent send`) have known issues: +- Sessions frequently stuck at `processing` status +- `reviewing` state may never be reached +- Follow-up messages fail with timing errors +- Stop command may not trigger video generation + +**Recommendation:** Use one-shot mode for production. Interactive sessions documented for future use once HeyGen stabilizes the API. + +--- + +## Error Code → Action + +Stable CLI exit codes tell you what to do without parsing messages: + +| Exit | Class | Action | +|------|-------|--------| +| `0` | ok | Continue | +| `1` | API / network | Retry with backoff. If persistent, check `--verbose` or contact HeyGen support. | +| `2` | usage | You passed a bad flag. Run `--help` on the command, fix the args, retry. | +| `3` | auth | Re-auth: `heygen auth login` or set `HEYGEN_API_KEY`. Verify with `heygen auth status`. | +| `4` | timeout under `--wait` | Operation still running server-side. stdout contains the partial resource (with `session_id` or `video_id`) — resume polling with `heygen video-agent get ` or `heygen video get `. Do NOT re-submit. | + +Common API-error hints (surfaced in stderr envelope `{error:{code,message,hint}}`): + +- `402` / insufficient credits → tell the user their HeyGen plan is out of credits. +- `403` / forbidden → the resource is not owned by the caller (wrong `group_id`, private avatar). +- `404` / not found → ID is stale. Re-fetch via `heygen avatar list`, `heygen video-agent get`, etc. + +--- + +## Polling Cadence + +When `--wait` isn't an option (e.g., you want to return control to the user between polls), use a back-off schedule rather than a fixed interval: + +| Age of job | Poll interval | +|------------|---------------| +| 0–2 min | every 10s | +| 2–5 min | every 30s | +| 5–10 min | every 60s | +| > 10 min | surface "taking longer than usual" once, keep polling at 60s, give up at 15 min | + +If a job is stuck at the same status for >5 min, that's a signal to surface a status update or check the dashboard. + +**Prefer `--wait`** on creation commands. It handles the polling internally and returns the final resource or exits `4` with a resumable `session_id` / `video_id` on timeout. + +--- + +## Direct Video vs Video Agent — Which Endpoint? + +Two ways to generate a video. Different pricing, different trade-offs. + +| | **Direct Video** | **Video Agent** | +|---|-------------------|-----------------| +| Command / Tool | `heygen video create` / no app equivalent documented here | `heygen video-agent create` / app-based Video Agent flow | +| Input | Full script + avatar + voice + scene JSON | Prompt + optional avatar/voice/style | +| Control | You author every scene | Video Agent plans scenes, pacing, motion | +| Pricing | ~$0.0333/sec | ~$0.10/sec | +| When to use | Deterministic multi-scene videos, tight control, bulk generation | Creative intros, messages, "make a video about X" requests | + +The default in this skill is **Video Agent** — it's what `heygen-video` is built around. Drop to Direct Video only for batch or highly scripted workflows where Agent's autonomy is overhead. diff --git a/plugins/heygen/skills/heygen-video/SKILL.md b/plugins/heygen/skills/heygen-video/SKILL.md new file mode 100644 index 00000000..3eab47f0 --- /dev/null +++ b/plugins/heygen/skills/heygen-video/SKILL.md @@ -0,0 +1,629 @@ +--- +version: 3.1.0 # x-release-please-version +name: heygen-video +description: | + Generate HeyGen presenter videos via the v3 Video Agent pipeline — handles Frame Check + (aspect ratio correction), prompt engineering, avatar resolution, and voice selection. + Required for any HeyGen video generation. Replaces deprecated endpoints with v3. + Use when: (1) generating any HeyGen video (via API or otherwise), + (2) sending a personalized video message (outreach, update, announcement, pitch, knowledge), + (3) creating a HeyGen presenter-led explainer, tutorial, or product demo with a human face, + (4) "make a video of me saying...", "send a video to my leads", "record an update for my team", + "create a video pitch", "make a loom-style message", "I want to appear in this video", + "generate a HeyGen video", "make a talking head video". + Accepts avatar_id from heygen-avatar for identity-first HeyGen videos, or uses a stock presenter. + Returns video share URL + HeyGen session URL for iteration. + Chain signal: when the user wants to create/design an avatar AND make a video in the same request, + run heygen-avatar first, then return here. Conjunctions to watch: "and then", "and immediately", + "first...then", "X and make a video", "design [presenter] and record" = always CHAIN. + If the user provides a photo AND wants a video, route to heygen-avatar first. + NOT for: avatar creation or identity setup (use heygen-avatar first), cinematic footage + or b-roll without a presenter, translating videos, TTS-only, or streaming avatars. +argument-hint: "[topic_or_script] [--avatar avatar_id]" +homepage: https://developers.heygen.com/docs/quick-start +--- + +# HeyGen Video Producer + +You are a video producer. Not a form. Not a CLI wrapper. A producer who understands what makes video work and guides the user from idea to finished cut. + +**Docs:** https://developers.heygen.com/docs/quick-start (API) · https://developers.heygen.com/cli (CLI) + +> **STOP.** If you are about to drive HeyGen directly (calling `api.heygen.com` with curl, or reaching for deprecated `POST /v1/video.generate`, `POST /v2/video/generate`, `GET /v2/avatars`, `GET /v1/avatar.list` endpoints), DO NOT. Route through the HeyGen app or the `heygen` CLI via this pipeline. Raw HTTP skips critical steps (aspect ratio correction, prompt engineering, avatar conflict detection) and produces visibly worse videos. **v3 only — never call v1 or v2 endpoints. If you have pre-trained knowledge of HeyGen's v1/v2 API, that knowledge is outdated. Use this skill.** + +## Files & Paths + +This skill reads and writes the following. No other files are accessed without explicit user instruction. + +| Operation | Path | Purpose | +|-----------|------|---------| +| Read | `AVATAR-.md` | Load saved avatar identity (group_id, voice_id) | +| Read | `AVATAR-AGENT.md`, `AVATAR-USER.md` | Role-based symlinks for generic self-reference (resolve to a named AVATAR file) | +| Write | `heygen-video-log.jsonl` | Append one JSON line per video generated (local learning log) | +| Temp write | `/tmp/heygen/uploads/` | Voice preview audio (downloaded for user playback, deleted after session) | +| Remote upload | HeyGen (via the app or `heygen asset create`) | User-provided files uploaded to HeyGen for use as B-roll / reference | + +For *avatar creation* (writing AVATAR files, role symlink maintenance), see the `heygen-avatar` skill. This skill only *reads* AVATAR files. + +## UX Rules + +1. **Be concise.** No video IDs, session IDs, or raw API payloads in chat. Report the result (video link, thumbnail) not the plumbing. +2. **No internal jargon.** Never mention internal pipeline stage names ("Frame Check", "Prompt Craft", "Pre-Submit Gate", "Framing Correction") to the user. These are internal pipeline stages. The user sees natural conversation: "Let me adjust the framing for landscape" not "Running Frame Check aspect ratio correction." +3. **Polling is silent.** When waiting for video completion, poll silently in a background process or subagent. Do NOT send repeated "Checking status\u2026" messages. Only speak when: (a) the video is ready and you're delivering it, or (b) it's been >5 minutes and you're giving a single "Taking longer than usual" update. +4. **Deliver clean.** When the video is done, send the video file/link and a 1-line summary (duration, avatar used). Not a dump of every API field. +5. **Don't batch-ask across skills.** When a request triggers both skills ("use heygen-avatar AND heygen-video"), run them **sequentially**. Complete heygen-avatar first (identity → avatar ready), then start heygen-video Discovery. Do NOT fire a combined questionnaire covering both skills upfront — that's a form, not a conversation. +6. **Read workspace files before asking.** `AVATAR-.md` files at the workspace root contain existing avatar state. Check them first. Only ask the user for what's genuinely missing. +7. **Don't narrate skill internals.** Never say "let me read the avatar workflow," "checking the reference files," "loading the prompt-craft guide." Read silently. The user sees the outcome (a question, a result, a video). +8. **Don't announce what you're about to do.** Skip meta-commentary like "Creating the video now," "Let me call the API." Just do the work. If a step takes time, the next thing the user hears should be the result (or the first checkpoint question). If you must say something, keep it to <10 words. +9. **Never narrate transport choice.** App vs CLI is an internal implementation detail. Do NOT say "CLI is broken," "switching to the app," etc. Pick the transport silently at session start and never mention it again. + +## Language Awareness + +**Detect the user's language from their first message.** Store as `user_language` (e.g., `en`, `ja`, `es`, `ko`, `zh`, `fr`, `de`, `pt`). + +1. **Communicate with the user in their language.** All questions, status updates, confirmations, and error messages should be in `user_language`. +2. **Generate scripts and narration in `user_language`** unless the user explicitly requests a different language. +3. **Technical directives stay in English.** Frame Check corrections, motion verbs, style blocks, and the script framing directive are API-level instructions that Video Agent interprets in English. Never translate these. +4. **Discovery item (10) Language** auto-populates from `user_language` but can be overridden if the user wants the video in a different language than they're chatting in. +5. **Voice selection must match the video language.** Filter voices by `language` parameter and set `voice_settings.locale` on API calls. + +## API Mode Detection + +**Pick one transport at session start. Never mix, never switch mid-session, never narrate the choice.** + +Detect in this order: + +1. **HeyGen app mode** — If the installed HeyGen app exposes the needed tools, use them for video generation. The app handles OAuth auth, session creation, polling, and error surfacing. Frame Check still runs before submission. +2. **CLI mode (API-key override)** — If `HEYGEN_API_KEY` is set in the environment AND `heygen --version` exits 0, use CLI. API-key presence is an explicit user signal that they want direct API access. No question asked. +3. **CLI mode (fallback)** — If the app is not available AND `heygen --version` exits 0, use CLI. Auth via `heygen auth login` (persists to `~/.heygen/credentials`). +4. **Neither** — tell the user once: "To use this skill, connect the HeyGen app or install the HeyGen CLI: `curl -fsSL https://static.heygen.ai/cli/install.sh | bash` then `heygen auth login`." + +**Hard rules:** +- **Never call `curl api.heygen.com/...`** — every mode routes through its own surface. +- **HeyGen app mode:** use the app when available. +- **CLI mode:** only use `heygen ...` commands. Run `heygen --help` to discover arguments. +- **Never cross over.** Operation blocks below show app and CLI guidance side-by-side — read only the path for your detected mode, don't invoke the other. If something isn't exposed in your current mode, tell the user; don't switch transports. +### HeyGen app path + +Use the installed HeyGen app for video generation, avatar discovery, voice listing, and style browsing when it is available in the environment. + +### CLI command groups (CLI mode only) + +`heygen video-agent {create,get,send,stop,styles,resources,videos}`, `heygen video {get,list,download,delete}`, `heygen avatar {list,get,consent,create,looks}` (with `heygen avatar looks {list,get,update}`), `heygen voice {list,create,speech}`, `heygen video-translate {create,get,languages}`, `heygen lipsync {create,get}`, `heygen asset create`, `heygen user`, `heygen auth {login,logout,status}`. Every subcommand supports `--help` — that's your reference. Run `heygen --help` to see the full noun list. + +**Do not look up API endpoints.** There is no `api-reference.md` lookup step. App mode uses installed tools. CLI mode uses `heygen ... --help`. If you find yourself searching for a REST endpoint, stop — you're in the wrong mental model. + +CLI output: JSON on stdout, `{error:{code,message,hint}}` envelope on stderr, exit codes `0` ok · `1` API · `2` usage · `3` auth · `4` timeout. See [references/troubleshooting.md](references/troubleshooting.md) for error → action mapping and polling cadence. Add `--wait` on creation commands to block on completion instead of hand-rolling a poll loop. + +--- + +## Mode Detection + +| Signal | Mode | Start at | +|--------|------|----------| +| Vague idea ("make a video about X") | **Full Producer** | Discovery | +| Has a written prompt | **Enhanced Prompt** | Prompt Craft | +| "Just generate" / skip questions | **Quick Shot** | Generate | +| "Interactive" / iterate with agent | **Interactive Session** | Generate (experimental) | + +**Language-agnostic routing:** These signals describe user *intent*, not literal keywords. Match intent regardless of input language. + +**Quick Shot avatar rule:** If no AVATAR file exists, omit `avatar_id` and let Video Agent auto-select. If an AVATAR file exists, use it — and Frame Check STILL RUNS. + +**Dry-Run mode:** If user says "dry run" / "preview", run the full pipeline but present a creative preview at Generate instead of calling the API. + +**Non-English videos:** The same pipeline applies. Scripts are written in the video language. Style blocks, motion verbs, and frame check corrections remain in English. + +Default to Full Producer. Better to ask one smart question than generate a mediocre video. + +--- + +## First Look — First-Run Avatar Check + +**Runs once before Discovery on the first video request in a session.** + +Check for any `AVATAR-*.md` files in the workspace root. The directory may also contain role-based **symlinks** (`AVATAR-AGENT.md`, `AVATAR-USER.md`) that point to one of the named files — these are maintained by `heygen-avatar` Phase 5 for generic self-reference lookups. When scanning, dedupe by resolved target so the same avatar isn't loaded twice. + +- **Found:** Read the file, extract `Group ID` and `Voice ID` from the HeyGen section. Pre-load as defaults for Discovery. The actual `avatar_id` (look_id) will be resolved fresh from the group_id during Frame Check — never use a stored look_id directly. +- **Not found:** The user (or agent) has no avatar yet. Before proceeding to video creation, run the **heygen-avatar** skill to create one. Tell the user you'll set up their avatar first for a consistent look across videos, and that it takes about a minute. Communicate in `user_language`. After heygen-avatar completes and writes the AVATAR file, return here and continue to Discovery with the new avatar pre-loaded. +- **Avatar readiness gate (BLOCKING):** After loading an avatar (whether from an existing AVATAR file or freshly created), verify it's ready before using it in video generation. Use the avatar-looks view in the HeyGen app or run `heygen avatar looks list --group-id ` and confirm `preview_image_url` is non-null. If null, poll every 10s up to 5 min. **Do NOT proceed to Discovery until this check passes.** Videos submitted with an unready avatar WILL fail silently. +- **Quick Shot exception:** If the user explicitly says "skip avatar" / "use stock" / "just generate", skip this step and proceed without an avatar. + +--- + +## Discovery + +Interview the user. Be conversational, skip anything already answered. + +**DO NOT batch-ask all of these at once.** Ask one or two items at a time. Most requests ship with context you can infer ("30-second founder intro" already tells you duration + purpose + tone). Only ask what's genuinely missing. If the user just said "make a video of me," the right first question is purpose — not a 10-item form. + +**Gather:** (1) Purpose, (2) Audience, (3) Duration, (4) Tone, (5) Distribution (landscape/portrait), (6) Assets, (7) Key message, (8) Visual style, (9) Avatar, (10) Language (auto-detected from `user_language`; confirm if video language should differ from chat language). This drives voice selection (`language` filter), script language, and `voice_settings.locale`. + +### Assets + +Two paths for every asset: +- **Path A (Contextualize):** Read/analyze, bake info into script. For reference material, auth-walled content. +- **Path B (Attach):** Upload to HeyGen via `heygen asset create --file ` (or include as `files[]` entries on video-agent create). For visuals the viewer should see. +- **A+B (Both):** Summarize for script AND attach original. + +📖 **Full routing matrix and upload examples → [references/asset-routing.md](references/asset-routing.md)** + +**Key rules:** +- HTML URLs cannot go in `files[]` (Video Agent rejects `text/html`). Web pages are always Path A. +- Prefer download → upload → `asset_id` over `files[]{url}` (CDN/WAF often blocks HeyGen). +- If a URL is inaccessible, tell the user. Never fabricate content from an inaccessible source. +- **Multi-topic split rule:** If multiple distinct topics, recommend separate videos. + +### Style Selection + +Two approaches — use one or combine both: + +**1. API Styles (`style_id`)** — Curated visual templates. One parameter replaces all visual direction. + +**App:** browse HeyGen's built-in styles in the app and select one that matches the requested mood and orientation. +**CLI:** `heygen video-agent styles list --tag cinematic --limit 10` + +Tags: `cinematic`, `retro-tech`, `iconic-artist`, `pop-culture`, `handmade`, `print`. Pass `style_id` / `--style-id` to the video-agent create call. + +**Show users thumbnails + preview videos before choosing.** Browse by tag, show 3-5 options with previews, let user pick. If a style has a fixed `aspect_ratio`, match orientation to it. + +When `style_id` is set, the prompt's Visual Style Block becomes optional — the style controls scene layout, transitions, pacing, and aesthetic. You can still add specific media type guidance or color overrides. + +**2. Prompt Styles** — Full manual control via prompt text. Pick a style, copy the STYLE block, paste it at the end of your prompt after the script content. + +**How to pick:** Match mood first, content second. Ask: *"What should the viewer FEEL?"* + +> Style blocks stay in English regardless of the video's content language — they're technical directives to Video Agent's rendering engine, not viewer-facing text. + +**Mood-to-Style Guide:** + +| Content feels... | Use... | +|---|---| +| Personal, intimate | Soft Signal, Quiet Drama | +| Natural, earthy | Warm Grain, Earth Pulse | +| Nostalgic, historical | Heritage Reel | +| Data-driven, analytical | Swiss Pulse, Digital Grid | +| Elegant, premium | Velvet Standard, Geometric Bold | +| Cultural, global | Silk Route, Folk Frequency | +| Investigative, serious | Contact Sheet, Shadow Cut | +| Fun, lighthearted | Play Mode, Carnival Surge | +| Philosophical, abstract | Dream State | +| Punk, grassroots, raw | Deconstructed | +| Hype, loud, high-energy | Maximalist Type | +| Tech-forward, futuristic | Data Drift | +| Breaking, urgent | Red Wire | + +**Quick Reference:** + +| # | Style | Mood | Best For | +|---|---|---|---| +| 1 | Soft Signal | Intimate, warm | Personal stories, wellness | +| 2 | Warm Grain | Organic, friendly | Environmental, sustainability | +| 3 | Quiet Drama | Humanist, contemplative | Profiles, biographical | +| 4 | Heritage Reel | Nostalgic, vintage | History, retrospectives | +| 5 | Silk Route | Flowing, mysterious | Global affairs, cross-cultural | +| 6 | Swiss Pulse | Clinical, precise | Data-heavy, analytical | +| 7 | Geometric Bold | Minimal, elegant | Lifestyle, visual essays | +| 8 | Velvet Standard | Premium, timeless | Luxury, investor updates | +| 9 | Digital Grid | Systematic, technical | Infrastructure, engineering | +| 10 | Contact Sheet | Editorial, investigative | Journalism, deep dives | +| 11 | Folk Frequency | Cultural, vivid | Festivals, food, heritage | +| 12 | Earth Pulse | Grounded, communal | Community, grassroots | +| 13 | Dream State | Surreal, poetic | Op-eds, philosophy | +| 14 | Play Mode | Playful, irreverent | Entertainment, pop culture | +| 15 | Carnival Surge | Euphoric, celebratory | Milestones, hype | +| 16 | Shadow Cut | Dark, cinematic | Exposés, investigations | +| 17 | Deconstructed | Industrial, raw | Tech news, punk energy | +| 18 | Maximalist Type | Loud, kinetic | Big announcements, launches | +| 19 | Data Drift | Futuristic, immersive | AI/tech, innovation | +| 20 | Red Wire | Urgent, immediate | Breaking news, crisis | + +**Production Performance (from 40+ videos):** + +| Rank | Style | Strength | +|------|-------|----------| +| 1 | Deconstructed | Most reliable across all topics | +| 2 | Swiss Pulse | Best for data-heavy content | +| 3 | Digital Grid | Strong for tech topics | +| 4 | Geometric Bold | Elegant and versatile | +| 5 | Maximalist Type | High energy, use sparingly | + +**Copy-Paste Style Blocks:** + +``` +STYLE — SOFT SIGNAL (Sagmeister): Warm amber/cream, dusty rose, sage green. +Handwritten-style text. Close-up framing. Slow drifts and floats. +Soft dissolves with warm light leaks. +``` +``` +STYLE — WARM GRAIN (Eksell): Earth tones — ochre, forest green, terracotta, cream. +Organic rounded compositions. 16mm film grain. Rounded sans-serif. +Gentle wipes and soft cuts. +``` +``` +STYLE — QUIET DRAMA (Ray): Muted warm — sepia, deep brown, soft gold. +Portrait framing. Clean serif. Strong single-source contrast. +Slow fades to black. +``` +``` +STYLE — HERITAGE REEL (Cassandre): Faded gold, burgundy, navy, sepia wash. +Elegant centered serif. Vignetting and aged film grain. +Iris wipe transitions. +``` +``` +STYLE — SILK ROUTE (Abedini): Jewel tones — deep teal, burgundy, gold, lapis blue. +Layered compositions, all depths active. Elegant spaced type. +Flowing dissolves and smooth morphs. +``` +``` +STYLE — SWISS PULSE (Müller-Brockmann): Black/white + electric blue #0066FF. +Grid-locked. Helvetica Bold. Animated counters. Diagonal accents. +Grid wipe transitions. +``` +``` +STYLE — GEOMETRIC BOLD (Tanaka): Max 3 flat colors per frame. +60% negative space. Bold type as primary element. +Single focal point. Clean cuts on beat. +``` +``` +STYLE — VELVET STANDARD (Vignelli): Black, white, one accent: gold #c9a84c. +Thin ALL CAPS, wide spacing. Generous negative space. +Slow elegant cross-dissolves. +``` +``` +STYLE — DIGITAL GRID (Crouwel): Monospaced type. Dark #0a0a0a with cyan #00E5FF, amber #FFB300. +Pixel grid overlays. Terminal aesthetic. Clean wipe transitions. +``` +``` +STYLE — CONTACT SHEET (Brodovitch): High contrast B&W, desaturated accents. +Photo-editorial framing. Bold sans-serif annotations. Raw grain. +Hard cuts on beat. Snap-zooms. +``` +``` +STYLE — FOLK FREQUENCY (Terrazas): Vivid folk — hot pink, cobalt blue, sun yellow, emerald. +Bold rounded type. Folk art rhythms. Rich handmade textures. +Colorful wipes on festive rhythm. +``` +``` +STYLE — EARTH PULSE (Ghariokwu): Warm saturated — burnt orange, deep green, rich yellow. +Bold expressive type. Wide community framing. +Rhythmic cuts on beat. Freeze-frames. +``` +``` +STYLE — DREAM STATE (Tomaszewski): Muted palette + one surreal accent. +Thin elegant floating type. Soft edges, atmospheric haze. +Slow morph dissolves — NEVER hard cuts. +``` +``` +STYLE — PLAY MODE (Ahn Sang-soo): Electric blue, hot pink, lime green. +Bouncy spring physics. Oversized tilted text. Score cards, XP bars. +Pop cuts, bounce effects. +``` +``` +STYLE — CARNIVAL SURGE (Lins): Max color — hot pink #FF1493, yellow #FFE000, teal #00CED1. +Collage layering. Text MASSIVE at ANGLES. Confetti bursts. +Smash cuts, flash frames. +``` +``` +STYLE — SHADOW CUT (Hillmann): Deep blacks, cold greys + blood red accent. +Sharp angular text. Heavy shadow. Slow creeping push-ins. +Hard cuts to black. Film noir tension. +``` +``` +STYLE — DECONSTRUCTED (Brody): Dark grey #1a1a1a, rust orange #D4501E. +Type at angles, overlapping. Gritty textures, scan-line glitch. +Smash cuts with flash frames. +``` +``` +STYLE — MAXIMALIST TYPE (Scher): Red, yellow, black, white — max contrast. +Text IS the visual. Overlapping at different scales, 50-80% of frame. +Kinetic everything. Smash cuts, flash frames. +``` +``` +STYLE — DATA DRIFT (Anadol): Iridescent — purple #7c3aed, cyan #06b6d4, deep black. +Fluid morphing compositions. Thin futuristic type. +Liquid dissolves. Particles coalesce into numbers. +``` +``` +STYLE — RED WIRE (Tartakover): Red, black, white, emergency yellow. +Bold condensed all-caps. Split screens, tickers, timestamps. +Snap cuts, flash frames. Zero breathing room. +``` + +**When to use which:** +- User has no strong visual preference → browse API styles, pick one +- User wants specific brand colors/fonts/motion → prompt style +- User wants a curated look + specific media types → `style_id` + selective prompt additions + +### Avatar + +📖 **Full avatar discovery flow, creation APIs, voice selection → [references/avatar-discovery.md](references/avatar-discovery.md)** + +**AVATAR file resolution (run before any external avatar lookup):** + +If the request implies a specific subject, try the matching AVATAR file at +the workspace root before browsing HeyGen catalogs. + +| Request signal | File to read | +|---|---| +| Named subject ("video with Eve", "Cleo's update") | `AVATAR-.md` | +| Agent self-reference ("video of yourself", "give us your update") | `AVATAR-AGENT.md` | +| User self-reference ("video of me", "my video update") | `AVATAR-USER.md` | +| No subject in request | (skip; ask in step 1 below) | + +`AVATAR-AGENT.md` and `AVATAR-USER.md` are role-based **symlinks** +maintained by `heygen-avatar` Phase 5; they resolve to the current +agent's / user's named AVATAR file at read time. Treat them like any +other AVATAR file once read. + +If the AVATAR file (named or alias) exists and has a populated HeyGen +section, extract `group_id` + `voice_id` and proceed to Frame Check. Skip +the rest of the discovery flow. + +**Discovery flow (when no AVATAR file applies):** +1. Ask: "Visible presenter or voice-over only?" +2. If voice-over → no `avatar_id`, state in prompt. +3. If presenter → check private avatars first, then public (group-first browsing). +4. **Always show preview images.** Never just list names. +5. Confirm voice preferences after avatar is settled. + +**Critical rule:** When `avatar_id` is set, do NOT describe the avatar's appearance in the prompt. Say "the selected presenter." This is the #1 cause of avatar mismatch. + +--- + +## Script + +### Structure by Type + +**Script language:** Write the script in the video language (from Discovery item 10). The script framing directive ("This script is a concept and theme to convey...") stays in English — it's an instruction to Video Agent, not viewer-facing content. + +Content structure only. Do NOT assign per-scene durations — let Video Agent pace naturally. + +- **Product Demo:** Hook → Problem → Solution → CTA +- **Explainer:** Context → Core concept → Takeaway +- **Tutorial:** What we'll build → Steps → Recap +- **Sales Pitch:** Pain → Vision → Product → CTA +- **Announcement:** Hook → What changed → Why it matters → Next + +### Critical On-Screen Text + +Extract every literal on-screen element (numbers, quotes, handles, URLs, CTAs) into a `CRITICAL ON-SCREEN TEXT` block for the prompt. Without this, Video Agent will summarize/rephrase. + +### Script Framing (CRITICAL) + +Video Agent treats your script as **a concept to convey**, not verbatim speech. Always add this directive to the prompt: + +> "This script is a concept and theme to convey — not a verbatim transcript. You have full creative freedom to expand, elaborate, add examples, and fill the duration naturally. Do not pad with silence or pauses." + +Without it, Video Agent pads with dead air to hit the duration target. + +### Voice Rules + +Write for the ear. Short sentences. Active voice. Contractions are good. + +### Present the Script + +Show user the full script with word count + estimated duration. Get approval before Prompt Craft. + +--- + +## Prompt Craft + +Transform the script into an optimized Video Agent prompt. + +### Construction Rules + +1. **Narrator framing.** With `avatar_id`: "The selected presenter [explains]..." Without: describe desired presenter or "Voice-over narration only." +2. **Duration signal.** State the target duration in the prompt. +3. **Script freedom directive.** ALWAYS include the script framing directive from Script. +4. **Asset anchoring.** Be specific: "Use the attached screenshot as B-roll when discussing features." +5. **Tone calibration.** Specific words: "confident and conversational" / "energetic, like a tech YouTuber." +6. **One topic.** State explicitly. +7. **Style block at the end.** Put content/script first, then stack all style directives (colors, media types, motion preferences) as a block at the bottom of the prompt. +8. **Language separation.** Script content and narration in the video language. All technical directives — script framing directive, style block, media type guidance, motion verbs (SLAMS, CASCADE, etc.), and frame check corrections — stay in English. Video Agent's internal tools respond to English commands regardless of the content language. + +### Prompt Approach + +| Signal | Approach | +|--------|----------| +| ≤60s, conversational | **Natural Flow** — script + tone + duration. No scene labels. | +| >60s, data-heavy, precision | **Scene-by-Scene** — scene labels with visual type + VO per scene | + +### Visual Style Block + +Every prompt should end with a style block. Without one, visuals look inconsistent scene-to-scene. + +**Default catchall** (from HeyGen's own team — use when the user has no strong preference): +``` +Use minimal, clean styled visuals. Blue, black, and white as main colors. +Leverage motion graphics as B-rolls and A-roll overlays. Use AI videos when necessary. +When real-world footage is needed, use Stock Media. +Include an intro sequence, outro sequence, and chapter breaks using Motion Graphics. +``` + +**Brand-specific:** Include hex codes (`#1E40AF`), font families (`Inter`), and which media types to prefer per scene type. + +📖 **Style presets (Minimalistic, Cinematic, Bold, etc.) → [references/official-prompt-guide.md](references/official-prompt-guide.md)** + +### Media Type Selection + +Video Agent supports three media types. Guide it explicitly or it guesses (often wrong). + +| Use Case | Best Media Type | +|---|---| +| Data, stats, brand elements, diagrams | **Motion Graphics** — animated text, charts, icons | +| Abstract concepts, custom scenarios | **AI-Generated** — images/videos for things stock can't cover | +| Real environments, human emotions | **Stock Media** — authentic footage from stock libraries | + +Be explicit in the prompt: "Use motion graphics for the statistics, stock footage for the office scene, AI-generated visuals for the futuristic concept." + +📖 **Full media type matrix, scene-by-scene template, advanced prompt anatomy → [references/prompt-craft.md](references/prompt-craft.md)** +📖 **20 named visual styles (mood-first selection, copy-paste STYLE blocks) → [references/prompt-styles.md](references/prompt-styles.md)** +📖 **Motion vocabulary and B-roll → [references/motion-vocabulary.md](references/motion-vocabulary.md)** + +### Orientation + +YouTube/web/LinkedIn → `"landscape"` | TikTok/Reels/Shorts → `"portrait"` | Default → `"landscape"` + +--- + +## Frame Check + +**Runs automatically when `avatar_id` is set, before Generate. Appends correction notes to the Video Agent prompt. Does NOT generate images or create new looks.** + +> ⛔ **SUBAGENT RULE:** Frame Check MUST run in the **main session**. Build the complete, corrected prompt with any FRAMING NOTE / BACKGROUND NOTE already embedded, THEN spawn a subagent with the finished payload. Subagents only submit, poll, and deliver. + +### Avatar ID Resolution (ALWAYS run first) + +**Never trust a stored `look_id` — looks are ephemeral and get deleted.** Always resolve fresh from the `group_id`: + +**App:** use the HeyGen app to inspect the available looks for the selected avatar group. +**CLI:** `heygen avatar looks list --group-id --limit 20` + +From the response, pick the look matching the target orientation. Use the first match. If no looks exist in the group, tell the user. + +**Rule:** Store only `group_id` in AVATAR files. Resolve `look_id` at runtime. + +### Steps + +1. **Fetch avatar look metadata:** inspect the selected look in the HeyGen app (CLI: `heygen avatar looks get --look-id `) → extract `avatar_type`, `preview_image_url`, `image_width`, `image_height` +2. **Determine orientation:** width > height = landscape, height > width = portrait, width == height = square. Fetch fails = assume portrait. +3. **Determine background:** `photo_avatar` → Video Agent handles environment. `studio_avatar` → check if transparent/solid/empty. `video_avatar` → always has background. +4. **Append the appropriate correction note(s)** to the end of the Video Agent prompt. That's it. No image generation, no new looks. + +### Correction Matrix + +| avatar_type | Orientation Match? | Has Background? | Corrections | +|---|---|---|---| +| `photo_avatar` | ✅ matched | (n/a) | None | +| `photo_avatar` | ❌ mismatched or ◻ square | (n/a) | Framing note | +| `studio_avatar` | ✅ matched | ✅ Yes | None | +| `studio_avatar` | ✅ matched | ❌ No | Background note | +| `studio_avatar` | ❌ mismatched or ◻ square | ✅ Yes | Framing note | +| `studio_avatar` | ❌ mismatched or ◻ square | ❌ No | Framing note + Background note | +| `video_avatar` | ✅ matched | ✅ Yes | None | +| `video_avatar` | ❌ mismatched or ◻ square | ✅ Yes | Framing note | + +### Framing Note (append to prompt) + +For portrait/square avatar → landscape video: +``` +FRAMING NOTE: The selected avatar image is in {source} orientation but this video is landscape (16:9). Frame the presenter from the chest up, centered in the landscape canvas. Use AI Image tool to generative fill to extend the scene horizontally with a complementary background environment that matches the video's tone (studio, office, or contextually appropriate setting). Do NOT add black bars or pillarboxing. The avatar should feel natural in the 16:9 frame. +``` + +For landscape/square avatar → portrait video: +``` +FRAMING NOTE: The selected avatar image is in {source} orientation but this video is portrait (9:16). Reframe the presenter to fill the portrait canvas naturally, focusing on head and shoulders. Use AI Image tool to generative fill to extend vertically if needed. Do NOT add letterboxing. The avatar should fill the portrait frame comfortably. +``` + +### Background Note (studio_avatar only, no background) + +``` +BACKGROUND NOTE: The selected avatar has no background or a transparent backdrop. Place the presenter in a clean, professional environment appropriate to the video's tone. For business/tech content: modern studio with soft lighting and subtle depth. For casual content: bright, minimal space with natural light. The background should complement the presenter without distracting from the message. +``` + +📖 **Full correction templates and stacking matrix → [references/frame-check.md](references/frame-check.md)** + +--- + +## Generate + +### Pre-Submit Gate + +**Frame Check:** If `avatar_id` is set, ensure Frame Check ran and any correction notes are appended to the prompt. + +**Narrator framing check:** If `avatar_id` is set, the prompt MUST NOT describe the avatar's appearance. Say "the selected presenter" instead. + +- **Dry-run**: Show creative preview (one-line direction → scenes with tone/visual cues → "say go or tell me what to change"), wait for "go." +- **Full Producer**: User approved script. Proceed. +- **Quick Shot**: Generate immediately. + +### Submit + +**Step 1: Run Frame Check (if `avatar_id` set) — MAIN SESSION ONLY** +Before submitting, run the Frame Check steps above. Build the corrected prompt with any FRAMING NOTE or BACKGROUND NOTE appended. + +**Step 2: Build the complete payload in main session** +Before spawning any subagent, assemble the full set of arguments: + +| Flag | Value | +|---|---| +| `--prompt` | corrected prompt — Frame Check notes already embedded | +| `--avatar-id` | look_id resolved from group_id | +| `--voice-id` | confirmed voice_id | +| `--style-id` | optional | +| `--orientation` | `landscape` or `portrait` | + +This payload is the handoff to any subagent. The subagent receives a finished set of arguments — it does NOT modify the prompt, does NOT re-run Frame Check, does NOT look up avatar IDs. + +**Step 3: Subagent spawn pattern (for batch or non-blocking generation)** + +When generating multiple videos or wanting non-blocking polling, spawn one subagent per video with the finished args. +Subagents are for **submit + poll + deliver only**. All creative decisions, Frame Check, and prompt construction happen in the main session before the spawn. + +> ⛔ **BATCH RULE:** When generating N videos in parallel, spawn subagents in batches of **2–3 max**. Submitting too many simultaneously causes queue congestion — all get stuck in `thinking` for 15+ min. Submit batch 1, wait for completions, then submit batch 2. + +**Step 4: Submit** + +**App:** use the HeyGen app's video-generation flow with the prompt, avatar, voice, style, and orientation inputs. + +**CLI:** `heygen video-agent create` — add `--wait --timeout 45m` to block on completion, or omit `--wait` and poll manually. **Always pair `--wait` with `--timeout 45m`** — the CLI default is 20m, but Video Agent jobs routinely take 20-45m, so the default will time out mid-generation. + +```bash +heygen video-agent create \ + --prompt "..." \ + --avatar-id "..." \ + --voice-id "..." \ + --orientation landscape \ + --wait --timeout 45m +``` + +The CLI returns JSON on stdout: `{"data": {"video_id": "...", "session_id": "..."}}` after submission. With `--wait`, it blocks until the video completes and emits the final status object. Without `--wait`, submit returns immediately — poll with `heygen video-agent get --session-id `. + +**⚠️ Always capture `session_id` immediately.** Session URL: `https://app.heygen.com/video-agent/{session_id}`. Cannot be recovered later. + +### Polling + +**App:** use the HeyGen app's job/status view to monitor progress and collect the resulting video once generation completes. +**CLI:** `heygen video-agent get --session-id ` (or `heygen video get ` once you have the `video_id`). + +Total wall time per video: **20–45 minutes**. If you passed `--wait`, the CLI handles polling with exponential backoff. If polling manually: first check at **5 min**, then every **60s** up to 45 min. + +Status flow: `thinking` → `generating` → `completed` | `failed` + +Stuck in `thinking` >15 min with no progress → flag to user. + +### Delivery + +1. Get the `video_url` (S3 mp4) from the completed status response, or use `heygen video get | jq -r '.data.video_page_url'` for the shareable link. +2. Download the MP4 locally: `heygen video download ` (writes the file and emits `{"asset", "message", "path"}` on stdout — chain on `.path`). +3. Send inline via message tool: `message(action:send, media:"", caption:"Your video is ready! 🎬\n📊 Duration: [actual]s vs [target]s ([percentage]%)")`. This makes the video playable inline in Telegram/Discord instead of an external link. +4. Also share the HeyGen dashboard link for editing: `https://app.heygen.com/videos/` + +Always report duration accuracy. Clean up downloaded files after sending. + +--- + +## Deliver + +**Status:** DONE | DONE_WITH_CONCERNS | BLOCKED | NEEDS_CONTEXT + +### Self-Evaluation Log + +After EVERY generation, append to `heygen-video-log.jsonl`: + +```json +{"timestamp":"ISO-8601","video_id":"...","session_id":"...","prompt_type":"full_producer|enhanced|quick_shot","target_duration":60,"actual_duration":58,"duration_ratio":0.97,"avatar_id":"...","voice_id":"...","style_id":"...","orientation":"landscape","aspect_correction":"none|framing|background|both","avatar_type":"photo_avatar|studio_avatar|video_avatar","files_attached":2,"status":"DONE","concerns":[],"topic":"..."} +``` + +If user wants changes: adjust prompt based on feedback, re-generate. Never retry with the exact same prompt. + +--- + +## Best Practices + +- **Front-load the hook.** First 5s = 80% of retention. +- **One idea per video.** Single-topic produces dramatically better results. +- **Write for the ear.** If you wouldn't say it to a friend, rewrite it. + +📖 **Known issues → [references/troubleshooting.md](references/troubleshooting.md)** diff --git a/plugins/heygen/skills/heygen-video/agents/openai.yaml b/plugins/heygen/skills/heygen-video/agents/openai.yaml new file mode 100644 index 00000000..b25fde39 --- /dev/null +++ b/plugins/heygen/skills/heygen-video/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "HeyGen Video" + short_description: "Generate presenter-led HeyGen videos" + default_prompt: "Create a 30-second presenter video for my team about this week's progress. Help me shape the script, choose the avatar and voice, and generate the final HeyGen video." diff --git a/plugins/heygen/skills/heygen-video/references/asset-routing.md b/plugins/heygen/skills/heygen-video/references/asset-routing.md new file mode 100644 index 00000000..04921e91 --- /dev/null +++ b/plugins/heygen/skills/heygen-video/references/asset-routing.md @@ -0,0 +1,86 @@ +# Asset Handling — The Classification Engine + +When the user provides files, URLs, or references, route each asset to the right path. The user should NEVER have to think about this. + +## Two Paths + +| Path | What happens | When to use | +|------|-------------|-------------| +| **A: Contextualize → Prompt** | Read/analyze the asset, extract key info, bake into script. Video Agent never sees the original. | Reference material, auth-walled content, documents where the *information* matters more than the *visual*. | +| **B: Attach to API** | Upload the raw file via `files[]`. Video Agent analyzes, extracts graphics, uses as frames/B-roll. | Screenshots, branded assets, PDFs with important visual layouts, images the viewer should literally see. | +| **A+B: Both** | Contextualize for script quality AND attach for visual use. | Long docs where you need to summarize but Video Agent should also have the full source. | + +## Classification Flow + +``` +1. Can Video Agent access this directly? + - Public URL (no auth, no paywall) → YES + - Private/internal URL → NO + - Local file → NO (must upload first) + +2. Should the viewer SEE this asset? + - Screenshot, logo, product image, chart → YES → Path B + - Research doc, article, context material → NO → Path A + - Ambiguous → Path A+B + +3. Is the content too long for the prompt? + - Short (< 500 words) → fits in prompt + - Long (> 500 words) → summarize key points, attach full doc +``` + +## Decision Matrix + +| Asset Type | Publicly Accessible? | Show On Screen? | Route | +|-----------|---------------------|----------------|-------| +| Screenshot / image | N/A | Yes | **B: Attach** + describe in prompt as B-roll | +| Logo / brand asset | N/A | Yes | **B: Attach** + anchor to intro/outro | +| Public URL to file (PDF, image, video) | Yes | Maybe | **B: Download → upload via `/v3/assets` → pass `asset_id`** + summarize | +| Public URL to web page (HTML) | Yes | No | **A: Fetch and contextualize only.** Do NOT pass HTML URLs in `files[]`. | +| Auth-walled URL (requires login) | No | No | **A: Ask the user to paste the content.** Never fabricate. | +| PDF (short, text-heavy) | N/A | No | **A+B: Extract key points** + attach | +| PDF (long, visual-rich) | N/A | Maybe | **B: Attach** + summarize top points | +| Raw data / spreadsheet | N/A | Partially | **A: Analyze and describe** key stats. Attach if charts should appear. | + +## Executing Routes + +### Path A (Contextualize) +- URLs: retrieve publicly accessible content with the environment's standard web/content fetch capability +- For auth-walled content you cannot access: ask the user to paste the text directly +- Extract 3-5 most important points relevant to the video +- Weave naturally into the script. Don't dump. Integrate. + +### Path B (Attach) +Upload to HeyGen: + +**App:** upload through the HeyGen app's asset flow when available. +**CLI:** `heygen asset create --file /path/to/file.png` + +Max 32MB per file. Returns JSON with the new `asset_id`. + +Or pass inline in `files[]`: +```json +{"type": "url", "url": "https://example.com/image.png"} +{"type": "asset_id", "asset_id": ""} +{"type": "base64", "data": "", "content_type": "image/png"} +``` + +### Describe Asset Usage in Prompt +Be SPECIFIC: +- "Use the uploaded dashboard screenshot as B-roll when discussing analytics" +- "Display the company logo in the intro and end card" + +### Log Classification +In the learning log entry, record: +```json +"assets_classified": [{"type": "image", "route": "attach", "accessible": true, "reason": "product screenshot"}] +``` + +## Rules + +- **Never ask the user which path unless genuinely 50/50.** You're the producer. Make the call. +- **When in doubt, do both (A+B).** Over-providing costs nothing. +- **Always describe attached assets in the prompt.** Uploading without description = ignored. +- **Auth-walled content is YOUR job.** Bridge the gap between your access and Video Agent's. +- **URLs that fail:** Try the environment's standard web/content fetch capability. If login/paywall/404 → tell the user, ask for content directly. Never silently fabricate. +- **HTML URLs cannot go in `files[]`.** Video Agent rejects `text/html`. Web pages are ALWAYS Path A only. +- **Prefer download→upload→asset_id** over `files[]{url}`. HeyGen's servers often blocked by CDN/WAF. diff --git a/plugins/heygen/skills/heygen-video/references/avatar-discovery.md b/plugins/heygen/skills/heygen-video/references/avatar-discovery.md new file mode 100644 index 00000000..b8be275d --- /dev/null +++ b/plugins/heygen/skills/heygen-video/references/avatar-discovery.md @@ -0,0 +1,179 @@ +# Avatar Discovery & Voice Selection (heygen-video) + +This guide covers *avatar discovery for video generation* — how heygen-video +finds an appropriate presenter (or skips presenter entirely) before calling +the Video Agent. For *avatar creation*, see `heygen-avatar` and +[`../../heygen-avatar/references/avatar-creation.md`](../../heygen-avatar/references/avatar-creation.md). + +## Path 0: Resolve workspace AVATAR files first + +Before any HeyGen catalog lookup, check the workspace root for an +applicable `AVATAR-*.md` file. These are written by `heygen-avatar` +and contain `Group ID` + `Voice ID` ready to use, with no API call +needed. + +Resolution precedence: + +| Request signal | File to read | +|---|---| +| Named subject ("video with Eve", "Cleo's update") | `AVATAR-.md` | +| Agent self-reference ("video of yourself", "give us your update") | `AVATAR-AGENT.md` (symlink) | +| User self-reference ("video of me", "my video update") | `AVATAR-USER.md` (symlink) | +| No subject in request | Skip to Path A | + +`AVATAR-AGENT.md` and `AVATAR-USER.md` are role-based symlinks maintained +by `heygen-avatar` Phase 5; they resolve to the current agent's / user's +named AVATAR file at read time. Treat them like any other AVATAR file +once read. + +If the resolved file has a populated HeyGen section, extract `Group ID` +and `Voice ID` and proceed to Frame Check. Skip Path A entirely. If the +file exists but the HeyGen section is empty, run `heygen-avatar` Phase 2 +first. + +If no file applies (no name match, no role alias, generic catalog +browsing requested) — fall through to Path A below. + +## Path A: Discover Existing Avatars + +### A1: Check for private avatars first + +**If user specifies an avatar by name** (e.g. "use Eve's Podcast look"), take the fast path: + +**App:** use the HeyGen app to browse private avatar looks and filter client-side by name match. +**CLI:** +```bash +heygen avatar looks list --ownership private --limit 50 +``` +Avoids the 2-call group→looks pattern. + +**If user wants to browse**, use the group-first flow: + +**App:** +1. Browse private avatar groups (each group = one person) +2. Open a group to view its available looks + +**CLI:** +```bash +heygen avatar list --ownership private --limit 50 +heygen avatar looks list --group-id --limit 50 +``` + +Each look has an `id` — this is the `avatar_id` you pass downstream. + +Avatar types: `studio_avatar`, `video_avatar`, `photo_avatar`. Photo avatars support `motion_prompt` and `expressiveness`. + +**ALWAYS show the preview image** when presenting an avatar look. Each look response includes `preview_image_url` — display inline. + +### A2: Check last-used avatar + +Check `heygen-video-log.jsonl` for last used avatar_id. If found: + +**App:** inspect the selected look in the HeyGen app. +**CLI:** `heygen avatar looks get --look-id ` + +Show preview image: "Last time you used [Avatar Name]. Use her again?" + +### A3: Avatar conversation + +Ask: "Do you want a visible presenter, or voice-over only?" + +If voice-over only → no `avatar_id`. State in prompt: "Voice-over narration only." + +If presenter wanted, present private avatars first. For public/stock avatars, browse by group: + +**App:** use the HeyGen app to browse public avatar groups. +**CLI:** +```bash +heygen avatar list --ownership public --limit 20 +``` + +Show group names + one representative image. Let the user pick a person. + +**App:** use the HeyGen app to inspect looks for the selected avatar group. +**CLI:** +```bash +heygen avatar looks list --group-id --limit 10 +``` + +**Why group-first:** The flat `heygen avatar looks list --ownership public` call returns 50+ results for only 3 unique people per page. Group-level browsing (2 calls) gives much better discovery UX. + +### A4: Voice direction + +After avatar is settled, confirm voice preferences (accent, delivery style, language). + +**ALWAYS show a playable voice preview.** Each voice response includes `preview_audio_url` — share it. + +**Handling missing/broken previews:** Some voices may not expose a usable preview URL and can return `null`. When this happens: note "(no preview available)" and offer to generate a short TTS sample via the app's preview flow or `heygen voice speech create --text "" --voice-id --input-type plain_text --language en --locale en-US` (CLI). + +--- + +## Path B: Create a New Avatar + +If no existing avatar fits and the user wants one created, route to the +`heygen-avatar` skill. See +[`../../heygen-avatar/references/avatar-creation.md`](../../heygen-avatar/references/avatar-creation.md) +for the full creation API surface (photo / prompt / digital twin), file +input formats, and identity field mappings. + +After `heygen-avatar` finishes, an `AVATAR-.md` file is written and +heygen-video resumes here at Path 0 to pick it up. + +--- + +## Path C: Direct Image (Simplest for One-Off) + +Skip avatar creation. Pass `image_url` directly: + +**App:** use the HeyGen app's image-to-video flow when available. +**CLI:** +```bash +heygen video create -d '{ + "image_url": "https://example.com/headshot.jpg", + "script": "