diff --git a/src/common/inlineScriptMetadata.ts b/src/common/inlineScriptMetadata.ts new file mode 100644 index 00000000..5082eddf --- /dev/null +++ b/src/common/inlineScriptMetadata.ts @@ -0,0 +1,406 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import * as tomljs from '@iarna/toml'; +import * as fs from 'fs/promises'; +import { Uri } from 'vscode'; +import { traceVerbose, traceWarn } from './logging'; + +/** + * Parsed and validated PEP 723 `script` metadata block. + * + * See: https://packaging.python.org/en/latest/specifications/inline-script-metadata/ + */ +export interface InlineScriptMetadata { + /** Parsed value of `requires-python`, if present. */ + readonly requiresPython?: string; + /** Parsed value of `dependencies`, if present. */ + readonly dependencies?: readonly string[]; + /** Parsed `[tool]` table, opaque to this parser. */ + readonly tool?: tomljs.JsonMap; + /** + * Character offsets of the `# /// script` … `# ///` block in the + * (normalized — see notes on BOM and CRLF handling below) text that + * was parsed: inclusive start of the `# /// script` line, exclusive + * end immediately after the closing `# ///` line's terminating + * newline (or end of string if there is no trailing newline). + */ + readonly range: { readonly start: number; readonly end: number }; +} + +/** + * Maximum bytes read from the head of a file when looking for inline + * script metadata. PEP 723 blocks live at the top of files, so reading + * the first chunk is sufficient. Callers that need to handle scripts + * with very large leading shebang / comment blocks should know that + * anything past this byte boundary is invisible to the detector. + */ +export const MAX_HEADER_BYTES = 8 * 1024; + +/** + * Canonical block regex from the PEP 723 spec, translated to JavaScript + * (Python's `(?P...)` becomes `(?...)` in JS). The flag + * combination `gm` is required so `^` / `$` anchor on line boundaries + * and so `String.prototype.matchAll` can iterate every candidate block. + * + * Important: this regex assumes line endings have already been + * normalized to `\n`. In Python's `re` module `.` matches `\r`, but in + * JavaScript it does not, so a literal CRLF file would behave + * inconsistently against this pattern. `readInlineScriptMetadata` + * normalizes line endings before applying the regex. + * + * The pattern is consumed exclusively via `text.matchAll(BLOCK_RE)`, + * which constructs a fresh iterator each call and does NOT mutate the + * regex's `lastIndex`. Do not call `BLOCK_RE.exec` directly — that + * would reintroduce the stateful-lastIndex footgun. + */ +const BLOCK_RE = /^# \/\/\/ (?[a-zA-Z0-9-]+)$\s(?(^#(| .*)$\s)+)^# \/\/\/$/gm; + +/** + * Parse PEP 723 `script` metadata from script source text. + * + * Returns: + * - the parsed metadata if the text contains exactly one well-formed + * `script` block; + * - `undefined` if there is no `script` block, if there are multiple + * `script` blocks (per spec this MUST error), or if the block's + * TOML payload is malformed. + * + * Encoding: input is processed as UTF-8 text. The `# -*- coding: ... -*-` + * declaration is not honored (the spec permits but does not require it). + */ +export function readInlineScriptMetadata(scriptText: string): InlineScriptMetadata | undefined { + if (!scriptText) { + return undefined; + } + + // Strip a single leading UTF-8 BOM (\uFEFF). Files saved as + // "UTF-8 with BOM" on Windows have this; without stripping it the + // first line becomes "\uFEFF# /// script" and the regex fails to + // match. + let text = scriptText.charCodeAt(0) === 0xfeff ? scriptText.slice(1) : scriptText; + + // Normalize CRLF and lone CR to LF so the canonical regex (which + // was authored assuming `.` matches `\r`, true in Python's re but + // not in JavaScript) behaves consistently. The offsets in `range` + // refer to this normalized text. + text = text.replace(/\r\n?/g, '\n'); + + // Collect ALL matches first so we can detect the "multiple script + // blocks" error case the spec requires us to surface. + // + // `matchAll` constructs a fresh iterator and does not mutate the + // shared `BLOCK_RE.lastIndex`, so this loop is re-entrant and safe + // even if a caller (or an exception) ever interrupts a previous + // pass. + const scriptMatches: RegExpMatchArray[] = []; + for (const m of text.matchAll(BLOCK_RE)) { + // Per spec, tools MUST NOT read non-standardized block types. + // The only standardized type today is `script`. + if (m.groups?.type === 'script') { + scriptMatches.push(m); + } + } + + if (scriptMatches.length === 0) { + traceVerbose('inline script metadata: no `# /// script` block found'); + return undefined; + } + if (scriptMatches.length > 1) { + traceWarn( + `inline script metadata: ${scriptMatches.length} \`# /// script\` blocks found; per PEP 723 multiple blocks of the same type MUST be an error.`, + ); + return undefined; + } + + const match = scriptMatches[0]; + const rawContent = match.groups!.content; + // `index` is always populated for matches produced by + // `matchAll(regex)` when `regex` has the `g` flag, but the + // TypeScript lib type still marks it optional. Pin it locally. + const matchStart = match.index!; + + // Validate each content line and reconstruct the TOML payload, + // applying the spec's content-extraction rule: + // if line[1] === ' ' drop 2 chars, else drop 1 char (the leading '#'). + // The canonical regex already restricts content lines to '#' or + // '# ', but we walk the lines explicitly here both for + // safety against regex-engine quirks and to keep the + // reconstruction logic obvious. + const reconstructed: string[] = []; + const contentLines = rawContent.split('\n'); + for (const line of contentLines) { + if (line.length === 0) { + // Final element after splitting on the trailing '\n' that + // belongs to the last content line. Not a real line. + continue; + } + if (line[0] !== '#') { + traceWarn(`inline script metadata: invalid content line (must start with '#'): ${JSON.stringify(line)}`); + return undefined; + } + if (line.length === 1) { + // Bare '#': a blank content line within the block. + reconstructed.push(''); + continue; + } + if (line[1] !== ' ') { + // Per spec, content lines are exactly '#' or '# '. + // '##foo', '#\tfoo', '#foo' are not valid. + traceWarn(`inline script metadata: invalid content line (expected '#' or '# '): ${JSON.stringify(line)}`); + return undefined; + } + reconstructed.push(line.slice(2)); + } + + let parsed: tomljs.JsonMap; + try { + parsed = tomljs.parse(reconstructed.join('\n')); + } catch (err) { + traceWarn('inline script metadata: failed to parse TOML in `# /// script` block:', err); + return undefined; + } + + // Validate the small set of known fields. Unknown top-level keys + // are tolerated — the spec reserves room for future tool tables + // and we don't want to be brittle. + let requiresPython: string | undefined; + if (parsed['requires-python'] !== undefined) { + if (typeof parsed['requires-python'] !== 'string') { + traceWarn( + `inline script metadata: 'requires-python' must be a string, got ${typeof parsed['requires-python']}`, + ); + return undefined; + } + requiresPython = parsed['requires-python']; + } + + let dependencies: readonly string[] | undefined; + if (parsed.dependencies !== undefined) { + if (!Array.isArray(parsed.dependencies)) { + traceWarn('inline script metadata: `dependencies` must be an array of strings'); + return undefined; + } + for (const dep of parsed.dependencies) { + if (typeof dep !== 'string') { + traceWarn('inline script metadata: each entry in `dependencies` must be a string'); + return undefined; + } + } + // Defensive copy + freeze so consumers can't mutate the cached + // parse result. + dependencies = Object.freeze((parsed.dependencies as string[]).slice()); + } + + let tool: tomljs.JsonMap | undefined; + if (parsed.tool !== undefined) { + if (typeof parsed.tool !== 'object' || Array.isArray(parsed.tool) || parsed.tool === null) { + traceWarn('inline script metadata: `tool` must be a table'); + return undefined; + } + tool = parsed.tool as tomljs.JsonMap; + } + + // Range end: position immediately AFTER the closing `# ///` line's + // newline. The regex's `$` anchor stops before the newline, so we + // step over it explicitly when present. + let end = matchStart + match[0].length; + if (text.charAt(end) === '\n') { + end += 1; + } + + return { + requiresPython, + dependencies, + tool, + range: { start: matchStart, end }, + }; +} + +/** + * Read PEP 723 metadata from a file. Reads only the first + * `MAX_HEADER_BYTES` bytes of the file — PEP 723 blocks live at the + * top of files, so reading the whole file would be wasteful when this + * is invoked across many candidate `.py` files. + * + * Returns `undefined` for: + * - any URI scheme other than `file:` (notebook cells, untitled + * buffers, git: revisions, etc. are out of scope); + * - any I/O error (logged at `traceVerbose`); + * - any of the malformed-metadata cases handled by + * `readInlineScriptMetadata`. + */ +export async function readInlineScriptMetadataFromFile(uri: Uri): Promise { + if (uri.scheme !== 'file') { + traceVerbose(`inline script metadata: skipping non-file URI scheme '${uri.scheme}'`); + return undefined; + } + + let text: string; + try { + const handle = await fs.open(uri.fsPath, 'r'); + try { + const buf = Buffer.alloc(MAX_HEADER_BYTES); + const { bytesRead } = await handle.read(buf, 0, MAX_HEADER_BYTES, 0); + text = buf.toString('utf-8', 0, bytesRead); + } finally { + await handle.close(); + } + } catch (err) { + traceVerbose(`inline script metadata: failed to read ${uri.fsPath}:`, err); + return undefined; + } + + return readInlineScriptMetadata(text); +} + +/** + * Test whether a Python `version` (e.g. "3.12.4") satisfies a PEP 440 + * version specifier (e.g. ">=3.11"). Implements the subset of PEP 440 + * needed by `requires-python`: + * + * - operators `==`, `!=`, `>=`, `<=`, `>`, `<`, `~=`, `===`; + * - comma-separated clauses are AND-ed; + * - wildcard `==X.Y.*` (and the negated `!=X.Y.*`) is supported; + * - pre-release / dev / post / local version semantics are NOT + * modeled (script `requires-python` is almost always a simple + * lower bound; suffixes on the input version are truncated to + * the release segments). + * + * Returns `false` (and logs a `traceWarn`) on an unparseable + * specifier — safer than defaulting to "any version goes". + */ +export function matchesPythonVersion(requiresPython: string, version: string): boolean { + if (!requiresPython || !version) { + return false; + } + const clauses = requiresPython + .split(',') + .map((c) => c.trim()) + .filter((c) => c.length > 0); + if (clauses.length === 0) { + return false; + } + for (const clause of clauses) { + if (!matchSingleClause(clause, version)) { + return false; + } + } + return true; +} + +// Longest-match-first order matters: `===` must beat `==`, `~=` and +// `>=` / `<=` / `!=` must beat the single-char operators. +const SPECIFIER_RE = /^(===|~=|==|!=|>=|<=|>|<)\s*(.+)$/; + +function matchSingleClause(clause: string, version: string): boolean { + const m = clause.match(SPECIFIER_RE); + if (!m) { + traceWarn(`inline script metadata: unrecognized requires-python clause: ${JSON.stringify(clause)}`); + return false; + } + const op = m[1]; + const specVersion = m[2].trim(); + + if (op === '===') { + // Arbitrary-equality: exact string comparison after stripping + // a leading 'v' (which PEP 440 permits). + const normSpec = specVersion.replace(/^v/i, ''); + const normVer = version.replace(/^v/i, ''); + return normSpec === normVer; + } + + if (specVersion.endsWith('.*')) { + if (op !== '==' && op !== '!=') { + traceWarn( + `inline script metadata: wildcard versions are only valid with '==' or '!=': ${JSON.stringify(clause)}`, + ); + return false; + } + const prefix = parseRelease(specVersion.slice(0, -2)); + const ver = parseRelease(version); + if (prefix === undefined || ver === undefined) { + traceWarn(`inline script metadata: cannot parse version for clause ${JSON.stringify(clause)}`); + return false; + } + const isPrefixMatch = ver.length >= prefix.length && prefix.every((seg, i) => ver[i] === seg); + return op === '==' ? isPrefixMatch : !isPrefixMatch; + } + + const specSegs = parseRelease(specVersion); + const verSegs = parseRelease(version); + if (specSegs === undefined || verSegs === undefined) { + traceWarn(`inline script metadata: cannot parse version for clause ${JSON.stringify(clause)}`); + return false; + } + + const cmp = compareReleases(verSegs, specSegs); + switch (op) { + case '==': + return cmp === 0; + case '!=': + return cmp !== 0; + case '>=': + return cmp >= 0; + case '<=': + return cmp <= 0; + case '>': + return cmp > 0; + case '<': + return cmp < 0; + case '~=': { + // Compatible release. `~=X.Y` is equivalent to + // `>= X.Y, == X.*`; `~=X.Y.Z` is `>= X.Y.Z, == X.Y.*`. + // PEP 440 requires at least two release segments here. + if (specSegs.length < 2) { + traceWarn( + `inline script metadata: '~=' requires at least two release segments: ${JSON.stringify(clause)}`, + ); + return false; + } + if (cmp < 0) { + return false; + } + const prefix = specSegs.slice(0, -1); + if (verSegs.length < prefix.length) { + return false; + } + return prefix.every((seg, i) => verSegs[i] === seg); + } + default: + // Unreachable — SPECIFIER_RE only matches the operators above. + return false; + } +} + +function parseRelease(v: string): number[] | undefined { + let s = v.trim().replace(/^v/i, ''); + // Strip optional epoch prefix `N!`. + const epoch = s.match(/^(\d+)!(.*)$/); + if (epoch) { + s = epoch[2]; + } + // Take only the leading dotted-integer segments; PEP 440 release + // segments must be integers. Pre/post/dev/local suffixes are + // dropped, which is sufficient for `requires-python` matching. + const m = s.match(/^(\d+(?:\.\d+)*)/); + if (!m) { + return undefined; + } + return m[1].split('.').map((x) => parseInt(x, 10)); +} + +function compareReleases(a: readonly number[], b: readonly number[]): number { + const n = Math.max(a.length, b.length); + for (let i = 0; i < n; i++) { + const av = a[i] ?? 0; + const bv = b[i] ?? 0; + if (av < bv) { + return -1; + } + if (av > bv) { + return 1; + } + } + return 0; +} diff --git a/src/common/telemetry/constants.ts b/src/common/telemetry/constants.ts index 858529e7..55a012aa 100644 --- a/src/common/telemetry/constants.ts +++ b/src/common/telemetry/constants.ts @@ -209,6 +209,27 @@ export enum EventNames { * - errorType: string (only when outcome === 'failed') */ MIGRATION_SYSTEM_ENV_MANAGER = 'MIGRATION.SYSTEM_ENV_MANAGER', + /** + * Telemetry event fired once per session, per URI, the first time a `.py` + * file with a valid PEP 723 `# /// script` block is observed by the lazy + * detector. Used to size the population of users who actually see PEP 723 + * files — the denominator for the "view vs edit" question. + * Properties: + * - trigger: 'open' | 'save' (which workspace event surfaced the file) + * - hasRequiresPython: boolean (whether the block declares `requires-python`) + * Measures: + * - dependencyCount: number (number of entries in the `dependencies` list) + */ + PEP723_DETECTED = 'PEP723.DETECTED', + /** + * Telemetry event fired once per session, per URI, the first time a `.py` + * file that previously raised a `PEP723.DETECTED` event receives a real + * text edit. Together with `PEP723.DETECTED` this measures the fraction + * of users who do more than view PEP 723 scripts. + * Measures: + * - duration: number (ms between the detection and the first edit) + */ + PEP723_EDITED = 'PEP723.EDITED', } // Map all events to their properties @@ -657,4 +678,25 @@ export interface IEventNamePropertyMapping { outcome: 'removed' | 'partial' | 'not_set' | 'failed'; errorType?: string; }; + + /* __GDPR__ + "pep723.detected": { + "trigger": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "StellaHuang95" }, + "hasRequiresPython": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "owner": "StellaHuang95" }, + "dependencyCount": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "StellaHuang95" } + } + */ + [EventNames.PEP723_DETECTED]: { + trigger: 'open' | 'save'; + hasRequiresPython: boolean; + // Goes through the measures payload (numeric); listed here for GDPR only. + dependencyCount?: number; + }; + + /* __GDPR__ + "pep723.edited": { + "": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "owner": "StellaHuang95" } + } + */ + [EventNames.PEP723_EDITED]: never | undefined; } diff --git a/src/common/workspace.apis.ts b/src/common/workspace.apis.ts index c009cd6d..d571bcf2 100644 --- a/src/common/workspace.apis.ts +++ b/src/common/workspace.apis.ts @@ -8,6 +8,8 @@ import { FileRenameEvent, FileSystemWatcher, GlobPattern, + TextDocument, + TextDocumentChangeEvent, Uri, workspace, WorkspaceConfiguration, @@ -72,3 +74,36 @@ export function onDidRenameFiles( ): Disposable { return workspace.onDidRenameFiles(listener, thisArgs, disposables); } + +export function onDidOpenTextDocument( + listener: (e: TextDocument) => any, + thisArgs?: any, + disposables?: Disposable[], +): Disposable { + return workspace.onDidOpenTextDocument(listener, thisArgs, disposables); +} + +export function onDidSaveTextDocument( + listener: (e: TextDocument) => any, + thisArgs?: any, + disposables?: Disposable[], +): Disposable { + return workspace.onDidSaveTextDocument(listener, thisArgs, disposables); +} + +export function onDidChangeTextDocument( + listener: (e: TextDocumentChangeEvent) => any, + thisArgs?: any, + disposables?: Disposable[], +): Disposable { + return workspace.onDidChangeTextDocument(listener, thisArgs, disposables); +} + +/** + * Snapshot of the text documents VS Code has already opened. Useful + * for extensions activated by `onLanguage:*` events, which miss the + * `onDidOpenTextDocument` fired for the activating document. + */ +export function getOpenTextDocuments(): readonly TextDocument[] { + return workspace.textDocuments; +} diff --git a/src/extension.ts b/src/extension.ts index 3a28d78f..6f609efe 100644 --- a/src/extension.ts +++ b/src/extension.ts @@ -64,6 +64,7 @@ import { } from './features/envCommands'; import { PythonEnvironmentManagers } from './features/envManagers'; import { EnvVarManager, PythonEnvVariableManager } from './features/execution/envVariableManager'; +import { InlineScriptLazyDetector } from './features/inlineScriptLazyDetector'; import { applyInitialEnvironmentSelection, registerInterpreterSettingsChangeListener, @@ -205,6 +206,13 @@ export async function activate(context: ExtensionContext): Promise(); diff --git a/src/features/inlineScriptLazyDetector.ts b/src/features/inlineScriptLazyDetector.ts new file mode 100644 index 00000000..f84dc499 --- /dev/null +++ b/src/features/inlineScriptLazyDetector.ts @@ -0,0 +1,246 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import * as path from 'path'; +import { Disposable, TextDocument, TextDocumentChangeEvent, Uri } from 'vscode'; +import { readInlineScriptMetadataFromFile } from '../common/inlineScriptMetadata'; +import { traceVerbose, traceWarn } from '../common/logging'; +import { EventNames } from '../common/telemetry/constants'; +import { sendTelemetryEvent } from '../common/telemetry/sender'; +import { + getOpenTextDocuments, + getWorkspaceFolder, + onDidChangeTextDocument, + onDidOpenTextDocument, + onDidSaveTextDocument, +} from '../common/workspace.apis'; + +/** + * Silent on-open / on-save detector for `.py` files that declare + * inline script metadata (PEP 723). The detector parses the head of + * every eligible `.py` file the user opens or saves and emits two + * anonymized telemetry events: + * + * - `PEP723.DETECTED` once per (URI, session) the first time a + * valid `# /// script` block is observed. This is the denominator + * for the "how many users actually see PEP 723 files" question. + * - `PEP723.EDITED` once per (URI, session) the first time a + * previously-detected file receives a real text edit. Together + * with `DETECTED` this distinguishes viewers from editors. + * + * No URIs, file paths, or file content are sent. The detector does + * not register projects, surface UI, or otherwise change extension + * behavior; it is a pure observer. + * + * Detection is cheap (≤ 8 KiB read + regex + TOML parse) and runs + * only on files the user has already shown intent in. + */ +export class InlineScriptLazyDetector implements Disposable { + private readonly subscriptions: Disposable[] = []; + // In-flight reads keyed by `uri.toString()` so rapid open+save + // doesn't double-process the same file. + private readonly inFlight = new Map>(); + // URIs (as `uri.toString()`) for which we have already emitted + // `PEP723.DETECTED` in this session. Used to dedup the detection + // event across repeat opens/saves and to gate `PEP723.EDITED` so + // the latter only fires for files we already counted as detected. + private readonly detectedUris = new Set(); + // URIs for which we have already emitted `PEP723.EDITED` in this + // session. Each detected file emits at most one edited event. + private readonly editedUris = new Set(); + // Wall-clock ms (from `Date.now`) at which each URI's detection + // event fired. Used to compute the `duration` measure on the + // first-edit event. + private readonly detectionAtMs = new Map(); + // Flips to `true` in `dispose()`. Guards async continuations + // inside `processOnce` so an in-flight read that completes after + // disposal does not emit telemetry on a detector the host has + // already torn down. + private disposed = false; + + /** + * Subscribe to workspace text-document events. Safe to call once + * during extension activation. + * + * Listeners return the promise from `handleDocument` rather than + * void-ing it. VS Code's event bus does not await listener + * promises (so production behaviour is unchanged — still + * fire-and-forget), but returning the promise lets tests await + * the work triggered by a synthetic open/save event. + * + * After subscribing we also replay every document already open at + * activation time. Our `onLanguage:python` activation event fires + * AFTER VS Code has already opened any restored editors, so the + * `onDidOpenTextDocument` for the file that triggered activation + * (the most common case) is gone by the time we subscribe. The + * replay is deferred via `setImmediate` so VS Code finishes any + * in-flight document registration first; the per-URI dedup in + * `handleDocument` keeps it idempotent if a live event happens to + * arrive too. + */ + public activate(): void { + this.subscriptions.push( + onDidOpenTextDocument((doc) => this.handleDocument(doc, 'open')), + onDidSaveTextDocument((doc) => this.handleDocument(doc, 'save')), + onDidChangeTextDocument((e) => this.handleChange(e)), + ); + // Defer the catch-up pass so we observe `workspace.textDocuments` + // AFTER VS Code finishes registering the document that triggered + // our activation. Running the loop synchronously here can race + // against VS Code's own initialization on `onLanguage:*` activation. + const handle = setImmediate(() => this.replayOpenDocuments('activate')); + this.subscriptions.push(new Disposable(() => clearImmediate(handle))); + } + + /** + * Walk every currently-open text document and run it through + * `handleDocument` as if a synthetic `open` event had fired. Used + * for the deferred activation catch-up. The per-URI dedup in + * `handleDocument` keeps this safe to call repeatedly. + */ + private replayOpenDocuments(source: 'activate'): void { + // Restrict the replay to documents that the per-event handler + // would actually look at. This keeps the activation log + // proportional to the work the detector will do — on an + // editor with many tabs open we would otherwise dump every + // URI just to throw most of them away inside + // `handleDocument`. + const openDocs = getOpenTextDocuments().filter((d) => shouldHandleUri(d.uri)); + if (openDocs.length === 0) { + traceVerbose(`inlineScriptLazyDetector: ${source} replay found no candidate .py documents`); + return; + } + traceVerbose( + `inlineScriptLazyDetector: ${source} replay over ${openDocs.length} candidate .py document(s): ` + + openDocs.map((d) => d.uri.fsPath).join(', '), + ); + for (const doc of openDocs) { + void this.handleDocument(doc, 'open'); + } + } + + public dispose(): void { + this.disposed = true; + this.subscriptions.forEach((s) => s.dispose()); + this.subscriptions.length = 0; + this.inFlight.clear(); + } + + private async handleDocument(doc: TextDocument, trigger: 'open' | 'save'): Promise { + const uri = doc.uri; + // Diagnostic: trace every event entering the detector. This + // is high-frequency (fires on every keystroke-triggered save + // and on every editor open) so it stays at `traceVerbose` — + // the `Trace` log level — to avoid flooding the default + // `Info` channel. + traceVerbose(`inlineScriptLazyDetector: event received (${trigger}) ${uri.toString()}`); + if (!shouldHandleUri(uri)) { + traceVerbose( + `inlineScriptLazyDetector: skipped (${trigger}) ${uri.toString()} ` + + `(scheme='${uri.scheme}', extname='${path.extname(uri.fsPath).toLowerCase()}', ` + + `inWorkspace=${getWorkspaceFolder(uri) !== undefined})`, + ); + return; + } + const key = uri.toString(); + const existing = this.inFlight.get(key); + if (existing) { + // Coalesce repeated open/save events for the same URI. + // We only parse for observation (telemetry), so the most + // recent in-flight read is good enough; there is no + // cached state downstream that could go stale. + await existing; + return; + } + const work = this.processOnce(uri, trigger).finally(() => { + this.inFlight.delete(key); + }); + this.inFlight.set(key, work); + await work; + } + + private async processOnce(uri: Uri, trigger: 'open' | 'save'): Promise { + try { + const metadata = await readInlineScriptMetadataFromFile(uri); + if (this.disposed) { + return; + } + if (metadata === undefined) { + return; + } + const key = uri.toString(); + if (this.detectedUris.has(key)) { + // Already counted this file in the current session. + // Subsequent opens/saves of the same URI are silent. + return; + } + this.detectedUris.add(key); + this.detectionAtMs.set(key, Date.now()); + traceVerbose(`inlineScriptLazyDetector: detected inline script metadata in ${uri.fsPath} (${trigger})`); + sendTelemetryEvent( + EventNames.PEP723_DETECTED, + { dependencyCount: metadata.dependencies?.length ?? 0 }, + { + trigger, + hasRequiresPython: metadata.requiresPython !== undefined, + }, + ); + } catch (err) { + // `readInlineScriptMetadataFromFile` already swallows I/O + // errors internally. This catch is a defensive net for + // unexpected synchronous throws (e.g. malformed URI). + traceWarn(`inlineScriptLazyDetector: unexpected error while reading ${uri.fsPath}:`, err); + } + } + + /** + * Emit `PEP723.EDITED` the first time a previously-detected URI + * receives a real content change. The handler is hot (fires on + * every keystroke in every text document workspace-wide) so it + * bails out as cheaply as possible for the common case where the + * file is not a tracked PEP 723 script. + */ + private handleChange(e: TextDocumentChangeEvent): void { + if (this.disposed) { + return; + } + // `onDidChangeTextDocument` can fire with empty `contentChanges` + // (e.g. dirty-state toggles); skip those — they aren't user edits. + if (e.contentChanges.length === 0) { + return; + } + const key = e.document.uri.toString(); + if (!this.detectedUris.has(key)) { + return; + } + if (this.editedUris.has(key)) { + return; + } + this.editedUris.add(key); + const detectedAt = this.detectionAtMs.get(key); + const duration = detectedAt !== undefined ? Date.now() - detectedAt : 0; + traceVerbose( + `inlineScriptLazyDetector: first edit observed on ${e.document.uri.fsPath} (${duration}ms after detection)`, + ); + sendTelemetryEvent(EventNames.PEP723_EDITED, duration); + } +} + +/** + * Cheap, side-effect-free gate for which URIs the lazy detector + * should look at. Filters out non-file schemes, non-`.py` + * extensions, and files that are not inside an open workspace + * folder. Exported for test access. + */ +export function shouldHandleUri(uri: Uri): boolean { + if (uri.scheme !== 'file') { + return false; + } + if (path.extname(uri.fsPath).toLowerCase() !== '.py') { + return false; + } + if (getWorkspaceFolder(uri) === undefined) { + return false; + } + return true; +} diff --git a/src/test/common/inlineScriptMetadata.unit.test.ts b/src/test/common/inlineScriptMetadata.unit.test.ts new file mode 100644 index 00000000..cc8971f8 --- /dev/null +++ b/src/test/common/inlineScriptMetadata.unit.test.ts @@ -0,0 +1,497 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import assert from 'assert'; +import * as fs from 'fs-extra'; +import * as os from 'os'; +import path from 'path'; +import * as sinon from 'sinon'; +import { Uri } from 'vscode'; +import { + InlineScriptMetadata, + MAX_HEADER_BYTES, + matchesPythonVersion, + readInlineScriptMetadata, + readInlineScriptMetadataFromFile, +} from '../../common/inlineScriptMetadata'; +import * as logging from '../../common/logging'; + +// Helper to assemble a script body. Lines are joined with '\n' so each +// test controls line endings explicitly via `joiner` when needed. +function script(lines: string[], joiner = '\n'): string { + return lines.join(joiner); +} + +suite('inlineScriptMetadata', () => { + let traceWarnStub: sinon.SinonStub; + let traceVerboseStub: sinon.SinonStub; + + setup(() => { + traceWarnStub = sinon.stub(logging, 'traceWarn'); + traceVerboseStub = sinon.stub(logging, 'traceVerbose'); + }); + + teardown(() => { + sinon.restore(); + }); + + suite('readInlineScriptMetadata', () => { + test('empty input returns undefined', () => { + assert.strictEqual(readInlineScriptMetadata(''), undefined); + }); + + test('file without any block returns undefined', () => { + const text = script(['#!/usr/bin/env python3', 'import sys', 'print("hello")']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('minimal valid block (single blank content line) parses with empty metadata', () => { + // The canonical PEP 723 regex requires AT LEAST ONE content + // line between the open and close markers (the `+` quantifier + // in `(^#(| .*)$\s)+`). A bare `#` line counts as a blank + // content line and is the minimal accepted form. + const text = script(['# /// script', '#', '# ///', '', 'print("hi")']); + const md = readInlineScriptMetadata(text); + assert.ok(md, 'expected metadata to be defined'); + assert.strictEqual(md.requiresPython, undefined); + assert.strictEqual(md.dependencies, undefined); + assert.strictEqual(md.tool, undefined); + assert.strictEqual(md.range.start, 0); + const expectedBlock = '# /// script\n#\n# ///\n'; + assert.strictEqual(md.range.end, expectedBlock.length); + }); + + test('valid block with requires-python and dependencies', () => { + const text = script([ + '# /// script', + '# requires-python = ">=3.11"', + '# dependencies = [', + '# "requests<3",', + '# "rich",', + '# ]', + '# ///', + 'import requests', + ]); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.strictEqual(md.requiresPython, '>=3.11'); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['requests<3', 'rich']); + assert.strictEqual(md.tool, undefined); + }); + + test('valid block with [tool] table is opaque', () => { + const text = script([ + '# /// script', + '# dependencies = ["x"]', + '# [tool.mybuild]', + '# extra = "thing"', + '# ///', + ]); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.ok(md.tool, 'tool table should be populated'); + assert.deepStrictEqual(md.tool, { mybuild: { extra: 'thing' } }); + }); + + test('multiple `script` blocks returns undefined and logs a warning', () => { + const text = script([ + '# /// script', + '# dependencies = ["a"]', + '# ///', + '', + '# /// script', + '# dependencies = ["b"]', + '# ///', + ]); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + assert.ok(traceWarnStub.called, 'expected a traceWarn for multiple script blocks'); + }); + + test('unclosed block returns undefined', () => { + const text = script(['# /// script', '# dependencies = ["a"]', 'import x']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('content lines may be bare `#` (blank in metadata)', () => { + const text = script([ + '# /// script', + '# requires-python = ">=3.10"', + '#', + '# dependencies = ["a"]', + '# ///', + ]); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.strictEqual(md.requiresPython, '>=3.10'); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('CRLF line endings parse identically to LF', () => { + const text = script(['# /// script', '# dependencies = ["a"]', '# ///', ''], '\r\n'); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('lone-CR line endings parse identically to LF', () => { + const text = script(['# /// script', '# dependencies = ["a"]', '# ///', ''], '\r'); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('content with `##` line is rejected', () => { + const text = script(['# /// script', '## not a valid content line', '# ///']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('content with `#\\t` line is rejected', () => { + const text = script(['# /// script', '#\tnot a valid content line', '# ///']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('leading UTF-8 BOM is stripped before matching', () => { + const text = '\uFEFF' + script(['# /// script', '# dependencies = ["a"]', '# ///']); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('shebang before block does not block detection', () => { + const text = script(['#!/usr/bin/env python3', '# /// script', '# dependencies = ["a"]', '# ///']); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('encoding declaration before block does not block detection', () => { + const text = script(['# -*- coding: utf-8 -*-', '# /// script', '# dependencies = ["a"]', '# ///']); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('shebang AND encoding declaration before block', () => { + const text = script([ + '#!/usr/bin/env python3', + '# -*- coding: utf-8 -*-', + '# /// script', + '# dependencies = ["a"]', + '# ///', + ]); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('closing `# ///` with trailing whitespace is rejected', () => { + const text = script(['# /// script', '# dependencies = ["a"]', '# /// ']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('opening `# /// script ` with trailing whitespace is rejected', () => { + const text = script(['# /// script ', '# dependencies = ["a"]', '# ///']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('malformed TOML returns undefined and logs warn', () => { + const text = script(['# /// script', '# this is = not valid = toml', '# ///']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + assert.ok(traceWarnStub.called, 'expected a traceWarn for malformed TOML'); + }); + + test('dependencies is not a list returns undefined', () => { + const text = script(['# /// script', '# dependencies = "not a list"', '# ///']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('dependencies contains non-string returns undefined', () => { + // @iarna/toml will accept a mixed array; we validate downstream. + const text = script(['# /// script', '# dependencies = ["ok", 42]', '# ///']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('dependencies array with an empty string is passed through', () => { + const text = script(['# /// script', '# dependencies = [""]', '# ///']); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['']); + }); + + test('requires-python that is not a string returns undefined', () => { + const text = script(['# /// script', '# requires-python = 311', '# ///']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('non-`script` block TYPE is ignored', () => { + const text = script(['# /// pyproject', '# foo = "bar"', '# ///']); + assert.strictEqual(readInlineScriptMetadata(text), undefined); + }); + + test('non-`script` block followed by valid script block', () => { + const text = script([ + '# /// pyproject', + '# foo = "bar"', + '# ///', + '', + '# /// script', + '# dependencies = ["a"]', + '# ///', + ]); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('valid script block followed by non-`script` block', () => { + // Reverse order: a valid `# /// script` block immediately + // before another block whose TYPE is not `script`. The + // parser must still recognize the script block — PEP 723 + // does not require it to be the first or last block in + // the file. + const text = script([ + '# /// script', + '# dependencies = ["b"]', + '# ///', + '', + '# /// pyproject', + '# foo = "bar"', + '# ///', + ]); + const md = readInlineScriptMetadata(text); + assert.ok(md, 'a script block adjacent to a pyproject block must be recognized'); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['b']); + }); + + test('range refers to the normalized text', () => { + // Canonical regex requires at least one content line; use a + // bare `#` so the block is the minimal accepted form. + const text = script(['# /// script', '#', '# ///', 'rest']); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.strictEqual(md.range.start, 0); + // start of `# /// script` line through and including the + // closing `# ///`'s terminating newline. + assert.strictEqual(text.slice(md.range.start, md.range.end), '# /// script\n#\n# ///\n'); + }); + + test('dependencies array is frozen (defensive copy)', () => { + const text = script(['# /// script', '# dependencies = ["a"]', '# ///']); + const md = readInlineScriptMetadata(text); + assert.ok(md?.dependencies); + assert.throws(() => { + (md.dependencies as string[]).push('b'); + }); + }); + + test('block beyond MAX_HEADER_BYTES boundary is invisible to file-reader (known limitation)', () => { + // The string parser sees the whole text, so it WILL find a + // block past the byte boundary. This test documents that + // the boundary is only enforced by the file reader. + const padding = 'a'.repeat(MAX_HEADER_BYTES + 100); + const text = padding + '\n' + script(['# /// script', '# dependencies = ["a"]', '# ///']); + const md = readInlineScriptMetadata(text); + assert.ok(md, 'parser ignores byte caps; only the file reader enforces them'); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + // The plan calls out a "nested-looking line" test case (e.g. a + // `# ///` that appears inside what is morally a TOML multi-line + // string). The canonical regex is greedy with backtracking, so + // the LAST `# ///` line wins as the closing marker. We assert + // that the well-formed text round-trips. + test('content containing a `# ///` mid-block: last `# ///` wins as closer', () => { + // The "mid-block" `# ///` is a real `# ///` line inside the + // block. Per the regex's greedy backtracking, the parser + // treats it as content and the trailing `# ///` as the + // close. + const text = script(['# /// script', '# dependencies = ["a"]', '# # /// inside as comment', '# ///']); + const md = readInlineScriptMetadata(text); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + // ensure traceVerboseStub silences "no block" log noise — also + // a sanity check that the verbose log fires on the negative path. + test('no-block path logs only at verbose level', () => { + readInlineScriptMetadata('print("hi")'); + assert.strictEqual(traceWarnStub.called, false); + assert.ok(traceVerboseStub.called); + }); + }); + + suite('readInlineScriptMetadataFromFile', () => { + let tmpDir: string; + + setup(async () => { + tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'ism-test-')); + }); + + teardown(async () => { + await fs.remove(tmpDir); + }); + + test('returns metadata for a real on-disk .py file', async () => { + const filePath = path.join(tmpDir, 'script.py'); + await fs.writeFile( + filePath, + script(['# /// script', '# requires-python = ">=3.11"', '# dependencies = ["a"]', '# ///', 'print(1)']), + ); + const md = await readInlineScriptMetadataFromFile(Uri.file(filePath)); + assert.ok(md); + assert.strictEqual(md.requiresPython, '>=3.11'); + }); + + test('returns undefined for a file without a block', async () => { + const filePath = path.join(tmpDir, 'plain.py'); + await fs.writeFile(filePath, 'print("hi")\n'); + const md = await readInlineScriptMetadataFromFile(Uri.file(filePath)); + assert.strictEqual(md, undefined); + }); + + test('returns undefined for a non-file URI scheme', async () => { + const md = await readInlineScriptMetadataFromFile(Uri.parse('untitled:foo.py')); + assert.strictEqual(md, undefined); + }); + + test('returns undefined when the file does not exist', async () => { + const md = await readInlineScriptMetadataFromFile(Uri.file(path.join(tmpDir, 'does-not-exist.py'))); + assert.strictEqual(md, undefined); + }); + + test('block past MAX_HEADER_BYTES boundary is NOT found (cap is enforced)', async () => { + const filePath = path.join(tmpDir, 'big.py'); + // Pad with a comment that fills more than MAX_HEADER_BYTES, + // then put a valid block AFTER the cap. The reader should + // see only the padding and return undefined. + const padding = '# ' + 'a'.repeat(MAX_HEADER_BYTES); + const body = script([padding, '# /// script', '# dependencies = ["a"]', '# ///']); + await fs.writeFile(filePath, body); + const md = await readInlineScriptMetadataFromFile(Uri.file(filePath)); + assert.strictEqual(md, undefined); + }); + + test('block at top of a >MAX_HEADER_BYTES file is still found', async () => { + const filePath = path.join(tmpDir, 'big-top.py'); + const trailing = 'x'.repeat(MAX_HEADER_BYTES * 2); + const body = script(['# /// script', '# dependencies = ["a"]', '# ///', trailing]); + await fs.writeFile(filePath, body); + const md = await readInlineScriptMetadataFromFile(Uri.file(filePath)); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + + test('reader returns a parsed block for a minimal on-disk file', async () => { + // The negative case above ("block past MAX_HEADER_BYTES + // boundary is NOT found") proves the byte cap is enforced. + // This test just sanity-checks the happy path on a minimal + // file: the reader returns the parsed block. + const filePath = path.join(tmpDir, 'spy.py'); + await fs.writeFile(filePath, script(['# /// script', '# dependencies = ["a"]', '# ///'])); + const md = await readInlineScriptMetadataFromFile(Uri.file(filePath)); + assert.ok(md); + assert.deepStrictEqual([...(md.dependencies ?? [])], ['a']); + }); + }); + + suite('matchesPythonVersion', () => { + test('>=3.11 vs 3.10/3.11/3.12', () => { + assert.strictEqual(matchesPythonVersion('>=3.11', '3.10'), false); + assert.strictEqual(matchesPythonVersion('>=3.11', '3.11'), true); + assert.strictEqual(matchesPythonVersion('>=3.11', '3.11.4'), true); + assert.strictEqual(matchesPythonVersion('>=3.11', '3.12'), true); + }); + + test('==3.12.* wildcard prefix match', () => { + assert.strictEqual(matchesPythonVersion('==3.12.*', '3.12'), true); + assert.strictEqual(matchesPythonVersion('==3.12.*', '3.12.4'), true); + assert.strictEqual(matchesPythonVersion('==3.12.*', '3.11.4'), false); + assert.strictEqual(matchesPythonVersion('==3.12.*', '3.13.0'), false); + }); + + test('!=3.12.* wildcard prefix anti-match', () => { + assert.strictEqual(matchesPythonVersion('!=3.12.*', '3.12.4'), false); + assert.strictEqual(matchesPythonVersion('!=3.12.*', '3.11.4'), true); + assert.strictEqual(matchesPythonVersion('!=3.12.*', '3.13.0'), true); + }); + + test('multi-clause >=3.10,<3.13 is AND-ed', () => { + assert.strictEqual(matchesPythonVersion('>=3.10,<3.13', '3.9.0'), false); + assert.strictEqual(matchesPythonVersion('>=3.10,<3.13', '3.10'), true); + assert.strictEqual(matchesPythonVersion('>=3.10,<3.13', '3.12.4'), true); + assert.strictEqual(matchesPythonVersion('>=3.10,<3.13', '3.13'), false); + }); + + test('~=3.11 (compatible release at minor level)', () => { + assert.strictEqual(matchesPythonVersion('~=3.11', '3.10.0'), false); + assert.strictEqual(matchesPythonVersion('~=3.11', '3.11.0'), true); + assert.strictEqual(matchesPythonVersion('~=3.11', '3.12.4'), true); + assert.strictEqual(matchesPythonVersion('~=3.11', '4.0.0'), false); + }); + + test('~=3.11.2 (compatible release at patch level)', () => { + assert.strictEqual(matchesPythonVersion('~=3.11.2', '3.11.1'), false); + assert.strictEqual(matchesPythonVersion('~=3.11.2', '3.11.2'), true); + assert.strictEqual(matchesPythonVersion('~=3.11.2', '3.11.10'), true); + assert.strictEqual(matchesPythonVersion('~=3.11.2', '3.12.0'), false); + }); + + test('== exact match (PEP 440 release-segment equality with zero padding)', () => { + // Per PEP 440 §"Version matching": when comparing release + // segments of different lengths, the shorter is padded + // with zeros. So `==3.11` matches both `3.11` and `3.11.0`. + // Users who want strict-shape equality use `===`. + assert.strictEqual(matchesPythonVersion('==3.11', '3.11'), true); + assert.strictEqual(matchesPythonVersion('==3.11', '3.11.0'), true); + assert.strictEqual(matchesPythonVersion('==3.11.0', '3.11.0'), true); + assert.strictEqual(matchesPythonVersion('==3.11', '3.12'), false); + assert.strictEqual(matchesPythonVersion('==3.11', '3.11.1'), false); + }); + + test('!= inequality', () => { + assert.strictEqual(matchesPythonVersion('!=3.11', '3.11'), false); + assert.strictEqual(matchesPythonVersion('!=3.11', '3.12'), true); + }); + + test('===X (arbitrary equality) is string match', () => { + assert.strictEqual(matchesPythonVersion('===3.11.0', '3.11.0'), true); + assert.strictEqual(matchesPythonVersion('===3.11.0', '3.11'), false); + }); + + test('input version with pre/dev suffix is truncated to release', () => { + assert.strictEqual(matchesPythonVersion('>=3.11', '3.11.0rc1'), true); + assert.strictEqual(matchesPythonVersion('>=3.11', '3.10.0rc1'), false); + }); + + test('invalid specifier returns false and logs warn', () => { + assert.strictEqual(matchesPythonVersion('weird-thing', '3.11'), false); + assert.ok(traceWarnStub.called); + }); + + test('empty specifier or version returns false', () => { + assert.strictEqual(matchesPythonVersion('', '3.11'), false); + assert.strictEqual(matchesPythonVersion('>=3.11', ''), false); + }); + + test('whitespace around clauses is tolerated', () => { + assert.strictEqual(matchesPythonVersion(' >= 3.11 , < 3.13 ', '3.12.4'), true); + }); + + test('wildcard with non-equality operator is invalid', () => { + assert.strictEqual(matchesPythonVersion('>=3.12.*', '3.12.4'), false); + assert.ok(traceWarnStub.called); + }); + }); + + // Type-only assertion: make sure InlineScriptMetadata is exported + // and structurally compatible with downstream use. + test('InlineScriptMetadata interface is exported and structurally correct', () => { + const sample: InlineScriptMetadata = { + requiresPython: '>=3.11', + dependencies: ['x'], + tool: { mykey: 'v' }, + range: { start: 0, end: 10 }, + }; + assert.strictEqual(sample.requiresPython, '>=3.11'); + }); +}); diff --git a/src/test/features/inlineScriptLazyDetector.unit.test.ts b/src/test/features/inlineScriptLazyDetector.unit.test.ts new file mode 100644 index 00000000..44e83e21 --- /dev/null +++ b/src/test/features/inlineScriptLazyDetector.unit.test.ts @@ -0,0 +1,454 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import assert from 'assert'; +import * as path from 'path'; +import * as sinon from 'sinon'; +import { Disposable, TextDocument, TextDocumentChangeEvent, TextDocumentContentChangeEvent, Uri } from 'vscode'; +import * as ism from '../../common/inlineScriptMetadata'; +import { EventNames } from '../../common/telemetry/constants'; +import * as telemetrySender from '../../common/telemetry/sender'; +import * as wapi from '../../common/workspace.apis'; +import { InlineScriptLazyDetector, shouldHandleUri } from '../../features/inlineScriptLazyDetector'; + +// Build a minimal TextDocument stub. Only the `uri` field is read by +// the detector; the rest exists to satisfy the type. +function makeDoc(uri: Uri): TextDocument { + return { uri } as TextDocument; +} + +// A non-empty change event payload. The actual content of the +// changes is not inspected by the detector; only `contentChanges.length` +// matters. +const NON_EMPTY_CHANGES: readonly TextDocumentContentChangeEvent[] = [ + { range: undefined as never, rangeOffset: 0, rangeLength: 0, text: 'x' }, +]; + +function makeChange(uri: Uri, changes: readonly TextDocumentContentChangeEvent[] = NON_EMPTY_CHANGES): TextDocumentChangeEvent { + return { + document: makeDoc(uri), + contentChanges: changes, + reason: undefined, + } as TextDocumentChangeEvent; +} + +const VALID_METADATA: ism.InlineScriptMetadata = { + requiresPython: '>=3.11', + dependencies: ['requests', 'rich'], + tool: undefined, + range: { start: 0, end: 40 }, +}; + +suite('InlineScriptLazyDetector', () => { + let onDidOpenStub: sinon.SinonStub; + let onDidSaveStub: sinon.SinonStub; + let onDidChangeStub: sinon.SinonStub; + let getOpenTextDocumentsStub: sinon.SinonStub; + let getWorkspaceFolderStub: sinon.SinonStub; + let readMetadataStub: sinon.SinonStub; + let sendTelemetryStub: sinon.SinonStub; + let openListener: ((doc: TextDocument) => unknown) | undefined; + let saveListener: ((doc: TextDocument) => unknown) | undefined; + let changeListener: ((e: TextDocumentChangeEvent) => unknown) | undefined; + + setup(() => { + openListener = undefined; + saveListener = undefined; + changeListener = undefined; + + onDidOpenStub = sinon.stub(wapi, 'onDidOpenTextDocument'); + onDidOpenStub.callsFake((listener: (doc: TextDocument) => unknown) => { + openListener = listener; + return new Disposable(() => { + openListener = undefined; + }); + }); + + onDidSaveStub = sinon.stub(wapi, 'onDidSaveTextDocument'); + onDidSaveStub.callsFake((listener: (doc: TextDocument) => unknown) => { + saveListener = listener; + return new Disposable(() => { + saveListener = undefined; + }); + }); + + onDidChangeStub = sinon.stub(wapi, 'onDidChangeTextDocument'); + onDidChangeStub.callsFake((listener: (e: TextDocumentChangeEvent) => unknown) => { + changeListener = listener; + return new Disposable(() => { + changeListener = undefined; + }); + }); + + // Default to an empty list of open documents. Tests that + // exercise the catch-up replay override this. + getOpenTextDocumentsStub = sinon.stub(wapi, 'getOpenTextDocuments'); + getOpenTextDocumentsStub.returns([]); + + getWorkspaceFolderStub = sinon.stub(wapi, 'getWorkspaceFolder'); + // By default, every URI is treated as being inside a workspace + // folder. Tests that want to exercise the "not in workspace" + // branch override this. + getWorkspaceFolderStub.callsFake((uri: Uri) => ({ + uri: Uri.file(path.dirname(uri.fsPath)), + name: 'mockWorkspace', + index: 0, + })); + + readMetadataStub = sinon.stub(ism, 'readInlineScriptMetadataFromFile'); + readMetadataStub.resolves(undefined); + + sendTelemetryStub = sinon.stub(telemetrySender, 'sendTelemetryEvent'); + }); + + teardown(() => { + sinon.restore(); + }); + + async function fireOpen(uri: Uri): Promise { + assert.ok(openListener, 'open listener should be registered after activate()'); + await openListener!(makeDoc(uri)); + } + + async function fireSave(uri: Uri): Promise { + assert.ok(saveListener, 'save listener should be registered after activate()'); + await saveListener!(makeDoc(uri)); + } + + function fireChange(uri: Uri, changes: readonly TextDocumentContentChangeEvent[] = NON_EMPTY_CHANGES): void { + assert.ok(changeListener, 'change listener should be registered after activate()'); + changeListener!(makeChange(uri, changes)); + } + + // Filter `sendTelemetryStub.getCalls()` to a single PEP 723 event name. + function callsFor(name: EventNames): sinon.SinonSpyCall[] { + return sendTelemetryStub.getCalls().filter((c) => c.args[0] === name); + } + + test('activate() subscribes to onDidOpen, onDidSave, and onDidChange', () => { + const detector = new InlineScriptLazyDetector(); + detector.activate(); + assert.ok(onDidOpenStub.calledOnce, 'should subscribe to onDidOpenTextDocument'); + assert.ok(onDidSaveStub.calledOnce, 'should subscribe to onDidSaveTextDocument'); + assert.ok(onDidChangeStub.calledOnce, 'should subscribe to onDidChangeTextDocument'); + detector.dispose(); + }); + + test('skips non-file URI schemes', async () => { + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(Uri.parse('untitled:foo.py')); + assert.ok(readMetadataStub.notCalled, 'should not read metadata for non-file URI'); + detector.dispose(); + }); + + test('skips non-.py files', async () => { + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(Uri.file(path.resolve('/ws/foo.txt'))); + assert.ok(readMetadataStub.notCalled, 'should not read metadata for non-.py files'); + detector.dispose(); + }); + + test('skips files outside any workspace folder', async () => { + getWorkspaceFolderStub.returns(undefined); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(Uri.file(path.resolve('/elsewhere/foo.py'))); + assert.ok(readMetadataStub.notCalled, 'should not read metadata for out-of-workspace files'); + detector.dispose(); + }); + + test('reads metadata for an in-workspace .py file on open', async () => { + const uri = Uri.file(path.resolve('/ws/foo.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + assert.strictEqual(readMetadataStub.callCount, 1, 'open should trigger exactly one read'); + assert.strictEqual((readMetadataStub.firstCall.args[0] as Uri).toString(), uri.toString()); + detector.dispose(); + }); + + test('reads metadata for an in-workspace .py file on save', async () => { + const uri = Uri.file(path.resolve('/ws/bar.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireSave(uri); + assert.strictEqual(readMetadataStub.callCount, 1, 'save should trigger exactly one read'); + detector.dispose(); + }); + + test('concurrent open + open coalesces to a single read', async () => { + const uri = Uri.file(path.resolve('/ws/dedup.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await Promise.all([fireOpen(uri), fireOpen(uri)]); + assert.strictEqual(readMetadataStub.callCount, 1, 'open+open should coalesce to a single read'); + detector.dispose(); + }); + + test('concurrent open + save coalesces to a single read', async () => { + const uri = Uri.file(path.resolve('/ws/race.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await Promise.all([fireOpen(uri), fireSave(uri)]); + // The slim observer has no cached state to keep fresh, so + // simple URI-level dedup is sufficient: a save concurrent + // with an in-flight open coalesces with it. + assert.strictEqual(readMetadataStub.callCount, 1, 'concurrent open+save should coalesce to a single read'); + detector.dispose(); + }); + + test('dispose() during an in-flight read bails out before emitting telemetry', async () => { + const uri = Uri.file(path.resolve('/ws/disposed.py')); + let resolveRead: ((meta: ism.InlineScriptMetadata) => void) | undefined; + readMetadataStub.returns( + new Promise((resolve) => { + resolveRead = resolve; + }), + ); + + const detector = new InlineScriptLazyDetector(); + detector.activate(); + // Kick off the open without awaiting it; the read is parked + // on our manual resolver above. + const inFlight = openListener!(makeDoc(uri)) as Promise | undefined; + // Tear the detector down BEFORE the read settles. + detector.dispose(); + // Now let the in-flight read complete with metadata. The + // `disposed` guard inside processOnce must prevent any + // further work — including the detection telemetry event. + resolveRead!(VALID_METADATA); + await assert.doesNotReject(inFlight ?? Promise.resolve()); + assert.strictEqual(callsFor(EventNames.PEP723_DETECTED).length, 0, 'no detection event after dispose'); + }); + + // ---------- catch-up replay over `getOpenTextDocuments` ---------- + + // Drain the microtask queue and the next `setImmediate` slot so + // the deferred catch-up replay can run before assertions. + function flushImmediate(): Promise { + return new Promise((resolve) => setImmediate(resolve)); + } + + test('activate() replays already-open .py documents via setImmediate', async () => { + const uriWithMeta = Uri.file(path.resolve('/ws/withMeta.py')); + const uriPlain = Uri.file(path.resolve('/ws/plain.py')); + const uriNonPy = Uri.file(path.resolve('/ws/skip.txt')); + readMetadataStub.callsFake(async (u: Uri) => + u.toString() === uriWithMeta.toString() ? VALID_METADATA : undefined, + ); + getOpenTextDocumentsStub.returns([makeDoc(uriWithMeta), makeDoc(uriPlain), makeDoc(uriNonPy)]); + + const detector = new InlineScriptLazyDetector(); + detector.activate(); + // Wait for the deferred catch-up. + await flushImmediate(); + // Then await any in-flight reads kicked off by the replay. + await flushImmediate(); + + // The non-`.py` URI must be filtered out by `shouldHandleUri` + // BEFORE the read is attempted. + assert.strictEqual(readMetadataStub.callCount, 2, 'should read each candidate .py document exactly once'); + const readUris = readMetadataStub.getCalls().map((c) => (c.args[0] as Uri).toString()); + assert.ok(readUris.includes(uriWithMeta.toString())); + assert.ok(readUris.includes(uriPlain.toString())); + assert.ok(!readUris.includes(uriNonPy.toString()), 'should not read non-.py URI during replay'); + detector.dispose(); + }); + + test('dispose() cancels the pending catch-up replay', async () => { + getOpenTextDocumentsStub.returns([makeDoc(Uri.file(path.resolve('/ws/never.py')))]); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + // Tear down BEFORE the `setImmediate` slot fires. + detector.dispose(); + await flushImmediate(); + assert.ok(readMetadataStub.notCalled, 'dispose() must clear the pending setImmediate handle'); + }); + + // ---------- PEP723.DETECTED telemetry ---------- + + test('PEP723.DETECTED fires once with trigger=open + dependencyCount + hasRequiresPython', async () => { + const uri = Uri.file(path.resolve('/ws/detect.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + + const detectedCalls = callsFor(EventNames.PEP723_DETECTED); + assert.strictEqual(detectedCalls.length, 1, 'detection event should fire exactly once'); + const [, measures, properties] = detectedCalls[0].args; + assert.deepStrictEqual(measures, { dependencyCount: 2 }); + assert.deepStrictEqual(properties, { trigger: 'open', hasRequiresPython: true }); + detector.dispose(); + }); + + test('PEP723.DETECTED fires with trigger=save when surfaced by a save event', async () => { + const uri = Uri.file(path.resolve('/ws/detectOnSave.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireSave(uri); + + const detectedCalls = callsFor(EventNames.PEP723_DETECTED); + assert.strictEqual(detectedCalls.length, 1); + assert.strictEqual(detectedCalls[0].args[2].trigger, 'save'); + detector.dispose(); + }); + + test('PEP723.DETECTED does not fire when the file has no metadata block', async () => { + const uri = Uri.file(path.resolve('/ws/plain.py')); + readMetadataStub.resolves(undefined); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + assert.strictEqual(callsFor(EventNames.PEP723_DETECTED).length, 0); + detector.dispose(); + }); + + test('PEP723.DETECTED is deduplicated across repeated opens and saves of the same URI', async () => { + const uri = Uri.file(path.resolve('/ws/repeat.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + await fireSave(uri); + await fireSave(uri); + await fireOpen(uri); + assert.strictEqual(callsFor(EventNames.PEP723_DETECTED).length, 1, 'detection event must dedup per session'); + detector.dispose(); + }); + + test('PEP723.DETECTED reports hasRequiresPython=false when not declared', async () => { + const uri = Uri.file(path.resolve('/ws/noPython.py')); + readMetadataStub.resolves({ + requiresPython: undefined, + dependencies: [], + tool: undefined, + range: { start: 0, end: 20 }, + } satisfies ism.InlineScriptMetadata); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + + const [, measures, properties] = callsFor(EventNames.PEP723_DETECTED)[0].args; + assert.deepStrictEqual(measures, { dependencyCount: 0 }); + assert.deepStrictEqual(properties, { trigger: 'open', hasRequiresPython: false }); + detector.dispose(); + }); + + // ---------- PEP723.EDITED telemetry ---------- + + test('PEP723.EDITED fires once on first content change after detection', async () => { + const uri = Uri.file(path.resolve('/ws/edit.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + fireChange(uri); + + const editedCalls = callsFor(EventNames.PEP723_EDITED); + assert.strictEqual(editedCalls.length, 1, 'edited event should fire exactly once'); + // Second arg is the measure (number → { duration }); accept either form. + const measureArg = editedCalls[0].args[1]; + assert.strictEqual(typeof measureArg, 'number', 'measure should be a number (latency ms)'); + assert.ok((measureArg as number) >= 0, 'duration should be non-negative'); + detector.dispose(); + }); + + test('PEP723.EDITED is deduplicated across repeated edits of the same URI', async () => { + const uri = Uri.file(path.resolve('/ws/multiEdit.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + fireChange(uri); + fireChange(uri); + fireChange(uri); + assert.strictEqual(callsFor(EventNames.PEP723_EDITED).length, 1); + detector.dispose(); + }); + + test('PEP723.EDITED does not fire for changes on a URI that was never detected', async () => { + const uri = Uri.file(path.resolve('/ws/notDetected.py')); + readMetadataStub.resolves(undefined); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + fireChange(uri); + assert.strictEqual(callsFor(EventNames.PEP723_EDITED).length, 0); + detector.dispose(); + }); + + test('PEP723.EDITED ignores change events with no content changes', async () => { + const uri = Uri.file(path.resolve('/ws/noOpChange.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + // VS Code can fire a change event with an empty contentChanges + // array for things like dirty-state toggles; that's not a user + // edit and must not count. + fireChange(uri, []); + assert.strictEqual(callsFor(EventNames.PEP723_EDITED).length, 0); + // A real edit still counts after the no-op was ignored. + fireChange(uri); + assert.strictEqual(callsFor(EventNames.PEP723_EDITED).length, 1); + detector.dispose(); + }); + + test('PEP723.EDITED is suppressed after dispose()', async () => { + const uri = Uri.file(path.resolve('/ws/disposedEdit.py')); + readMetadataStub.resolves(VALID_METADATA); + const detector = new InlineScriptLazyDetector(); + detector.activate(); + await fireOpen(uri); + const grabbedChangeListener = changeListener!; + detector.dispose(); + grabbedChangeListener(makeChange(uri)); + assert.strictEqual(callsFor(EventNames.PEP723_EDITED).length, 0); + }); +}); + +suite('shouldHandleUri', () => { + let getWorkspaceFolderStub: sinon.SinonStub; + + setup(() => { + getWorkspaceFolderStub = sinon.stub(wapi, 'getWorkspaceFolder'); + getWorkspaceFolderStub.callsFake((uri: Uri) => ({ + uri: Uri.file(path.dirname(uri.fsPath)), + name: 'ws', + index: 0, + })); + }); + + teardown(() => { + sinon.restore(); + }); + + test('accepts .py file in workspace folder', () => { + assert.strictEqual(shouldHandleUri(Uri.file(path.resolve('/ws/a.py'))), true); + }); + + test('accepts .PY (uppercase) file', () => { + assert.strictEqual(shouldHandleUri(Uri.file(path.resolve('/ws/A.PY'))), true); + }); + + test('rejects non-.py extension', () => { + assert.strictEqual(shouldHandleUri(Uri.file(path.resolve('/ws/a.txt'))), false); + }); + + test('rejects non-file scheme', () => { + assert.strictEqual(shouldHandleUri(Uri.parse('untitled:a.py')), false); + }); + + test('rejects file outside any workspace folder', () => { + getWorkspaceFolderStub.returns(undefined); + assert.strictEqual(shouldHandleUri(Uri.file(path.resolve('/elsewhere/a.py'))), false); + }); +});