From 8cbeef151dc7638b9d676309b49e3ecd841a1db2 Mon Sep 17 00:00:00 2001 From: morluto Date: Tue, 30 Jun 2026 23:13:59 +0000 Subject: [PATCH 01/13] fix: respect ignore files in glob --- .../agent-core/src/tools/builtin/file/glob.ts | 58 +++++++++++++---- packages/agent-core/test/tools/glob.test.ts | 63 +++++++++++++++++++ 2 files changed, 109 insertions(+), 12 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index 69d01c602..bae43b068 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -25,7 +25,7 @@ */ import type { Kaos } from '@moonshot-ai/kaos'; -import { normalize, resolve } from 'pathe'; +import { dirname, join, normalize, resolve } from 'pathe'; import { z } from 'zod'; import type { BuiltinTool } from '../../../agent/tool'; @@ -201,17 +201,17 @@ export class GlobTool implements BuiltinTool { // Running from the search root makes glob matching relative to it. const execKaos = this.kaos.withCwd(searchRoot); - let runResult = await runRipgrepOnce( - execKaos, - buildRgArgs(rgPath, args), - signal, - { abortedMessage: 'Glob aborted' }, - ); + const insideGitRepo = + args.include_ignored === true ? true : await isInsideGitRepo(this.kaos, searchRoot); + + let runResult = await runRipgrepOnce(execKaos, buildRgArgs(rgPath, args, insideGitRepo), signal, { + abortedMessage: 'Glob aborted', + }); if (runResult.kind === 'tool-error') return runResult.result; if (shouldRetryRipgrepEagain(runResult)) { runResult = await runRipgrepOnce( execKaos, - buildRgArgs(rgPath, args, true), + buildRgArgs(rgPath, args, insideGitRepo, true), signal, { abortedMessage: 'Glob aborted' }, ); @@ -312,16 +312,27 @@ export class GlobTool implements BuiltinTool { } } -function buildRgArgs(rgPath: string, args: GlobInput, singleThreaded = false): string[] { +function buildRgArgs( + rgPath: string, + args: GlobInput, + insideGitRepo: boolean, + singleThreaded = false, +): string[] { const cmd: string[] = [rgPath]; if (singleThreaded) cmd.push('-j', '1'); cmd.push('--files', '--hidden', '--sortr=modified'); + if (!insideGitRepo && args.include_ignored !== true) { + cmd.push('--no-require-git'); + } for (const dir of VCS_DIRECTORIES_TO_EXCLUDE) { cmd.push('--glob', `!${dir}`); } - // Positive pattern first, then sensitive-file exclusions so a broad - // pattern cannot re-include a sensitive path. - cmd.push('--glob', args.pattern); + // Positive pattern before sensitive-file exclusions unless it is broad + // enough to enumerate everything. Broad positive globs re-include files + // that ignore files exclude, so let `rg --files` walk those directly. + if (!isBroadPattern(args.pattern)) { + cmd.push('--glob', args.pattern); + } for (const glob of SENSITIVE_GLOBS_TO_EXCLUDE) { cmd.push('--glob', `!${glob}`); } @@ -332,6 +343,29 @@ function buildRgArgs(rgPath: string, args: GlobInput, singleThreaded = false): s return cmd; } +function isBroadPattern(pattern: string): boolean { + return pattern === '*' || pattern === '**' || pattern === '**/*'; +} + +async function isInsideGitRepo(kaos: Kaos, searchRoot: string): Promise { + let current = kaos.normpath(searchRoot); + for (;;) { + if (await pathExists(kaos, join(current, '.git'))) return true; + const parent = dirname(current); + if (parent === current) return false; + current = parent; + } +} + +async function pathExists(kaos: Kaos, path: string): Promise { + try { + await kaos.stat(path); + return true; + } catch { + return false; + } +} + function formatGlobError(searchRoot: string, stderr: string): string { const trimmed = stderr.trim(); if (/no such file or directory/i.test(trimmed)) { diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index dbe0119a5..d8707a616 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -250,6 +250,44 @@ describe('GlobTool', () => { expect(execArgs(exec)).not.toContain('--no-ignore'); }); + it('does not emit a positive --glob for broad all-file patterns', async () => { + for (const pattern of ['*', '**', '**/*'] as const) { + const exec = execReturning('/workspace/a.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + await executeTool(tool, context({ pattern })); + + const args = execArgs(exec); + expect(args).not.toContain(pattern); + expect(args).toContain('--glob'); + expect(args.some((arg) => arg.startsWith('!'))).toBe(true); + } + }); + + it('keeps a positive --glob for anchored patterns', async () => { + const exec = execReturning('/workspace/src/a.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + await executeTool(tool, context({ pattern: 'src/**/*.ts' })); + + expect(execArgs(exec)).toContain('src/**/*.ts'); + }); + + it('adds --no-require-git when the search root is outside a git repo', async () => { + const exec = execReturning('/workspace/a.ts\n'); + const stat = vi.fn(async (candidate: string) => { + if (candidate.endsWith('/.git')) { + throw Object.assign(new Error('ENOENT: no such file or directory'), { code: 'ENOENT' }); + } + return dirStat(); + }); + const tool = new GlobTool(createFakeKaos({ exec, stat }), workspace); + + await executeTool(tool, context({ pattern: '*.ts', path: '/workspace' })); + + expect(execArgs(exec)).toContain('--no-require-git'); + }); + it('caps returned matches and surfaces the truncation header', async () => { const stdout = Array.from({ length: MAX_MATCHES + 1 }, (_, i) => `/workspace/${String(i)}.ts`).join('\n') + @@ -727,4 +765,29 @@ describe('GlobTool integration (real ripgrep)', () => { await fs.rm(externalDir, { recursive: true, force: true }); } }); + + it('respects .gitignore by default for broad patterns in a git repo', async () => { + await touch('kept.ts', new Date('2024-01-01T00:00:00Z')); + await touch('ignored.log', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), '*.log\n'); + await fs.mkdir(path.join(tmpDir!, '.git'), { recursive: true }); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*', path: tmpDir! })); + + expect(result.output).toContain('kept.ts'); + expect(result.output).not.toContain('ignored.log'); + }); + + it('respects .gitignore by default in a non-git directory', async () => { + await touch('kept.ts', new Date('2024-01-01T00:00:00Z')); + await touch('ignored.log', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), '*.log\n'); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*', path: tmpDir! })); + + expect(result.output).toContain('kept.ts'); + expect(result.output).not.toContain('ignored.log'); + }); }); From 3fc6b3a8ae42bb7951de34dfdae61fbdded9e966 Mon Sep 17 00:00:00 2001 From: morluto Date: Tue, 30 Jun 2026 23:14:03 +0000 Subject: [PATCH 02/13] chore: add glob ignore changeset --- .changeset/glob-ignore-files.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/glob-ignore-files.md diff --git a/.changeset/glob-ignore-files.md b/.changeset/glob-ignore-files.md new file mode 100644 index 000000000..cb9f37251 --- /dev/null +++ b/.changeset/glob-ignore-files.md @@ -0,0 +1,5 @@ +--- +"@moonshot-ai/kimi-code": patch +--- + +Fix glob file searches so broad patterns continue to respect ignore files. From f01f7b42d052cf633a18be2cbbc5cb07f3c50465 Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 02:11:05 +0000 Subject: [PATCH 03/13] fix: filter glob patterns in-process to respect ignore files A positive ripgrep --glob "always overrides any other ignore logic," so passing any non-broad user pattern (e.g. `*.ts`, `dist/**/*.js`) as --glob re-included files excluded by .gitignore/.ignore/.rgignore. In a repo with .gitignore containing `*.ts`, Glob({ pattern: '*.ts' }) would surface ignored.ts despite the documented default ignore behavior. Stop passing the user's pattern as a positive --glob. Instead, let rg --files enumerate non-ignored files and filter the results in-process with picomatch, preserving both the pattern match and the ignore-file respect. Broad patterns (`*`, `**`, `**/*`) still skip the filter since they match everything. --- .changeset/glob-ignore-files.md | 2 +- .../agent-core/src/tools/builtin/file/glob.ts | 56 ++++++++++++-- packages/agent-core/test/tools/glob.test.ts | 77 ++++++++++++++----- 3 files changed, 108 insertions(+), 27 deletions(-) diff --git a/.changeset/glob-ignore-files.md b/.changeset/glob-ignore-files.md index cb9f37251..0a836ca85 100644 --- a/.changeset/glob-ignore-files.md +++ b/.changeset/glob-ignore-files.md @@ -2,4 +2,4 @@ "@moonshot-ai/kimi-code": patch --- -Fix glob file searches so broad patterns continue to respect ignore files. +Fix glob file searches so all patterns continue to respect ignore files. A positive ripgrep `--glob` always overrides ignore logic, so any non-broad user pattern (e.g. `*.ts`, `dist/**/*.js`) could re-include files excluded by `.gitignore`. The pattern is now filtered in-process after `rg --files` enumerates non-ignored files. diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index bae43b068..783830cf0 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -25,7 +25,8 @@ */ import type { Kaos } from '@moonshot-ai/kaos'; -import { dirname, join, normalize, resolve } from 'pathe'; +import { dirname, join, normalize, relative, resolve } from 'pathe'; +import picomatch from 'picomatch'; import { z } from 'zod'; import type { BuiltinTool } from '../../../agent/tool'; @@ -250,10 +251,16 @@ export class GlobTool implements BuiltinTool { resolve(searchRoot, p), ); + // Filter by the user's positive glob pattern in-process. A positive + // --glob would override ignore-file logic, so the pattern is not passed + // to rg; instead rg --files enumerates non-ignored files and we filter + // here to preserve both the pattern match and the ignore-file respect. + const patternMatched = filterByPattern(rawPaths, searchRoot, args.pattern); + // Authoritative sensitive-file check (the rg prefilter is conservative). const kept: string[] = []; let filteredSensitive = 0; - for (const p of rawPaths) { + for (const p of patternMatched) { if (isSensitiveFile(p)) { filteredSensitive++; } else { @@ -327,12 +334,11 @@ function buildRgArgs( for (const dir of VCS_DIRECTORIES_TO_EXCLUDE) { cmd.push('--glob', `!${dir}`); } - // Positive pattern before sensitive-file exclusions unless it is broad - // enough to enumerate everything. Broad positive globs re-include files - // that ignore files exclude, so let `rg --files` walk those directly. - if (!isBroadPattern(args.pattern)) { - cmd.push('--glob', args.pattern); - } + // The user's positive pattern is NOT passed as --glob. A positive --glob + // "always overrides any other ignore logic" (ripgrep docs), so it would + // re-include files excluded by .gitignore/.ignore/.rgignore. Instead, let + // rg --files enumerate non-ignored files and filter the results + // in-process via matchUserPattern(). for (const glob of SENSITIVE_GLOBS_TO_EXCLUDE) { cmd.push('--glob', `!${glob}`); } @@ -347,6 +353,40 @@ function isBroadPattern(pattern: string): boolean { return pattern === '*' || pattern === '**' || pattern === '**/*'; } +/** + * Match a file path against the user's glob pattern, using the same + * semantics ripgrep's `--glob` would have applied: + * - Patterns without `/` match the basename at any depth. + * - Patterns with `/` match the relative path from the search root. + * - `**` matches zero or more directory levels. + * - Dotfiles are matched (rg runs with `--hidden`). + * - Brace expansion (`*.{ts,tsx}`) is handled by picomatch natively. + */ +function matchUserPattern(relPath: string, pattern: string): boolean { + const opts = { dot: true, nocase: true }; + if (!pattern.includes('/')) { + const basename = relPath.split('/').pop()!; + return picomatch.isMatch(basename, pattern, opts); + } + return picomatch.isMatch(relPath, pattern, opts); +} + +/** + * Filter absolute paths from `rg --files` against the user's positive glob + * pattern. Broad patterns (star, double-star, star-slash-star) match + * everything, so the filter is skipped for those. Returns the filtered list + * in the same order. + */ +function filterByPattern(absPaths: string[], searchRoot: string, pattern: string): string[] { + if (isBroadPattern(pattern)) return absPaths; + const result: string[] = []; + for (const absPath of absPaths) { + const rel = relative(searchRoot, absPath); + if (matchUserPattern(rel, pattern)) result.push(absPath); + } + return result; +} + async function isInsideGitRepo(kaos: Kaos, searchRoot: string): Promise { let current = kaos.normpath(searchRoot); for (;;) { diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index d8707a616..6b903afeb 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -162,7 +162,7 @@ describe('GlobTool', () => { additionalDirs: [], }); - const result = await executeTool(tool, context({ pattern: 'src/**/*.ts', path: 'C:\\WORKSPACE' })); + const result = await executeTool(tool, context({ pattern: 'src/**/*.ts', path: 'C:\\workspace' })); // pathe.normalize renders Windows paths with forward slashes, so the // relativized result keeps `/` regardless of the backend path class. @@ -183,31 +183,37 @@ describe('GlobTool', () => { expect(result.output).toContain(`[Truncated at ${String(MAX_MATCHES)} matches`); }); - it('passes a brace pattern through to a single rg --glob', async () => { - const exec = execReturning('/workspace/a.ts\n/workspace/shared.ts\n/workspace/shared.tsx\n'); + it('filters brace patterns in-process without passing them as a positive --glob', async () => { + const exec = execReturning( + '/workspace/a.ts\n/workspace/shared.ts\n/workspace/shared.tsx\n/workspace/b.js\n', + ); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '*.{ts,tsx}' })); expect(result.isError).toBeFalsy(); - expect(execArgs(exec)).toContain('*.{ts,tsx}'); + // The pattern must NOT be passed as a positive --glob — that would + // override ignore-file logic. Filtering happens in-process. + expect(execArgs(exec)).not.toContain('*.{ts,tsx}'); expect(result.output).toContain('a.ts'); expect(result.output).toContain('shared.ts'); expect(result.output).toContain('shared.tsx'); + expect(result.output).not.toContain('b.js'); }); - it('passes an escaped-brace pattern through unchanged so literal-brace files stay matchable', async () => { + it('matches an escaped-brace pattern in-process so literal-brace files stay matchable', async () => { // `\{a,b\}.ts` opts out of brace expansion — the user wants a file - // literally named `{a,b}.ts`. The pattern must reach rg with the escapes - // intact (the tool must not strip or reinterpret the backslashes). - const exec = execReturning('/workspace/{a,b}.ts\n'); + // literally named `{a,b}.ts`. The pattern is matched in-process, so the + // escapes are handled by picomatch, not rg. + const exec = execReturning('/workspace/{a,b}.ts\n/workspace/other.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '\\{a,b\\}.ts' })); expect(result.isError).toBeFalsy(); - expect(execArgs(exec)).toContain('\\{a,b\\}.ts'); + expect(execArgs(exec)).not.toContain('\\{a,b\\}.ts'); expect(result.output).toContain('{a,b}.ts'); + expect(result.output).not.toContain('other.ts'); }); it('searches only the current workspace when path is omitted', async () => { @@ -264,13 +270,15 @@ describe('GlobTool', () => { } }); - it('keeps a positive --glob for anchored patterns', async () => { - const exec = execReturning('/workspace/src/a.ts\n'); + it('filters anchored patterns in-process without a positive --glob', async () => { + const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); - await executeTool(tool, context({ pattern: 'src/**/*.ts' })); + const result = await executeTool(tool, context({ pattern: 'src/**/*.ts' })); - expect(execArgs(exec)).toContain('src/**/*.ts'); + expect(execArgs(exec)).not.toContain('src/**/*.ts'); + expect(result.output).toContain('src/a.ts'); + expect(result.output).not.toContain('other/b.ts'); }); it('adds --no-require-git when the search root is outside a git repo', async () => { @@ -329,7 +337,7 @@ describe('GlobTool', () => { }); it('filters sensitive files from results', async () => { - const exec = execReturning('/workspace/.env\n/workspace/src/a.ts\n'); + const exec = execReturning('/workspace/src/.env\n/workspace/src/a.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: 'src/**' })); @@ -396,15 +404,16 @@ describe('GlobTool', () => { }); it('walks "**/" prefix patterns with a literal anchor', async () => { - const exec = execReturning('/workspace/a.py\n/workspace/sub/b.py\n'); + const exec = execReturning('/workspace/a.py\n/workspace/sub/b.py\n/workspace/other.txt\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '**/*.py' })); expect(result.isError).toBeFalsy(); - expect(execArgs(exec)).toContain('**/*.py'); + expect(execArgs(exec)).not.toContain('**/*.py'); expect(result.output).toContain('a.py'); expect(result.output).toContain('sub/b.py'); + expect(result.output).not.toContain('other.txt'); }); it('walks safe recursive patterns with a literal subdirectory anchor', async () => { @@ -494,14 +503,15 @@ describe('GlobTool', () => { }); it('walks "**/" patterns with literal subdirectory anchors after the prefix', async () => { - const exec = execReturning('/workspace/src/main/app.py\n'); + const exec = execReturning('/workspace/src/main/app.py\n/workspace/other/x.py\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '**/main/*.py' })); expect(result.isError).toBeFalsy(); - expect(execArgs(exec)).toContain('**/main/*.py'); + expect(execArgs(exec)).not.toContain('**/main/*.py'); expect(result.output).toContain('src/main/app.py'); + expect(result.output).not.toContain('other/x.py'); }); it('matches dotfiles like .gitlab-ci.yml under a simple "*.yml" pattern', async () => { @@ -790,4 +800,35 @@ describe('GlobTool integration (real ripgrep)', () => { expect(result.output).toContain('kept.ts'); expect(result.output).not.toContain('ignored.log'); }); + + it('respects .gitignore for specific patterns that would re-include ignored files', async () => { + // A positive --glob overrides ignore logic in ripgrep, so + // Glob({ pattern: '*.ts' }) in a repo with .gitignore containing + // *.ts would surface ignored.ts. The in-process filter avoids this + // by letting rg --files enumerate non-ignored files first. + await touch('kept.ts', new Date('2024-01-01T00:00:00Z')); + await touch('ignored.ts', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), '*.ts\n'); + await fs.mkdir(path.join(tmpDir!, '.git'), { recursive: true }); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*.ts', path: tmpDir! })); + + expect(result.output).toContain('No matches'); + expect(result.output).not.toContain('kept.ts'); + expect(result.output).not.toContain('ignored.ts'); + }); + + it('respects .gitignore for specific patterns in a non-git directory', async () => { + await touch('kept.ts', new Date('2024-01-01T00:00:00Z')); + await touch('ignored.ts', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), '*.ts\n'); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*.ts', path: tmpDir! })); + + expect(result.output).toContain('No matches'); + expect(result.output).not.toContain('kept.ts'); + expect(result.output).not.toContain('ignored.ts'); + }); }); From 41ecf63991b5786cdcc7b15ea040935693e5795c Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 02:50:54 +0000 Subject: [PATCH 04/13] fix: harden glob in-process filtering against cap, case, and malformed patterns runRipgrepOnce: add optional lineFilter that filters stdout lines as they stream in, before they count toward the MAX_OUTPUT_BYTES cap. GlobTool uses this so a selective pattern (e.g. rare/**/*.ts) is not starved by a large prefix of non-matching paths when rg --files returns >10MB of output. GlobTool: - Apply the user's glob pattern as a streaming lineFilter on rg --files output so matching paths survive the stdout cap. The post-cap filterByPattern re-check remains as a safety net. - Remove nocase:true from picomatch options. ripgrep's default glob matching is case-sensitive (--glob-case-insensitive is a separate flag), so the previous setting caused *.TS to match foo.ts on POSIX, diverging from rg --glob behaviour. - Normalize the search root and absolute paths case-insensitively on Windows before computing the relative path in filterByPattern, so a user-supplied root with different casing than rg returns doesn't produce escaped paths that fail to match. - Validate glob patterns for unclosed [ and { before running rg. picomatch silently treats malformed globs as literals and reports "No matches found", whereas rg --glob used to exit 2 with a hard error. The new validateGlobPattern restores that error behaviour. --- .../agent-core/src/tools/builtin/file/glob.ts | 103 ++++++++++++++++-- .../agent-core/src/tools/support/run-rg.ts | 57 ++++++++-- packages/agent-core/test/tools/glob.test.ts | 51 ++++++++- 3 files changed, 190 insertions(+), 21 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index 783830cf0..f1f331a2b 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -16,7 +16,7 @@ * files (e.g. build outputs, `node_modules`). Sensitive files such * as `.env` are always filtered out. * - Brace expansion (`*.{ts,tsx}`, `{src,test}/**`) is handled by - * ripgrep's glob engine. + * picomatch in-process. * - `path` is validated by `resolvePathAccess` in `absolute-outside-allowed` * mode. Explicit absolute paths outside the workspace are allowed; relative * paths that escape the workspace stay rejected. @@ -25,7 +25,7 @@ */ import type { Kaos } from '@moonshot-ai/kaos'; -import { dirname, join, normalize, relative, resolve } from 'pathe'; +import { dirname, isAbsolute, join, normalize, relative, resolve } from 'pathe'; import picomatch from 'picomatch'; import { z } from 'zod'; @@ -161,6 +161,11 @@ export class GlobTool implements BuiltinTool { ): Promise { const searchRoot = searchRoots[0] ?? this.workspace.workspaceDir; + const patternError = validateGlobPattern(args.pattern); + if (patternError !== undefined) { + return { isError: true, output: patternError }; + } + // Validate the search root is a directory. `rg --files ` exits 0 // and lists the file itself, so without this check a file root would be // returned as its own match instead of rejected. A missing root surfaces @@ -205,8 +210,12 @@ export class GlobTool implements BuiltinTool { const insideGitRepo = args.include_ignored === true ? true : await isInsideGitRepo(this.kaos, searchRoot); + const pathClass = this.kaos.pathClass(); + const lineFilter = makeLineFilter(args.pattern, pathClass, searchRoot); + let runResult = await runRipgrepOnce(execKaos, buildRgArgs(rgPath, args, insideGitRepo), signal, { abortedMessage: 'Glob aborted', + lineFilter, }); if (runResult.kind === 'tool-error') return runResult.result; if (shouldRetryRipgrepEagain(runResult)) { @@ -214,7 +223,7 @@ export class GlobTool implements BuiltinTool { execKaos, buildRgArgs(rgPath, args, insideGitRepo, true), signal, - { abortedMessage: 'Glob aborted' }, + { abortedMessage: 'Glob aborted', lineFilter }, ); if (runResult.kind === 'tool-error') return runResult.result; } @@ -255,7 +264,10 @@ export class GlobTool implements BuiltinTool { // --glob would override ignore-file logic, so the pattern is not passed // to rg; instead rg --files enumerates non-ignored files and we filter // here to preserve both the pattern match and the ignore-file respect. - const patternMatched = filterByPattern(rawPaths, searchRoot, args.pattern); + // The streaming lineFilter already applied this filter before the cap, + // but we re-check here as a safety net for any paths that slipped + // through (e.g. broad patterns where the filter is skipped). + const patternMatched = filterByPattern(rawPaths, searchRoot, args.pattern, pathClass); // Authoritative sensitive-file check (the rg prefilter is conservative). const kept: string[] = []; @@ -284,7 +296,6 @@ export class GlobTool implements BuiltinTool { // save tokens, but only for the primary workspace. Relative paths are // later resolved against workspaceDir, so additionalDir matches stay // absolute to keep follow-up Read/Edit calls on the same file. - const pathClass = this.kaos.pathClass(); const shouldRelativize = isWithinDirectory(searchRoot, this.workspace.workspaceDir, pathClass); const displayLines = limited.map((p) => shouldRelativize ? relativizeIfUnder(p, searchRoot, pathClass) : p, @@ -360,10 +371,12 @@ function isBroadPattern(pattern: string): boolean { * - Patterns with `/` match the relative path from the search root. * - `**` matches zero or more directory levels. * - Dotfiles are matched (rg runs with `--hidden`). + * - Matching is case-sensitive, matching ripgrep's default (use + * `--glob-case-insensitive` / `--iglob` for case-insensitive mode). * - Brace expansion (`*.{ts,tsx}`) is handled by picomatch natively. */ function matchUserPattern(relPath: string, pattern: string): boolean { - const opts = { dot: true, nocase: true }; + const opts = { dot: true }; if (!pattern.includes('/')) { const basename = relPath.split('/').pop()!; return picomatch.isMatch(basename, pattern, opts); @@ -375,18 +388,90 @@ function matchUserPattern(relPath: string, pattern: string): boolean { * Filter absolute paths from `rg --files` against the user's positive glob * pattern. Broad patterns (star, double-star, star-slash-star) match * everything, so the filter is skipped for those. Returns the filtered list - * in the same order. + * in the same order. On Windows, the search root and absolute paths are + * case-normalized before computing the relative path so that a user-supplied + * root with different casing than rg returns doesn't produce escaped paths. */ -function filterByPattern(absPaths: string[], searchRoot: string, pattern: string): string[] { +function filterByPattern( + absPaths: string[], + searchRoot: string, + pattern: string, + pathClass: PathClass, +): string[] { if (isBroadPattern(pattern)) return absPaths; const result: string[] = []; + const normRoot = normalize(searchRoot); + const comparableRoot = pathClass === 'win32' ? normRoot.toLowerCase() : normRoot; for (const absPath of absPaths) { - const rel = relative(searchRoot, absPath); + const normAbs = normalize(absPath); + const comparableAbs = pathClass === 'win32' ? normAbs.toLowerCase() : normAbs; + const rel = relative(comparableRoot, comparableAbs); if (matchUserPattern(rel, pattern)) result.push(absPath); } return result; } +/** + * Build a streaming line filter for `runRipgrepOnce` that applies the user's + * glob pattern to each path line before it counts toward the output cap. + * Returns `undefined` for broad patterns (no filtering needed) so the + * original byte-level cap path is used. + * + * rg runs with cwd pinned to the search root, so each line is normally a + * relative path like `./src/a.ts` (POSIX) or `.\src\a.ts` (Windows). The + * filter strips the leading `./` or `.\` and normalizes backslashes to + * forward slashes on Windows before matching. If the line is an absolute + * path (e.g. from a test mock or an external root), it is made relative to + * the search root first. + */ +function makeLineFilter( + pattern: string, + pathClass: PathClass, + searchRoot: string, +): ((line: string) => boolean) | undefined { + if (isBroadPattern(pattern)) return undefined; + const normRoot = normalize(searchRoot); + const comparableRoot = pathClass === 'win32' ? normRoot.toLowerCase() : normRoot; + return (line: string): boolean => { + let relPath = line; + if (pathClass === 'win32') relPath = relPath.replace(/\\/g, '/'); + if (relPath.startsWith('./') || relPath.startsWith('.\\')) { + relPath = relPath.slice(2); + } else if (isAbsolute(relPath)) { + const normAbs = pathClass === 'win32' ? normalize(relPath).toLowerCase() : normalize(relPath); + relPath = relative(comparableRoot, normAbs); + } + return matchUserPattern(relPath, pattern); + }; +} + +/** + * Validate a glob pattern for common malformed syntax (unclosed `[` or `{`). + * Returns an error message if the pattern is invalid, or `undefined` if it + * is well-formed. This mirrors ripgrep's globset parser, which rejects + * unclosed character classes and brace expansions with a hard error — + * picomatch would silently treat them as literals and report "No matches + * found" instead. + */ +function validateGlobPattern(pattern: string): string | undefined { + let bracketDepth = 0; + let braceDepth = 0; + for (let i = 0; i < pattern.length; i++) { + const ch = pattern[i]; + if (ch === '\\') { + i++; + continue; + } + if (ch === '[') bracketDepth++; + else if (ch === ']' && bracketDepth > 0) bracketDepth--; + else if (ch === '{') braceDepth++; + else if (ch === '}' && braceDepth > 0) braceDepth--; + } + if (bracketDepth > 0) return `Invalid glob pattern: unclosed '[' in "${pattern}"`; + if (braceDepth > 0) return `Invalid glob pattern: unclosed '{' in "${pattern}"`; + return undefined; +} + async function isInsideGitRepo(kaos: Kaos, searchRoot: string): Promise { let current = kaos.normpath(searchRoot); for (;;) { diff --git a/packages/agent-core/src/tools/support/run-rg.ts b/packages/agent-core/src/tools/support/run-rg.ts index f8713cb02..77cf2a6fe 100644 --- a/packages/agent-core/src/tools/support/run-rg.ts +++ b/packages/agent-core/src/tools/support/run-rg.ts @@ -58,6 +58,15 @@ export type RipgrepRunOutcome = export interface RunRipgrepOptions { /** Message surfaced when the run is aborted via `signal`. Defaults to `"Aborted"`. */ readonly abortedMessage?: string; + /** + * Optional line-level filter applied to stdout as lines stream in. When + * provided, only lines that pass the filter count toward the output cap, + * so a selective pattern can't be starved by a large prefix of non-matching + * lines (e.g. `rg --files` returning >10MB of paths before the user's glob + * matches). Lines are split on `\n`; the filter receives the line without + * the trailing newline. + */ + readonly lineFilter?: (line: string) => boolean; } async function disposeProcess(proc: KaosProcess): Promise { @@ -75,6 +84,7 @@ export async function runRipgrepOnce( options: RunRipgrepOptions = {}, ): Promise { const abortedMessage = options.abortedMessage ?? 'Aborted'; + const lineFilter = options.lineFilter; if (signal.aborted) { return { kind: 'tool-error', result: { isError: true, output: abortedMessage } }; } @@ -165,7 +175,7 @@ export async function runRipgrepOnce( try { const isTerminating = (): boolean => timedOut || aborted || killed; const [stdoutResult, stderrResult, code] = await Promise.all([ - readStreamWithCap(proc.stdout, MAX_OUTPUT_BYTES, isTerminating), + readStreamWithCap(proc.stdout, MAX_OUTPUT_BYTES, isTerminating, lineFilter), readStreamWithCap(proc.stderr, MAX_OUTPUT_BYTES, isTerminating), proc.wait(), ]); @@ -229,29 +239,58 @@ async function readStreamWithCap( stream: Readable, maxBytes: number, suppressPrematureClose?: () => boolean, + lineFilter?: (line: string) => boolean, ): Promise { const chunks: Buffer[] = []; let total = 0; let truncated = false; + let lineBuffer = ''; try { for await (const chunk of stream) { const buf: Buffer = typeof chunk === 'string' ? Buffer.from(chunk, 'utf8') : (chunk as Buffer); if (truncated) continue; - if (total + buf.length > maxBytes) { - const remaining = maxBytes - total; - if (remaining > 0) chunks.push(buf.subarray(0, remaining)); - total = maxBytes; - truncated = true; - continue; + if (lineFilter) { + lineBuffer += buf.toString('utf8'); + const lines = lineBuffer.split('\n'); + lineBuffer = lines.pop()!; + for (const line of lines) { + if (!lineFilter(line)) continue; + const lineBytes = Buffer.from(line + '\n', 'utf8'); + if (total + lineBytes.length > maxBytes) { + truncated = true; + break; + } + chunks.push(lineBytes); + total += lineBytes.length; + } + } else { + if (total + buf.length > maxBytes) { + const remaining = maxBytes - total; + if (remaining > 0) chunks.push(buf.subarray(0, remaining)); + total = maxBytes; + truncated = true; + continue; + } + chunks.push(buf); + total += buf.length; } - chunks.push(buf); - total += buf.length; } } catch (error) { if (!isPrematureCloseError(error) || suppressPrematureClose?.() !== true) { throw error; } } + if (!truncated && lineBuffer.length > 0) { + if (!lineFilter || lineFilter(lineBuffer)) { + const lineBytes = Buffer.from(lineBuffer, 'utf8'); + if (total + lineBytes.length > maxBytes) { + truncated = true; + } else { + chunks.push(lineBytes); + total += lineBytes.length; + } + } + } return { text: Buffer.concat(chunks).toString('utf8'), truncated }; } diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index 6b903afeb..6e33acdbe 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -449,6 +449,28 @@ describe('GlobTool', () => { expect(result.output).toContain('No matches found'); }); + it('filters the pattern before the stdout cap so rare matches are not starved', async () => { + // Simulate >10MB of non-matching paths followed by a matching path. + // Without the streaming line filter, the cap would truncate before the + // match and the tool would report "No matches found". + const nonMatching = Array.from( + { length: 200_000 }, + (_, i) => `/workspace/noise_${String(i)}.txt`, + ).join('\n'); + const stdout = nonMatching + '\n/workspace/rare/deep/match.ts\n'; + const exec = execReturning(stdout); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'rare/**/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('rare/deep/match.ts'); + expect(result.output).not.toContain('noise_'); + }); + it('keeps complete paths and surfaces a warning when rg exits 2 after traversal errors', async () => { const exec = execReturning( '/workspace/a.ts\n/workspace/src/b.ts\n', @@ -466,14 +488,37 @@ describe('GlobTool', () => { expect(result.output).toContain('Permission denied'); }); - it('keeps ripgrep errors hard failures when no complete path is produced', async () => { - const exec = execReturning('', 'error: invalid glob', 2); + it('rejects malformed glob patterns before running ripgrep', async () => { + const exec = vi.fn(); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '[', path: '/workspace' })); expect(result).toMatchObject({ isError: true }); - expect(result.output).toContain('Glob failed: error: invalid glob'); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects malformed glob patterns with unclosed braces', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '*.{ts,tsx', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('surfaces ripgrep errors when no complete path is produced', async () => { + const exec = execReturning('', 'error: something went wrong', 2); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '*.ts', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Glob failed: error: something went wrong'); }); it('reports "does not exist" when the search directory is missing', async () => { From 6f176103bbb3091a6d221c6f32269d18ae11ed53 Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 03:20:33 +0000 Subject: [PATCH 05/13] fix: handle rooted globs, multibyte streaming, bracket braces, Windows case matchUserPattern(): - Strip a leading `/` from the pattern before matching. ripgrep's --glob treats `/src/*.ts` as rooted at the search root (equivalent to `src/*.ts`), but the in-process matcher compared the rootless relative path directly against the rooted pattern and reported no matches. readStreamWithCap() in run-rg.ts: - Use a streaming TextDecoder with `{ stream: true }` instead of buf.toString('utf8') per chunk. A multibyte filename split across chunk boundaries would otherwise produce replacement characters, corrupting the path or failing the pattern match. validateGlobPattern(): - Track whether the scanner is inside a `[...]` character class and skip brace depth counting while inside one. ripgrep accepts `[{]foo.ts` as a valid pattern (the `{` is literal inside the bracket), but the previous scanner incremented braceDepth for it and rejected the pattern before ripgrep ran. filterByPattern() / makeLineFilter(): - Preserve the original case of the relative path on Windows. The previous code lowercased the entire path before deriving the relative path, so case-sensitive patterns like `SRC/**/*.TS` would fail against rg's original-case output. The new `relativePath()` helper compares case-insensitively to find the boundary but returns the original-case suffix for pattern matching. --- .../agent-core/src/tools/builtin/file/glob.ts | 69 +++++++++++++------ .../agent-core/src/tools/support/run-rg.ts | 10 ++- packages/agent-core/test/tools/glob.test.ts | 52 ++++++++++++++ 3 files changed, 109 insertions(+), 22 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index f1f331a2b..3be2698b6 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -369,6 +369,8 @@ function isBroadPattern(pattern: string): boolean { * semantics ripgrep's `--glob` would have applied: * - Patterns without `/` match the basename at any depth. * - Patterns with `/` match the relative path from the search root. + * - A leading `/` is stripped — ripgrep treats `/src/*.ts` as rooted at + * the search root, equivalent to `src/*.ts`. * - `**` matches zero or more directory levels. * - Dotfiles are matched (rg runs with `--hidden`). * - Matching is case-sensitive, matching ripgrep's default (use @@ -377,11 +379,12 @@ function isBroadPattern(pattern: string): boolean { */ function matchUserPattern(relPath: string, pattern: string): boolean { const opts = { dot: true }; - if (!pattern.includes('/')) { + const normalizedPattern = pattern.startsWith('/') ? pattern.slice(1) : pattern; + if (!normalizedPattern.includes('/')) { const basename = relPath.split('/').pop()!; - return picomatch.isMatch(basename, pattern, opts); + return picomatch.isMatch(basename, normalizedPattern, opts); } - return picomatch.isMatch(relPath, pattern, opts); + return picomatch.isMatch(relPath, normalizedPattern, opts); } /** @@ -389,8 +392,8 @@ function matchUserPattern(relPath: string, pattern: string): boolean { * pattern. Broad patterns (star, double-star, star-slash-star) match * everything, so the filter is skipped for those. Returns the filtered list * in the same order. On Windows, the search root and absolute paths are - * case-normalized before computing the relative path so that a user-supplied - * root with different casing than rg returns doesn't produce escaped paths. + * case-normalized only to compute the relative path boundary — the original- + * case relative path is then used for case-sensitive pattern matching. */ function filterByPattern( absPaths: string[], @@ -400,17 +403,36 @@ function filterByPattern( ): string[] { if (isBroadPattern(pattern)) return absPaths; const result: string[] = []; - const normRoot = normalize(searchRoot); - const comparableRoot = pathClass === 'win32' ? normRoot.toLowerCase() : normRoot; for (const absPath of absPaths) { - const normAbs = normalize(absPath); - const comparableAbs = pathClass === 'win32' ? normAbs.toLowerCase() : normAbs; - const rel = relative(comparableRoot, comparableAbs); + const rel = relativePath(searchRoot, absPath, pathClass); + if (rel === undefined) continue; if (matchUserPattern(rel, pattern)) result.push(absPath); } return result; } +/** + * Compute the relative path from `searchRoot` to `absPath`, preserving the + * original casing of `absPath`. On Windows, the root and path are compared + * case-insensitively to find the boundary, but the returned relative path + * keeps the original case so case-sensitive glob matching works correctly. + * Returns `undefined` if `absPath` is not under `searchRoot`. + */ +function relativePath(searchRoot: string, absPath: string, pathClass: PathClass): string | undefined { + const normAbs = normalize(absPath); + const normRoot = normalize(searchRoot); + if (pathClass !== 'win32') { + const rel = relative(normRoot, normAbs); + return rel.startsWith('..') ? undefined : rel; + } + const lowerAbs = normAbs.toLowerCase(); + const lowerRoot = normRoot.toLowerCase(); + if (lowerAbs === lowerRoot) return '.'; + const prefix = lowerRoot.endsWith('/') ? lowerRoot : lowerRoot + '/'; + if (!lowerAbs.startsWith(prefix)) return undefined; + return normAbs.slice(prefix.length); +} + /** * Build a streaming line filter for `runRipgrepOnce` that applies the user's * glob pattern to each path line before it counts toward the output cap. @@ -422,7 +444,7 @@ function filterByPattern( * filter strips the leading `./` or `.\` and normalizes backslashes to * forward slashes on Windows before matching. If the line is an absolute * path (e.g. from a test mock or an external root), it is made relative to - * the search root first. + * the search root first, preserving original case for pattern matching. */ function makeLineFilter( pattern: string, @@ -430,16 +452,15 @@ function makeLineFilter( searchRoot: string, ): ((line: string) => boolean) | undefined { if (isBroadPattern(pattern)) return undefined; - const normRoot = normalize(searchRoot); - const comparableRoot = pathClass === 'win32' ? normRoot.toLowerCase() : normRoot; return (line: string): boolean => { let relPath = line; if (pathClass === 'win32') relPath = relPath.replace(/\\/g, '/'); if (relPath.startsWith('./') || relPath.startsWith('.\\')) { relPath = relPath.slice(2); } else if (isAbsolute(relPath)) { - const normAbs = pathClass === 'win32' ? normalize(relPath).toLowerCase() : normalize(relPath); - relPath = relative(comparableRoot, normAbs); + const rel = relativePath(searchRoot, relPath, pathClass); + if (rel === undefined) return false; + relPath = rel; } return matchUserPattern(relPath, pattern); }; @@ -451,9 +472,11 @@ function makeLineFilter( * is well-formed. This mirrors ripgrep's globset parser, which rejects * unclosed character classes and brace expansions with a hard error — * picomatch would silently treat them as literals and report "No matches - * found" instead. + * found" instead. Braces inside a character class (`[{]foo`) are literal + * characters and do not count toward brace depth. */ function validateGlobPattern(pattern: string): string | undefined { + let inBracket = false; let bracketDepth = 0; let braceDepth = 0; for (let i = 0; i < pattern.length; i++) { @@ -462,10 +485,16 @@ function validateGlobPattern(pattern: string): string | undefined { i++; continue; } - if (ch === '[') bracketDepth++; - else if (ch === ']' && bracketDepth > 0) bracketDepth--; - else if (ch === '{') braceDepth++; - else if (ch === '}' && braceDepth > 0) braceDepth--; + if (ch === '[') { + inBracket = true; + bracketDepth++; + } else if (ch === ']' && bracketDepth > 0) { + inBracket = false; + bracketDepth--; + } else if (!inBracket) { + if (ch === '{') braceDepth++; + else if (ch === '}' && braceDepth > 0) braceDepth--; + } } if (bracketDepth > 0) return `Invalid glob pattern: unclosed '[' in "${pattern}"`; if (braceDepth > 0) return `Invalid glob pattern: unclosed '{' in "${pattern}"`; diff --git a/packages/agent-core/src/tools/support/run-rg.ts b/packages/agent-core/src/tools/support/run-rg.ts index 77cf2a6fe..510d2157f 100644 --- a/packages/agent-core/src/tools/support/run-rg.ts +++ b/packages/agent-core/src/tools/support/run-rg.ts @@ -244,14 +244,18 @@ async function readStreamWithCap( const chunks: Buffer[] = []; let total = 0; let truncated = false; + // Use a streaming TextDecoder so multibyte UTF-8 sequences split across + // chunk boundaries are decoded correctly instead of producing replacement + // characters. + const decoder = lineFilter ? new TextDecoder('utf8', { fatal: false }) : undefined; let lineBuffer = ''; try { for await (const chunk of stream) { const buf: Buffer = typeof chunk === 'string' ? Buffer.from(chunk, 'utf8') : (chunk as Buffer); if (truncated) continue; - if (lineFilter) { - lineBuffer += buf.toString('utf8'); + if (lineFilter && decoder) { + lineBuffer += decoder.decode(buf, { stream: true }); const lines = lineBuffer.split('\n'); lineBuffer = lines.pop()!; for (const line of lines) { @@ -282,6 +286,8 @@ async function readStreamWithCap( } } if (!truncated && lineBuffer.length > 0) { + // Flush any remaining bytes from the decoder. + if (decoder) lineBuffer += decoder.decode(); if (!lineFilter || lineFilter(lineBuffer)) { const lineBytes = Buffer.from(lineBuffer, 'utf8'); if (total + lineBytes.length > maxBytes) { diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index 6e33acdbe..8249642af 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -511,6 +511,58 @@ describe('GlobTool', () => { expect(exec).not.toHaveBeenCalled(); }); + it('accepts brace-in-bracket patterns like [{]foo.ts', async () => { + const exec = execReturning('/workspace/{foo.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[{]foo.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('{foo.ts'); + }); + + it('matches rooted patterns with a leading slash', async () => { + const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '/src/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('src/a.ts'); + expect(result.output).not.toContain('other/b.ts'); + }); + + it('decodes multibyte filenames split across stream chunks', async () => { + // Split a multibyte filename across two chunks so naive buf.toString + // would produce a replacement character. + const fullLine = '/workspace/src/é.ts\n'; + const fullBuf = Buffer.from(fullLine, 'utf8'); + const splitPoint = fullBuf.indexOf(0xc3); // first byte of é (0xc3 0xa9) + const chunk1 = fullBuf.subarray(0, splitPoint + 1); // splits the multibyte char + const chunk2 = fullBuf.subarray(splitPoint + 1); + const stdoutStream = Readable.from([chunk1, chunk2]); + const exec = vi.fn().mockResolvedValue({ + ...processWithOutput(''), + stdout: stdoutStream, + }); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'src/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('src/é.ts'); + expect(result.output).not.toContain('\uFFFD'); + }); + it('surfaces ripgrep errors when no complete path is produced', async () => { const exec = execReturning('', 'error: something went wrong', 2); const tool = new GlobTool(kaosWithExec(exec), workspace); From a600edbf35286dd7503665ccc57158e2e5a17fa8 Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 06:27:16 +0000 Subject: [PATCH 06/13] perf: compile glob matcher once per search instead of per line Replace matchUserPattern (which called picomatch.isMatch on every line, reparsing/compiling the pattern each time) with compileGlobMatcher, which compiles the picomatch matcher once and returns a reusable test function. filterByPattern and makeLineFilter now build the matcher once per search and call the compiled function for each path line. In large trees this avoids hundreds of thousands of redundant pattern compilations that could burn the Glob timeout in JS rather than in rg. --- .../agent-core/src/tools/builtin/file/glob.ts | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index 3be2698b6..3490c11c5 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -365,8 +365,8 @@ function isBroadPattern(pattern: string): boolean { } /** - * Match a file path against the user's glob pattern, using the same - * semantics ripgrep's `--glob` would have applied: + * Compile a glob matcher for the user's pattern, using the same semantics + * ripgrep's `--glob` would have applied: * - Patterns without `/` match the basename at any depth. * - Patterns with `/` match the relative path from the search root. * - A leading `/` is stripped — ripgrep treats `/src/*.ts` as rooted at @@ -376,15 +376,20 @@ function isBroadPattern(pattern: string): boolean { * - Matching is case-sensitive, matching ripgrep's default (use * `--glob-case-insensitive` / `--iglob` for case-insensitive mode). * - Brace expansion (`*.{ts,tsx}`) is handled by picomatch natively. + * + * Returns a function that takes a relative path and returns whether it + * matches. The picomatch matcher is compiled once so large trees don't + * reparse the pattern on every line. */ -function matchUserPattern(relPath: string, pattern: string): boolean { +function compileGlobMatcher(pattern: string): (relPath: string) => boolean { const opts = { dot: true }; const normalizedPattern = pattern.startsWith('/') ? pattern.slice(1) : pattern; if (!normalizedPattern.includes('/')) { - const basename = relPath.split('/').pop()!; - return picomatch.isMatch(basename, normalizedPattern, opts); + const matcher = picomatch(normalizedPattern, opts); + return (relPath: string) => matcher(relPath.split('/').pop()!); } - return picomatch.isMatch(relPath, normalizedPattern, opts); + const matcher = picomatch(normalizedPattern, opts); + return (relPath: string) => matcher(relPath); } /** @@ -402,11 +407,12 @@ function filterByPattern( pathClass: PathClass, ): string[] { if (isBroadPattern(pattern)) return absPaths; + const matches = compileGlobMatcher(pattern); const result: string[] = []; for (const absPath of absPaths) { const rel = relativePath(searchRoot, absPath, pathClass); if (rel === undefined) continue; - if (matchUserPattern(rel, pattern)) result.push(absPath); + if (matches(rel)) result.push(absPath); } return result; } @@ -452,6 +458,7 @@ function makeLineFilter( searchRoot: string, ): ((line: string) => boolean) | undefined { if (isBroadPattern(pattern)) return undefined; + const matches = compileGlobMatcher(pattern); return (line: string): boolean => { let relPath = line; if (pathClass === 'win32') relPath = relPath.replace(/\\/g, '/'); @@ -462,7 +469,7 @@ function makeLineFilter( if (rel === undefined) return false; relPath = rel; } - return matchUserPattern(relPath, pattern); + return matches(relPath); }; } From d810f4e976d8f0148ceb5d84156dba6493362757 Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 06:56:34 +0000 Subject: [PATCH 07/13] fix: preserve root anchoring for leading-slash basename globs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A rooted basename pattern like /foo.ts was stripped to foo.ts, which has no / and therefore fell into the basename-only branch — matching sub/foo.ts and deep/nested/foo.ts as well as the root file. rg treats /foo.ts as a gitignore-style rooted glob that only matches at the search root. Track whether the original pattern was rooted and use full-relative-path matching in that case so the in-process matcher matches rg's behavior. --- .../agent-core/src/tools/builtin/file/glob.ts | 8 ++++++-- packages/agent-core/test/tools/glob.test.ts | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index 3490c11c5..e111e02f0 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -383,8 +383,12 @@ function isBroadPattern(pattern: string): boolean { */ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { const opts = { dot: true }; - const normalizedPattern = pattern.startsWith('/') ? pattern.slice(1) : pattern; - if (!normalizedPattern.includes('/')) { + const rooted = pattern.startsWith('/'); + const normalizedPattern = rooted ? pattern.slice(1) : pattern; + // A pattern without `/` matches the basename at any depth — unless the + // original pattern was rooted with a leading `/`, in which case it + // matches only at the search root (gitignore-style rooted globs). + if (!normalizedPattern.includes('/') && !rooted) { const matcher = picomatch(normalizedPattern, opts); return (relPath: string) => matcher(relPath.split('/').pop()!); } diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index 8249642af..8554abb25 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -538,6 +538,23 @@ describe('GlobTool', () => { expect(result.output).not.toContain('other/b.ts'); }); + it('rooted basename pattern only matches at the search root', async () => { + const exec = execReturning( + '/workspace/foo.ts\n/workspace/sub/foo.ts\n/workspace/deep/nested/foo.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '/foo.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('foo.ts'); + expect(result.output).not.toContain('sub/foo.ts'); + expect(result.output).not.toContain('deep/nested/foo.ts'); + }); + it('decodes multibyte filenames split across stream chunks', async () => { // Split a multibyte filename across two chunks so naive buf.toString // would produce a replacement character. From 248855a554c2286aa8eb89bd0e92dd6c917abfec Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 07:38:03 +0000 Subject: [PATCH 08/13] fix: accept bracket escapes, disable picomatch extglob/range braces, narrow search paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit validateGlobPattern(): - Inside a character class, [ is a literal character, not a second opening bracket. Previously [[]foo.ts was rejected as unclosed because the inner [ incremented bracketDepth a second time and the single ] only decremented it once. Now only ] closes a character class, so [[]foo.ts is accepted as a valid gitignore-style escape that matches a literal [. compileGlobMatcher() / escapePicomatchExtensions(): - picomatch's extglob: false and rangeBraces: false options have no effect — the patterns are still parsed as extglob/range. Preprocess the pattern to escape these constructs so picomatch treats them as literal, matching rg --glob behavior: @(a|b) -> [@]\(a\|b\) (literal, not extglob) {1..2} -> \{1..2\} (literal, not range expansion) Brace alternatives like {ts,tsx} are left intact (supported by both). buildRgArgs() / deriveSearchPath(): - Derive a safe literal search path from the pattern prefix and pass it to rg instead of always searching from '.'. For example, src/**/*.ts now searches 'src' instead of '.', so rg only walks the relevant subtree instead of collecting and sorting unrelated files before the streaming filter can see anything. Falls back to '.' when no literal prefix can be derived (e.g. *.ts, **/*.ts). --- .../agent-core/src/tools/builtin/file/glob.ts | 95 ++++++++++++++++--- packages/agent-core/test/tools/glob.test.ts | 61 +++++++++++- 2 files changed, 143 insertions(+), 13 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index e111e02f0..c02571e2d 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -354,12 +354,50 @@ function buildRgArgs( cmd.push('--glob', `!${glob}`); } if (args.include_ignored) cmd.push('--no-ignore'); - // Search path is `.` because the process cwd is pinned to the search root - // (see execution()); this keeps `--glob` matching relative to that root. - cmd.push('.'); + // Derive a safe literal search path from the pattern so rg only walks + // the relevant subtree instead of the entire root. For example, + // `src/**/*.ts` → search path `src` instead of `.`. This prevents + // anchored searches from collecting and sorting unrelated files before + // the streaming filter can see anything. + const searchPath = deriveSearchPath(args.pattern); + cmd.push(searchPath); return cmd; } +/** + * Extract the leading literal path component(s) from a glob pattern, + * before any glob metacharacter. Returns the path relative to the search + * root, or '.' if no literal prefix can be derived. + * + * Examples: + * src/.../.ts -> src (double-star in the middle) + * /src/.ts -> src (leading slash stripped, rooted) + * src/a/b.ts -> src/a (no metacharacters, strip trailing file) + * star.ts -> . (starts with a metacharacter) + * src/{a,b}.ts -> src (brace is a metacharacter) + */ +function deriveSearchPath(pattern: string): string { + const rooted = pattern.startsWith('/'); + const normalized = rooted ? pattern.slice(1) : pattern; + // Find the first glob metacharacter: *, ?, [, {, or a ** segment. + const metaIndex = normalized.search(/[*?\[{]/); + if (metaIndex === -1) { + // No metacharacters — the whole pattern is a literal path. But if it + // contains no `/`, it's a basename pattern that could match at any + // depth, so we can't narrow the search path. + if (!normalized.includes('/')) return '.'; + // Strip a trailing filename (last path component) since the parent + // directory is the narrowest safe search path for a single file. + const lastSlash = normalized.lastIndexOf('/'); + return lastSlash > 0 ? normalized.slice(0, lastSlash) : '.'; + } + // Narrow to the literal prefix before the first metacharacter. + const prefix = normalized.slice(0, metaIndex); + const lastSlash = prefix.lastIndexOf('/'); + if (lastSlash <= 0) return '.'; + return prefix.slice(0, lastSlash); +} + function isBroadPattern(pattern: string): boolean { return pattern === '*' || pattern === '**' || pattern === '**/*'; } @@ -385,17 +423,45 @@ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { const opts = { dot: true }; const rooted = pattern.startsWith('/'); const normalizedPattern = rooted ? pattern.slice(1) : pattern; + // Escape picomatch-only extensions that rg --glob does not support: + // - extglob: @(a|b), !(a), +(a), ?(a), *(a) — rg treats these as + // literal characters, not extended glob patterns. + // - range braces: {1..2} — rg treats {1..2} as a literal, not a range. + // Brace alternatives like {ts,tsx} are still supported by both rg and + // picomatch, so they are left intact. + const escapedPattern = escapePicomatchExtensions(normalizedPattern); // A pattern without `/` matches the basename at any depth — unless the // original pattern was rooted with a leading `/`, in which case it // matches only at the search root (gitignore-style rooted globs). if (!normalizedPattern.includes('/') && !rooted) { - const matcher = picomatch(normalizedPattern, opts); + const matcher = picomatch(escapedPattern, opts); return (relPath: string) => matcher(relPath.split('/').pop()!); } - const matcher = picomatch(normalizedPattern, opts); + const matcher = picomatch(escapedPattern, opts); return (relPath: string) => matcher(relPath); } +/** + * Escape picomatch-only glob extensions that rg --glob does not support, + * so the in-process matcher matches the same files rg would. + * + * - extglob constructs like @(a|b), !(a), +(a), ?(a), *(a) are escaped + * so picomatch treats them as literal characters. + * - range braces like {1..2} are escaped so picomatch treats them as + * literal strings instead of expanding to 1, 2. + * - brace alternatives like {ts,tsx} are left intact (supported by both). + */ +function escapePicomatchExtensions(pattern: string): string { + let result = pattern; + // Escape extglob: [@!?+*](...) → literal + result = result.replace(/([@!?+*])\(([^)]*)\)/g, (_match, prefix: string, inner: string) => { + return `[${prefix}]\\(${inner.replace(/\|/g, '\\|')}\\)`; + }); + // Escape range braces: {N..M} → literal + result = result.replace(/\{(\d+)\.\.(\d+)\}/g, '\\{$1..$2\\}'); + return result; +} + /** * Filter absolute paths from `rg --files` against the user's positive glob * pattern. Broad patterns (star, double-star, star-slash-star) match @@ -496,15 +562,20 @@ function validateGlobPattern(pattern: string): string | undefined { i++; continue; } - if (ch === '[') { + if (inBracket) { + // Inside a character class, [ is a literal (e.g. [[] matches a + // literal [). Only ] closes the class. + if (ch === ']') { + inBracket = false; + bracketDepth--; + } + } else if (ch === '[') { inBracket = true; bracketDepth++; - } else if (ch === ']' && bracketDepth > 0) { - inBracket = false; - bracketDepth--; - } else if (!inBracket) { - if (ch === '{') braceDepth++; - else if (ch === '}' && braceDepth > 0) braceDepth--; + } else if (ch === '{') { + braceDepth++; + } else if (ch === '}' && braceDepth > 0) { + braceDepth--; } } if (bracketDepth > 0) return `Invalid glob pattern: unclosed '[' in "${pattern}"`; diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index 8554abb25..cf7e0c322 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -235,7 +235,8 @@ describe('GlobTool', () => { const result = await executeTool(tool, context({ pattern: 'pkg/**/*.ts', path: '/extra' })); expect(result.output).toBe('/extra/pkg/a.ts'); - expect(execArgs(exec).at(-1)).toBe('.'); + // The search path is derived from the pattern prefix: pkg/**/*.ts -> pkg + expect(execArgs(exec).at(-1)).toBe('pkg'); }); it('adds --no-ignore when include_ignored is true', async () => { @@ -524,6 +525,64 @@ describe('GlobTool', () => { expect(result.output).toContain('{foo.ts'); }); + it('accepts bracket-in-bracket patterns like [[]foo.ts', async () => { + const exec = execReturning('/workspace/[foo.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[[]foo.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('[foo.ts'); + }); + + it('treats extglob syntax as literal, matching rg --glob behavior', async () => { + const exec = execReturning('/workspace/@(a|b).ts\n/workspace/a.ts\n/workspace/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '@(a|b).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('@(a|b).ts'); + expect(result.output).not.toContain('a.ts'); + expect(result.output).not.toContain('b.ts'); + }); + + it('treats range braces as literal, matching rg --glob behavior', async () => { + const exec = execReturning('/workspace/{1..2}.ts\n/workspace/1.ts\n/workspace/2.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '{1..2}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('{1..2}.ts'); + expect(result.output).not.toContain('1.ts'); + expect(result.output).not.toContain('2.ts'); + }); + + it('derives a search path from anchored patterns to narrow rg traversal', async () => { + const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'src/**/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('src/a.ts'); + // The search path should be 'src', not '.' + expect(execArgs(exec).at(-1)).toBe('src'); + }); + it('matches rooted patterns with a leading slash', async () => { const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); From 71c9af3113c12a48c14f313cfc774dc16b811d32 Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 08:55:52 +0000 Subject: [PATCH 09/13] fix: harden glob search path, extglob/range escaping, and pattern validation Address PR #1245 review comments: - Remove deriveSearchPath and always search from `.`. A derived prefix passed as the rg PATH overrides ignore rules (command-line paths are authoritative in rg), allows `../outside/**` to escape the authorized tree, and errors out on non-existent prefixes. The streaming line filter already prevents the output cap from being exhausted. - Fix relativePath to only reject actual escapes (`..` or `../`), not files whose names start with two dots (e.g. `..config/a.ts`). - Preserve `*` and `?` wildcards before literal parentheses. rg treats them as regular wildcards and `(` as a literal character, not as extglob prefixes. Only `@`, `!`, `+` are escaped as literal prefixes. - Match rg's range-brace behavior: `{N..M}` and `{foo}` are single brace alternatives with the braces removed (matching `N..M` / `foo`), not literal braces or picomatch ranges. - Reject malformed character classes (`[]`, `[!]`, `[^]`, `[z-a]`) and dangling trailing backslashes before running ripgrep, matching rg's hard parse errors instead of falling through to picomatch. --- .changeset/glob-ignore-files.md | 2 +- .../agent-core/src/tools/builtin/file/glob.ts | 155 +++++++----- packages/agent-core/test/tools/glob.test.ts | 237 +++++++++++++++++- 3 files changed, 319 insertions(+), 75 deletions(-) diff --git a/.changeset/glob-ignore-files.md b/.changeset/glob-ignore-files.md index 0a836ca85..0758faf87 100644 --- a/.changeset/glob-ignore-files.md +++ b/.changeset/glob-ignore-files.md @@ -2,4 +2,4 @@ "@moonshot-ai/kimi-code": patch --- -Fix glob file searches so all patterns continue to respect ignore files. A positive ripgrep `--glob` always overrides ignore logic, so any non-broad user pattern (e.g. `*.ts`, `dist/**/*.js`) could re-include files excluded by `.gitignore`. The pattern is now filtered in-process after `rg --files` enumerates non-ignored files. +Fix glob file searches so all patterns continue to respect ignore files. A positive ripgrep `--glob` always overrides ignore logic, so any non-broad user pattern (e.g. `*.ts`, `dist/**/*.js`) could re-include files excluded by `.gitignore`. The pattern is now filtered in-process after `rg --files` enumerates non-ignored files. Also fixes several glob edge cases: the search path is always `.` so derived prefixes cannot override ignore rules or escape the authorized tree; `*` and `?` before literal parentheses are preserved as wildcards (not treated as extglob prefixes); range braces `{N..M}` match rg's single-alternative behavior; malformed character classes (`[]`, `[!]`, `[z-a]`, dangling `\`) are rejected before running ripgrep; and files whose names start with `..` are no longer dropped as escapes. diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index c02571e2d..fb24a9b75 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -354,50 +354,19 @@ function buildRgArgs( cmd.push('--glob', `!${glob}`); } if (args.include_ignored) cmd.push('--no-ignore'); - // Derive a safe literal search path from the pattern so rg only walks - // the relevant subtree instead of the entire root. For example, - // `src/**/*.ts` → search path `src` instead of `.`. This prevents - // anchored searches from collecting and sorting unrelated files before - // the streaming filter can see anything. - const searchPath = deriveSearchPath(args.pattern); - cmd.push(searchPath); + // Search path is `.` because the process cwd is pinned to the search root + // (see execution()). Passing a derived subdirectory as the rg PATH would + // override ignore rules — rg treats command-line paths as authoritative, + // so `rg --files dist` traverses a gitignored `dist/` even with `--glob + // !dist`. It would also allow patterns like `../outside/**` to escape the + // authorized search tree, and error out on non-existent prefixes. The + // streaming line filter (makeLineFilter) already prevents the output cap + // from being exhausted by non-matching paths, so narrowing the traversal + // is not needed for correctness. + cmd.push('.'); return cmd; } -/** - * Extract the leading literal path component(s) from a glob pattern, - * before any glob metacharacter. Returns the path relative to the search - * root, or '.' if no literal prefix can be derived. - * - * Examples: - * src/.../.ts -> src (double-star in the middle) - * /src/.ts -> src (leading slash stripped, rooted) - * src/a/b.ts -> src/a (no metacharacters, strip trailing file) - * star.ts -> . (starts with a metacharacter) - * src/{a,b}.ts -> src (brace is a metacharacter) - */ -function deriveSearchPath(pattern: string): string { - const rooted = pattern.startsWith('/'); - const normalized = rooted ? pattern.slice(1) : pattern; - // Find the first glob metacharacter: *, ?, [, {, or a ** segment. - const metaIndex = normalized.search(/[*?\[{]/); - if (metaIndex === -1) { - // No metacharacters — the whole pattern is a literal path. But if it - // contains no `/`, it's a basename pattern that could match at any - // depth, so we can't narrow the search path. - if (!normalized.includes('/')) return '.'; - // Strip a trailing filename (last path component) since the parent - // directory is the narrowest safe search path for a single file. - const lastSlash = normalized.lastIndexOf('/'); - return lastSlash > 0 ? normalized.slice(0, lastSlash) : '.'; - } - // Narrow to the literal prefix before the first metacharacter. - const prefix = normalized.slice(0, metaIndex); - const lastSlash = prefix.lastIndexOf('/'); - if (lastSlash <= 0) return '.'; - return prefix.slice(0, lastSlash); -} - function isBroadPattern(pattern: string): boolean { return pattern === '*' || pattern === '**' || pattern === '**/*'; } @@ -445,20 +414,41 @@ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { * Escape picomatch-only glob extensions that rg --glob does not support, * so the in-process matcher matches the same files rg would. * - * - extglob constructs like @(a|b), !(a), +(a), ?(a), *(a) are escaped - * so picomatch treats them as literal characters. - * - range braces like {1..2} are escaped so picomatch treats them as - * literal strings instead of expanding to 1, 2. + * - extglob constructs like @(a|b), !(a), +(a) are escaped so picomatch + * treats the prefix character and parentheses as literal. `@`, `!`, `+` + * are literal characters in rg's glob engine. + * - `*` and `?` before `(` are NOT extglob prefixes in rg — they are + * regular wildcards, and `(` is a literal character. Only the + * parentheses are escaped so picomatch doesn't interpret `*(...)` or + * `?(...)` as extglobs, but the wildcard semantics of `*` and `?` are + * preserved. + * - range braces like {1..2} are reduced to `1..2` (braces removed) to + * match rg, which treats a brace group with a single alternative (no + * comma) as that alternative with the braces stripped. * - brace alternatives like {ts,tsx} are left intact (supported by both). */ function escapePicomatchExtensions(pattern: string): string { let result = pattern; - // Escape extglob: [@!?+*](...) → literal - result = result.replace(/([@!?+*])\(([^)]*)\)/g, (_match, prefix: string, inner: string) => { - return `[${prefix}]\\(${inner.replace(/\|/g, '\\|')}\\)`; + // Escape extglob: [@!+](...) → literal prefix + literal parens. + // `*` and `?` are wildcards in rg, so only the parens are escaped. + result = result.replaceAll(/([@!?+*])\(([^)]*)\)/g, (_match, prefix: string, inner: string) => { + const escapedInner = inner.replaceAll('|', '\\|'); + if (prefix === '*' || prefix === '?') { + return `${prefix}\\(${escapedInner}\\)`; + } + return `[${prefix}]\\(${escapedInner}\\)`; + }); + // Range braces and single-alternative braces: rg treats `{N..M}` and + // `{foo}` as a single brace alternative, removing the braces (matching + // `N..M` or `foo`, not `{N..M}` or `{foo}`). picomatch 4.x either expands + // `{a..z}` as a range or treats `{foo}` as a literal — neither matches + // rg. Reduce to the inner text so picomatch matches the same literal. + // Escaped braces (`\{...\}`) are left intact via the lookbehind. + result = result.replaceAll(/(? { + // Brace alternatives with commas are expanded by both rg and picomatch. + if (inner.includes(',')) return _match; + return inner; }); - // Escape range braces: {N..M} → literal - result = result.replace(/\{(\d+)\.\.(\d+)\}/g, '\\{$1..$2\\}'); return result; } @@ -499,7 +489,11 @@ function relativePath(searchRoot: string, absPath: string, pathClass: PathClass) const normRoot = normalize(searchRoot); if (pathClass !== 'win32') { const rel = relative(normRoot, normAbs); - return rel.startsWith('..') ? undefined : rel; + // Only reject paths that actually escape the root: `..` (the parent + // itself) or `../…` (something inside the parent). A file whose name + // starts with two dots — e.g. `..config/a.ts` — is under the root and + // must be kept. + return rel === '..' || rel.startsWith('../') ? undefined : rel; } const lowerAbs = normAbs.toLowerCase(); const lowerRoot = normRoot.toLowerCase(); @@ -531,7 +525,7 @@ function makeLineFilter( const matches = compileGlobMatcher(pattern); return (line: string): boolean => { let relPath = line; - if (pathClass === 'win32') relPath = relPath.replace(/\\/g, '/'); + if (pathClass === 'win32') relPath = relPath.replaceAll('\\', '/'); if (relPath.startsWith('./') || relPath.startsWith('.\\')) { relPath = relPath.slice(2); } else if (isAbsolute(relPath)) { @@ -544,34 +538,71 @@ function makeLineFilter( } /** - * Validate a glob pattern for common malformed syntax (unclosed `[` or `{`). - * Returns an error message if the pattern is invalid, or `undefined` if it - * is well-formed. This mirrors ripgrep's globset parser, which rejects - * unclosed character classes and brace expansions with a hard error — - * picomatch would silently treat them as literals and report "No matches - * found" instead. Braces inside a character class (`[{]foo`) are literal - * characters and do not count toward brace depth. + * Validate a glob pattern for common malformed syntax. Returns an error + * message if the pattern is invalid, or `undefined` if it is well-formed. + * This mirrors ripgrep's globset parser, which rejects these with a hard + * error — picomatch would silently treat them as literals and report "No + * matches found" instead. + * + * Detected errors: + * - Unclosed `[` or `{` (balanced tracking). + * - Empty character class: `[]`, `[!]`, `[^]` — a `]` right after the + * opening bracket prefix (`[`, `[!`, `[^`) is a literal `]`, not the + * terminator, so the class is unclosed. + * - Invalid range: `[z-a]` where the start character is greater than the + * end character. + * - Dangling backslash: a trailing `\` with no character to escape. + * + * Braces inside a character class (`[{]foo`) are literal characters and do + * not count toward brace depth. */ function validateGlobPattern(pattern: string): string | undefined { let inBracket = false; let bracketDepth = 0; let braceDepth = 0; + // Position of the first content character inside the current bracket + // (after `[`, `[!`, or `[^`). A `]` at this position is a literal, not + // the class terminator. + let bracketContentStart = -1; + // Last unescaped character seen inside the current bracket, for range + // validation. Reset to '' on bracket open or after an escape. + let bracketPrevChar = ''; for (let i = 0; i < pattern.length; i++) { - const ch = pattern[i]; + const ch = pattern[i]!; if (ch === '\\') { + if (i === pattern.length - 1) { + return `Invalid glob pattern: dangling '\\' in "${pattern}"`; + } i++; + bracketPrevChar = ''; continue; } if (inBracket) { - // Inside a character class, [ is a literal (e.g. [[] matches a - // literal [). Only ] closes the class. if (ch === ']') { + if (i === bracketContentStart) { + // Literal ] — the class is still open. + bracketPrevChar = ']'; + continue; + } inBracket = false; bracketDepth--; + bracketPrevChar = ''; + } else if (ch === '-' && bracketPrevChar !== '' && bracketPrevChar !== '-') { + const nextCh = pattern[i + 1]; + if (nextCh !== undefined && nextCh !== ']' && nextCh !== '\\') { + if (bracketPrevChar > nextCh) { + return `Invalid glob pattern: invalid range '${bracketPrevChar}-${nextCh}' in "${pattern}"`; + } + } + } else { + bracketPrevChar = ch; } } else if (ch === '[') { inBracket = true; bracketDepth++; + const next = pattern[i + 1]; + bracketContentStart = next === '!' || next === '^' ? i + 2 : i + 1; + bracketPrevChar = ''; } else if (ch === '{') { braceDepth++; } else if (ch === '}' && braceDepth > 0) { diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index cf7e0c322..e7e8eacb4 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -235,8 +235,9 @@ describe('GlobTool', () => { const result = await executeTool(tool, context({ pattern: 'pkg/**/*.ts', path: '/extra' })); expect(result.output).toBe('/extra/pkg/a.ts'); - // The search path is derived from the pattern prefix: pkg/**/*.ts -> pkg - expect(execArgs(exec).at(-1)).toBe('pkg'); + // The search path is always `.` (pinned to the search root via cwd). + // A derived subdirectory path would override ignore rules in rg. + expect(execArgs(exec).at(-1)).toBe('.'); }); it('adds --no-ignore when include_ignored is true', async () => { @@ -512,6 +513,66 @@ describe('GlobTool', () => { expect(exec).not.toHaveBeenCalled(); }); + it('rejects empty character classes like []', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '[]', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects negated empty character classes like [!]', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '[!]', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects caret empty character classes like [^]', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '[^]', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects invalid character ranges like [z-a]', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '[z-a]', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('invalid range'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects a dangling trailing backslash', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: 'foo\\', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('dangling'); + expect(exec).not.toHaveBeenCalled(); + }); + it('accepts brace-in-bracket patterns like [{]foo.ts', async () => { const exec = execReturning('/workspace/{foo.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); @@ -553,22 +614,80 @@ describe('GlobTool', () => { expect(result.output).not.toContain('b.ts'); }); - it('treats range braces as literal, matching rg --glob behavior', async () => { - const exec = execReturning('/workspace/{1..2}.ts\n/workspace/1.ts\n/workspace/2.ts\n'); + it('preserves * and ? wildcards before literal parentheses, matching rg --glob', async () => { + // rg treats `*` and `?` as regular wildcards and `(` as a literal + // character — NOT as extglob prefixes. So `*(bar).ts` matches any file + // ending in `(bar).ts` (the `*` matches any prefix), including + // `foo123(bar).ts` which the old `[*]\(bar\).ts` escape would NOT match. + const exec = execReturning( + '/workspace/foo*(bar).ts\n/workspace/foo123(bar).ts\n/workspace/x(bar).ts\n/workspace/(bar).ts\n/workspace/other.ts\n', + ); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool( tool, - context({ pattern: '{1..2}.ts', path: '/workspace' }), + context({ pattern: '*(bar).ts', path: '/workspace' }), ); expect(result.isError).toBeFalsy(); - expect(result.output).toContain('{1..2}.ts'); - expect(result.output).not.toContain('1.ts'); - expect(result.output).not.toContain('2.ts'); - }); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('foo*(bar).ts'); + expect(lines).toContain('foo123(bar).ts'); + expect(lines).toContain('x(bar).ts'); + expect(lines).toContain('(bar).ts'); + expect(lines).not.toContain('other.ts'); + }); + + it('preserves ? wildcard before literal parentheses, matching rg --glob', async () => { + // `?(bar).ts` — `?` is a single-char wildcard, `(` is literal. Matches + // `x(bar).ts` (one char) and `?(bar).ts` (literal ?), but not + // `yz(bar).ts` (two chars) or `(bar).ts` (zero chars). + const exec = execReturning( + '/workspace/x(bar).ts\n/workspace/?(bar).ts\n/workspace/yz(bar).ts\n/workspace/(bar).ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '?(bar).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('x(bar).ts'); + expect(lines).toContain('?(bar).ts'); + expect(lines).not.toContain('yz(bar).ts'); + expect(lines).not.toContain('(bar).ts'); + }); + + it('treats range braces as a single alternative, matching rg --glob behavior', async () => { + // rg treats `{1..2}` as a single brace alternative, removing the braces + // (matching `1..2`, not `{1..2}` and not the range 1..2). picomatch 4.x + // would either expand the range or treat the braces as literal, so the + // in-process matcher collapses single-alternative braces to match rg. + const exec = execReturning( + '/workspace/1..2.ts\n/workspace/{1..2}.ts\n/workspace/1.ts\n/workspace/2.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); - it('derives a search path from anchored patterns to narrow rg traversal', async () => { + const result = await executeTool( + tool, + context({ pattern: '{1..2}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('1..2.ts'); + expect(result.output).not.toContain('{1..2}.ts'); + // `1.ts` and `2.ts` (standalone) should not match — only `1..2.ts`. + // Use line-level checks to avoid substring false positives. + const lines = (result.output as string).split('\n'); + expect(lines).toContain('1..2.ts'); + expect(lines).not.toContain('1.ts'); + expect(lines).not.toContain('2.ts'); + expect(lines).not.toContain('{1..2}.ts'); + }); + + it('always searches from `.` so derived paths cannot override ignore rules', async () => { const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); @@ -579,8 +698,9 @@ describe('GlobTool', () => { expect(result.isError).toBeFalsy(); expect(result.output).toContain('src/a.ts'); - // The search path should be 'src', not '.' - expect(execArgs(exec).at(-1)).toBe('src'); + // The search path is always `.` — a derived subdirectory path would + // override rg's ignore rules (command-line paths are authoritative). + expect(execArgs(exec).at(-1)).toBe('.'); }); it('matches rooted patterns with a leading slash', async () => { @@ -697,6 +817,21 @@ describe('GlobTool', () => { expect(result.output).toContain('config.yml'); }); + it('keeps files whose names start with two dots under the search root', async () => { + // A file like `..config/a.ts` is under the root — its relative path + // starts with `..` but is not `..` or `../`. The old `startsWith('..')` + // check would drop it; the fixed check only rejects actual escapes. + const exec = execReturning('/workspace/..config/a.ts\n/workspace/..foo.ts\n/workspace/src/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '**/*.ts', path: '/workspace' })); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('..config/a.ts'); + expect(result.output).toContain('..foo.ts'); + expect(result.output).toContain('src/b.ts'); + }); + it('descends into hidden directories under a recursive pattern', async () => { const exec = execReturning('/workspace/src/.config/settings.yml\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); @@ -1004,4 +1139,82 @@ describe('GlobTool integration (real ripgrep)', () => { expect(result.output).not.toContain('kept.ts'); expect(result.output).not.toContain('ignored.ts'); }); + + it('respects .gitignore for anchored patterns pointing at an ignored dir', async () => { + // A pattern like `dist/**/*.js` must not surface files from a gitignored + // `dist/`. Passing the derived prefix `dist` as the rg PATH would override + // ignore rules (command-line paths are authoritative in rg), so the tool + // always searches from `.` and lets the in-process filter narrow results. + await touch('src/a.ts', new Date('2024-01-01T00:00:00Z')); + await touch('dist/bundle.js', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), 'dist/\n'); + await fs.mkdir(path.join(tmpDir!, '.git'), { recursive: true }); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: 'dist/**/*.js', path: tmpDir! })); + + expect(result.output).toContain('No matches'); + expect(result.output).not.toContain('dist/bundle.js'); + }); + + it('respects .gitignore for anchored patterns pointing at an ignored dir (include_ignored surfaces them)', async () => { + await touch('src/a.ts', new Date('2024-01-01T00:00:00Z')); + await touch('dist/bundle.js', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), 'dist/\n'); + await fs.mkdir(path.join(tmpDir!, '.git'), { recursive: true }); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool( + tool, + context({ pattern: 'dist/**/*.js', path: tmpDir!, include_ignored: true }), + ); + + expect(result.output).toContain('dist/bundle.js'); + }); + + it('returns no matches for an anchored pattern whose prefix does not exist', async () => { + // `src/**/*.ts` in a repo with no `src` must return "No matches", not + // error out with "does not exist" (which happened when the missing + // prefix was passed as the rg PATH). + await touch('other/a.ts', new Date('2024-01-01T00:00:00Z')); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: 'src/**/*.ts', path: tmpDir! })); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('No matches'); + }); + + it('matches wildcard-before-paren patterns like rg --glob (integration)', async () => { + // rg treats `*` as a wildcard and `(` as a literal — `*(bar).ts` matches + // any file ending in `(bar).ts` (the `*` matches any prefix). + await touch('foo*(bar).ts', new Date('2024-01-01T00:00:00Z')); + await touch('foo123(bar).ts', new Date('2023-01-01T00:00:00Z')); + await touch('x(bar).ts', new Date('2022-01-01T00:00:00Z')); + await touch('other.ts', new Date('2021-01-01T00:00:00Z')); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*(bar).ts', path: tmpDir! })); + + expect(result.output).toContain('foo*(bar).ts'); + expect(result.output).toContain('foo123(bar).ts'); + expect(result.output).toContain('x(bar).ts'); + expect(result.output).not.toContain('other.ts'); + }); + + it('matches range-brace patterns as a single alternative like rg --glob (integration)', async () => { + // rg treats `{1..2}` as a single brace alternative, matching `1..2` + // (braces removed), not `{1..2}` and not the range 1..2. + await touch('1..2.ts', new Date('2024-01-01T00:00:00Z')); + await touch('{1..2}.ts', new Date('2023-01-01T00:00:00Z')); + await touch('1.ts', new Date('2022-01-01T00:00:00Z')); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '{1..2}.ts', path: tmpDir! })); + + const lines = (result.output as string).split('\n'); + expect(lines).toContain('1..2.ts'); + expect(lines).not.toContain('{1..2}.ts'); + expect(lines).not.toContain('1.ts'); + }); }); From daf49705dd8aa211a57591610cff0e9ebce48a34 Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 09:41:41 +0000 Subject: [PATCH 10/13] fix: preserve braces in char classes, drop empty alternatives, treat empty glob as broad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - rewriteBraces now tracks character-class state so braces inside `[]` (e.g. `[{a}].ts`) are left intact — rg treats them as literal class members, not brace-group delimiters - empty brace alternatives (`ab{,c}`, `ab{c,,d}`, `ab{,}`) are filtered out to match rg, which ignores them; picomatch would expand the empty arm and surface extra files - empty pattern `''` is now treated as a broad all-files glob, matching rg's `-g ''` behavior; previously picomatch threw on the empty string --- .../agent-core/src/tools/builtin/file/glob.ts | 123 ++++++++++++++++-- packages/agent-core/test/tools/glob.test.ts | 86 ++++++++++++ 2 files changed, 196 insertions(+), 13 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index fb24a9b75..7a5ff2e1d 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -368,7 +368,9 @@ function buildRgArgs( } function isBroadPattern(pattern: string): boolean { - return pattern === '*' || pattern === '**' || pattern === '**/*'; + // rg treats an empty --glob as matching all files (respecting ignores), + // so skip picomatch compilation for it — picomatch throws on empty input. + return pattern === '' || pattern === '*' || pattern === '**' || pattern === '**/*'; } /** @@ -425,7 +427,11 @@ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { * - range braces like {1..2} are reduced to `1..2` (braces removed) to * match rg, which treats a brace group with a single alternative (no * comma) as that alternative with the braces stripped. - * - brace alternatives like {ts,tsx} are left intact (supported by both). + * - brace alternatives like {ts,tsx} are left intact (supported by both), + * but empty alternatives ({,c} or {a,,b}) are dropped to match rg, which + * ignores them. + * - braces inside character classes ([{a}]) are left intact — inside `[]` + * `{` and `}` are literal class members, not brace-group delimiters. */ function escapePicomatchExtensions(pattern: string): string { let result = pattern; @@ -438,17 +444,108 @@ function escapePicomatchExtensions(pattern: string): string { } return `[${prefix}]\\(${escapedInner}\\)`; }); - // Range braces and single-alternative braces: rg treats `{N..M}` and - // `{foo}` as a single brace alternative, removing the braces (matching - // `N..M` or `foo`, not `{N..M}` or `{foo}`). picomatch 4.x either expands - // `{a..z}` as a range or treats `{foo}` as a literal — neither matches - // rg. Reduce to the inner text so picomatch matches the same literal. - // Escaped braces (`\{...\}`) are left intact via the lookbehind. - result = result.replaceAll(/(? { - // Brace alternatives with commas are expanded by both rg and picomatch. - if (inner.includes(',')) return _match; - return inner; - }); + // Rewrite brace groups outside character classes. Inside `[]`, braces + // are literal class members and must be preserved. rg also drops empty + // alternatives (e.g. `ab{,c}` matches `abc` only, not `ab`), so they are + // filtered out here to prevent picomatch from expanding them. + result = rewriteBraces(result); + return result; +} + +/** + * Rewrite brace groups in `pattern` to match rg's semantics: + * - Single-alternative braces `{foo}` → `foo` (braces stripped). + * - Comma alternatives with empty arms `{a,,b}` → `{a,b}` (empty arms + * dropped; if only one non-empty arm remains, braces are removed). + * - All-empty `{,,}` → empty string (braces removed, no content). + * - Nested brace groups `{a,{b,c}}` are left intact (both rg and + * picomatch expand them). + * - Escaped braces `\{...\}` are left intact. + * - Braces inside character classes `[{a}]` are left intact — inside + * `[]` they are literal class members, not group delimiters. + */ +function rewriteBraces(pattern: string): string { + let result = ''; + let i = 0; + let inBracket = false; + let bracketContentStart = -1; + while (i < pattern.length) { + const ch = pattern[i]!; + if (inBracket) { + result += ch; + if (ch === '\\' && i + 1 < pattern.length) { + result += pattern[i + 1]!; + i += 2; + continue; + } + if (ch === ']') { + // A `]` at the first content position is a literal, not the + // terminator (mirrors validateGlobPattern logic). + if (i !== bracketContentStart) inBracket = false; + } + i++; + continue; + } + if (ch === '[') { + inBracket = true; + const next = pattern[i + 1]; + bracketContentStart = next === '!' || next === '^' ? i + 2 : i + 1; + result += ch; + i++; + continue; + } + if (ch === '\\' && i + 1 < pattern.length) { + result += ch + (pattern[i + 1] ?? ''); + i += 2; + continue; + } + if (ch === '{') { + // Scan forward to the matching `}`, respecting escaped chars. + let depth = 1; + let j = i + 1; + while (j < pattern.length && depth > 0) { + const cj = pattern[j]!; + if (cj === '\\' && j + 1 < pattern.length) { + j += 2; + continue; + } + if (cj === '{') depth++; + else if (cj === '}') depth--; + if (depth > 0) j++; + } + if (depth !== 0) { + // Unclosed brace — keep as-is (validator will catch it). + result += ch; + i++; + continue; + } + const inner = pattern.slice(i + 1, j); + // Nested brace groups are left intact (both rg and picomatch + // expand them). + if (inner.includes('{') || inner.includes('}')) { + result += pattern.slice(i, j + 1); + i = j + 1; + continue; + } + if (inner.includes(',')) { + const nonEmpty = inner.split(',').filter((a) => a !== ''); + if (nonEmpty.length === 0) { + // All alternatives empty — rg strips to empty string. + } else if (nonEmpty.length === 1) { + result += nonEmpty[0]!; + } else { + result += `{${nonEmpty.join(',')}}`; + } + } else { + // Single alternative — rg strips the braces. + result += inner; + } + i = j + 1; + continue; + } + result += ch; + i++; + } return result; } diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index e7e8eacb4..81852d901 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -272,6 +272,20 @@ describe('GlobTool', () => { } }); + it('treats an empty pattern as a broad all-files glob, matching rg --glob', async () => { + // rg treats -g '' as matching all files (respecting ignores). picomatch + // throws on an empty string, so the tool must short-circuit before + // compiling the matcher. + const exec = execReturning('/workspace/a.ts\n/workspace/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '', path: '/workspace' })); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('a.ts'); + expect(result.output).toContain('b.ts'); + }); + it('filters anchored patterns in-process without a positive --glob', async () => { const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); @@ -687,6 +701,78 @@ describe('GlobTool', () => { expect(lines).not.toContain('{1..2}.ts'); }); + it('preserves braces inside character classes, matching rg --glob behavior', async () => { + // rg treats `[{a}].ts` as a character class containing `{`, `a`, `}` — + // matching `{.ts`, `}.ts`, and `a.ts`, but NOT `{a}.ts`. The brace + // rewrite must not strip braces inside `[]`. + const exec = execReturning( + '/workspace/{.ts\n/workspace/}.ts\n/workspace/a.ts\n/workspace/{a}.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[{a}].ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('{.ts'); + expect(lines).toContain('}.ts'); + expect(lines).toContain('a.ts'); + expect(lines).not.toContain('{a}.ts'); + }); + + it('drops empty brace alternatives, matching rg --glob behavior', async () => { + // rg drops empty alternatives: `ab{,c}` matches `abc` only, not `ab`. + // picomatch expands the empty arm, so the rewrite must filter it out. + const exec = execReturning('/workspace/ab\n/workspace/abc\n/workspace/abd\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'ab{,c}', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('abc'); + expect(lines).not.toContain('ab'); + }); + + it('drops multiple empty brace alternatives, matching rg --glob behavior', async () => { + // `ab{c,,d}` — rg keeps only `c` and `d`, dropping the empty arm. + const exec = execReturning('/workspace/ab\n/workspace/abc\n/workspace/abd\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'ab{c,,d}', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('abc'); + expect(lines).toContain('abd'); + expect(lines).not.toContain('ab'); + }); + + it('collapses all-empty brace alternatives to the prefix, matching rg', async () => { + // `ab{,}` — all alternatives empty, rg strips to `ab`. + const exec = execReturning('/workspace/ab\n/workspace/abc\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'ab{,}', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('ab'); + expect(lines).not.toContain('abc'); + }); + it('always searches from `.` so derived paths cannot override ignore rules', async () => { const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); From f7d0f50c55616b86403e01801fe87a6a6f3b565d Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 11:59:48 +0000 Subject: [PATCH 11/13] fix: escape POSIX classes, extglob in char classes, bare parens, negated classes, range arms, ./ prefix, unmatched braces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unified the picomatch escape into a single character-class-aware scanner: - POSIX bracket classes `[:digit:]` inside `[]` are escaped (`[:` → `[\:`) so picomatch treats them as literal, matching rg - Extglob rewrites and bare parenthesis alternation `(a|b)` are now skipped inside character classes — `[@(a)].ts` preserves all chars as literal class members - Bare `(a|b)` outside char classes has parens/pipe escaped so picomatch treats them as literal, matching rg - `[!` is converted to `[^` for picomatch's negation syntax - Range-like arms in brace alternatives (`{1..2,3}`) have dots replaced with `[.]` to prevent picomatch range expansion - `./` prefix is stripped from patterns so the in-process filter matches rg's un-prefixed glob subject - Unmatched `}` is now rejected by the validator, matching rg's error --- .../agent-core/src/tools/builtin/file/glob.ts | 181 ++++++++++++------ packages/agent-core/test/tools/glob.test.ts | 138 +++++++++++++ 2 files changed, 256 insertions(+), 63 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index 7a5ff2e1d..9ce226282 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -393,14 +393,16 @@ function isBroadPattern(pattern: string): boolean { function compileGlobMatcher(pattern: string): (relPath: string) => boolean { const opts = { dot: true }; const rooted = pattern.startsWith('/'); - const normalizedPattern = rooted ? pattern.slice(1) : pattern; - // Escape picomatch-only extensions that rg --glob does not support: - // - extglob: @(a|b), !(a), +(a), ?(a), *(a) — rg treats these as - // literal characters, not extended glob patterns. - // - range braces: {1..2} — rg treats {1..2} as a literal, not a range. - // Brace alternatives like {ts,tsx} are still supported by both rg and - // picomatch, so they are left intact. - const escapedPattern = escapePicomatchExtensions(normalizedPattern); + let normalizedPattern = rooted ? pattern.slice(1) : pattern; + // Strip a leading `./` — rg's glob subject never has it, and picomatch + // treats `./` as optional, which would broaden matches. Normalizing both + // sides to the un-prefixed form keeps the in-process filter consistent. + if (normalizedPattern.startsWith('./')) { + normalizedPattern = normalizedPattern.slice(2); + } + // Escape picomatch-only extensions that rg --glob does not support, + // so the in-process matcher matches the same files rg would. + const escapedPattern = escapeForPicomatch(normalizedPattern); // A pattern without `/` matches the basename at any depth — unless the // original pattern was rooted with a leading `/`, in which case it // matches only at the search root (gitignore-style rooted globs). @@ -416,61 +418,36 @@ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { * Escape picomatch-only glob extensions that rg --glob does not support, * so the in-process matcher matches the same files rg would. * - * - extglob constructs like @(a|b), !(a), +(a) are escaped so picomatch - * treats the prefix character and parentheses as literal. `@`, `!`, `+` - * are literal characters in rg's glob engine. - * - `*` and `?` before `(` are NOT extglob prefixes in rg — they are - * regular wildcards, and `(` is a literal character. Only the - * parentheses are escaped so picomatch doesn't interpret `*(...)` or - * `?(...)` as extglobs, but the wildcard semantics of `*` and `?` are - * preserved. - * - range braces like {1..2} are reduced to `1..2` (braces removed) to - * match rg, which treats a brace group with a single alternative (no - * comma) as that alternative with the braces stripped. - * - brace alternatives like {ts,tsx} are left intact (supported by both), - * but empty alternatives ({,c} or {a,,b}) are dropped to match rg, which - * ignores them. - * - braces inside character classes ([{a}]) are left intact — inside `[]` - * `{` and `}` are literal class members, not brace-group delimiters. - */ -function escapePicomatchExtensions(pattern: string): string { - let result = pattern; - // Escape extglob: [@!+](...) → literal prefix + literal parens. - // `*` and `?` are wildcards in rg, so only the parens are escaped. - result = result.replaceAll(/([@!?+*])\(([^)]*)\)/g, (_match, prefix: string, inner: string) => { - const escapedInner = inner.replaceAll('|', '\\|'); - if (prefix === '*' || prefix === '?') { - return `${prefix}\\(${escapedInner}\\)`; - } - return `[${prefix}]\\(${escapedInner}\\)`; - }); - // Rewrite brace groups outside character classes. Inside `[]`, braces - // are literal class members and must be preserved. rg also drops empty - // alternatives (e.g. `ab{,c}` matches `abc` only, not `ab`), so they are - // filtered out here to prevent picomatch from expanding them. - result = rewriteBraces(result); - return result; -} - -/** - * Rewrite brace groups in `pattern` to match rg's semantics: - * - Single-alternative braces `{foo}` → `foo` (braces stripped). - * - Comma alternatives with empty arms `{a,,b}` → `{a,b}` (empty arms - * dropped; if only one non-empty arm remains, braces are removed). - * - All-empty `{,,}` → empty string (braces removed, no content). - * - Nested brace groups `{a,{b,c}}` are left intact (both rg and - * picomatch expand them). - * - Escaped braces `\{...\}` are left intact. - * - Braces inside character classes `[{a}]` are left intact — inside - * `[]` they are literal class members, not group delimiters. + * This is a single-pass, character-class-aware scanner that handles: + * + * - **Extglob** (`[@!+](...)`): rg treats `@`, `!`, `+` as literal chars + * and `(`, `)` as literal. The prefix is wrapped in `[...]` and the + * parens/pipe are escaped. `*` and `?` before `(` are wildcards in rg, + * so only the parens/pipe are escaped. + * - **Bare parenthesis alternation** (`(a|b)`): rg treats `(`, `|`, `)` as + * literal. All three are escaped so picomatch does too. + * - **Range braces** (`{1..2}`): rg treats a single-alternative brace as + * the inner text with braces removed. Reduced to `1..2`. + * - **Brace alternatives** (`{ts,tsx}`): left intact (supported by both). + * Empty alternatives (`{,c}`, `{a,,b}`) are dropped to match rg. + * Range-like arms (`{1..2,3}`) have their dots escaped so picomatch + * treats them as literal, not as a range expansion. + * - **Nested braces** (`{a,{b,c}}`): left intact (both expand them). + * - **Character classes**: inside `[]`, braces and extglob constructs are + * literal class members and are never rewritten. `[!` is converted to + * `[^` (picomatch's negation syntax). `[:` is escaped to `[\:` to + * prevent picomatch from interpreting POSIX classes like `[:digit:]`. + * - **Escaped chars** (`\x`): passed through unchanged. */ -function rewriteBraces(pattern: string): string { +function escapeForPicomatch(pattern: string): string { let result = ''; let i = 0; let inBracket = false; let bracketContentStart = -1; while (i < pattern.length) { const ch = pattern[i]!; + + // --- Inside a character class --- if (inBracket) { result += ch; if (ch === '\\' && i + 1 < pattern.length) { @@ -483,22 +460,78 @@ function rewriteBraces(pattern: string): string { // terminator (mirrors validateGlobPattern logic). if (i !== bracketContentStart) inBracket = false; } + // Escape `:` after `[` inside a char class to prevent picomatch + // from interpreting `[:class:]` as a POSIX character class. + if (ch === ':' && result.length >= 2 && result.at(-2) === '[') { + // Replace the just-added `:` with `\:` + result = result.slice(0, -1) + '\\:'; + } i++; continue; } + + // --- Escaped character (outside char class) --- + if (ch === '\\' && i + 1 < pattern.length) { + result += ch + (pattern[i + 1] ?? ''); + i += 2; + continue; + } + + // --- Character class opening --- if (ch === '[') { inBracket = true; const next = pattern[i + 1]; bracketContentStart = next === '!' || next === '^' ? i + 2 : i + 1; + // Convert `[!` to `[^` — picomatch uses `[^` for negation, while rg + // (gitignore semantics) uses `[!`. + if (next === '!') { + result += '[^'; + i += 2; + continue; + } result += ch; i++; continue; } - if (ch === '\\' && i + 1 < pattern.length) { - result += ch + (pattern[i + 1] ?? ''); - i += 2; + + // --- Opening parenthesis (extglob or bare alternation) --- + if (ch === '(') { + // Find the matching `)`, respecting escaped chars. + let depth = 1; + let j = i + 1; + while (j < pattern.length && depth > 0) { + const cj = pattern[j]!; + if (cj === '\\' && j + 1 < pattern.length) { + j += 2; + continue; + } + if (cj === '(') depth++; + else if (cj === ')') depth--; + if (depth > 0) j++; + } + if (depth !== 0) { + // Unclosed — treat `(` as literal (validator will catch it). + result += '\\('; + i++; + continue; + } + const inner = pattern.slice(i + 1, j); + const escapedInner = inner.replaceAll('|', '\\|'); + // Check if the previous char in result is an extglob prefix. + const prev = result.length > 0 ? result.at(-1) : ''; + if (prev === '@' || prev === '!' || prev === '+') { + // Extglob: replace the prefix with [prefix] and escape parens. + result = result.slice(0, -1) + `[${prev}]`; + } + // For `*` and `?` prefixes, the wildcard is preserved and only + // the parens/pipe are escaped. For bare parens (no extglob prefix), + // also just escape the parens/pipe. + result += `\\(${escapedInner}\\)`; + i = j + 1; continue; } + + // --- Brace group --- if (ch === '{') { // Scan forward to the matching `}`, respecting escaped chars. let depth = 1; @@ -532,9 +565,13 @@ function rewriteBraces(pattern: string): string { if (nonEmpty.length === 0) { // All alternatives empty — rg strips to empty string. } else if (nonEmpty.length === 1) { - result += nonEmpty[0]!; + // Single non-empty arm — braces removed. + result += escapeRangeArms(nonEmpty[0]!); } else { - result += `{${nonEmpty.join(',')}}`; + // Multiple arms — escape range-like arms (containing `..`) so + // picomatch treats them as literal, not as a range expansion. + const escaped = nonEmpty.map(escapeRangeArms); + result += `{${escaped.join(',')}}`; } } else { // Single alternative — rg strips the braces. @@ -543,12 +580,26 @@ function rewriteBraces(pattern: string): string { i = j + 1; continue; } + result += ch; i++; } return result; } +/** + * Escape dots in a range-like brace arm (e.g. `1..2`) so picomatch treats + * them as literal characters, not as a range expansion. rg treats `{1..2}` + * as the literal string `1..2`, so inside a comma brace group each arm + * that contains `..` must have its dots escaped. Uses `[.]` instead of + * `\.` because picomatch's brace expansion strips backslashes before + * matching, but preserves bracket classes. + */ +function escapeRangeArms(arm: string): string { + if (!arm.includes('..')) return arm; + return arm.replaceAll('.', '[.]'); +} + /** * Filter absolute paths from `rg --files` against the user's positive glob * pattern. Broad patterns (star, double-star, star-slash-star) match @@ -702,8 +753,12 @@ function validateGlobPattern(pattern: string): string | undefined { bracketPrevChar = ''; } else if (ch === '{') { braceDepth++; - } else if (ch === '}' && braceDepth > 0) { - braceDepth--; + } else if (ch === '}') { + if (braceDepth > 0) { + braceDepth--; + } else { + return `Invalid glob pattern: unopened '}' in "${pattern}"`; + } } } if (bracketDepth > 0) return `Invalid glob pattern: unclosed '[' in "${pattern}"`; diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index 81852d901..dbbebce9c 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -773,6 +773,144 @@ describe('GlobTool', () => { expect(lines).not.toContain('abc'); }); + it('escapes POSIX bracket classes, matching rg --glob behavior', async () => { + // rg treats `[:` inside `[]` as literal characters, not a POSIX class. + // So `[[:digit:]].ts` matches `d].ts` and `:].ts` (char class `[ : d i g i t ]` + // then literal `]`), but NOT `1.ts`. picomatch would interpret `[:digit:]` + // as a POSIX digit class, so the `:` after `[` is escaped. + const exec = execReturning( + '/workspace/1.ts\n/workspace/a.ts\n/workspace/d].ts\n/workspace/:].ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[[:digit:]].ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('d].ts'); + expect(lines).toContain(':].ts'); + expect(lines).not.toContain('1.ts'); + }); + + it('skips extglob rewrites inside character classes, matching rg', async () => { + // `[@(a)].ts` — rg treats `@`, `(`, `a`, `)` as literal class members. + // The extglob rewrite must not fire inside `[]`. + const exec = execReturning( + '/workspace/@.ts\n/workspace/(.ts\n/workspace/a.ts\n/workspace/).ts\n/workspace/@(a).ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[@(a)].ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('@.ts'); + expect(lines).toContain('(.ts'); + expect(lines).toContain('a.ts'); + expect(lines).toContain(').ts'); + expect(lines).not.toContain('@(a).ts'); + }); + + it('escapes range arms inside brace alternatives, matching rg', async () => { + // `{1..2,3}.ts` — rg matches `1..2.ts` and `3.ts`, treating `1..2` as + // a literal arm. picomatch would expand `1..2` as a range (1, 2), so + // the dots are replaced with `[.]` to prevent range expansion. + const exec = execReturning( + '/workspace/1.ts\n/workspace/2.ts\n/workspace/3.ts\n/workspace/1..2.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '{1..2,3}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('1..2.ts'); + expect(lines).toContain('3.ts'); + expect(lines).not.toContain('1.ts'); + expect(lines).not.toContain('2.ts'); + }); + + it('converts [! to [^ for negated character classes, matching rg', async () => { + // rg (gitignore semantics) uses `[!` for negation; picomatch uses `[^`. + // `[!a].ts` should match `b.ts`, `c.ts`, etc. but NOT `a.ts`. + const exec = execReturning( + '/workspace/a.ts\n/workspace/b.ts\n/workspace/c.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[!a].ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('b.ts'); + expect(lines).toContain('c.ts'); + expect(lines).not.toContain('a.ts'); + }); + + it('rejects unmatched closing braces, matching rg', async () => { + // rg errors on `a}.ts` — unopened alternate group. + const exec = execReturning('/workspace/a.ts\n/workspace/a}.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'a}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBe(true); + expect(result.output).toContain('unopened'); + }); + + it('strips ./ prefix from glob patterns, matching rg --glob behavior', async () => { + // rg treats `./src/*.ts` as not matching `src/a.ts` because the glob + // subject is `src/a.ts` (no `./` prefix). Stripping `./` from the + // pattern and matching against the un-prefixed relative path keeps + // the in-process filter consistent with rg. + const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: './src/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('src/a.ts'); + expect(result.output).not.toContain('other/b.ts'); + }); + + it('escapes bare parenthesis alternation, matching rg --glob behavior', async () => { + // rg treats `(`, `|`, `)` as literal characters. `(a|b).ts` matches + // only the literal filename `(a|b).ts`, not `a.ts` or `b.ts`. + const exec = execReturning( + '/workspace/(a|b).ts\n/workspace/a.ts\n/workspace/b.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '(a|b).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('(a|b).ts'); + expect(lines).not.toContain('a.ts'); + expect(lines).not.toContain('b.ts'); + }); + it('always searches from `.` so derived paths cannot override ignore rules', async () => { const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); From ea6426ceb9ca8b96b47441e7b8a40d45cde66dba Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 12:41:44 +0000 Subject: [PATCH 12/13] fix: preserve leading ! exclusion globs and escaped comma brace arms - compileGlobMatcher now detects a leading `!` as rg's glob exclusion marker (not a picomatch extglob prefix), strips it, and negates the matcher so `!(a).ts` excludes files matching `(a).ts` and includes everything else, matching rg --glob behavior - isBroadPattern handles `!` prefix: `!` alone is broad (exclude nothing), but `!*` / `!**` are not broad (exclude everything) - Brace arm splitting now uses splitBraceArms which splits on unescaped commas only, so `{\,,a}.ts` correctly produces two arms (`\,` and `a`) instead of corrupting the escaped comma into an empty segment --- .../agent-core/src/tools/builtin/file/glob.ts | 59 +++++++++++++++-- packages/agent-core/test/tools/glob.test.ts | 63 +++++++++++++++++++ 2 files changed, 117 insertions(+), 5 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index 9ce226282..3723470a2 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -370,6 +370,13 @@ function buildRgArgs( function isBroadPattern(pattern: string): boolean { // rg treats an empty --glob as matching all files (respecting ignores), // so skip picomatch compilation for it — picomatch throws on empty input. + // A leading `!` (rg's exclusion marker) followed by nothing means + // "exclude nothing" → keep all files → broad. But `!*` or `!**/*` means + // "exclude everything" → NOT broad (needs compilation to negate). + if (pattern.startsWith('!')) { + const rest = pattern.slice(1); + return rest === ''; + } return pattern === '' || pattern === '*' || pattern === '**' || pattern === '**/*'; } @@ -400,18 +407,30 @@ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { if (normalizedPattern.startsWith('./')) { normalizedPattern = normalizedPattern.slice(2); } + // rg treats a leading `!` as a glob exclusion marker: `!(a).ts` means + // "exclude files matching `(a).ts`". Strip the `!` and negate the + // matcher so the in-process filter excludes those files instead of + // treating `!` as a picomatch extglob prefix. + let negated = false; + if (normalizedPattern.startsWith('!')) { + negated = true; + normalizedPattern = normalizedPattern.slice(1); + } // Escape picomatch-only extensions that rg --glob does not support, // so the in-process matcher matches the same files rg would. const escapedPattern = escapeForPicomatch(normalizedPattern); // A pattern without `/` matches the basename at any depth — unless the // original pattern was rooted with a leading `/`, in which case it // matches only at the search root (gitignore-style rooted globs). + let matcher: (relPath: string) => boolean; if (!normalizedPattern.includes('/') && !rooted) { - const matcher = picomatch(escapedPattern, opts); - return (relPath: string) => matcher(relPath.split('/').pop()!); + const fn = picomatch(escapedPattern, opts); + matcher = (relPath: string) => fn(relPath.split('/').pop()!); + } else { + const fn = picomatch(escapedPattern, opts); + matcher = (relPath: string) => fn(relPath); } - const matcher = picomatch(escapedPattern, opts); - return (relPath: string) => matcher(relPath); + return negated ? (relPath: string) => !matcher(relPath) : matcher; } /** @@ -561,7 +580,11 @@ function escapeForPicomatch(pattern: string): string { continue; } if (inner.includes(',')) { - const nonEmpty = inner.split(',').filter((a) => a !== ''); + // Split on unescaped commas only — `\,` is a literal comma arm, + // not a separator. rg treats `{\,,a}.ts` as matching both `,.ts` + // and `a.ts`. + const arms = splitBraceArms(inner); + const nonEmpty = arms.filter((a) => a !== ''); if (nonEmpty.length === 0) { // All alternatives empty — rg strips to empty string. } else if (nonEmpty.length === 1) { @@ -600,6 +623,32 @@ function escapeRangeArms(arm: string): string { return arm.replaceAll('.', '[.]'); } +/** + * Split a brace group's inner text on unescaped commas only. A `\,` is a + * literal comma within an arm, not a separator — rg treats `{\,,a}` as + * two arms: `\,` (literal comma) and `a`. + */ +function splitBraceArms(inner: string): string[] { + const arms: string[] = []; + let current = ''; + for (let k = 0; k < inner.length; k++) { + const ck = inner[k]!; + if (ck === '\\' && k + 1 < inner.length) { + current += ck + inner[k + 1]!; + k++; + continue; + } + if (ck === ',') { + arms.push(current); + current = ''; + continue; + } + current += ck; + } + arms.push(current); + return arms; +} + /** * Filter absolute paths from `rg --files` against the user's positive glob * pattern. Broad patterns (star, double-star, star-slash-star) match diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index dbbebce9c..dd464cc38 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -1156,6 +1156,69 @@ describe('GlobTool', () => { expect(tool.description).toContain('C:\\Users\\foo'); expect(tool.description).toContain('/c/Users/foo'); }); + + it('treats leading ! as rg exclusion marker, not extglob prefix', async () => { + // rg --glob '!(a).ts' excludes files matching `(a).ts` and includes + // everything else. The leading `!` is an exclusion marker, not a + // picomatch extglob prefix. + const exec = execReturning( + '/workspace/(a).ts\n/workspace/b.ts\n/workspace/c.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '!(a).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('b.ts'); + expect(lines).toContain('c.ts'); + expect(lines).not.toContain('(a).ts'); + }); + + it('negated glob with * excludes all matching files', async () => { + // !*.ts should exclude all .ts files and include only non-.ts files. + const exec = execReturning( + '/workspace/a.ts\n/workspace/b.ts\n/workspace/c.js\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '!*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('c.js'); + expect(lines).not.toContain('a.ts'); + expect(lines).not.toContain('b.ts'); + }); + + it('preserves escaped comma as a literal brace arm, matching rg', async () => { + // rg treats `{\,,a}.ts` as two arms: `\,` (literal comma) and `a`. + // It matches `,.ts` and `a.ts`. The naive split on `,` would break + // the escaped comma into an empty arm and corrupt the group. + const exec = execReturning( + '/workspace/,.ts\n/workspace/a.ts\n/workspace/b.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '{\\,,a}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('a.ts'); + // The literal-comma arm matches `,.ts` — picomatch treats `\,` as + // a literal comma after brace expansion. + expect(lines).toContain(',.ts'); + expect(lines).not.toContain('b.ts'); + }); }); describe('splitCompletePaths', () => { From 3b60ca9f8f06052c2e1f7707dd36a40ff9d66227 Mon Sep 17 00:00:00 2001 From: morluto Date: Wed, 1 Jul 2026 13:28:25 +0000 Subject: [PATCH 13/13] fix(agent-core): align glob filtering with gitignore syntax --- .../agent-core/src/tools/builtin/file/glob.ts | 83 ++++++++++++---- packages/agent-core/test/tools/glob.test.ts | 96 +++++++++++++++++++ 2 files changed, 162 insertions(+), 17 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index 3723470a2..7b71e3710 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -368,15 +368,19 @@ function buildRgArgs( } function isBroadPattern(pattern: string): boolean { + const preprocessed = preprocessGitignoreGlobPattern(pattern); + if (preprocessed === undefined) return true; + pattern = preprocessed; // rg treats an empty --glob as matching all files (respecting ignores), // so skip picomatch compilation for it — picomatch throws on empty input. // A leading `!` (rg's exclusion marker) followed by nothing means - // "exclude nothing" → keep all files → broad. But `!*` or `!**/*` means - // "exclude everything" → NOT broad (needs compilation to negate). - if (pattern.startsWith('!')) { - const rest = pattern.slice(1); - return rest === ''; - } + // "exclude the empty glob" → no files → NOT broad. `!*` or `!**/*` also + // needs compilation to negate. + if (pattern.startsWith('!')) return false; + return isBroadPositivePattern(pattern); +} + +function isBroadPositivePattern(pattern: string): boolean { return pattern === '' || pattern === '*' || pattern === '**' || pattern === '**/*'; } @@ -399,14 +403,9 @@ function isBroadPattern(pattern: string): boolean { */ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { const opts = { dot: true }; - const rooted = pattern.startsWith('/'); - let normalizedPattern = rooted ? pattern.slice(1) : pattern; - // Strip a leading `./` — rg's glob subject never has it, and picomatch - // treats `./` as optional, which would broaden matches. Normalizing both - // sides to the un-prefixed form keeps the in-process filter consistent. - if (normalizedPattern.startsWith('./')) { - normalizedPattern = normalizedPattern.slice(2); - } + const preprocessed = preprocessGitignoreGlobPattern(pattern); + if (preprocessed === undefined) return () => true; + let normalizedPattern = preprocessed; // rg treats a leading `!` as a glob exclusion marker: `!(a).ts` means // "exclude files matching `(a).ts`". Strip the `!` and negate the // matcher so the in-process filter excludes those files instead of @@ -416,6 +415,19 @@ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { negated = true; normalizedPattern = normalizedPattern.slice(1); } + if (negated && normalizedPattern === '') { + return () => false; + } + const rooted = normalizedPattern.startsWith('/'); + if (rooted) { + normalizedPattern = normalizedPattern.slice(1); + } + // Strip a leading `./` — rg's glob subject never has it, and picomatch + // treats `./` as optional, which would broaden matches. Normalizing both + // sides to the un-prefixed form keeps the in-process filter consistent. + if (normalizedPattern.startsWith('./')) { + normalizedPattern = normalizedPattern.slice(2); + } // Escape picomatch-only extensions that rg --glob does not support, // so the in-process matcher matches the same files rg would. const escapedPattern = escapeForPicomatch(normalizedPattern); @@ -433,6 +445,23 @@ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { return negated ? (relPath: string) => !matcher(relPath) : matcher; } +function preprocessGitignoreGlobPattern(pattern: string): string | undefined { + if (pattern.startsWith('#')) return undefined; + let end = pattern.length; + while (end > 0 && pattern[end - 1] === ' ' && !isEscaped(pattern, end - 1)) { + end--; + } + return pattern.slice(0, end); +} + +function isEscaped(pattern: string, index: number): boolean { + let slashCount = 0; + for (let i = index - 1; i >= 0 && pattern[i] === '\\'; i--) { + slashCount++; + } + return slashCount % 2 === 1; +} + /** * Escape picomatch-only glob extensions that rg --glob does not support, * so the in-process matcher matches the same files rg would. @@ -456,7 +485,8 @@ function compileGlobMatcher(pattern: string): (relPath: string) => boolean { * literal class members and are never rewritten. `[!` is converted to * `[^` (picomatch's negation syntax). `[:` is escaped to `[\:` to * prevent picomatch from interpreting POSIX classes like `[:digit:]`. - * - **Escaped chars** (`\x`): passed through unchanged. + * - **Escaped chars** (`\x`): gitignore removes escapes before ordinary + * characters; escapes before picomatch syntax are preserved as literals. */ function escapeForPicomatch(pattern: string): string { let result = ''; @@ -491,7 +521,8 @@ function escapeForPicomatch(pattern: string): string { // --- Escaped character (outside char class) --- if (ch === '\\' && i + 1 < pattern.length) { - result += ch + (pattern[i + 1] ?? ''); + const escaped = pattern[i + 1]!; + result += shouldPreserveEscapeForPicomatch(escaped) ? ch + escaped : escaped; i += 2; continue; } @@ -535,7 +566,7 @@ function escapeForPicomatch(pattern: string): string { continue; } const inner = pattern.slice(i + 1, j); - const escapedInner = inner.replaceAll('|', '\\|'); + const escapedInner = escapePicomatchParenthesisInner(inner); // Check if the previous char in result is an extglob prefix. const prev = result.length > 0 ? result.at(-1) : ''; if (prev === '@' || prev === '!' || prev === '+') { @@ -610,6 +641,24 @@ function escapeForPicomatch(pattern: string): string { return result; } +function shouldPreserveEscapeForPicomatch(ch: string): boolean { + return '*?[]{}()!+@,|\\'.includes(ch); +} + +function escapePicomatchParenthesisInner(inner: string): string { + let result = ''; + for (let i = 0; i < inner.length; i++) { + const ch = inner[i]!; + if (ch === '\\' && i + 1 < inner.length) { + result += ch + inner[i + 1]!; + i++; + continue; + } + result += ch === '(' || ch === ')' || ch === '|' ? `\\${ch}` : ch; + } + return result; +} + /** * Escape dots in a range-like brace arm (e.g. `1..2`) so picomatch treats * them as literal characters, not as a range expansion. rg treats `{1..2}` diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index dd464cc38..7622f87e4 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -1197,6 +1197,102 @@ describe('GlobTool', () => { expect(lines).not.toContain('b.ts'); }); + it('treats a bare ! as an empty exclusion glob', async () => { + const exec = execReturning('/workspace/a.ts\n/workspace/b.js\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '!', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('No matches'); + expect(result.output).not.toContain('a.ts'); + expect(result.output).not.toContain('b.js'); + }); + + it('preserves rooted exclusions after stripping the exclusion marker', async () => { + const exec = execReturning( + '/workspace/foo.ts\n/workspace/src/foo.ts\n/workspace/bar.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '!/foo.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('src/foo.ts'); + expect(lines).toContain('bar.ts'); + expect(lines).not.toContain('foo.ts'); + }); + + it('drops escapes before ordinary gitignore glob characters', async () => { + const exec = execReturning( + '/workspace/foobar\n/workspace/foo\\bar\n/workspace/foo/bar\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'foo\\bar', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('foobar'); + expect(lines).not.toContain('foo\\bar'); + expect(lines).not.toContain('foo/bar'); + }); + + it('escapes nested literal parentheses and pipes like rg glob syntax', async () => { + const exec = execReturning( + '/workspace/(a(b|c)).ts\n/workspace/(ab|c).ts\n/workspace/ac.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '(a(b|c)).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('(a(b|c)).ts'); + expect(lines).not.toContain('(ab|c).ts'); + expect(lines).not.toContain('ac.ts'); + }); + + it('preprocesses comment and trailing-space globs like gitignore lines', async () => { + const commentExec = execReturning('/workspace/#foo\n/workspace/foo\n'); + const commentTool = new GlobTool(kaosWithExec(commentExec), workspace); + + const commentResult = await executeTool( + commentTool, + context({ pattern: '#foo', path: '/workspace' }), + ); + + expect(commentResult.isError).toBeFalsy(); + expect(commentResult.output).toContain('#foo'); + expect(commentResult.output).toContain('foo'); + + const spaceExec = execReturning('/workspace/foo\n/workspace/foo \n'); + const spaceTool = new GlobTool(kaosWithExec(spaceExec), workspace); + + const spaceResult = await executeTool( + spaceTool, + context({ pattern: 'foo ', path: '/workspace' }), + ); + + expect(spaceResult.isError).toBeFalsy(); + const lines = (spaceResult.output as string).split('\n'); + expect(lines).toContain('foo'); + expect(lines).not.toContain('foo '); + }); + it('preserves escaped comma as a literal brace arm, matching rg', async () => { // rg treats `{\,,a}.ts` as two arms: `\,` (literal comma) and `a`. // It matches `,.ts` and `a.ts`. The naive split on `,` would break