From 28fc96bfadfc8f1077b8356bc819e4196e240e44 Mon Sep 17 00:00:00 2001 From: kccarlos <110118511+kccarlos@users.noreply.github.com> Date: Sun, 7 Sep 2025 01:05:11 -0700 Subject: [PATCH 1/6] feat: status bar message during token counting --- src/web/src/App.tsx | 50 ++++++++++++++++++++++++++++++++-- src/web/src/types/appStatus.ts | 2 +- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/web/src/App.tsx b/src/web/src/App.tsx index 54cb0c1..4cdc9ce 100644 --- a/src/web/src/App.tsx +++ b/src/web/src/App.tsx @@ -25,6 +25,7 @@ import { debounce } from './utils/debounce' function App() { const [appStatus, setAppStatus] = useState({ state: 'IDLE' }) + // note: we will temporarily set task='tokens' while counting, see effect below const [currentDir, setCurrentDir] = useState(null) const { @@ -377,6 +378,7 @@ function App() { // Selected files token counts come from hook; compute extras for file tree and assemble total const [fileTreeTokens, setFileTreeTokens] = useState(0) const [treeFilter, setTreeFilter] = useState('') + const [treeTokensBusy, setTreeTokensBusy] = useState(false) function generateSelectedTreeString(paths: string[]): string { // Build a minimal tree of selected files only @@ -449,12 +451,17 @@ function App() { ;(async () => { if (!includeFileTree) { setFileTreeTokens(0) + setTreeTokensBusy(false) return } + setTreeTokensBusy(true) const list = Array.from(selectedPaths) const treeStr = generateSelectedTreeString(list) const n = await countTokens(treeStr) - if (!cancelled) setFileTreeTokens(n) + if (!cancelled) { + setFileTreeTokens(n) + setTreeTokensBusy(false) + } })() return () => { cancelled = true } // eslint-disable-next-line react-hooks/exhaustive-deps @@ -542,7 +549,10 @@ function App() { }, [fileTree]) // Total tokens for selected files (same hook used in SelectedFilesPanel) - const { total: selectedFilesTokensTotal } = useTokenCounts({ + const { + total: selectedFilesTokensTotal, + busy: selectedFilesTokensBusy, + } = useTokenCounts({ gitClient, baseRef: baseBranch, compareRef: compareBranch, @@ -550,6 +560,42 @@ function App() { statusByPath, diffContextLines, }) + + // ---- Status bar integration for token counting --------------------------------- + // Show an indeterminate status while we (re)count tokens, but don't override other LOADING tasks. + useEffect(() => { + const anotherTaskLoading = + appStatus.state === 'LOADING' && + 'task' in appStatus && + appStatus.task !== 'tokens' + + const tokenWorkActive = selectedFilesTokensBusy || treeTokensBusy + if (tokenWorkActive) { + // Only show token counting status when a repository is loaded + if (!anotherTaskLoading && currentDir !== null) { + const files = selectedPaths.size + const msg = + files > 0 + ? `Counting tokens for ${files.toLocaleString()} selected file${files === 1 ? '' : 's'}…` + : 'Counting tokens…' + setAppStatus({ + state: 'LOADING', + task: 'tokens', + message: msg, + progress: 'indeterminate', + }) + try { console.info('[app-status]', { state: 'LOADING', task: 'tokens', message: msg, progress: 'indeterminate' }) } catch {} + } + } else { + // only clear if we were the ones showing the tokens task AND a repository is loaded + if (appStatus.state === 'LOADING' && 'task' in appStatus && appStatus.task === 'tokens' && currentDir !== null) { + setAppStatus({ state: 'READY', message: 'Token counts updated.' }) + try { console.info('[app-status]', { state: 'READY', message: 'Token counts updated.' }) } catch {} + } + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [selectedFilesTokensBusy, treeTokensBusy, selectedPaths.size, currentDir]) + const headerRight = ( Date: Sun, 7 Sep 2025 01:11:05 -0700 Subject: [PATCH 2/6] feat: token counting progress --- src/web/src/App.tsx | 35 ++++++++++++++++++++++------- src/web/src/hooks/useTokenCounts.ts | 18 ++++++++++++++- 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/src/web/src/App.tsx b/src/web/src/App.tsx index 4cdc9ce..fc1ae74 100644 --- a/src/web/src/App.tsx +++ b/src/web/src/App.tsx @@ -379,6 +379,7 @@ function App() { const [fileTreeTokens, setFileTreeTokens] = useState(0) const [treeFilter, setTreeFilter] = useState('') const [treeTokensBusy, setTreeTokensBusy] = useState(false) + const [tokenProgress, setTokenProgress] = useState(0) // 0..100 for selected-files counting function generateSelectedTreeString(paths: string[]): string { // Build a minimal tree of selected files only @@ -559,8 +560,18 @@ function App() { selectedPaths, statusByPath, diffContextLines, + onBatch: (done, total) => { + // Clamp and convert to 0..100; handle total=0 safely. + const pct = total <= 0 ? 100 : Math.max(0, Math.min(100, Math.round((done / total) * 100))) + setTokenProgress(pct) + }, }) + // Reset numeric progress when we begin a new selected-files counting pass + useEffect(() => { + if (selectedFilesTokensBusy) setTokenProgress(0) + }, [selectedFilesTokensBusy]) + // ---- Status bar integration for token counting --------------------------------- // Show an indeterminate status while we (re)count tokens, but don't override other LOADING tasks. useEffect(() => { @@ -570,21 +581,29 @@ function App() { appStatus.task !== 'tokens' const tokenWorkActive = selectedFilesTokensBusy || treeTokensBusy + // Build a combined percent so the bar advances smoothly. + // Reserve 85% for selected-files counting, and 15% for the (quick) file-tree tokenization. + const withTree = includeFileTree + const selectedWeight = withTree ? 85 : 100 + const treeWeight = withTree ? 15 : 0 + const selectedPortion = Math.round((Math.max(0, Math.min(100, tokenProgress)) * selectedWeight) / 100) + const treePortion = treeTokensBusy ? 0 : treeWeight + const overallPercent = Math.max(0, Math.min(100, selectedPortion + treePortion)) + if (tokenWorkActive) { // Only show token counting status when a repository is loaded if (!anotherTaskLoading && currentDir !== null) { const files = selectedPaths.size - const msg = - files > 0 - ? `Counting tokens for ${files.toLocaleString()} selected file${files === 1 ? '' : 's'}…` - : 'Counting tokens…' + const msg = files > 0 + ? `Counting tokens for ${files.toLocaleString()} selected file${files === 1 ? '' : 's'}…` + : 'Counting tokens…' setAppStatus({ state: 'LOADING', task: 'tokens', - message: msg, - progress: 'indeterminate', + message: `${msg} ${overallPercent}%`, + progress: overallPercent, }) - try { console.info('[app-status]', { state: 'LOADING', task: 'tokens', message: msg, progress: 'indeterminate' }) } catch {} + try { console.info('[app-status]', { state: 'LOADING', task: 'tokens', message: `${msg} ${overallPercent}%`, progress: overallPercent }) } catch {} } } else { // only clear if we were the ones showing the tokens task AND a repository is loaded @@ -594,7 +613,7 @@ function App() { } } // eslint-disable-next-line react-hooks/exhaustive-deps - }, [selectedFilesTokensBusy, treeTokensBusy, selectedPaths.size, currentDir]) + }, [selectedFilesTokensBusy, treeTokensBusy, selectedPaths.size, tokenProgress, includeFileTree, currentDir]) const headerRight = ( diffContextLines: number tokenizer?: TokenizerEngine + onBatch?: (completed: number, total: number) => void } -export function useTokenCounts({ gitClient, baseRef, compareRef, selectedPaths, statusByPath, diffContextLines, tokenizer }: Args) { +export function useTokenCounts({ gitClient, baseRef, compareRef, selectedPaths, statusByPath, diffContextLines, tokenizer, onBatch }: Args) { const [counts, setCounts] = useState(new Map()) const [busy, setBusy] = useState(false) const tok: TokenizerEngine = useMemo(() => tokenizer ?? createTokenizer(), [tokenizer]) @@ -28,11 +29,17 @@ export function useTokenCounts({ gitClient, baseRef, compareRef, selectedPaths, async function run() { if (!gitClient || !baseRef || !compareRef) { setCounts(new Map()) + // If caller wants progress, mark as "complete" when there's nothing to do. + try { onBatch?.(1, 1) } catch {} return } setBusy(true) try { const next = new Map() + const totalFiles = selectedList.length + let completed = 0 + // initial tick + try { onBatch?.(totalFiles === 0 ? 1 : 0, totalFiles === 0 ? 1 : totalFiles) } catch {} // Limit concurrent requests to prevent overwhelming the worker const BATCH_SIZE = 10 @@ -71,10 +78,19 @@ export function useTokenCounts({ gitClient, baseRef, compareRef, selectedPaths, next.set(path, n) }), ) + // batch finished; advance progress + completed += batch.length + try { + onBatch?.(Math.min(completed, totalFiles), totalFiles || 1) + } catch {} } if (!cancelled) setCounts(next) } finally { if (!cancelled) setBusy(false) + // ensure we always end at 100% + try { + onBatch?.(selectedList.length || 1, selectedList.length || 1) + } catch {} } } run() From 62d2c56a5ea0463202211f4484f5920911f8f24c Mon Sep 17 00:00:00 2001 From: kccarlos <110118511+kccarlos@users.noreply.github.com> Date: Sun, 7 Sep 2025 01:28:29 -0700 Subject: [PATCH 3/6] fix: single token counting pass app wide --- src/web/src/App.tsx | 179 +++++++++++------- src/web/src/components/SelectedFilesPanel.tsx | 11 +- src/web/src/context/TokenCountsContext.tsx | 61 ++++++ 3 files changed, 170 insertions(+), 81 deletions(-) create mode 100644 src/web/src/context/TokenCountsContext.tsx diff --git a/src/web/src/App.tsx b/src/web/src/App.tsx index fc1ae74..220470c 100644 --- a/src/web/src/App.tsx +++ b/src/web/src/App.tsx @@ -1,4 +1,4 @@ -import { useEffect, useMemo, useRef, useState } from 'react' +import { useCallback, useEffect, useMemo, useRef, useState } from 'react' import './App.css' import { Folder, ChevronsDown, ChevronsUp, CheckSquare, Square, Copy, Sun, Moon, ArrowLeftRight, MessageCircleMore, FolderGit2, ListChecks } from 'lucide-react' import BrowserSupportGate from './components/BrowserSupportGate' @@ -19,7 +19,8 @@ import type { ModelInfo } from './types/models' import type { AppStatus } from './types/appStatus' import { buildUnifiedDiffForStatus } from './utils/diff' import { countTokens } from './utils/tokenizer' -import { useTokenCounts } from './hooks/useTokenCounts' +// Globally shared token counts +import { TokenCountsProvider, useTokenCountsContext } from './context/TokenCountsContext' import { logError } from './utils/logger' import { debounce } from './utils/debounce' @@ -379,7 +380,7 @@ function App() { const [fileTreeTokens, setFileTreeTokens] = useState(0) const [treeFilter, setTreeFilter] = useState('') const [treeTokensBusy, setTreeTokensBusy] = useState(false) - const [tokenProgress, setTokenProgress] = useState(0) // 0..100 for selected-files counting + // (moved into TokenCountsContext) function generateSelectedTreeString(paths: string[]): string { // Build a minimal tree of selected files only @@ -549,71 +550,11 @@ function App() { return m }, [fileTree]) - // Total tokens for selected files (same hook used in SelectedFilesPanel) - const { - total: selectedFilesTokensTotal, - busy: selectedFilesTokensBusy, - } = useTokenCounts({ - gitClient, - baseRef: baseBranch, - compareRef: compareBranch, - selectedPaths, - statusByPath, - diffContextLines, - onBatch: (done, total) => { - // Clamp and convert to 0..100; handle total=0 safely. - const pct = total <= 0 ? 100 : Math.max(0, Math.min(100, Math.round((done / total) * 100))) - setTokenProgress(pct) - }, - }) + // Token counting is now provided globally via . - // Reset numeric progress when we begin a new selected-files counting pass - useEffect(() => { - if (selectedFilesTokensBusy) setTokenProgress(0) - }, [selectedFilesTokensBusy]) + // (progress is handled in the TokenCountsProvider; no local reset needed) - // ---- Status bar integration for token counting --------------------------------- - // Show an indeterminate status while we (re)count tokens, but don't override other LOADING tasks. - useEffect(() => { - const anotherTaskLoading = - appStatus.state === 'LOADING' && - 'task' in appStatus && - appStatus.task !== 'tokens' - - const tokenWorkActive = selectedFilesTokensBusy || treeTokensBusy - // Build a combined percent so the bar advances smoothly. - // Reserve 85% for selected-files counting, and 15% for the (quick) file-tree tokenization. - const withTree = includeFileTree - const selectedWeight = withTree ? 85 : 100 - const treeWeight = withTree ? 15 : 0 - const selectedPortion = Math.round((Math.max(0, Math.min(100, tokenProgress)) * selectedWeight) / 100) - const treePortion = treeTokensBusy ? 0 : treeWeight - const overallPercent = Math.max(0, Math.min(100, selectedPortion + treePortion)) - - if (tokenWorkActive) { - // Only show token counting status when a repository is loaded - if (!anotherTaskLoading && currentDir !== null) { - const files = selectedPaths.size - const msg = files > 0 - ? `Counting tokens for ${files.toLocaleString()} selected file${files === 1 ? '' : 's'}…` - : 'Counting tokens…' - setAppStatus({ - state: 'LOADING', - task: 'tokens', - message: `${msg} ${overallPercent}%`, - progress: overallPercent, - }) - try { console.info('[app-status]', { state: 'LOADING', task: 'tokens', message: `${msg} ${overallPercent}%`, progress: overallPercent }) } catch {} - } - } else { - // only clear if we were the ones showing the tokens task AND a repository is loaded - if (appStatus.state === 'LOADING' && 'task' in appStatus && appStatus.task === 'tokens' && currentDir !== null) { - setAppStatus({ state: 'READY', message: 'Token counts updated.' }) - try { console.info('[app-status]', { state: 'READY', message: 'Token counts updated.' }) } catch {} - } - } - // eslint-disable-next-line react-hooks/exhaustive-deps - }, [selectedFilesTokensBusy, treeTokensBusy, selectedPaths.size, tokenProgress, includeFileTree, currentDir]) + // (moved status-bar tie-in to a small bridge component below) const headerRight = ( lower.endsWith(ext)) } + // Small helper: use context to feed TokenUsage without prop-drilling + function TokenUsageWithContext({ + filesCount, + instructionsTokens, + fileTreeTokens, + limit, + }: { + filesCount: number + instructionsTokens: number + fileTreeTokens: number + limit: number + }) { + const { total } = useTokenCountsContext() + const src = useCallback(() => total, [total]) + return ( + + ) + } + + // Bridge: keeps your StatusBar messages/progress exactly as before, now fed by the context. + function TokenCountingStatusBridge({ + includeTree, + treeBusy, + }: { + includeTree: boolean + treeBusy: boolean + }) { + const { busy, progress } = useTokenCountsContext() + useEffect(() => { + const anotherTaskLoading = + appStatus.state === 'LOADING' && 'task' in appStatus && appStatus.task !== 'tokens' + + const tokenWorkActive = busy || treeBusy + const selectedWeight = includeTree ? 85 : 100 + const treeWeight = includeTree ? 15 : 0 + const selectedPortion = Math.round((Math.max(0, Math.min(100, progress.percent)) * selectedWeight) / 100) + const treePortion = treeBusy ? 0 : treeWeight + const overallPercent = Math.max(0, Math.min(100, selectedPortion + treePortion)) + + if (tokenWorkActive) { + if (!anotherTaskLoading && currentDir !== null) { + const files = selectedPaths.size + const msg = + files > 0 + ? `Counting tokens for ${files.toLocaleString()} selected file${files === 1 ? '' : 's'}…` + : 'Counting tokens…' + setAppStatus({ + state: 'LOADING', + task: 'tokens', + message: `${msg} ${overallPercent}%`, + progress: overallPercent, + }) + try { + console.info('[app-status]', { + state: 'LOADING', + task: 'tokens', + message: `${msg} ${overallPercent}%`, + progress: overallPercent, + }) + } catch {} + } + } else { + if ( + appStatus.state === 'LOADING' && + 'task' in appStatus && + appStatus.task === 'tokens' && + currentDir !== null + ) { + setAppStatus({ state: 'READY', message: 'Token counts updated.' }) + try { + console.info('[app-status]', { state: 'READY', message: 'Token counts updated.' }) + } catch {} + } + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [busy, treeBusy, progress.percent, includeTree, currentDir, selectedPaths.size]) + return null + } + return ( +
@@ -1156,9 +1190,8 @@ function App() {
- {/* Token usage summary */} - selectedFilesTokensTotal} + {/* Token usage summary (fed from global context) */} + toggleSelect(path)} onPreview={(path, status) => previewFile(path, status)} refreshing={repoStatus.state === 'loading'} @@ -1224,6 +1253,10 @@ function App() {
)} + + {/* Keep StatusBar updates synchronized with global token counting */} + +
) } diff --git a/src/web/src/components/SelectedFilesPanel.tsx b/src/web/src/components/SelectedFilesPanel.tsx index 4c0b595..d7ed653 100644 --- a/src/web/src/components/SelectedFilesPanel.tsx +++ b/src/web/src/components/SelectedFilesPanel.tsx @@ -1,7 +1,6 @@ import { useMemo, useState } from 'react' import { ArrowUpDown, Search, X, FilePenLine, FilePlus2, FileMinus2, File as FileIcon, FileArchive } from 'lucide-react' -import type { GitWorkerClient } from '../utils/gitWorkerClient' -import { useTokenCounts } from '../hooks/useTokenCounts' +import { useTokenCountsContext } from '../context/TokenCountsContext' import type { FileDiffStatus } from '../hooks/useFileTree' type SelectedEntry = { @@ -15,21 +14,17 @@ type SelectedEntry = { type SortKey = 'tokens-desc' | 'tokens-asc' | 'name-asc' | 'name-desc' type Props = { - gitClient: GitWorkerClient | null - baseRef: string - compareRef: string selectedPaths: Set statusByPath: Map - diffContextLines: number onUnselect: (path: string) => void onPreview: (path: string, status: FileDiffStatus) => void refreshing?: boolean filterText?: string } -export function SelectedFilesPanel({ gitClient, baseRef, compareRef, selectedPaths, statusByPath, diffContextLines, onUnselect, onPreview, refreshing, filterText }: Props) { +export function SelectedFilesPanel({ selectedPaths, statusByPath, onUnselect, onPreview, refreshing, filterText }: Props) { const [sortKey, setSortKey] = useState('tokens-desc') - const { counts, busy } = useTokenCounts({ gitClient, baseRef, compareRef, selectedPaths, statusByPath, diffContextLines }) + const { counts, busy } = useTokenCountsContext() const effectiveBusy = !!refreshing || busy const items = useMemo(() => { diff --git a/src/web/src/context/TokenCountsContext.tsx b/src/web/src/context/TokenCountsContext.tsx new file mode 100644 index 0000000..cfda8e0 --- /dev/null +++ b/src/web/src/context/TokenCountsContext.tsx @@ -0,0 +1,61 @@ +import React, { createContext, useContext, useMemo, useState } from 'react' +import type { GitEngine } from '../platform/types' +import type { FileDiffStatus } from '../hooks/useFileTree' +import { useTokenCounts } from '../hooks/useTokenCounts' + +type ProgressState = { completed: number; total: number; percent: number } +type Ctx = { + counts: Map + total: number + busy: boolean + progress: ProgressState +} + +const TokenCountsContext = createContext(undefined) + +type ProviderProps = { + gitClient: GitEngine | null + baseRef: string + compareRef: string + selectedPaths: Set + statusByPath: Map + diffContextLines: number + children: React.ReactNode +} + +export function TokenCountsProvider({ + gitClient, + baseRef, + compareRef, + selectedPaths, + statusByPath, + diffContextLines, + children, +}: ProviderProps) { + const [progress, setProgress] = useState({ completed: 0, total: 0, percent: 0 }) + + const { counts, total, busy } = useTokenCounts({ + gitClient, + baseRef, + compareRef, + selectedPaths, + statusByPath, + diffContextLines, + onBatch: (done, totalFiles) => { + const pct = + totalFiles <= 0 + ? 100 + : Math.max(0, Math.min(100, Math.round((done / totalFiles) * 100))) + setProgress({ completed: done, total: totalFiles, percent: pct }) + }, + }) + + const value = useMemo(() => ({ counts, total, busy, progress }), [counts, total, busy, progress]) + return {children} +} + +export function useTokenCountsContext(): Ctx { + const v = useContext(TokenCountsContext) + if (!v) throw new Error('useTokenCountsContext must be used within a TokenCountsProvider') + return v +} From e9302a2b1341c5ff863c8bd92c195f4f479a12cf Mon Sep 17 00:00:00 2001 From: kccarlos <110118511+kccarlos@users.noreply.github.com> Date: Sun, 7 Sep 2025 01:39:41 -0700 Subject: [PATCH 4/6] fix: file tree not updated after workspace switch --- src/web/src/App.tsx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/web/src/App.tsx b/src/web/src/App.tsx index 220470c..d1e9736 100644 --- a/src/web/src/App.tsx +++ b/src/web/src/App.tsx @@ -522,6 +522,13 @@ function App() { // Keep a local mirror of currentDir for legacy display setCurrentDir(repoDir) }, [repoDir]) + + // Invalidate the memoized diff key whenever the underlying repository changes. + // Without this, switching to a different workspace with the same branch names + // (e.g. __WORKDIR__ → main) won't trigger a recompute and the tree stays stale. + useEffect(() => { + lastDiffKeyRef.current = '' + }, [repoDir, gitClient]) // Keep the main UI visible during refresh; only hide when no project selected @@ -1003,7 +1010,7 @@ function App() { Date: Sun, 7 Sep 2025 01:57:44 -0700 Subject: [PATCH 5/6] fix: binary file handling --- src/electron/workers/nodeGitWorker.ts | 27 +++++++ src/web/src/App.tsx | 40 ++++++---- src/web/src/components/FileTreeView.tsx | 3 +- src/web/src/components/SelectedFilesPanel.tsx | 8 +- src/web/src/context/TokenCountsContext.tsx | 3 + src/web/src/hooks/useFileTree.ts | 8 +- src/web/src/hooks/useTokenCounts.ts | 80 +++++++++++++------ src/web/src/utils/binary.ts | 18 +++++ src/web/src/workers/gitWorker.ts | 29 +++++++ 9 files changed, 167 insertions(+), 49 deletions(-) create mode 100644 src/web/src/utils/binary.ts diff --git a/src/electron/workers/nodeGitWorker.ts b/src/electron/workers/nodeGitWorker.ts index faa923e..0ee6929 100644 --- a/src/electron/workers/nodeGitWorker.ts +++ b/src/electron/workers/nodeGitWorker.ts @@ -17,6 +17,19 @@ const blobCache = new LRUCache let blobCacheHits = 0 const gitCache: Record = Object.create(null) const WORKDIR = '__WORKDIR__' +const BINARY_EXTS = [ + '.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico', + '.pdf','.zip','.rar','.7z','.tar','.gz','.tgz', + '.mp3','.wav','.flac', + '.mp4','.mov','.avi','.mkv','.webm', + '.exe','.dll','.bin','.dmg','.pkg','.iso', + '.woff','.woff2','.ttf','.otf', + '.svg' +] +function isBinaryPathLocal(p: string): boolean { + const lower = p.toLowerCase() + return BINARY_EXTS.some(ext => lower.endsWith(ext)) +} // Helper function to parse packed-refs async function parsePackedRefs(repoPath: string): Promise { @@ -127,6 +140,20 @@ parentPort?.on('message', async (m: Msg) => { return } case 'readFile': { + // Fast path: known-binary extension => no content read + if (isBinaryPathLocal(m.filepath)) { + if (m.ref !== WORKDIR) { + ok(m.id, { binary: true, text: null, notFound: false }); return + } + // WORKDIR existence without reading file + const fileAbs = path.join(repoPath, m.filepath) + const exists = await fs.promises + .stat(fileAbs) + .then(() => true) + .catch(() => false) + ok(m.id, { binary: exists, text: null, notFound: !exists }) + return + } if (m.ref !== WORKDIR) { const commitOid = await git.resolveRef({ fs, dir: repoPath, ref: m.ref }).catch(() => null as any) if (!commitOid) { ok(m.id, { binary: false, text: null, notFound: true }); return } diff --git a/src/web/src/App.tsx b/src/web/src/App.tsx index d1e9736..3079538 100644 --- a/src/web/src/App.tsx +++ b/src/web/src/App.tsx @@ -21,6 +21,7 @@ import { buildUnifiedDiffForStatus } from './utils/diff' import { countTokens } from './utils/tokenizer' // Globally shared token counts import { TokenCountsProvider, useTokenCountsContext } from './context/TokenCountsContext' +import { isBinaryPath } from './utils/binary' import { logError } from './utils/logger' import { debounce } from './utils/debounce' @@ -580,6 +581,11 @@ function App() { async function previewFile(path: string, status: FileDiffStatus): Promise { if (!gitClient) return + // Hard guard: no previews for binary files (prevents heavy reads) + if (isBinaryPath(path)) { + setNotif('Binary file preview is not supported.') + return + } try { const toFetchBase = status !== 'add' const toFetchCompare = status !== 'remove' @@ -589,6 +595,13 @@ function App() { toFetchCompare && compareBranch ? gitClient.readFile(compareBranch, path) : Promise.resolve(undefined), ]) + // If worker reports binary, bail out as well + const baseBin = (baseRes as any)?.binary + const compareBin = (compareRes as any)?.binary + if (baseBin || compareBin) { + setNotif('Binary file preview is not supported.') + return + } setPreviewPath(path) setPreviewStatus(status) setPreviewData({ @@ -654,11 +667,19 @@ function App() { // File sections const fileSections: string[] = [] const includeBinaryNow = (includeBinaryCheckboxRef.current?.checked ?? includeBinaryAsPathsRef.current) - const pathsToProcess = includeBinaryNow ? selected : selected.filter((p) => !isLikelyBinaryPath(p)) + const pathsToProcess = includeBinaryNow ? selected : selected.filter((p) => !isBinaryPath(p)) const fileReadPromises = pathsToProcess.map((path) => { const status = statusByPath.get(path) ?? 'unchanged' const needBase = status !== 'add' const needCompare = status !== 'remove' + // Avoid heavy reads for binary paths — we only emit a header line + if (isBinaryPath(path)) { + return Promise.resolve({ + path, status, + baseRes: { binary: true, text: null }, + compareRes: { binary: true, text: null }, + }) + } return Promise.all([ needBase ? gitClient.readFile(baseBranch, path) : Promise.resolve(undefined), needCompare ? gitClient.readFile(compareBranch, path) : Promise.resolve(undefined), @@ -666,7 +687,7 @@ function App() { }) const fileContents = await Promise.all(fileReadPromises) for (const { path, status, baseRes, compareRes } of fileContents) { - const isBinary = (baseRes as { binary?: boolean } | undefined)?.binary || (compareRes as { binary?: boolean } | undefined)?.binary || isLikelyBinaryPath(path) + const isBinary = (baseRes as { binary?: boolean } | undefined)?.binary || (compareRes as { binary?: boolean } | undefined)?.binary || isBinaryPath(path) const header = `## FILE: ${path} (${status.toUpperCase()})\n\n` if (isBinary) { // When we filtered out likely-binary paths earlier and still hit binary here (e.g. unknown ext), @@ -741,17 +762,7 @@ function App() { if (lower.endsWith('.html') || lower.endsWith('.htm')) return 'html' return '' } - function isLikelyBinaryPath(p: string): boolean { - const lower = p.toLowerCase() - const binaryExts = [ - '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.ico', - '.pdf', '.zip', '.rar', '.7z', '.tar', '.gz', '.tgz', - '.mp3', '.wav', '.flac', '.mp4', '.mov', '.avi', '.mkv', '.webm', - '.exe', '.dll', '.bin', '.dmg', '.pkg', '.iso', - '.woff', '.woff2', '.ttf', '.otf' - ] - return binaryExts.some((ext) => lower.endsWith(ext)) - } + // binary detection now centralized in utils/binary.ts (isBinaryPath) // Small helper: use context to feed TokenUsage without prop-drilling function TokenUsageWithContext({ @@ -847,6 +858,7 @@ function App() { selectedPaths={selectedPaths} statusByPath={statusByPath} diffContextLines={diffContextLines} + includeBinaryPaths={includeBinaryAsPaths} >
@@ -1135,7 +1147,7 @@ function App() { const curr = Array.from(selectedPathsRef.current) const removed: string[] = [] for (const p of curr) { - if (isLikelyBinaryPath(p)) { + if (isBinaryPath(p)) { removed.push(p) toggleSelect(p) } diff --git a/src/web/src/components/FileTreeView.tsx b/src/web/src/components/FileTreeView.tsx index cae2644..abc5498 100644 --- a/src/web/src/components/FileTreeView.tsx +++ b/src/web/src/components/FileTreeView.tsx @@ -151,8 +151,9 @@ export function FileTreeView({ type="button" onClick={() => onPreviewFile(node.path, st)} className="btn btn-ghost btn-icon ml-auto" - title="Preview" + title={node.isLikelyBinary ? 'Preview disabled for binary files' : 'Preview'} aria-label="Preview" + disabled={node.isLikelyBinary} > diff --git a/src/web/src/components/SelectedFilesPanel.tsx b/src/web/src/components/SelectedFilesPanel.tsx index d7ed653..0e76711 100644 --- a/src/web/src/components/SelectedFilesPanel.tsx +++ b/src/web/src/components/SelectedFilesPanel.tsx @@ -2,6 +2,7 @@ import { useMemo, useState } from 'react' import { ArrowUpDown, Search, X, FilePenLine, FilePlus2, FileMinus2, File as FileIcon, FileArchive } from 'lucide-react' import { useTokenCountsContext } from '../context/TokenCountsContext' import type { FileDiffStatus } from '../hooks/useFileTree' +import { isBinaryPath } from '../utils/binary' type SelectedEntry = { path: string @@ -33,9 +34,7 @@ export function SelectedFilesPanel({ selectedPaths, statusByPath, onUnselect, on const st = statusByPath.get(path) ?? 'unchanged' const tokens = counts.get(path) ?? 0 const name = path.includes('/') ? path.slice(path.lastIndexOf('/') + 1) : path - const lower = path.toLowerCase() - const exts = ['.png','.jpg','.jpeg','.gif','.webp','.svg','.ico','.pdf','.zip','.gz','.tgz','.rar','.7z','.mp4','.mp3','.wav','.mov','.avi','.mkv','.woff','.woff2','.ttf'] - const isLikelyBinary = exts.some((e) => lower.endsWith(e)) + const isLikelyBinary = isBinaryPath(path) entries.push({ path, name, status: st, tokens, isLikelyBinary }) } const q = (filterText || '').trim().toLowerCase() @@ -123,9 +122,10 @@ export function SelectedFilesPanel({ selectedPaths, statusByPath, onUnselect, on diff --git a/src/web/src/context/TokenCountsContext.tsx b/src/web/src/context/TokenCountsContext.tsx index cfda8e0..2adf632 100644 --- a/src/web/src/context/TokenCountsContext.tsx +++ b/src/web/src/context/TokenCountsContext.tsx @@ -20,6 +20,7 @@ type ProviderProps = { selectedPaths: Set statusByPath: Map diffContextLines: number + includeBinaryPaths?: boolean children: React.ReactNode } @@ -30,6 +31,7 @@ export function TokenCountsProvider({ selectedPaths, statusByPath, diffContextLines, + includeBinaryPaths = true, children, }: ProviderProps) { const [progress, setProgress] = useState({ completed: 0, total: 0, percent: 0 }) @@ -41,6 +43,7 @@ export function TokenCountsProvider({ selectedPaths, statusByPath, diffContextLines, + includeBinaryPaths, onBatch: (done, totalFiles) => { const pct = totalFiles <= 0 diff --git a/src/web/src/hooks/useFileTree.ts b/src/web/src/hooks/useFileTree.ts index f7a7958..9fa5b3a 100644 --- a/src/web/src/hooks/useFileTree.ts +++ b/src/web/src/hooks/useFileTree.ts @@ -1,6 +1,7 @@ import { useCallback, useState } from 'react' import type { GitWorkerClient } from '../utils/gitWorkerClient' import type { AppStatus } from '../types/appStatus' +import { isBinaryPath } from '../utils/binary' export type FileDiffStatus = 'modify' | 'add' | 'remove' | 'unchanged' @@ -39,12 +40,7 @@ export function useFileTree(setAppStatus?: (s: AppStatus) => void) { return node } - const likelyBinary = (p: string): boolean => { - const lower = p.toLowerCase() - // Heuristic similar to App.tsx; keep in sync - const exts = ['.png','.jpg','.jpeg','.gif','.webp','.svg','.ico','.pdf','.zip','.gz','.tgz','.rar','.7z','.mp4','.mp3','.wav','.mov','.avi','.mkv','.woff','.woff2','.ttf'] - return exts.some((e) => lower.endsWith(e)) - } + const likelyBinary = (p: string): boolean => isBinaryPath(p) for (const fullPath of allPaths) { const parts = fullPath.split('/') diff --git a/src/web/src/hooks/useTokenCounts.ts b/src/web/src/hooks/useTokenCounts.ts index 0a6b94f..16e5ca1 100644 --- a/src/web/src/hooks/useTokenCounts.ts +++ b/src/web/src/hooks/useTokenCounts.ts @@ -3,6 +3,7 @@ import type { GitEngine, TokenizerEngine } from '../platform/types' import { createTokenizer } from '../platform/tokenizerFactory' import type { FileDiffStatus } from './useFileTree' import { buildUnifiedDiffForStatus } from '../utils/diff' +import { isBinaryPath } from '../utils/binary' export type TokenCounts = Map @@ -13,11 +14,22 @@ type Args = { selectedPaths: Set statusByPath: Map diffContextLines: number + includeBinaryPaths?: boolean tokenizer?: TokenizerEngine onBatch?: (completed: number, total: number) => void } -export function useTokenCounts({ gitClient, baseRef, compareRef, selectedPaths, statusByPath, diffContextLines, tokenizer, onBatch }: Args) { +export function useTokenCounts({ + gitClient, + baseRef, + compareRef, + selectedPaths, + statusByPath, + diffContextLines, + includeBinaryPaths = true, + tokenizer, + onBatch, +}: Args) { const [counts, setCounts] = useState(new Map()) const [busy, setBusy] = useState(false) const tok: TokenizerEngine = useMemo(() => tokenizer ?? createTokenizer(), [tokenizer]) @@ -49,31 +61,51 @@ export function useTokenCounts({ gitClient, baseRef, compareRef, selectedPaths, const batch = selectedList.slice(i, i + BATCH_SIZE) await Promise.all( batch.map(async (path) => { - const status = statusByPath.get(path) ?? 'unchanged' - const needBase = status !== 'add' - const needCompare = status !== 'remove' - const [baseRes, compareRes] = await Promise.all([ - needBase && baseRef ? gitClient.readFile(baseRef, path) : Promise.resolve(undefined as any), - needCompare && compareRef ? gitClient.readFile(compareRef, path) : Promise.resolve(undefined as any), - ]) - // Mirror final output generation logic - const MAX_CONTEXT = 999 - const ctx = diffContextLines >= MAX_CONTEXT ? Number.MAX_SAFE_INTEGER : diffContextLines - let textForCount = '' - if (status === 'modify' || status === 'add' || status === 'remove') { - const isBinary = Boolean((baseRes as any)?.binary) || Boolean((compareRes as any)?.binary) - if (isBinary) { - textForCount = '' - } else if (status === 'add' && ctx === Number.MAX_SAFE_INTEGER) { - textForCount = (compareRes as { text?: string } | undefined)?.text ?? '' + const status = statusByPath.get(path) ?? 'unchanged' + const looksBinary = isBinaryPath(path) + let textForCount = '' + + // Fast path: known-binary files never load content + if (looksBinary) { + if (includeBinaryPaths) { + // Mirror the exact header we output during copy + const header = `## FILE: ${path} (${(status || 'unchanged').toUpperCase()})\n\n` + textForCount = header + } else { + textForCount = '' + } } else { - textForCount = buildUnifiedDiffForStatus(status, path, baseRes as any, compareRes as any, { context: ctx }) || '' + // Textual path -> maybe fetch content/diff + const needBase = status !== 'add' + const needCompare = status !== 'remove' + const [baseRes, compareRes] = await Promise.all([ + needBase && baseRef ? gitClient.readFile(baseRef, path) : Promise.resolve(undefined as any), + needCompare && compareRef ? gitClient.readFile(compareRef, path) : Promise.resolve(undefined as any), + ]) + // Mirror final output generation logic + const MAX_CONTEXT = 999 + const ctx = diffContextLines >= MAX_CONTEXT ? Number.MAX_SAFE_INTEGER : diffContextLines + if (status === 'modify' || status === 'add' || status === 'remove') { + const isBinary = Boolean((baseRes as any)?.binary) || Boolean((compareRes as any)?.binary) + if (isBinary) { + // Edge: unknown ext but worker says binary; treat same as looksBinary + if (includeBinaryPaths) { + const header = `## FILE: ${path} (${(status || 'unchanged').toUpperCase()})\n\n` + textForCount = header + } else { + textForCount = '' + } + } else if (status === 'add' && ctx === Number.MAX_SAFE_INTEGER) { + textForCount = (compareRes as { text?: string } | undefined)?.text ?? '' + } else { + textForCount = buildUnifiedDiffForStatus(status, path, baseRes as any, compareRes as any, { context: ctx }) || '' + } + } else { + const isBinary = Boolean((baseRes as any)?.binary) + const oldText = isBinary || (baseRes as any)?.notFound ? '' : (baseRes as any)?.text ?? '' + textForCount = oldText + } } - } else { - const isBinary = Boolean((baseRes as any)?.binary) - const oldText = isBinary || (baseRes as any)?.notFound ? '' : (baseRes as any)?.text ?? '' - textForCount = oldText - } const n = textForCount ? await tok.count(textForCount) : 0 next.set(path, n) }), diff --git a/src/web/src/utils/binary.ts b/src/web/src/utils/binary.ts new file mode 100644 index 0000000..ad42ca2 --- /dev/null +++ b/src/web/src/utils/binary.ts @@ -0,0 +1,18 @@ +// Centralized binary-file heuristics to keep UI, workers, and counters in sync. +// Note: We treat SVG as binary here for safety/perf (often very large). +const BINARY_EXTS = [ + '.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico', + '.pdf','.zip','.rar','.7z','.tar','.gz','.tgz', + '.mp3','.wav','.flac', + '.mp4','.mov','.avi','.mkv','.webm', + '.exe','.dll','.bin','.dmg','.pkg','.iso', + '.woff','.woff2','.ttf','.otf', + '.svg' +] + +export function isBinaryPath(p: string): boolean { + const lower = p.toLowerCase() + return BINARY_EXTS.some(ext => lower.endsWith(ext)) +} + +export { BINARY_EXTS } diff --git a/src/web/src/workers/gitWorker.ts b/src/web/src/workers/gitWorker.ts index 2e00e79..6698555 100644 --- a/src/web/src/workers/gitWorker.ts +++ b/src/web/src/workers/gitWorker.ts @@ -16,6 +16,20 @@ import LightningFS from '@isomorphic-git/lightning-fs' import * as BufferModule from 'buffer' import ProcessModule from 'process' import * as GIT from 'isomorphic-git' +// Lightweight duplicate; workers can't import app utils directly. Keep in sync with utils/binary.ts. +const BINARY_EXTS = [ + '.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico', + '.pdf','.zip','.rar','.7z','.tar','.gz','.tgz', + '.mp3','.wav','.flac', + '.mp4','.mov','.avi','.mkv','.webm', + '.exe','.dll','.bin','.dmg','.pkg','.iso', + '.woff','.woff2','.ttf','.otf', + '.svg' +] +function isBinaryPathLocal(p: string): boolean { + const lower = p.toLowerCase() + return BINARY_EXTS.some((ext) => lower.endsWith(ext)) +} ;(self as any).Buffer = (self as any).Buffer || (BufferModule as any).Buffer ;(self as any).process = (self as any).process || (ProcessModule as any) @@ -449,6 +463,21 @@ async function handleReadFile( ): Promise { if (!pfs) throw new Error('Repository is not initialized in worker') + // Skip heavy reads for known-binary extensions + if (isBinaryPathLocal(filepath)) { + if (ref === WORKDIR_SENTINEL) { + // Best-effort existence check without loading file contents + try { + await pfs.stat('/' + filepath) + return { id, type: 'ok', data: { binary: true, text: null, notFound: false } } + } catch { + return { id, type: 'ok', data: { binary: false, text: null, notFound: true } } + } + } + // For commit refs we assume existence (paths come from list/diff); avoid blob load + return { id, type: 'ok', data: { binary: true, text: null, notFound: false } } + } + // Read raw to detect binary first if (ref === WORKDIR_SENTINEL) { try { From 0b2e6d0393a2c216daf0f7f08427ac8bec552210 Mon Sep 17 00:00:00 2001 From: kccarlos <110118511+kccarlos@users.noreply.github.com> Date: Sun, 7 Sep 2025 02:09:28 -0700 Subject: [PATCH 6/6] fix: binary file detection --- src/electron/shared/binary.ts | 125 ++++++++++++++++++++++++++ src/electron/workers/nodeGitWorker.ts | 50 +++++------ src/web/src/shared/binary.ts | 125 ++++++++++++++++++++++++++ src/web/src/utils/binary.ts | 20 +---- src/web/src/workers/gitWorker.ts | 55 +++--------- 5 files changed, 285 insertions(+), 90 deletions(-) create mode 100644 src/electron/shared/binary.ts create mode 100644 src/web/src/shared/binary.ts diff --git a/src/electron/shared/binary.ts b/src/electron/shared/binary.ts new file mode 100644 index 0000000..f91a55f --- /dev/null +++ b/src/electron/shared/binary.ts @@ -0,0 +1,125 @@ +// Shared binary detection (web + desktop workers + UI). +// Strategy: +// 1) Extension hints (cheap). +// 2) Magic-byte signatures. +// 3) UTF-8 text heuristic on a small sample. +// 4) SVG special-case (XML-ish text). +// +// Keep fast and dependency-free. + +export const SNIFF_BYTES = 8192 + +// Pragmatic denylist; not authoritative, just an early-out. +const BINARY_EXTS = new Set([ + '.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico', + '.pdf','.zip','.rar','.7z','.tar','.gz','.tgz', + '.mp3','.wav','.flac', + '.mp4','.mov','.avi','.mkv','.webm', + '.exe','.dll','.bin','.dmg','.pkg','.iso', + '.woff','.woff2','.ttf','.otf', + '.so','.dylib','.class','.jar', + '.psd','.ai','.sketch', + '.wasm', + // SVG is special-cased below; keep here so we early-out unless the content proves XML-ish + '.svg', +]) + +export function isBinaryPath(path: string): boolean { + const i = path.lastIndexOf('.') + if (i < 0) return false + const ext = path.slice(i).toLowerCase() + return BINARY_EXTS.has(ext) +} + +function startsWith(bytes: Uint8Array, ascii: string, offset = 0): boolean { + if (offset + ascii.length > bytes.length) return false + for (let i = 0; i < ascii.length; i++) { + if (bytes[offset + i] !== ascii.charCodeAt(i)) return false + } + return true +} + +// Spot common binary formats by signature (magic bytes). +export function hasBinaryMagic(bytes: Uint8Array): boolean { + const b = bytes + const len = b.length + if (len >= 8) { + // PNG + if ( + b[0] === 0x89 && b[1] === 0x50 && b[2] === 0x4E && b[3] === 0x47 && + b[4] === 0x0D && b[5] === 0x0A && b[6] === 0x1A && b[7] === 0x0A + ) return true + } + if (len >= 3) { + // JPEG + if (b[0] === 0xFF && b[1] === 0xD8 && b[2] === 0xFF) return true + } + // GIF + if (startsWith(b, 'GIF87a') || startsWith(b, 'GIF89a')) return true + // PDF + if (startsWith(b, '%PDF-')) return true + // ZIP (also covers many Office docs, apk, jar) + if (len >= 4 && b[0] === 0x50 && b[1] === 0x4B && (b[2] === 0x03 || b[2] === 0x05 || b[2] === 0x07) && (b[3] === 0x04 || b[3] === 0x06 || b[3] === 0x08)) return true + // GZIP + if (len >= 3 && b[0] === 0x1F && b[1] === 0x8B && b[2] === 0x08) return true + // MP3 (ID3) + if (startsWith(b, 'ID3')) return true + // MP4/ISO BMFF + if (len >= 12 && startsWith(b, 'ftyp', 4)) return true + // OGG + if (startsWith(b, 'OggS')) return true + // Matroska/WebM + if (len >= 4 && b[0] === 0x1A && b[1] === 0x45 && b[2] === 0xDF && b[3] === 0xA3) return true + // WOFF/WOFF2 + if (startsWith(b, 'wOFF') || startsWith(b, 'wOF2')) return true + // TTF/OTF + if (len >= 4 && ((b[0] === 0x00 && b[1] === 0x01 && b[2] === 0x00 && b[3] === 0x00) || startsWith(b, 'OTTO'))) return true + // Windows MZ / ELF + if (startsWith(b, 'MZ') || (len >= 4 && b[0] === 0x7F && b[1] === 0x45 && b[2] === 0x4C && b[3] === 0x46)) return true + return false +} + +// SVG often lives in repos as text; detect text-y SVG even if extension is .svg. +function isXmlSvgText(bytes: Uint8Array): boolean { + // Skip leading whitespace / BOM, then expect '<' + let i = 0 + while (i < bytes.length && (bytes[i] === 0xEF || bytes[i] === 0xBB || bytes[i] === 0xBF || bytes[i] <= 0x20)) i++ + if (i >= bytes.length || bytes[i] !== 0x3C /* '<' */) return false + // Look for " 13 && c < 32)) suspicious++ + } + return suspicious / n > 0.30 +} + +export function detectBinaryByContent(sample: Uint8Array, path?: string): boolean { + // path hint + if (path && isBinaryPath(path)) { + // Allow SVG override if it looks like XML + if (path.toLowerCase().endsWith('.svg') && isXmlSvgText(sample)) return false + return true + } + if (hasBinaryMagic(sample)) return true + // SVG override if no magic but looks like XML text + if (isXmlSvgText(sample)) return false + return looksBinaryHeuristic(sample) +} + +// Convenience: decide with or without sample. +export function shouldTreatAsBinary(path: string, sample?: Uint8Array): boolean { + if (!sample) return isBinaryPath(path) + return detectBinaryByContent(sample, path) +} + +export { BINARY_EXTS } diff --git a/src/electron/workers/nodeGitWorker.ts b/src/electron/workers/nodeGitWorker.ts index 0ee6929..00706f0 100644 --- a/src/electron/workers/nodeGitWorker.ts +++ b/src/electron/workers/nodeGitWorker.ts @@ -3,6 +3,7 @@ import * as fs from 'fs' import * as path from 'path' import * as git from 'isomorphic-git' import { LRUCache } from 'lru-cache' +import { isBinaryPath, detectBinaryByContent, SNIFF_BYTES } from '../shared/binary' type Msg = | { id: number; type: 'loadRepo'; repoPath: string } @@ -17,19 +18,6 @@ const blobCache = new LRUCache let blobCacheHits = 0 const gitCache: Record = Object.create(null) const WORKDIR = '__WORKDIR__' -const BINARY_EXTS = [ - '.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico', - '.pdf','.zip','.rar','.7z','.tar','.gz','.tgz', - '.mp3','.wav','.flac', - '.mp4','.mov','.avi','.mkv','.webm', - '.exe','.dll','.bin','.dmg','.pkg','.iso', - '.woff','.woff2','.ttf','.otf', - '.svg' -] -function isBinaryPathLocal(p: string): boolean { - const lower = p.toLowerCase() - return BINARY_EXTS.some(ext => lower.endsWith(ext)) -} // Helper function to parse packed-refs async function parsePackedRefs(repoPath: string): Promise { @@ -58,14 +46,7 @@ function ok(id: number, data?: any) { send({ id, type: 'ok', data }) } function err(id: number, error: string) { send({ id, type: 'error', error }) } function progress(id: number, message: string) { send({ id, type: 'progress', message }) } -function looksBinary(buf: Buffer): boolean { - const len = Math.min(buf.length, 8000) - for (let i = 0; i < len; i++) { - const c = buf[i] - if (c === 0) return true - } - return false -} +// (content sniffing comes from shared helper) parentPort?.on('message', async (m: Msg) => { try { @@ -141,7 +122,7 @@ parentPort?.on('message', async (m: Msg) => { } case 'readFile': { // Fast path: known-binary extension => no content read - if (isBinaryPathLocal(m.filepath)) { + if (isBinaryPath(m.filepath)) { if (m.ref !== WORKDIR) { ok(m.id, { binary: true, text: null, notFound: false }); return } @@ -168,17 +149,32 @@ parentPort?.on('message', async (m: Msg) => { const res = await git.readBlob({ fs, dir: repoPath, oid: commitOid, filepath: m.filepath }).catch(() => null) if (!res) { ok(m.id, { binary: false, text: null, notFound: true }); return } const buf = Buffer.from(res.blob) - const binary = looksBinary(buf) + const sample = buf.subarray(0, SNIFF_BYTES) + const binary = detectBinaryByContent(sample, m.filepath) const value = { binary, text: binary ? null : buf.toString('utf8') } blobCache.set(cacheKey, value) ok(m.id, { ...value, notFound: false }) return } + // Partial read to sniff type without pulling whole large files const fileAbs = path.join(repoPath, m.filepath) - const buf = await fs.promises.readFile(fileAbs).catch(() => null as any) - if (!buf) { ok(m.id, { binary: false, text: null, notFound: true }); return } - const binary = looksBinary(buf as Buffer) - ok(m.id, { binary, text: binary ? null : (buf as Buffer).toString('utf8') }) + const fd = await fs.promises.open(fileAbs, 'r').catch(() => null as any) + if (!fd) { ok(m.id, { binary: false, text: null, notFound: true }); return } + try { + const probe = Buffer.allocUnsafe(SNIFF_BYTES) + const { bytesRead } = await fd.read(probe, 0, SNIFF_BYTES, 0) + const sample = probe.subarray(0, bytesRead) + const binary = detectBinaryByContent(sample, m.filepath) + if (binary) { + ok(m.id, { binary: true, text: null, notFound: false }) + } else { + // Now read full text if needed + const full = await fs.promises.readFile(fileAbs, 'utf8') + ok(m.id, { binary: false, text: full, notFound: false }) + } + } finally { + await fd.close().catch(() => {}) + } return } case 'listFiles': { diff --git a/src/web/src/shared/binary.ts b/src/web/src/shared/binary.ts new file mode 100644 index 0000000..f91a55f --- /dev/null +++ b/src/web/src/shared/binary.ts @@ -0,0 +1,125 @@ +// Shared binary detection (web + desktop workers + UI). +// Strategy: +// 1) Extension hints (cheap). +// 2) Magic-byte signatures. +// 3) UTF-8 text heuristic on a small sample. +// 4) SVG special-case (XML-ish text). +// +// Keep fast and dependency-free. + +export const SNIFF_BYTES = 8192 + +// Pragmatic denylist; not authoritative, just an early-out. +const BINARY_EXTS = new Set([ + '.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico', + '.pdf','.zip','.rar','.7z','.tar','.gz','.tgz', + '.mp3','.wav','.flac', + '.mp4','.mov','.avi','.mkv','.webm', + '.exe','.dll','.bin','.dmg','.pkg','.iso', + '.woff','.woff2','.ttf','.otf', + '.so','.dylib','.class','.jar', + '.psd','.ai','.sketch', + '.wasm', + // SVG is special-cased below; keep here so we early-out unless the content proves XML-ish + '.svg', +]) + +export function isBinaryPath(path: string): boolean { + const i = path.lastIndexOf('.') + if (i < 0) return false + const ext = path.slice(i).toLowerCase() + return BINARY_EXTS.has(ext) +} + +function startsWith(bytes: Uint8Array, ascii: string, offset = 0): boolean { + if (offset + ascii.length > bytes.length) return false + for (let i = 0; i < ascii.length; i++) { + if (bytes[offset + i] !== ascii.charCodeAt(i)) return false + } + return true +} + +// Spot common binary formats by signature (magic bytes). +export function hasBinaryMagic(bytes: Uint8Array): boolean { + const b = bytes + const len = b.length + if (len >= 8) { + // PNG + if ( + b[0] === 0x89 && b[1] === 0x50 && b[2] === 0x4E && b[3] === 0x47 && + b[4] === 0x0D && b[5] === 0x0A && b[6] === 0x1A && b[7] === 0x0A + ) return true + } + if (len >= 3) { + // JPEG + if (b[0] === 0xFF && b[1] === 0xD8 && b[2] === 0xFF) return true + } + // GIF + if (startsWith(b, 'GIF87a') || startsWith(b, 'GIF89a')) return true + // PDF + if (startsWith(b, '%PDF-')) return true + // ZIP (also covers many Office docs, apk, jar) + if (len >= 4 && b[0] === 0x50 && b[1] === 0x4B && (b[2] === 0x03 || b[2] === 0x05 || b[2] === 0x07) && (b[3] === 0x04 || b[3] === 0x06 || b[3] === 0x08)) return true + // GZIP + if (len >= 3 && b[0] === 0x1F && b[1] === 0x8B && b[2] === 0x08) return true + // MP3 (ID3) + if (startsWith(b, 'ID3')) return true + // MP4/ISO BMFF + if (len >= 12 && startsWith(b, 'ftyp', 4)) return true + // OGG + if (startsWith(b, 'OggS')) return true + // Matroska/WebM + if (len >= 4 && b[0] === 0x1A && b[1] === 0x45 && b[2] === 0xDF && b[3] === 0xA3) return true + // WOFF/WOFF2 + if (startsWith(b, 'wOFF') || startsWith(b, 'wOF2')) return true + // TTF/OTF + if (len >= 4 && ((b[0] === 0x00 && b[1] === 0x01 && b[2] === 0x00 && b[3] === 0x00) || startsWith(b, 'OTTO'))) return true + // Windows MZ / ELF + if (startsWith(b, 'MZ') || (len >= 4 && b[0] === 0x7F && b[1] === 0x45 && b[2] === 0x4C && b[3] === 0x46)) return true + return false +} + +// SVG often lives in repos as text; detect text-y SVG even if extension is .svg. +function isXmlSvgText(bytes: Uint8Array): boolean { + // Skip leading whitespace / BOM, then expect '<' + let i = 0 + while (i < bytes.length && (bytes[i] === 0xEF || bytes[i] === 0xBB || bytes[i] === 0xBF || bytes[i] <= 0x20)) i++ + if (i >= bytes.length || bytes[i] !== 0x3C /* '<' */) return false + // Look for " 13 && c < 32)) suspicious++ + } + return suspicious / n > 0.30 +} + +export function detectBinaryByContent(sample: Uint8Array, path?: string): boolean { + // path hint + if (path && isBinaryPath(path)) { + // Allow SVG override if it looks like XML + if (path.toLowerCase().endsWith('.svg') && isXmlSvgText(sample)) return false + return true + } + if (hasBinaryMagic(sample)) return true + // SVG override if no magic but looks like XML text + if (isXmlSvgText(sample)) return false + return looksBinaryHeuristic(sample) +} + +// Convenience: decide with or without sample. +export function shouldTreatAsBinary(path: string, sample?: Uint8Array): boolean { + if (!sample) return isBinaryPath(path) + return detectBinaryByContent(sample, path) +} + +export { BINARY_EXTS } diff --git a/src/web/src/utils/binary.ts b/src/web/src/utils/binary.ts index ad42ca2..f77d049 100644 --- a/src/web/src/utils/binary.ts +++ b/src/web/src/utils/binary.ts @@ -1,18 +1,2 @@ -// Centralized binary-file heuristics to keep UI, workers, and counters in sync. -// Note: We treat SVG as binary here for safety/perf (often very large). -const BINARY_EXTS = [ - '.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico', - '.pdf','.zip','.rar','.7z','.tar','.gz','.tgz', - '.mp3','.wav','.flac', - '.mp4','.mov','.avi','.mkv','.webm', - '.exe','.dll','.bin','.dmg','.pkg','.iso', - '.woff','.woff2','.ttf','.otf', - '.svg' -] - -export function isBinaryPath(p: string): boolean { - const lower = p.toLowerCase() - return BINARY_EXTS.some(ext => lower.endsWith(ext)) -} - -export { BINARY_EXTS } +// Re-export shared detector for web UI code paths. +export * from '../shared/binary' diff --git a/src/web/src/workers/gitWorker.ts b/src/web/src/workers/gitWorker.ts index 6698555..4f265cc 100644 --- a/src/web/src/workers/gitWorker.ts +++ b/src/web/src/workers/gitWorker.ts @@ -16,20 +16,7 @@ import LightningFS from '@isomorphic-git/lightning-fs' import * as BufferModule from 'buffer' import ProcessModule from 'process' import * as GIT from 'isomorphic-git' -// Lightweight duplicate; workers can't import app utils directly. Keep in sync with utils/binary.ts. -const BINARY_EXTS = [ - '.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico', - '.pdf','.zip','.rar','.7z','.tar','.gz','.tgz', - '.mp3','.wav','.flac', - '.mp4','.mov','.avi','.mkv','.webm', - '.exe','.dll','.bin','.dmg','.pkg','.iso', - '.woff','.woff2','.ttf','.otf', - '.svg' -] -function isBinaryPathLocal(p: string): boolean { - const lower = p.toLowerCase() - return BINARY_EXTS.some((ext) => lower.endsWith(ext)) -} +import { detectBinaryByContent, SNIFF_BYTES, isBinaryPath } from '../shared/binary' ;(self as any).Buffer = (self as any).Buffer || (BufferModule as any).Buffer ;(self as any).process = (self as any).process || (ProcessModule as any) @@ -437,24 +424,6 @@ async function handleDiff( return { id, type: 'ok', data: { files } } } -// Heuristic binary detection -function looksBinary(buf: Uint8Array): boolean { - // If there are many zero bytes or high ASCII control chars, treat as binary - const len = buf.length - if (len === 0) return false - let suspicious = 0 - const maxCheck = Math.min(len, 8192) - for (let i = 0; i < maxCheck; i++) { - const c = buf[i] - // Allow common whitespace and ASCII printable range - if (c === 0) { - suspicious += 2 - } else if (c < 7 || (c > 13 && c < 32)) { - suspicious++ - } - } - return suspicious / maxCheck > 0.3 -} async function handleReadFile( id: number, @@ -463,26 +432,21 @@ async function handleReadFile( ): Promise { if (!pfs) throw new Error('Repository is not initialized in worker') - // Skip heavy reads for known-binary extensions - if (isBinaryPathLocal(filepath)) { + // Fast extension short-circuit (no content read) + if (isBinaryPath(filepath)) { if (ref === WORKDIR_SENTINEL) { - // Best-effort existence check without loading file contents - try { - await pfs.stat('/' + filepath) - return { id, type: 'ok', data: { binary: true, text: null, notFound: false } } - } catch { - return { id, type: 'ok', data: { binary: false, text: null, notFound: true } } - } + try { await pfs.stat('/' + filepath); return { id, type: 'ok', data: { binary: true, text: null, notFound: false } } } + catch { return { id, type: 'ok', data: { binary: false, text: null, notFound: true } } } } - // For commit refs we assume existence (paths come from list/diff); avoid blob load return { id, type: 'ok', data: { binary: true, text: null, notFound: false } } } - // Read raw to detect binary first + // Read raw and sniff (LightningFS can only read full files; we still only *inspect* a small prefix) if (ref === WORKDIR_SENTINEL) { try { const raw = (await pfs.readFile('/' + filepath)) as Uint8Array - const binary = looksBinary(raw) + const sample = raw.subarray(0, SNIFF_BYTES) + const binary = detectBinaryByContent(sample, filepath) const text = binary ? null : new TextDecoder('utf-8', { fatal: false }).decode(raw) return { id, type: 'ok', data: { binary, text, notFound: false } } } catch { @@ -506,7 +470,8 @@ async function handleReadFile( return { id, type: 'ok', data: { binary: false, text: null, notFound: true } } } - const binary = looksBinary(raw) + const sample = (raw as Uint8Array).subarray(0, SNIFF_BYTES) + const binary = detectBinaryByContent(sample, filepath) let text: string | null = null if (!binary && raw) { // Decode as UTF-8