perf: optimize large file memory handling

claude · claude · commit adb5e2816e9a · 2025-12-15T14:56:09.000Z
- Avoid repeated split() in ValueExtractor by accepting pre-split lines array
- Add LTTB (Largest Triangle Three Buckets) downsampling algorithm for chart rendering
- Optimize LocalStorage: large files (&gt;500KB) store only metricsData, not raw content
- Add parsing status indicator and large file warning in FileList
- Add i18n translations for new UI states

This should significantly reduce memory usage when handling files with 100k+ lines.
diff --git a/public/locales/en/translation.json b/public/locales/en/translation.json
@@ -12,6 +12,9 @@
   "fileList.disabled": "Disabled",
   "fileList.config": "Configure file {{name}}",
   "fileList.delete": "Remove file {{name}}",
+  "fileList.parsing": "Parsing",
+  "fileList.needsReupload": "Large file - re-upload required to re-parse",
+  "fileList.needsReuploadTip": "File data is cached, but re-upload is required to modify parsing config",
   "comparison.title": "Compare Mode",
   "comparison.select": "Select comparison mode",
   "comparison.multiFileMode": "Multi-file comparison mode",
diff --git a/public/locales/zh/translation.json b/public/locales/zh/translation.json
@@ -12,6 +12,9 @@
   "fileList.disabled": "已禁用",
   "fileList.config": "配置文件 {{name}}",
   "fileList.delete": "删除文件 {{name}}",
+  "fileList.parsing": "解析中",
+  "fileList.needsReupload": "大文件 - 需要重新上传才能重新解析",
+  "fileList.needsReuploadTip": "此文件数据已缓存，但需要重新上传才能修改解析配置",
   "comparison.title": "对比模式",
   "comparison.select": "选择数据对比模式",
   "comparison.multiFileMode": "多文件对比模式",
diff --git a/src/App.jsx b/src/App.jsx
@@ -11,6 +11,9 @@ import { Header } from './components/Header';
 import { PanelLeftClose, PanelLeftOpen } from 'lucide-react';
 import { mergeFilesWithReplacement } from './utils/mergeFiles.js';
 
+// Threshold for "large file" - files above this won't have content persisted
+const LARGE_FILE_THRESHOLD = 500 * 1024; // 500KB of content
+
 // Default global parsing configuration
 export const DEFAULT_GLOBAL_PARSING_CONFIG = {
   metrics: [
@@ -35,7 +38,22 @@ function App() {
   const { t } = useTranslation();
   const [uploadedFiles, setUploadedFiles] = useState(() => {
     const stored = localStorage.getItem('uploadedFiles');
-    return stored ? JSON.parse(stored) : [];
+    if (!stored) return [];
+    try {
+      const parsed = JSON.parse(stored);
+      // Restore files with proper defaults for large files that have metricsData
+      return parsed.map(file => ({
+        ...file,
+        enabled: file.enabled ?? true,
+        isParsing: false,
+        // For large files, metricsData is already stored; for small files it will be re-parsed
+        metricsData: file.metricsData || {},
+        // Mark large files that need re-upload for re-parsing
+        needsReupload: file.isLargeFile && !file.content
+      }));
+    } catch {
+      return [];
+    }
   });
 
   // Global parsing configuration state
@@ -118,16 +136,26 @@ function App() {
   useEffect(() => {
     if (savingDisabledRef.current) return;
     try {
-      const serialized = uploadedFiles.map(({ id, name, enabled, content, config }) => ({
-        id,
-        name,
-        enabled,
-        content,
-        config
-      }));
+      // Smart serialization: for large files, only store metricsData (not raw content)
+      // This allows the app to still display charts after refresh, but re-parsing will need re-upload
+      const serialized = uploadedFiles.map(({ id, name, enabled, content, config, metricsData }) => {
+        const isLargeFile = content && content.length > LARGE_FILE_THRESHOLD;
+        return {
+          id,
+          name,
+          enabled,
+          // For large files, don't store content to save memory/storage
+          content: isLargeFile ? null : content,
+          config,
+          // Store metricsData for large files so charts still work after refresh
+          metricsData: isLargeFile ? metricsData : undefined,
+          // Flag to indicate this file needs re-upload for re-parsing
+          isLargeFile
+        };
+      });
       if (serialized.length > 0) {
         const json = JSON.stringify(serialized);
-        // Avoid filling localStorage with very large files
+        // Avoid filling localStorage with very large data
         if (json.length > 5 * 1024 * 1024) {
           savingDisabledRef.current = true;
           console.warn('Uploaded files exceed storage limit; persistence disabled.');
diff --git a/src/components/ChartContainer.jsx b/src/components/ChartContainer.jsx
@@ -16,6 +16,7 @@ import zoomPlugin from 'chartjs-plugin-zoom';
 import { ImageDown, Copy, FileDown } from 'lucide-react';
 import { getMinSteps } from "../utils/getMinSteps.js";
 import { useTranslation } from 'react-i18next';
+import { adaptiveDownsample } from "../utils/downsample.js";
 
 ChartJS.register(
   CategoryScale,
@@ -272,8 +273,11 @@ export default function ChartContainer({
     }
   }, [parsedData, onXRangeChange]);
 
-  const colors = ['#ef4444', '#3b82f6', '#10b981', '#f59e0b', '#8b5cf6', '#f97316'];
-  const createChartData = dataArray => {
+  // Maximum points to render per dataset - prevents browser crashes on large files
+  const MAX_DISPLAY_POINTS = 3000;
+
+  const colors = useMemo(() => ['#ef4444', '#3b82f6', '#10b981', '#f59e0b', '#8b5cf6', '#f97316'], []);
+  const createChartData = useCallback((dataArray) => {
     // Ensure no duplicate datasets
     const uniqueItems = dataArray.reduce((acc, item) => {
       const exists = acc.find(existing => existing.name === item.name);
@@ -286,9 +290,13 @@ export default function ChartContainer({
     return {
       datasets: uniqueItems.map((item, index) => {
         const color = colors[index % colors.length];
+        // Apply LTTB downsampling for display - preserves trends while reducing memory
+        const displayData = adaptiveDownsample(item.data, MAX_DISPLAY_POINTS);
         return {
           label: item.name?.replace(/\.(log|txt)$/i, '') || `File ${index + 1}`,
-          data: item.data,
+          data: displayData,
+          // Store original data length for reference
+          _originalLength: item.data.length,
           borderColor: color,
           backgroundColor: `${color}33`,
           borderWidth: 2,
@@ -307,7 +315,7 @@ export default function ChartContainer({
         };
       })
     };
-  };
+  }, [colors]);
 
   const getComparisonData = (data1, data2, mode) => {
     const map2 = new Map(data2.map(p => [p.x, p.y]));
diff --git a/src/components/FileList.jsx b/src/components/FileList.jsx
@@ -1,5 +1,5 @@
 import React from 'react';
-import { FileText, X, Settings } from 'lucide-react';
+import { FileText, X, Settings, Loader2, AlertCircle } from 'lucide-react';
 import { useTranslation } from 'react-i18next';
 
   export function FileList({ files, onFileRemove, onFileToggle, onFileConfig }) {
@@ -45,17 +45,34 @@ import { useTranslation } from 'react-i18next';
                     className="checkbox"
                     aria-describedby={`file-status-${file.id}`}
                   />
-                  <FileText
-                    size={14}
-                    className={`${file.enabled !== false ? 'text-blue-600' : 'text-gray-400 dark:text-gray-500'}`}
-                    aria-hidden="true"
-                  />
+                  {file.isParsing ? (
+                    <Loader2
+                      size={14}
+                      className="text-blue-600 animate-spin"
+                      aria-hidden="true"
+                    />
+                  ) : file.needsReupload ? (
+                    <AlertCircle
+                      size={14}
+                      className="text-amber-500"
+                      aria-hidden="true"
+                      title={t('fileList.needsReupload')}
+                    />
+                  ) : (
+                    <FileText
+                      size={14}
+                      className={`${file.enabled !== false ? 'text-blue-600' : 'text-gray-400 dark:text-gray-500'}`}
+                      aria-hidden="true"
+                    />
+                  )}
                   <span
                     className={`text-xs font-medium truncate ${
                       file.enabled !== false ? 'text-gray-700 dark:text-gray-200' : 'text-gray-400 dark:text-gray-500'
                     }`}
+                    title={file.needsReupload ? t('fileList.needsReuploadTip') : file.name}
                   >
                     {file.name}
+                    {file.isParsing && <span className="text-blue-500 ml-1">({t('fileList.parsing')})</span>}
                   </span>
                     <span
                       id={`file-status-${file.id}`}
diff --git a/src/utils/ValueExtractor.js b/src/utils/ValueExtractor.js
@@ -7,20 +7,26 @@ export const MATCH_MODES = {
 
 // Value extractor class
 export class ValueExtractor {
-  // Keyword match
-  static extractByKeyword(content, keyword) {
+  // Helper to get lines array - accepts either content string or pre-split lines array
+  static getLines(contentOrLines) {
+    if (!contentOrLines) return [];
+    if (Array.isArray(contentOrLines)) return contentOrLines;
+    return contentOrLines.split('\n');
+  }
+
+  // Keyword match - now accepts either content string or pre-split lines array
+  static extractByKeyword(contentOrLines, keyword) {
     const results = [];
-    // Handle empty content
-    if (!content) return results;
-    
-    const lines = content.split('\n');
-    
+    const lines = this.getLines(contentOrLines);
+    if (lines.length === 0) return results;
+
     // Number regex supporting scientific notation
     const numberRegex = /[+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?/;
-    
+    const keywordLower = keyword.toLowerCase();
+
     lines.forEach((line, lineIndex) => {
       // Find keyword (case-insensitive)
-      const keywordIndex = line.toLowerCase().indexOf(keyword.toLowerCase());
+      const keywordIndex = line.toLowerCase().indexOf(keywordLower);
       if (keywordIndex !== -1) {
         // Find first number after the keyword
         const afterKeyword = line.substring(keywordIndex + keyword.length);
@@ -43,13 +49,12 @@ export class ValueExtractor {
     return results;
   }
 
-  // Column position match
-  static extractByColumn(content, columnIndex, separator = ' ') {
+  // Column position match - now accepts either content string or pre-split lines array
+  static extractByColumn(contentOrLines, columnIndex, separator = ' ') {
     const results = [];
-    if (!content) return results;
+    const lines = this.getLines(contentOrLines);
+    if (lines.length === 0) return results;
 
-    const lines = content.split('\n');
-    
     lines.forEach((line, lineIndex) => {
       if (line.trim()) {
         const columns = separator === ' ' 
@@ -72,13 +77,12 @@ export class ValueExtractor {
     return results;
   }
 
-  // Smart parsing
-  static extractBySmart(content, type = 'loss') {
+  // Smart parsing - now accepts either content string or pre-split lines array
+  static extractBySmart(contentOrLines, type = 'loss') {
     const results = [];
-    if (!content) return results;
+    const lines = this.getLines(contentOrLines);
+    if (lines.length === 0) return results;
 
-    const lines = content.split('\n');
-    
     // Smart keyword list
     const keywords = type === 'loss'
       ? ['loss', 'training_loss', 'train_loss', 'val_loss', 'validation_loss']
@@ -143,13 +147,12 @@ export class ValueExtractor {
     return results;
   }
 
-  // Regex match (original functionality)
-  static extractByRegex(content, regex) {
+  // Regex match (original functionality) - now accepts either content string or pre-split lines array
+  static extractByRegex(contentOrLines, regex) {
     const results = [];
-    if (!content) return results;
+    const lines = this.getLines(contentOrLines);
+    if (lines.length === 0) return results;
 
-    const lines = content.split('\n');
-    
     try {
       const regexObj = new RegExp(regex, 'gi');
       lines.forEach((line, lineIndex) => {
diff --git a/src/utils/downsample.js b/src/utils/downsample.js
diff --git a/src/workers/__tests__/logParser.worker.test.js b/src/workers/__tests__/logParser.worker.test.js
diff --git a/src/workers/logParser.worker.js b/src/workers/logParser.worker.js