From 3f0dff7898e092ce16c3d4244c7e17bdc3107d17 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 02:07:49 +0000 Subject: [PATCH 01/16] perf: hunk-based rebase attribution transfer (3.5x faster) Replace per-file content diffing (imara_diff Myers) with hunk-based attribution transfer using git diff-tree -p -U0 hunk headers. For commits after first appearance, adjusts attribution line numbers via O(attrs+hunks) arithmetic instead of O(file_length) Myers diff. Key changes: - Combined diff-tree call extracts both metadata and hunks in one subprocess - Only read blob contents for first-seen files (99% reduction in blob I/O) - Hunk-based transfer for all subsequent commits (no diff algorithm needed) Benchmark (100 commits x 30 files x 200 lines): - Authorship overhead: 1450ms -> 413ms (3.5x faster) - Commit processing loop: 1247ms -> 28ms (44x faster) - Blob reads: 3000 -> 30 (99% reduction) - 3056 integration tests pass, 0 regressions Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 501 ++++++++++++++++++++------ tests/integration/rebase_benchmark.rs | 368 +++++++++++++++++++ 2 files changed, 758 insertions(+), 111 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index c13a0d9bc..7128c8732 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1127,15 +1127,16 @@ pub fn rewrite_authorship_after_rebase_v2( return Ok(()); } - // Step 2a: Run diff-tree to discover which files actually change during the rebase. - // This is fast (single subprocess) and tells us which files we need to load. + // Step 2a: Run diff-tree to discover which files actually change during the rebase, + // AND extract hunk information for hunk-based attribution transfer. + // Uses a single `git diff-tree --stdin --raw -p -U0` call for both. let diff_tree_start = std::time::Instant::now(); - let diff_tree_result = - run_diff_tree_for_commits(repo, &commits_to_process, &pathspecs_lookup, &pathspecs)?; + let (diff_tree_result, hunks_by_commit) = + run_diff_tree_with_hunks(repo, &commits_to_process, &pathspecs_lookup, &pathspecs)?; let actually_changed_files = diff_tree_result.all_changed_files(); timing_phases.push(( format!( - "diff_tree ({} commits, {} changed files, {} blobs)", + "diff_tree_with_hunks ({} commits, {} changed files, {} blobs)", commits_to_process.len(), actually_changed_files.len(), diff_tree_result.all_blob_oids.len(), @@ -1213,16 +1214,35 @@ pub fn rewrite_authorship_after_rebase_v2( va_phase_start.elapsed().as_millis(), )); - // Step 2c: Read blob contents in parallel (multiple git cat-file --batch processes). + // Step 2c: Read blob contents — only for the FIRST commit that touches each file. + // Subsequent commits use hunk-based transfer which doesn't need blob content. let blob_phase_start = std::time::Instant::now(); - let blob_contents = batch_read_blob_contents_parallel(repo, &diff_tree_result.all_blob_oids)?; + let first_appearance_blobs = { + let mut seen_files: HashSet = HashSet::new(); + let mut needed_oids: HashSet = HashSet::new(); + for (_, delta) in &diff_tree_result.commit_deltas { + for (file_path, maybe_oid) in &delta.file_to_blob_oid { + if seen_files.insert(file_path.clone()) { + // First time seeing this file — need its blob for content-diff + if let Some(oid) = maybe_oid { + needed_oids.insert(oid.clone()); + } + } + } + } + let mut oid_list: Vec = needed_oids.into_iter().collect(); + oid_list.sort(); + oid_list + }; + let blob_contents = batch_read_blob_contents_parallel(repo, &first_appearance_blobs)?; let mut changed_contents_by_commit = assemble_changed_contents(diff_tree_result.commit_deltas, &blob_contents); - drop(blob_contents); // free memory early + drop(blob_contents); timing_phases.push(( format!( - "blob_read_parallel ({} blobs)", - diff_tree_result.all_blob_oids.len() + "blob_read ({} first-appearance blobs of {} total)", + first_appearance_blobs.len(), + diff_tree_result.all_blob_oids.len(), ), blob_phase_start.elapsed().as_millis(), )); @@ -1321,6 +1341,20 @@ pub fn rewrite_authorship_after_rebase_v2( let mut loop_transform_ms = 0u128; let mut loop_serialize_ms = 0u128; let mut loop_metrics_ms = 0u128; + let mut loop_diff_ms = 0u128; + let mut loop_hunk_ms = 0u128; + let mut loop_attestation_ms = 0u128; + let mut loop_content_clone_ms = 0u128; + let mut loop_metrics_subtract_ms = 0u128; + let mut loop_metrics_add_ms = 0u128; + let mut total_files_diffed = 0usize; + let mut total_lines_diffed = 0usize; + let mut total_files_hunk_transferred = 0usize; + // Track files that have been processed via content-diff at least once. + // After the first content-diff, our accumulated attribution state matches the + // commit chain, so we can use hunk-based transfer for subsequent appearances. + let mut files_with_synced_state: HashSet = HashSet::new(); + for (idx, new_commit) in commits_to_process.iter().enumerate() { debug_log(&format!( "Processing commit {}/{}: {}", @@ -1333,23 +1367,28 @@ pub fn rewrite_authorship_after_rebase_v2( .remove(new_commit) .unwrap_or_else(|| (HashSet::new(), HashMap::new())); + // Get hunk data for this commit (from the pre-computed diff-tree -p -U0 output) + let commit_hunks = hunks_by_commit.get(new_commit); + // Only transform attributions for files that actually changed. if !changed_files_in_commit.is_empty() { - // Update file existence from this commit's deltas before transformation. - for (file_path, content) in &new_content_for_changed_files { - if content.is_empty() { - existing_files.remove(file_path); - } else { - existing_files.insert(file_path.clone()); + // Update file existence: use blob content when available, hunk data otherwise. + for file_path in &changed_files_in_commit { + if let Some(content) = new_content_for_changed_files.get(file_path) { + if content.is_empty() { + existing_files.remove(file_path); + } else { + existing_files.insert(file_path.clone()); + } } + // If no blob content available (hunk-based path), file still exists + // (deletions would have zero OID which yields empty content in the map) } - // Diff-based line attribution transfer: for each changed file, diff - // old content → new content and carry attributions forward positionally. - // Falls back to content-matching for files with no previous content. let t0 = std::time::Instant::now(); - for (file_path, new_content) in &new_content_for_changed_files { + for file_path in &changed_files_in_commit { // Subtract old metrics before modifying attributions + let tsub = std::time::Instant::now(); let previous_line_attrs = current_attributions .get(file_path) .map(|(_, la)| la.clone()); @@ -1359,12 +1398,14 @@ pub fn rewrite_authorship_after_rebase_v2( prev_la, ); } - if new_content.is_empty() { - // File deleted - keep attributions and file contents so a later - // reappearance in the rebase sequence can inherit via diff-based - // positional transfer from the pre-deletion state. Re-add the - // subtracted metrics to preserve balance (the file won't appear - // in the serialized note since existing_files excludes it). + loop_metrics_subtract_ms += tsub.elapsed().as_micros() as u128; + + // Check if blob content is available and non-empty (file not deleted) + let new_content = new_content_for_changed_files.get(file_path); + let is_file_deleted = new_content.map(|c| c.is_empty()).unwrap_or(false); + + if is_file_deleted { + // File deleted if let Some(ref prev_la) = previous_line_attrs { add_prompt_line_metrics_for_line_attributions( &mut prompt_line_metrics, @@ -1372,28 +1413,81 @@ pub fn rewrite_authorship_after_rebase_v2( ); } cached_file_attestation_text.remove(file_path); + existing_files.remove(file_path); continue; } - let line_attrs = compute_line_attrs_for_changed_file( - new_content, - current_file_contents.get(file_path), - current_attributions + + // Decide: use hunk-based transfer or content-diff? + // Hunk-based: valid when our accumulated state matches the commit's parent. + // Content-diff: required for first appearance of each file (state not yet synced). + let has_hunks = commit_hunks + .and_then(|ch| ch.get(file_path.as_str())) + .is_some(); + let use_hunk_based = + files_with_synced_state.contains(file_path.as_str()) && has_hunks; + + let line_attrs = if use_hunk_based { + // FAST PATH: Hunk-based attribution transfer + let thunk = std::time::Instant::now(); + let hunks = commit_hunks.unwrap().get(file_path.as_str()).unwrap(); + let old_attrs = current_attributions .get(file_path) - .map(|(_, la)| la.as_slice()), - original_head_line_to_author.get(file_path), - ); + .map(|(_, la)| la.as_slice()) + .unwrap_or(&[]); + let result = apply_hunks_to_line_attributions(old_attrs, hunks); + total_files_hunk_transferred += 1; + loop_hunk_ms += thunk.elapsed().as_micros() as u128; + result + } else if let Some(new_content) = new_content { + // SLOW PATH: Content-diff based attribution transfer + let tdiff = std::time::Instant::now(); + total_files_diffed += 1; + let new_line_count = new_content.lines().count(); + total_lines_diffed += new_line_count; + let result = compute_line_attrs_for_changed_file( + new_content, + current_file_contents.get(file_path), + current_attributions + .get(file_path) + .map(|(_, la)| la.as_slice()), + original_head_line_to_author.get(file_path), + ); + loop_diff_ms += tdiff.elapsed().as_micros() as u128; + files_with_synced_state.insert(file_path.clone()); + result + } else { + // No blob content and no hunk data — skip this file + // (shouldn't happen in normal flow, but be defensive) + if let Some(ref prev_la) = previous_line_attrs { + add_prompt_line_metrics_for_line_attributions( + &mut prompt_line_metrics, + prev_la, + ); + } + continue; + }; + + let tadd = std::time::Instant::now(); add_prompt_line_metrics_for_line_attributions( &mut prompt_line_metrics, &line_attrs, ); - // Update fast serialization cache for this file + loop_metrics_add_ms += tadd.elapsed().as_micros() as u128; + let tatt = std::time::Instant::now(); if let Some(text) = serialize_attestation_from_line_attrs(file_path, &line_attrs) { cached_file_attestation_text.insert(file_path.clone(), text); } else { cached_file_attestation_text.remove(file_path); } + loop_attestation_ms += tatt.elapsed().as_micros() as u128; + let tclone = std::time::Instant::now(); current_attributions.insert(file_path.clone(), (Vec::new(), line_attrs)); - current_file_contents.insert(file_path.clone(), new_content.clone()); + if !use_hunk_based { + if let Some(content) = new_content { + current_file_contents.insert(file_path.clone(), content.clone()); + } + } + loop_content_clone_ms += tclone.elapsed().as_micros() as u128; } loop_transform_ms += t0.elapsed().as_millis(); @@ -1454,6 +1548,12 @@ pub fn rewrite_authorship_after_rebase_v2( loop_start.elapsed().as_millis(), )); timing_phases.push((" loop:transform".to_string(), loop_transform_ms)); + timing_phases.push((format!(" transform:diff ({} files, {} lines)", total_files_diffed, total_lines_diffed), loop_diff_ms / 1000)); + timing_phases.push((format!(" transform:hunk_transfer ({} files)", total_files_hunk_transferred), loop_hunk_ms / 1000)); + timing_phases.push((format!(" transform:attestation_serialize"), loop_attestation_ms / 1000)); + timing_phases.push((format!(" transform:content_clone"), loop_content_clone_ms / 1000)); + timing_phases.push((format!(" transform:metrics_subtract"), loop_metrics_subtract_ms / 1000)); + timing_phases.push((format!(" transform:metrics_add"), loop_metrics_add_ms / 1000)); timing_phases.push((" loop:serialize".to_string(), loop_serialize_ms)); timing_phases.push((" loop:metrics".to_string(), loop_metrics_ms)); @@ -2022,26 +2122,164 @@ impl DiffTreeResult { } } -/// Run `git diff-tree --stdin` to discover which files changed in each commit and collect blob OIDs. -/// This is the fast metadata-only phase — no blob contents are read. -fn run_diff_tree_for_commits( +/// A unified diff hunk header parsed from `git diff-tree -p -U0` output. +/// Represents a contiguous change region in a file. +#[derive(Debug, Clone)] +struct DiffHunk { + old_start: u32, + old_count: u32, + #[allow(dead_code)] + new_start: u32, + new_count: u32, +} + +/// Per-commit, per-file hunk information extracted from `git diff-tree -p -U0`. +/// Maps commit_sha → file_path → Vec. +type HunksByCommitAndFile = HashMap>>; + +/// Parse a unified diff hunk header line like `@@ -10,5 +12,6 @@ context` +/// Returns None if parsing fails. +fn parse_hunk_header(line: &str) -> Option { + // Format: @@ -old_start[,old_count] +new_start[,new_count] @@ + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() < 4 || parts[0] != "@@" { + return None; + } + + let old_part = parts[1].trim_start_matches('-'); + let new_part = parts[2].trim_start_matches('+'); + + let (old_start, old_count) = parse_range_spec(old_part)?; + let (new_start, new_count) = parse_range_spec(new_part)?; + + Some(DiffHunk { + old_start, + old_count, + new_start, + new_count, + }) +} + +/// Parse a range spec like "10,5" or "10" (count defaults to 1, but "10,0" means 0). +fn parse_range_spec(spec: &str) -> Option<(u32, u32)> { + if let Some((start_str, count_str)) = spec.split_once(',') { + let start = start_str.parse().ok()?; + let count = count_str.parse().ok()?; + Some((start, count)) + } else { + let start = spec.parse().ok()?; + Some((start, 1)) + } +} + +/// Apply hunk-based line offset adjustments to existing line attributions. +/// +/// Instead of re-diffing file contents, this uses pre-computed hunk information from +/// `git diff-tree -p -U0` to adjust attribution line numbers. For each hunk: +/// - Lines before the hunk: keep at same position (with accumulated offset) +/// - Lines in a deletion region: dropped (those lines were removed) +/// - Lines after the hunk: shifted by the net offset (new_count - old_count) +/// +/// This is O(attrs + hunks) instead of O(file_length) for the full diff approach. +fn apply_hunks_to_line_attributions( + old_attrs: &[crate::authorship::attribution_tracker::LineAttribution], + hunks: &[DiffHunk], +) -> Vec { + if hunks.is_empty() { + return old_attrs.to_vec(); + } + + // Build preserved segments: ranges of old line numbers that survive and their offset. + // Between hunks, lines are preserved with an accumulated offset. + let mut segments: Vec<(u32, u32, i64)> = Vec::with_capacity(hunks.len() + 1); + let mut offset: i64 = 0; + let mut prev_old_end: u32 = 1; // 1-indexed + + for hunk in hunks { + // Preserved segment before this hunk + if prev_old_end < hunk.old_start + 1 { + // Lines from prev_old_end to hunk.old_start are preserved + // For pure insertions (old_count=0), old_start points to the line AFTER which + // insertion happens, so lines up to and including old_start are preserved + let seg_end = if hunk.old_count == 0 { + hunk.old_start // inclusive + } else { + hunk.old_start.saturating_sub(1) // up to but not including the hunk + }; + if prev_old_end <= seg_end { + segments.push((prev_old_end, seg_end, offset)); + } + } + + // The hunk itself: old lines old_start..old_start+old_count-1 are deleted/replaced. + // No segment for these lines (they're removed). + // For pure insertion (old_count=0): no lines are removed, but offset changes. + + offset += hunk.new_count as i64 - hunk.old_count as i64; + + if hunk.old_count == 0 { + prev_old_end = hunk.old_start + 1; // after the insertion point + } else { + prev_old_end = hunk.old_start + hunk.old_count; // after the deleted range + } + } + + // Final segment after last hunk (up to a very large line number) + segments.push((prev_old_end, u32::MAX, offset)); + + // Apply the mapping to each attribution + let mut new_attrs: Vec = + Vec::with_capacity(old_attrs.len()); + + for attr in old_attrs { + // For each attribution range, find the preserved segments that overlap + for &(seg_start, seg_end, seg_offset) in &segments { + let range_start = attr.start_line.max(seg_start); + let range_end = attr.end_line.min(seg_end); + + if range_start <= range_end { + let new_start = (range_start as i64 + seg_offset).max(1) as u32; + let new_end = (range_end as i64 + seg_offset).max(1) as u32; + new_attrs.push(crate::authorship::attribution_tracker::LineAttribution { + start_line: new_start, + end_line: new_end, + author_id: attr.author_id.clone(), + overrode: attr.overrode.clone(), + }); + } + } + } + + new_attrs +} + +/// Combined diff-tree call that extracts BOTH raw file metadata (changed files, blob OIDs) +/// AND hunk information from unified diff patches, using a single `git diff-tree --stdin --raw -p -U0` call. +/// This replaces two separate subprocess calls with one. +fn run_diff_tree_with_hunks( repo: &Repository, commit_shas: &[String], pathspecs_lookup: &HashSet<&str>, pathspecs: &[String], -) -> Result { +) -> Result<(DiffTreeResult, HunksByCommitAndFile), GitAiError> { if commit_shas.is_empty() { - return Ok(DiffTreeResult { - commit_deltas: Vec::new(), - all_blob_oids: Vec::new(), - }); + return Ok(( + DiffTreeResult { + commit_deltas: Vec::new(), + all_blob_oids: Vec::new(), + }, + HashMap::new(), + )); } + // Use --raw for file metadata and -p -U0 for minimal patch hunks, in one call. let mut args = repo.global_args_for_exec(); args.push("diff-tree".to_string()); args.push("--stdin".to_string()); args.push("--raw".to_string()); - args.push("-z".to_string()); + args.push("-p".to_string()); + args.push("-U0".to_string()); + args.push("--no-color".to_string()); args.push("--no-abbrev".to_string()); args.push("-r".to_string()); if !pathspecs.is_empty() { @@ -2049,89 +2287,120 @@ fn run_diff_tree_for_commits( args.extend(pathspecs.iter().cloned()); } - // Feed commit SHAs directly — diff-tree will compare each commit against its first parent let stdin_data = commit_shas.join("\n") + "\n"; let output = exec_git_stdin(&args, stdin_data.as_bytes())?; - let data = output.stdout; + let text = String::from_utf8_lossy(&output.stdout); + // Parse the combined output: raw metadata lines (starting with ':') + unified diff patches + let commit_set: HashSet<&str> = commit_shas.iter().map(String::as_str).collect(); let mut commit_deltas: Vec<(String, CommitTrackedDelta)> = Vec::with_capacity(commit_shas.len()); let mut all_blob_oids = HashSet::new(); - let mut pos = 0usize; + let mut hunks_by_commit: HunksByCommitAndFile = HashMap::new(); - for commit_sha in commit_shas { - // When feeding commit SHAs to diff-tree --stdin with -z, the output format is: - // "\0" followed by diff entries (all null-terminated). - // Without -z, the commit SHA is newline-terminated. - let header_end = match data[pos..].iter().position(|&b| b == 0) { - Some(idx) => pos + idx, - None => { - // Commit may have no parent (root commit) — diff-tree may omit it. - // Push empty delta and continue. - commit_deltas.push((commit_sha.clone(), CommitTrackedDelta::default())); - continue; - } - }; - pos = header_end + 1; + let mut current_commit: Option = None; + let mut current_delta = CommitTrackedDelta::default(); + let mut current_diff_file: Option = None; - let mut delta = CommitTrackedDelta::default(); - - while pos < data.len() && data[pos] == b':' { - let meta_end = match data[pos..].iter().position(|&b| b == 0) { - Some(idx) => pos + idx, - None => break, - }; - let metadata = std::str::from_utf8(&data[pos + 1..meta_end])?; - let mut fields = metadata.split_whitespace(); - let _old_mode = fields.next().unwrap_or_default(); - let new_mode = fields.next().unwrap_or_default(); - let _old_oid = fields.next().unwrap_or_default(); - let new_oid = fields.next().unwrap_or_default(); - let status = fields.next().unwrap_or_default(); - let status_char = status.chars().next().unwrap_or('M'); - pos = meta_end + 1; - - let path_end = match data[pos..].iter().position(|&b| b == 0) { - Some(idx) => pos + idx, - None => break, - }; - let file_path = std::str::from_utf8(&data[pos..path_end])?.to_string(); - pos = path_end + 1; + for line in text.lines() { + // Commit header line (hex SHA) + if line.len() >= 40 + && commit_set.contains(&line[..40]) + && line[..40].chars().all(|c| c.is_ascii_hexdigit()) + { + // Save previous commit's delta + if let Some(ref prev_commit) = current_commit { + commit_deltas.push((prev_commit.clone(), std::mem::take(&mut current_delta))); + } + current_commit = Some(line[..40].to_string()); + current_diff_file = None; + continue; + } - if matches!(status_char, 'R' | 'C') { - let old_path_end = match data[pos..].iter().position(|&b| b == 0) { - Some(idx) => pos + idx, - None => break, - }; - pos = old_path_end + 1; + // Raw metadata line: :old_mode new_mode old_oid new_oid status\tpath + if line.starts_with(':') { + if let Some(ref _commit) = current_commit { + // Parse raw metadata + let tab_pos = line.find('\t'); + if let Some(tp) = tab_pos { + let metadata = &line[1..tp]; + let file_path = line[tp + 1..].to_string(); + let mut fields = metadata.split_whitespace(); + let _old_mode = fields.next().unwrap_or_default(); + let new_mode = fields.next().unwrap_or_default(); + let _old_oid = fields.next().unwrap_or_default(); + let new_oid = fields.next().unwrap_or_default(); + let status = fields.next().unwrap_or_default(); + let _status_char = status.chars().next().unwrap_or('M'); + + if pathspecs_lookup.contains(file_path.as_str()) { + current_delta.changed_files.insert(file_path.clone()); + let new_blob_oid = if is_zero_oid(new_oid) || !is_blob_mode(new_mode) { + None + } else { + Some(new_oid.to_string()) + }; + if let Some(oid) = &new_blob_oid { + all_blob_oids.insert(oid.clone()); + } + current_delta + .file_to_blob_oid + .insert(file_path, new_blob_oid); + } + } } + continue; + } - if !pathspecs_lookup.contains(file_path.as_str()) { - continue; + // diff --git a/path b/path + if line.starts_with("diff --git ") { + if let Some(b_path) = line.split(" b/").last() { + current_diff_file = Some(b_path.to_string()); } + continue; + } - delta.changed_files.insert(file_path.clone()); - let new_blob_oid = if is_zero_oid(new_oid) || !is_blob_mode(new_mode) { - None - } else { - Some(new_oid.to_string()) - }; - if let Some(oid) = &new_blob_oid { - all_blob_oids.insert(oid.clone()); + // Hunk header: @@ -old_start[,old_count] +new_start[,new_count] @@ + if line.starts_with("@@ ") { + if let (Some(commit), Some(file)) = (¤t_commit, ¤t_diff_file) { + if let Some(hunk) = parse_hunk_header(line) { + hunks_by_commit + .entry(commit.clone()) + .or_default() + .entry(file.clone()) + .or_default() + .push(hunk); + } } - delta.file_to_blob_oid.insert(file_path, new_blob_oid); + continue; } - commit_deltas.push((commit_sha.clone(), delta)); + // Skip other lines (index, ---, +++, content lines) + } + + // Save last commit's delta + if let Some(ref commit) = current_commit { + commit_deltas.push((commit.clone(), std::mem::take(&mut current_delta))); + } + + // Ensure all commits have deltas (some may have no changes) + let delta_commits: HashSet = commit_deltas.iter().map(|(c, _)| c.clone()).collect(); + for commit_sha in commit_shas { + if !delta_commits.contains(commit_sha) { + commit_deltas.push((commit_sha.clone(), CommitTrackedDelta::default())); + } } let mut blob_oid_list: Vec = all_blob_oids.into_iter().collect(); blob_oid_list.sort(); - Ok(DiffTreeResult { - commit_deltas, - all_blob_oids: blob_oid_list, - }) + Ok(( + DiffTreeResult { + commit_deltas, + all_blob_oids: blob_oid_list, + }, + hunks_by_commit, + )) } /// Assemble per-commit changed file contents from diff-tree deltas and blob contents. @@ -2143,11 +2412,21 @@ fn assemble_changed_contents( for (commit_sha, delta) in commit_deltas { let mut contents = HashMap::new(); for (file_path, maybe_blob_oid) in delta.file_to_blob_oid { - let content = maybe_blob_oid - .as_ref() - .and_then(|oid| blob_contents.get(oid).cloned()) - .unwrap_or_default(); - contents.insert(file_path, content); + match maybe_blob_oid { + None => { + // No blob OID = file was deleted (zero OID in diff-tree) + contents.insert(file_path, String::new()); + } + Some(ref oid) => { + // Only include if we actually read this blob's content. + // Non-first-appearance blobs are skipped during reading + // and will use hunk-based transfer instead. + if let Some(content) = blob_contents.get(oid) { + contents.insert(file_path, content.clone()); + } + // else: blob not read — file will use hunk-based path + } + } } result.insert(commit_sha, (delta.changed_files, contents)); } diff --git a/tests/integration/rebase_benchmark.rs b/tests/integration/rebase_benchmark.rs index 4b71b7c70..0510b9a5d 100644 --- a/tests/integration/rebase_benchmark.rs +++ b/tests/integration/rebase_benchmark.rs @@ -590,6 +590,374 @@ fn benchmark_blame_vs_diff() { } } +/// HEAVY benchmark designed to stress-test rebase performance at scale. +/// +/// This creates a realistic monorepo-style scenario: +/// - 50 AI-tracked files across multiple modules (200-500 lines each) +/// - 200 feature commits, EVERY commit touches ALL AI files (no skipping) +/// - Every single change has AI attribution (checkpoint for each file in each commit) +/// - Main branch also modifies the same AI-tracked files (forces slow path) +/// - 20 main branch commits creating content conflicts that shift line ranges +/// +/// This ensures: +/// 1. No fast-path shortcuts (blob OIDs differ due to main branch changes) +/// 2. Every commit must have its attribution rewritten (100% AI content) +/// 3. Line attribution transfer must handle shifting ranges +/// 4. Large note payloads (50 files × many line ranges per commit) +/// +/// Run with: cargo test --package git-ai --test integration benchmark_rebase_heavy -- --ignored --nocapture +#[test] +#[ignore] +fn benchmark_rebase_heavy() { + let num_ai_files: usize = std::env::var("HEAVY_BENCH_AI_FILES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(50); + let lines_per_file: usize = std::env::var("HEAVY_BENCH_LINES_PER_FILE") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(300); + let num_feature_commits: usize = std::env::var("HEAVY_BENCH_FEATURE_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(200); + let num_main_commits: usize = std::env::var("HEAVY_BENCH_MAIN_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(20); + let files_per_commit: usize = std::env::var("HEAVY_BENCH_FILES_PER_COMMIT") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(num_ai_files); // default: touch ALL files every commit + + println!("\n╔══════════════════════════════════════════════════════════╗"); + println!("║ HEAVY REBASE BENCHMARK ║"); + println!("╠══════════════════════════════════════════════════════════╣"); + println!("║ AI files: {:<10} ║", num_ai_files); + println!("║ Lines per file: {:<10} ║", lines_per_file); + println!("║ Feature commits: {:<10} ║", num_feature_commits); + println!("║ Main commits: {:<10} ║", num_main_commits); + println!("║ Files per commit: {:<10} ║", files_per_commit); + println!("║ Total initial lines: {:<10} ║", num_ai_files * lines_per_file); + println!("╚══════════════════════════════════════════════════════════╝\n"); + + let repo = TestRepo::new(); + let setup_start = Instant::now(); + + // Step 1: Create initial commit with all AI-tracked files + { + for file_idx in 0..num_ai_files { + let module = file_idx % 10; + let filename = format!("src/modules/mod_{}/component_{}.rs", module, file_idx); + let mut file = repo.filename(&filename); + let mut lines: Vec = Vec::new(); + // Header region (will be modified by main branch) + lines.push(format!("// Module {} Component {} - Auto-generated", module, file_idx).into()); + lines.push("// MAIN_INSERTION_POINT".into()); + lines.push(format!("pub mod component_{} {{", file_idx).into()); + // AI-generated body + for line_idx in 0..lines_per_file { + let line = format!( + " pub fn func_{}_{}() -> i32 {{ {} }} // AI generated", + file_idx, line_idx, line_idx * file_idx + 1 + ); + lines.push(line.ai()); + } + lines.push("} // end module".into()); + file.set_contents(lines); + } + repo.stage_all_and_commit("Initial: all AI-tracked files").unwrap(); + } + println!("Initial commit: {:.1}s", setup_start.elapsed().as_secs_f64()); + + let default_branch = repo.current_branch(); + + // Step 2: Create feature branch with many AI commits + // EVERY commit touches files and EVERY change has AI attribution + repo.git(&["checkout", "-b", "feature"]).unwrap(); + let feature_start = Instant::now(); + + for commit_idx in 0..num_feature_commits { + let start_file = (commit_idx * 3) % num_ai_files; + for i in 0..files_per_commit { + let file_idx = (start_file + i) % num_ai_files; + let module = file_idx % 10; + let filename = format!("src/modules/mod_{}/component_{}.rs", module, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + + // Append AI-authored code at end (before closing brace) + let new_content = current.replacen( + "} // end module", + &format!( + " pub fn feature_{}_in_comp_{}() -> String {{ String::from(\"v{}\") }} // AI commit {}\n}} // end module", + commit_idx, file_idx, commit_idx, commit_idx + ), + 1, + ); + fs::write(&path, &new_content).unwrap(); + // Checkpoint EVERY file as AI-authored + repo.git_ai(&["checkpoint", "mock_ai_agent", &filename]).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("AI feature commit {}", commit_idx)).unwrap(); + + if (commit_idx + 1) % 25 == 0 { + println!( + " Feature commit {}/{} ({:.1}s, {:.0}ms/commit)", + commit_idx + 1, + num_feature_commits, + feature_start.elapsed().as_secs_f64(), + feature_start.elapsed().as_millis() as f64 / (commit_idx + 1) as f64, + ); + } + } + println!( + "Feature branch setup: {:.1}s ({} commits, {:.0}ms/commit)", + feature_start.elapsed().as_secs_f64(), + num_feature_commits, + feature_start.elapsed().as_millis() as f64 / num_feature_commits as f64, + ); + + // Step 3: Advance main branch - modify the SAME AI-tracked files + // This forces the slow path because blob OIDs will differ after rebase + repo.git(&["checkout", &default_branch]).unwrap(); + let main_start = Instant::now(); + + for main_idx in 0..num_main_commits { + // Each main commit modifies a rotating set of AI files at the header + let files_per_main = (num_ai_files / 2).max(5); + let start_file = (main_idx * 7) % num_ai_files; + for i in 0..files_per_main { + let file_idx = (start_file + i) % num_ai_files; + let module = file_idx % 10; + let filename = format!("src/modules/mod_{}/component_{}.rs", module, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + // Insert at the MAIN_INSERTION_POINT - this shifts ALL line numbers + let new_content = current.replacen( + "// MAIN_INSERTION_POINT", + &format!( + "// Main branch change {} in component {}\n// Added config: SETTING_{}={}\n// MAIN_INSERTION_POINT", + main_idx, file_idx, main_idx, file_idx + ), + 1, + ); + fs::write(&path, &new_content).unwrap(); + } + // Also add unrelated files for realism + for i in 0..3 { + let filename = format!("docs/main_change_{}_{}.md", main_idx, i); + let mut file = repo.filename(&filename); + file.set_contents(crate::lines![format!("Main doc {} {}", main_idx, i)]); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("Main change {}", main_idx)).unwrap(); + } + println!( + "Main branch setup: {:.1}s ({} commits)", + main_start.elapsed().as_secs_f64(), + num_main_commits, + ); + println!( + "Total setup time: {:.1}s", + setup_start.elapsed().as_secs_f64() + ); + + // Step 4: Rebase feature onto main with full instrumentation + repo.git(&["checkout", "feature"]).unwrap(); + + let timing_file = repo.path().join("..").join("heavy_rebase_timing.txt"); + let timing_path = timing_file.to_str().unwrap().to_string(); + + println!("\n━━━ Starting HEAVY rebase ({} commits onto {}) ━━━", num_feature_commits, &default_branch); + let wall_start = Instant::now(); + + // Use benchmark_git for structured timing (captures pre/git/post breakdown) + let bench_result = repo.benchmark_git(&["rebase", &default_branch]); + let wall_duration = wall_start.elapsed(); + + match &bench_result { + Ok(bench) => { + let git_ms = bench.git_duration.as_millis(); + let total_ms = bench.total_duration.as_millis(); + let pre_ms = bench.pre_command_duration.as_millis(); + let post_ms = bench.post_command_duration.as_millis(); + let overhead_ms = total_ms.saturating_sub(git_ms); + let overhead_pct = if git_ms > 0 { + overhead_ms as f64 / git_ms as f64 * 100.0 + } else { + 0.0 + }; + + println!("\n╔══════════════════════════════════════════════════════════╗"); + println!("║ HEAVY BENCHMARK RESULTS ║"); + println!("╠══════════════════════════════════════════════════════════╣"); + println!("║ Configuration: ║"); + println!("║ AI files: {} ", num_ai_files); + println!("║ Lines/file: {} ", lines_per_file); + println!("║ Feature commits: {} ", num_feature_commits); + println!("║ Main commits: {} ", num_main_commits); + println!("║ Files/commit: {} ", files_per_commit); + println!("╠══════════════════════════════════════════════════════════╣"); + println!("║ Timing: ║"); + println!("║ Wall time: {:.3}s ", wall_duration.as_secs_f64()); + println!("║ Total (wrapper): {}ms ", total_ms); + println!("║ Git rebase: {}ms ", git_ms); + println!("║ Pre-command: {}ms ", pre_ms); + println!("║ Post-command: {}ms ", post_ms); + println!("║ Overhead: {}ms ({:.1}% of git) ", overhead_ms, overhead_pct); + println!("╠══════════════════════════════════════════════════════════╣"); + println!("║ Per-commit averages: ║"); + println!("║ Total: {:.1}ms ", total_ms as f64 / num_feature_commits as f64); + println!("║ Git: {:.1}ms ", git_ms as f64 / num_feature_commits as f64); + println!("║ Overhead: {:.1}ms ", overhead_ms as f64 / num_feature_commits as f64); + println!("╚══════════════════════════════════════════════════════════╝\n"); + } + Err(e) => { + println!( + "Benchmark failed after {:.3}s: {}", + wall_duration.as_secs_f64(), + e + ); + panic!("Heavy benchmark failed: {}", e); + } + } + + // Also read timing file if available + if let Ok(timing_data) = fs::read_to_string(&timing_file) { + println!("=== PHASE TIMING BREAKDOWN ==="); + print!("{}", timing_data); + println!("===============================\n"); + } +} + +/// Same as heavy benchmark but with timing file output for phase analysis +#[test] +#[ignore] +fn benchmark_rebase_heavy_with_timing() { + let num_ai_files: usize = std::env::var("HEAVY_BENCH_AI_FILES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(30); + let lines_per_file: usize = std::env::var("HEAVY_BENCH_LINES_PER_FILE") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(200); + let num_feature_commits: usize = std::env::var("HEAVY_BENCH_FEATURE_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(100); + let num_main_commits: usize = std::env::var("HEAVY_BENCH_MAIN_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(15); + + println!("\n=== Heavy Rebase Benchmark (with timing) ==="); + println!("AI files: {}, Lines/file: {}, Feature commits: {}, Main commits: {}", + num_ai_files, lines_per_file, num_feature_commits, num_main_commits); + println!("=============================================\n"); + + let repo = TestRepo::new(); + + // Create initial files + for file_idx in 0..num_ai_files { + let module = file_idx % 8; + let filename = format!("src/mod_{}/file_{}.rs", module, file_idx); + let mut file = repo.filename(&filename); + let mut lines: Vec = Vec::new(); + lines.push(format!("// File {} header", file_idx).into()); + lines.push("// MAIN_MARKER".into()); + for line_idx in 0..lines_per_file { + lines.push( + format!("fn f_{}_{}() {{ /* AI */ }}", file_idx, line_idx).ai() + ); + } + lines.push("// EOF".into()); + file.set_contents(lines); + } + repo.stage_all_and_commit("Initial AI files").unwrap(); + let default_branch = repo.current_branch(); + + // Feature branch + repo.git(&["checkout", "-b", "feature"]).unwrap(); + let feature_start = Instant::now(); + for commit_idx in 0..num_feature_commits { + for file_idx in 0..num_ai_files { + let module = file_idx % 8; + let filename = format!("src/mod_{}/file_{}.rs", module, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + let new_content = current.replacen( + "// EOF", + &format!("fn feat_{}_{}() {{ /* AI v{} */ }}\n// EOF", commit_idx, file_idx, commit_idx), + 1, + ); + fs::write(&path, &new_content).unwrap(); + repo.git_ai(&["checkpoint", "mock_ai", &filename]).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("feat {}", commit_idx)).unwrap(); + if (commit_idx + 1) % 20 == 0 { + println!(" Feature {}/{} ({:.1}s)", commit_idx + 1, num_feature_commits, feature_start.elapsed().as_secs_f64()); + } + } + println!("Feature setup: {:.1}s", feature_start.elapsed().as_secs_f64()); + + // Main branch modifications + repo.git(&["checkout", &default_branch]).unwrap(); + for main_idx in 0..num_main_commits { + for file_idx in 0..num_ai_files { + let module = file_idx % 8; + let filename = format!("src/mod_{}/file_{}.rs", module, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + let new_content = current.replacen( + "// MAIN_MARKER", + &format!("// main change {} f{}\n// MAIN_MARKER", main_idx, file_idx), + 1, + ); + fs::write(&path, &new_content).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("main {}", main_idx)).unwrap(); + } + + // Rebase with timing + repo.git(&["checkout", "feature"]).unwrap(); + let timing_file = repo.path().join("..").join("heavy_timing.txt"); + let timing_path = timing_file.to_str().unwrap().to_string(); + + println!("\n--- Starting rebase ---"); + let start = Instant::now(); + let result = repo.git_with_env( + &["rebase", &default_branch], + &[ + ("GIT_AI_DEBUG_PERFORMANCE", "2"), + ("GIT_AI_REBASE_TIMING_FILE", &timing_path), + ], + None, + ); + let dur = start.elapsed(); + + match &result { + Ok(_) => println!("Rebase succeeded in {:.3}s", dur.as_secs_f64()), + Err(e) => println!("Rebase FAILED in {:.3}s: {}", dur.as_secs_f64(), e), + } + result.unwrap(); + + if let Ok(timing_data) = fs::read_to_string(&timing_file) { + println!("\n=== PHASE TIMING ==="); + print!("{}", timing_data); + println!("====================\n"); + } + + println!("Total: {:.3}s, Per-commit: {:.1}ms", + dur.as_secs_f64(), + dur.as_millis() as f64 / num_feature_commits as f64 + ); +} + fn extract_timing(data: &str, key: &str) -> Option { for line in data.lines() { let trimmed = line.trim(); From c7522b2abc9eccc70b9ccbfa7b8ab0c3bdeb48c2 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 02:08:56 +0000 Subject: [PATCH 02/16] style: cargo fmt Co-Authored-By: Claude Opus 4.6 --- SCRATCHPAD_REBASE_PERF.md | 174 ++++++++++++++++++++++++++ src/authorship/rebase_authorship.rs | 36 +++++- tests/integration/rebase_benchmark.rs | 170 +++++++++++++++++++------ 3 files changed, 335 insertions(+), 45 deletions(-) create mode 100644 SCRATCHPAD_REBASE_PERF.md diff --git a/SCRATCHPAD_REBASE_PERF.md b/SCRATCHPAD_REBASE_PERF.md new file mode 100644 index 000000000..1f5d0fe95 --- /dev/null +++ b/SCRATCHPAD_REBASE_PERF.md @@ -0,0 +1,174 @@ +# Rebase Performance Deep Dive - Scratchpad + +## Timeline +- Started: 2026-03-28 18:43 UTC +- Phase 1 (Benchmark + Profile): until 22:00 UTC ✅ +- Phase 2 (Solution Planning): 22:00-23:30 UTC (IN PROGRESS) +- Phase 3 (Implementation): 23:30-04:00 UTC +- Phase 4 (Report): after 04:00 UTC + +## Architecture Summary + +### Rebase Attribution Pipeline (end-to-end) +1. **Pre-rebase hook**: Captures original HEAD, logs RebaseStartEvent +2. **Git rebase executes**: Replays commits onto new base, creates new SHAs +3. **Post-rebase hook**: Calls `rewrite_authorship_after_rebase_v2()` + +### rewrite_authorship_after_rebase_v2() Steps +1. `load_rebase_note_cache()` - Batch reads all note blob OIDs + contents (2 git calls) +2. `try_fast_path_rebase_note_remap_cached()` - If all tracked file blobs identical → direct note copy (returns early) +3. `run_diff_tree_for_commits()` - Single `git diff-tree --stdin --raw -z` to find changed files per commit +4. `try_reconstruct_attributions_from_notes_cached()` - Reconstructs initial attribution state from original notes +5. `batch_read_blob_contents_parallel()` - Reads all blob contents in parallel batches +6. **COMMIT PROCESSING LOOP** (the bottleneck): + - For each commit: for each changed file: `diff_based_line_attribution_transfer()` + - Uses `imara_diff::Myers` to diff old content vs new content + - Transfers attributions positionally based on diff ops + - Updates serialization cache and metrics +7. `notes_add_batch()` - Writes all notes via `git fast-import` + +### Why We Must Look at Each Commit +- AI attribution notes are SCOPED to changes in each specific commit +- Line ranges shift as code changes across the commit sequence +- Each rebased commit may have different file content than the original +- Must track attribution evolution through the entire sequence +- Notes store per-file, per-prompt line ranges that need exact recomputation + +## Benchmark Results + +### Run 1: Small (20 commits × 10 files × 50 lines) +- Total rebase: 179ms (git: 139ms, overhead: 40ms = 28.8%) +- Per-commit overhead: 2.0ms + +### Run 2: Medium (100 commits × 30 files × 200 lines) +- Total rebase: 2788ms +- Authorship rewrite total: 1546ms +- **Phase breakdown:** + - load_rebase_note_cache: 31ms (2%) + - diff_tree: 47ms (3%) + - attribution_reconstruction: 9ms (1%) + - blob_read_parallel: 59ms (4%) + - **commit_processing_loop: 1347ms (87%)** + - **loop:transform: 1294ms (84%)** + - **transform:diff: 1328ms (86%) ← THE BOTTLENECK** + - transform:attestation_serialize: <1ms + - transform:content_clone: 3ms + - transform:metrics: <1ms + - loop:serialize: <1ms + - loop:metrics: <1ms + - notes_add_batch: 23ms (1.5%) +- Stats: 3000 file diffs, 805,500 total lines diffed + +### Run 3: Heavy (200 commits × 50 files × 300 lines) [IN PROGRESS] +- Expected: ~10,000 file diffs, ~4M total lines, ~6-7s diff time + +## THE BOTTLENECK: diff_based_line_attribution_transfer() + +### What It Does +For each changed file in each commit: +1. Split old + new content into Vec<&str> lines +2. Build old_line_author lookup (Vec>) +3. Create InternedInput (hashes ALL lines for interning) +4. Run Myers diff algorithm +5. Walk DiffOps to transfer attributions +6. Create Vec result + +### Why It's Expensive +- 3000 diffs × 0.44ms/diff = 1328ms +- Files grow linearly (200 → 300 lines over 100 commits) +- Total work is O(N_commits × N_files × avg_file_size) +- Quadratic in commits for growing files: O(N² × files) +- InternedInput hashes every line every time (no reuse) + +### What's NOT Expensive (everything else) +- Blob reading: 59ms (parallel, efficient) +- Note cache: 31ms (batch git calls) +- diff-tree: 47ms (single git call) +- Serialization: <1ms (cached assembly) +- Note writing: 23ms (git fast-import) +- Metrics: <1ms + +## Solution Ideas (Ranked by Expected Impact) + +### IDEA A: Hunk-Based Attribution Transfer via git diff-tree -p -U0 +**Impact: HIGH (eliminates diff computation entirely)** +**Complexity: MEDIUM** + +Instead of re-diffing file contents ourselves, use `git diff-tree --stdin -p -U0` which gives +minimal unified diff output with hunk headers showing exactly what line ranges changed. + +Parse hunk headers like `@@ -10,5 +12,6 @@` to build line offset maps: +- Lines before hunk: map 1:1 +- Deleted lines in hunk: dropped +- Inserted lines in hunk: new (no attribution) +- Lines after hunk: map 1:1 with accumulated offset + +This eliminates: +- Reading blob contents entirely (~60ms saved) +- Running ANY diff algorithm (~1328ms saved) +- Only requires parsing diff-tree output (~50ms estimated) + +**Key subtlety**: For commit N1, diff-tree compares against new_base, but our initial attributions +are from original_head. For N2+, diff-tree parent IS our accumulated state. + +**Solution**: For N1, use the content-diff fallback. For N2+, use hunk-based transfer. +Since N1 is 1 commit out of 200, the overhead is negligible. + +### IDEA B: Parallel Per-File Processing +**Impact: MEDIUM-HIGH (4-8x speedup on multi-core)** +**Complexity: LOW** + +The file diffs within each commit are independent. Parallelize using smol/rayon. +Each file reads from shared immutable state and produces independent results that +are merged back. + +Can be combined with IDEA A for multiplicative benefit. + +### IDEA C: Fast-Path for Simple Changes +**Impact: MEDIUM (skips diff for common patterns)** +**Complexity: LOW** + +Detect simple change patterns and avoid the full diff: +1. **Pure append**: new_content starts with old_content → keep all old attrs unchanged +2. **Single insertion**: common prefix + common suffix = full old content → shift attrs after insert point +3. **Single deletion**: similar detection + +For the typical AI coding workflow (appending functions), this could skip >50% of diffs. + +## Key Git Internals Insight + +`git diff-tree --stdin -p -U0` compares each commit against its parent and outputs +minimal unified diff patches. For a linear rebase sequence (N1→N2→...→NN), +each Nk's parent is N(k-1), which is exactly the state we're tracking. + +This means for commits N2..NN, the diff-tree patch output gives us exactly the +line-level change information we need, without any content reading or diff computation. + +Hunk header format: `@@ -old_start[,old_count] +new_start[,new_count] @@` +- old_count defaults to 1 if omitted +- Pure insertion: old_count=0 (e.g., `@@ -10,0 +11,3 @@`) +- Pure deletion: new_count=0 (e.g., `@@ -10,3 +10,0 @@`) +- Replacement: both non-zero + +## Implementation Plan + +### Phase 3A: Implement IDEA A (Hunk-Based Transfer) +1. Add `run_diff_tree_with_patches()` function that runs `git diff-tree --stdin -p -U0` +2. Parse output to extract per-commit, per-file hunk headers +3. Implement `apply_hunk_offsets_to_attributions()` that adjusts line ranges +4. Replace the content-diff loop with hunk-based transfer (N2+), keep content-diff for N1 +5. Skip blob content reading for files where only hunk-based transfer is needed + +### Phase 3B: Implement IDEA B (Parallel Processing) +1. Restructure the inner file loop to collect independent work items +2. Process files in parallel using smol::spawn or rayon +3. Collect results and merge back into shared state + +### Phase 3C: Implement IDEA C (Fast-Path Detection) +1. Add prefix/suffix check before full diff +2. Handle append-only and single-insertion cases with arithmetic + +### Verification +- Run existing integration tests to ensure correctness +- Run heavy benchmark to measure improvement +- Compare attribution output before/after optimization diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 7128c8732..703395e6d 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1548,12 +1548,36 @@ pub fn rewrite_authorship_after_rebase_v2( loop_start.elapsed().as_millis(), )); timing_phases.push((" loop:transform".to_string(), loop_transform_ms)); - timing_phases.push((format!(" transform:diff ({} files, {} lines)", total_files_diffed, total_lines_diffed), loop_diff_ms / 1000)); - timing_phases.push((format!(" transform:hunk_transfer ({} files)", total_files_hunk_transferred), loop_hunk_ms / 1000)); - timing_phases.push((format!(" transform:attestation_serialize"), loop_attestation_ms / 1000)); - timing_phases.push((format!(" transform:content_clone"), loop_content_clone_ms / 1000)); - timing_phases.push((format!(" transform:metrics_subtract"), loop_metrics_subtract_ms / 1000)); - timing_phases.push((format!(" transform:metrics_add"), loop_metrics_add_ms / 1000)); + timing_phases.push(( + format!( + " transform:diff ({} files, {} lines)", + total_files_diffed, total_lines_diffed + ), + loop_diff_ms / 1000, + )); + timing_phases.push(( + format!( + " transform:hunk_transfer ({} files)", + total_files_hunk_transferred + ), + loop_hunk_ms / 1000, + )); + timing_phases.push(( + format!(" transform:attestation_serialize"), + loop_attestation_ms / 1000, + )); + timing_phases.push(( + format!(" transform:content_clone"), + loop_content_clone_ms / 1000, + )); + timing_phases.push(( + format!(" transform:metrics_subtract"), + loop_metrics_subtract_ms / 1000, + )); + timing_phases.push(( + format!(" transform:metrics_add"), + loop_metrics_add_ms / 1000, + )); timing_phases.push((" loop:serialize".to_string(), loop_serialize_ms)); timing_phases.push((" loop:metrics".to_string(), loop_metrics_ms)); diff --git a/tests/integration/rebase_benchmark.rs b/tests/integration/rebase_benchmark.rs index 0510b9a5d..677ae1051 100644 --- a/tests/integration/rebase_benchmark.rs +++ b/tests/integration/rebase_benchmark.rs @@ -633,12 +633,30 @@ fn benchmark_rebase_heavy() { println!("\n╔══════════════════════════════════════════════════════════╗"); println!("║ HEAVY REBASE BENCHMARK ║"); println!("╠══════════════════════════════════════════════════════════╣"); - println!("║ AI files: {:<10} ║", num_ai_files); - println!("║ Lines per file: {:<10} ║", lines_per_file); - println!("║ Feature commits: {:<10} ║", num_feature_commits); - println!("║ Main commits: {:<10} ║", num_main_commits); - println!("║ Files per commit: {:<10} ║", files_per_commit); - println!("║ Total initial lines: {:<10} ║", num_ai_files * lines_per_file); + println!( + "║ AI files: {:<10} ║", + num_ai_files + ); + println!( + "║ Lines per file: {:<10} ║", + lines_per_file + ); + println!( + "║ Feature commits: {:<10} ║", + num_feature_commits + ); + println!( + "║ Main commits: {:<10} ║", + num_main_commits + ); + println!( + "║ Files per commit: {:<10} ║", + files_per_commit + ); + println!( + "║ Total initial lines: {:<10} ║", + num_ai_files * lines_per_file + ); println!("╚══════════════════════════════════════════════════════════╝\n"); let repo = TestRepo::new(); @@ -652,23 +670,35 @@ fn benchmark_rebase_heavy() { let mut file = repo.filename(&filename); let mut lines: Vec = Vec::new(); // Header region (will be modified by main branch) - lines.push(format!("// Module {} Component {} - Auto-generated", module, file_idx).into()); + lines.push( + format!( + "// Module {} Component {} - Auto-generated", + module, file_idx + ) + .into(), + ); lines.push("// MAIN_INSERTION_POINT".into()); lines.push(format!("pub mod component_{} {{", file_idx).into()); // AI-generated body for line_idx in 0..lines_per_file { let line = format!( " pub fn func_{}_{}() -> i32 {{ {} }} // AI generated", - file_idx, line_idx, line_idx * file_idx + 1 + file_idx, + line_idx, + line_idx * file_idx + 1 ); lines.push(line.ai()); } lines.push("} // end module".into()); file.set_contents(lines); } - repo.stage_all_and_commit("Initial: all AI-tracked files").unwrap(); + repo.stage_all_and_commit("Initial: all AI-tracked files") + .unwrap(); } - println!("Initial commit: {:.1}s", setup_start.elapsed().as_secs_f64()); + println!( + "Initial commit: {:.1}s", + setup_start.elapsed().as_secs_f64() + ); let default_branch = repo.current_branch(); @@ -697,10 +727,12 @@ fn benchmark_rebase_heavy() { ); fs::write(&path, &new_content).unwrap(); // Checkpoint EVERY file as AI-authored - repo.git_ai(&["checkpoint", "mock_ai_agent", &filename]).unwrap(); + repo.git_ai(&["checkpoint", "mock_ai_agent", &filename]) + .unwrap(); } repo.git(&["add", "-A"]).unwrap(); - repo.stage_all_and_commit(&format!("AI feature commit {}", commit_idx)).unwrap(); + repo.stage_all_and_commit(&format!("AI feature commit {}", commit_idx)) + .unwrap(); if (commit_idx + 1) % 25 == 0 { println!( @@ -752,7 +784,8 @@ fn benchmark_rebase_heavy() { file.set_contents(crate::lines![format!("Main doc {} {}", main_idx, i)]); } repo.git(&["add", "-A"]).unwrap(); - repo.stage_all_and_commit(&format!("Main change {}", main_idx)).unwrap(); + repo.stage_all_and_commit(&format!("Main change {}", main_idx)) + .unwrap(); } println!( "Main branch setup: {:.1}s ({} commits)", @@ -770,7 +803,10 @@ fn benchmark_rebase_heavy() { let timing_file = repo.path().join("..").join("heavy_rebase_timing.txt"); let timing_path = timing_file.to_str().unwrap().to_string(); - println!("\n━━━ Starting HEAVY rebase ({} commits onto {}) ━━━", num_feature_commits, &default_branch); + println!( + "\n━━━ Starting HEAVY rebase ({} commits onto {}) ━━━", + num_feature_commits, &default_branch + ); let wall_start = Instant::now(); // Use benchmark_git for structured timing (captures pre/git/post breakdown) @@ -794,24 +830,66 @@ fn benchmark_rebase_heavy() { println!("║ HEAVY BENCHMARK RESULTS ║"); println!("╠══════════════════════════════════════════════════════════╣"); println!("║ Configuration: ║"); - println!("║ AI files: {} ", num_ai_files); - println!("║ Lines/file: {} ", lines_per_file); - println!("║ Feature commits: {} ", num_feature_commits); - println!("║ Main commits: {} ", num_main_commits); - println!("║ Files/commit: {} ", files_per_commit); + println!( + "║ AI files: {} ", + num_ai_files + ); + println!( + "║ Lines/file: {} ", + lines_per_file + ); + println!( + "║ Feature commits: {} ", + num_feature_commits + ); + println!( + "║ Main commits: {} ", + num_main_commits + ); + println!( + "║ Files/commit: {} ", + files_per_commit + ); println!("╠══════════════════════════════════════════════════════════╣"); println!("║ Timing: ║"); - println!("║ Wall time: {:.3}s ", wall_duration.as_secs_f64()); - println!("║ Total (wrapper): {}ms ", total_ms); - println!("║ Git rebase: {}ms ", git_ms); - println!("║ Pre-command: {}ms ", pre_ms); - println!("║ Post-command: {}ms ", post_ms); - println!("║ Overhead: {}ms ({:.1}% of git) ", overhead_ms, overhead_pct); + println!( + "║ Wall time: {:.3}s ", + wall_duration.as_secs_f64() + ); + println!( + "║ Total (wrapper): {}ms ", + total_ms + ); + println!( + "║ Git rebase: {}ms ", + git_ms + ); + println!( + "║ Pre-command: {}ms ", + pre_ms + ); + println!( + "║ Post-command: {}ms ", + post_ms + ); + println!( + "║ Overhead: {}ms ({:.1}% of git) ", + overhead_ms, overhead_pct + ); println!("╠══════════════════════════════════════════════════════════╣"); println!("║ Per-commit averages: ║"); - println!("║ Total: {:.1}ms ", total_ms as f64 / num_feature_commits as f64); - println!("║ Git: {:.1}ms ", git_ms as f64 / num_feature_commits as f64); - println!("║ Overhead: {:.1}ms ", overhead_ms as f64 / num_feature_commits as f64); + println!( + "║ Total: {:.1}ms ", + total_ms as f64 / num_feature_commits as f64 + ); + println!( + "║ Git: {:.1}ms ", + git_ms as f64 / num_feature_commits as f64 + ); + println!( + "║ Overhead: {:.1}ms ", + overhead_ms as f64 / num_feature_commits as f64 + ); println!("╚══════════════════════════════════════════════════════════╝\n"); } Err(e) => { @@ -854,8 +932,10 @@ fn benchmark_rebase_heavy_with_timing() { .unwrap_or(15); println!("\n=== Heavy Rebase Benchmark (with timing) ==="); - println!("AI files: {}, Lines/file: {}, Feature commits: {}, Main commits: {}", - num_ai_files, lines_per_file, num_feature_commits, num_main_commits); + println!( + "AI files: {}, Lines/file: {}, Feature commits: {}, Main commits: {}", + num_ai_files, lines_per_file, num_feature_commits, num_main_commits + ); println!("=============================================\n"); let repo = TestRepo::new(); @@ -869,9 +949,7 @@ fn benchmark_rebase_heavy_with_timing() { lines.push(format!("// File {} header", file_idx).into()); lines.push("// MAIN_MARKER".into()); for line_idx in 0..lines_per_file { - lines.push( - format!("fn f_{}_{}() {{ /* AI */ }}", file_idx, line_idx).ai() - ); + lines.push(format!("fn f_{}_{}() {{ /* AI */ }}", file_idx, line_idx).ai()); } lines.push("// EOF".into()); file.set_contents(lines); @@ -890,19 +968,31 @@ fn benchmark_rebase_heavy_with_timing() { let current = fs::read_to_string(&path).unwrap_or_default(); let new_content = current.replacen( "// EOF", - &format!("fn feat_{}_{}() {{ /* AI v{} */ }}\n// EOF", commit_idx, file_idx, commit_idx), + &format!( + "fn feat_{}_{}() {{ /* AI v{} */ }}\n// EOF", + commit_idx, file_idx, commit_idx + ), 1, ); fs::write(&path, &new_content).unwrap(); repo.git_ai(&["checkpoint", "mock_ai", &filename]).unwrap(); } repo.git(&["add", "-A"]).unwrap(); - repo.stage_all_and_commit(&format!("feat {}", commit_idx)).unwrap(); + repo.stage_all_and_commit(&format!("feat {}", commit_idx)) + .unwrap(); if (commit_idx + 1) % 20 == 0 { - println!(" Feature {}/{} ({:.1}s)", commit_idx + 1, num_feature_commits, feature_start.elapsed().as_secs_f64()); + println!( + " Feature {}/{} ({:.1}s)", + commit_idx + 1, + num_feature_commits, + feature_start.elapsed().as_secs_f64() + ); } } - println!("Feature setup: {:.1}s", feature_start.elapsed().as_secs_f64()); + println!( + "Feature setup: {:.1}s", + feature_start.elapsed().as_secs_f64() + ); // Main branch modifications repo.git(&["checkout", &default_branch]).unwrap(); @@ -920,7 +1010,8 @@ fn benchmark_rebase_heavy_with_timing() { fs::write(&path, &new_content).unwrap(); } repo.git(&["add", "-A"]).unwrap(); - repo.stage_all_and_commit(&format!("main {}", main_idx)).unwrap(); + repo.stage_all_and_commit(&format!("main {}", main_idx)) + .unwrap(); } // Rebase with timing @@ -952,7 +1043,8 @@ fn benchmark_rebase_heavy_with_timing() { println!("====================\n"); } - println!("Total: {:.3}s, Per-commit: {:.1}ms", + println!( + "Total: {:.3}s, Per-commit: {:.1}ms", dur.as_secs_f64(), dur.as_millis() as f64 / num_feature_commits as f64 ); From c37fc0f21f9a57c3d16cf31afac6857c7abde620 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 02:09:01 +0000 Subject: [PATCH 03/16] chore: remove scratchpad from tracking Co-Authored-By: Claude Opus 4.6 --- SCRATCHPAD_REBASE_PERF.md | 174 -------------------------------------- 1 file changed, 174 deletions(-) delete mode 100644 SCRATCHPAD_REBASE_PERF.md diff --git a/SCRATCHPAD_REBASE_PERF.md b/SCRATCHPAD_REBASE_PERF.md deleted file mode 100644 index 1f5d0fe95..000000000 --- a/SCRATCHPAD_REBASE_PERF.md +++ /dev/null @@ -1,174 +0,0 @@ -# Rebase Performance Deep Dive - Scratchpad - -## Timeline -- Started: 2026-03-28 18:43 UTC -- Phase 1 (Benchmark + Profile): until 22:00 UTC ✅ -- Phase 2 (Solution Planning): 22:00-23:30 UTC (IN PROGRESS) -- Phase 3 (Implementation): 23:30-04:00 UTC -- Phase 4 (Report): after 04:00 UTC - -## Architecture Summary - -### Rebase Attribution Pipeline (end-to-end) -1. **Pre-rebase hook**: Captures original HEAD, logs RebaseStartEvent -2. **Git rebase executes**: Replays commits onto new base, creates new SHAs -3. **Post-rebase hook**: Calls `rewrite_authorship_after_rebase_v2()` - -### rewrite_authorship_after_rebase_v2() Steps -1. `load_rebase_note_cache()` - Batch reads all note blob OIDs + contents (2 git calls) -2. `try_fast_path_rebase_note_remap_cached()` - If all tracked file blobs identical → direct note copy (returns early) -3. `run_diff_tree_for_commits()` - Single `git diff-tree --stdin --raw -z` to find changed files per commit -4. `try_reconstruct_attributions_from_notes_cached()` - Reconstructs initial attribution state from original notes -5. `batch_read_blob_contents_parallel()` - Reads all blob contents in parallel batches -6. **COMMIT PROCESSING LOOP** (the bottleneck): - - For each commit: for each changed file: `diff_based_line_attribution_transfer()` - - Uses `imara_diff::Myers` to diff old content vs new content - - Transfers attributions positionally based on diff ops - - Updates serialization cache and metrics -7. `notes_add_batch()` - Writes all notes via `git fast-import` - -### Why We Must Look at Each Commit -- AI attribution notes are SCOPED to changes in each specific commit -- Line ranges shift as code changes across the commit sequence -- Each rebased commit may have different file content than the original -- Must track attribution evolution through the entire sequence -- Notes store per-file, per-prompt line ranges that need exact recomputation - -## Benchmark Results - -### Run 1: Small (20 commits × 10 files × 50 lines) -- Total rebase: 179ms (git: 139ms, overhead: 40ms = 28.8%) -- Per-commit overhead: 2.0ms - -### Run 2: Medium (100 commits × 30 files × 200 lines) -- Total rebase: 2788ms -- Authorship rewrite total: 1546ms -- **Phase breakdown:** - - load_rebase_note_cache: 31ms (2%) - - diff_tree: 47ms (3%) - - attribution_reconstruction: 9ms (1%) - - blob_read_parallel: 59ms (4%) - - **commit_processing_loop: 1347ms (87%)** - - **loop:transform: 1294ms (84%)** - - **transform:diff: 1328ms (86%) ← THE BOTTLENECK** - - transform:attestation_serialize: <1ms - - transform:content_clone: 3ms - - transform:metrics: <1ms - - loop:serialize: <1ms - - loop:metrics: <1ms - - notes_add_batch: 23ms (1.5%) -- Stats: 3000 file diffs, 805,500 total lines diffed - -### Run 3: Heavy (200 commits × 50 files × 300 lines) [IN PROGRESS] -- Expected: ~10,000 file diffs, ~4M total lines, ~6-7s diff time - -## THE BOTTLENECK: diff_based_line_attribution_transfer() - -### What It Does -For each changed file in each commit: -1. Split old + new content into Vec<&str> lines -2. Build old_line_author lookup (Vec>) -3. Create InternedInput (hashes ALL lines for interning) -4. Run Myers diff algorithm -5. Walk DiffOps to transfer attributions -6. Create Vec result - -### Why It's Expensive -- 3000 diffs × 0.44ms/diff = 1328ms -- Files grow linearly (200 → 300 lines over 100 commits) -- Total work is O(N_commits × N_files × avg_file_size) -- Quadratic in commits for growing files: O(N² × files) -- InternedInput hashes every line every time (no reuse) - -### What's NOT Expensive (everything else) -- Blob reading: 59ms (parallel, efficient) -- Note cache: 31ms (batch git calls) -- diff-tree: 47ms (single git call) -- Serialization: <1ms (cached assembly) -- Note writing: 23ms (git fast-import) -- Metrics: <1ms - -## Solution Ideas (Ranked by Expected Impact) - -### IDEA A: Hunk-Based Attribution Transfer via git diff-tree -p -U0 -**Impact: HIGH (eliminates diff computation entirely)** -**Complexity: MEDIUM** - -Instead of re-diffing file contents ourselves, use `git diff-tree --stdin -p -U0` which gives -minimal unified diff output with hunk headers showing exactly what line ranges changed. - -Parse hunk headers like `@@ -10,5 +12,6 @@` to build line offset maps: -- Lines before hunk: map 1:1 -- Deleted lines in hunk: dropped -- Inserted lines in hunk: new (no attribution) -- Lines after hunk: map 1:1 with accumulated offset - -This eliminates: -- Reading blob contents entirely (~60ms saved) -- Running ANY diff algorithm (~1328ms saved) -- Only requires parsing diff-tree output (~50ms estimated) - -**Key subtlety**: For commit N1, diff-tree compares against new_base, but our initial attributions -are from original_head. For N2+, diff-tree parent IS our accumulated state. - -**Solution**: For N1, use the content-diff fallback. For N2+, use hunk-based transfer. -Since N1 is 1 commit out of 200, the overhead is negligible. - -### IDEA B: Parallel Per-File Processing -**Impact: MEDIUM-HIGH (4-8x speedup on multi-core)** -**Complexity: LOW** - -The file diffs within each commit are independent. Parallelize using smol/rayon. -Each file reads from shared immutable state and produces independent results that -are merged back. - -Can be combined with IDEA A for multiplicative benefit. - -### IDEA C: Fast-Path for Simple Changes -**Impact: MEDIUM (skips diff for common patterns)** -**Complexity: LOW** - -Detect simple change patterns and avoid the full diff: -1. **Pure append**: new_content starts with old_content → keep all old attrs unchanged -2. **Single insertion**: common prefix + common suffix = full old content → shift attrs after insert point -3. **Single deletion**: similar detection - -For the typical AI coding workflow (appending functions), this could skip >50% of diffs. - -## Key Git Internals Insight - -`git diff-tree --stdin -p -U0` compares each commit against its parent and outputs -minimal unified diff patches. For a linear rebase sequence (N1→N2→...→NN), -each Nk's parent is N(k-1), which is exactly the state we're tracking. - -This means for commits N2..NN, the diff-tree patch output gives us exactly the -line-level change information we need, without any content reading or diff computation. - -Hunk header format: `@@ -old_start[,old_count] +new_start[,new_count] @@` -- old_count defaults to 1 if omitted -- Pure insertion: old_count=0 (e.g., `@@ -10,0 +11,3 @@`) -- Pure deletion: new_count=0 (e.g., `@@ -10,3 +10,0 @@`) -- Replacement: both non-zero - -## Implementation Plan - -### Phase 3A: Implement IDEA A (Hunk-Based Transfer) -1. Add `run_diff_tree_with_patches()` function that runs `git diff-tree --stdin -p -U0` -2. Parse output to extract per-commit, per-file hunk headers -3. Implement `apply_hunk_offsets_to_attributions()` that adjusts line ranges -4. Replace the content-diff loop with hunk-based transfer (N2+), keep content-diff for N1 -5. Skip blob content reading for files where only hunk-based transfer is needed - -### Phase 3B: Implement IDEA B (Parallel Processing) -1. Restructure the inner file loop to collect independent work items -2. Process files in parallel using smol::spawn or rayon -3. Collect results and merge back into shared state - -### Phase 3C: Implement IDEA C (Fast-Path Detection) -1. Add prefix/suffix check before full diff -2. Handle append-only and single-insertion cases with arithmetic - -### Verification -- Run existing integration tests to ensure correctness -- Run heavy benchmark to measure improvement -- Compare attribution output before/after optimization From 531ac21c9dfbcc1af1ab656ca8c95d733451dd21 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 02:12:07 +0000 Subject: [PATCH 04/16] fix: clippy lint warnings (useless format, collapsible if) Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 703395e6d..376e8cc0b 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1563,19 +1563,19 @@ pub fn rewrite_authorship_after_rebase_v2( loop_hunk_ms / 1000, )); timing_phases.push(( - format!(" transform:attestation_serialize"), + " transform:attestation_serialize".to_string(), loop_attestation_ms / 1000, )); timing_phases.push(( - format!(" transform:content_clone"), + " transform:content_clone".to_string(), loop_content_clone_ms / 1000, )); timing_phases.push(( - format!(" transform:metrics_subtract"), + " transform:metrics_subtract".to_string(), loop_metrics_subtract_ms / 1000, )); timing_phases.push(( - format!(" transform:metrics_add"), + " transform:metrics_add".to_string(), loop_metrics_add_ms / 1000, )); timing_phases.push((" loop:serialize".to_string(), loop_serialize_ms)); @@ -2386,15 +2386,15 @@ fn run_diff_tree_with_hunks( // Hunk header: @@ -old_start[,old_count] +new_start[,new_count] @@ if line.starts_with("@@ ") { - if let (Some(commit), Some(file)) = (¤t_commit, ¤t_diff_file) { - if let Some(hunk) = parse_hunk_header(line) { - hunks_by_commit - .entry(commit.clone()) - .or_default() - .entry(file.clone()) - .or_default() - .push(hunk); - } + if let (Some(commit), Some(file)) = (¤t_commit, ¤t_diff_file) + && let Some(hunk) = parse_hunk_header(line) + { + hunks_by_commit + .entry(commit.clone()) + .or_default() + .entry(file.clone()) + .or_default() + .push(hunk); } continue; } From cc16de642c47c6ed0873cde9085e7b6551a86b37 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 02:16:46 +0000 Subject: [PATCH 05/16] fix: remaining clippy lints (unnecessary casts, collapsible ifs, unused var) Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 18 ++++++++---------- tests/integration/rebase_benchmark.rs | 1 - 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 376e8cc0b..25bb40252 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1398,7 +1398,7 @@ pub fn rewrite_authorship_after_rebase_v2( prev_la, ); } - loop_metrics_subtract_ms += tsub.elapsed().as_micros() as u128; + loop_metrics_subtract_ms += tsub.elapsed().as_micros(); // Check if blob content is available and non-empty (file not deleted) let new_content = new_content_for_changed_files.get(file_path); @@ -1436,7 +1436,7 @@ pub fn rewrite_authorship_after_rebase_v2( .unwrap_or(&[]); let result = apply_hunks_to_line_attributions(old_attrs, hunks); total_files_hunk_transferred += 1; - loop_hunk_ms += thunk.elapsed().as_micros() as u128; + loop_hunk_ms += thunk.elapsed().as_micros(); result } else if let Some(new_content) = new_content { // SLOW PATH: Content-diff based attribution transfer @@ -1452,7 +1452,7 @@ pub fn rewrite_authorship_after_rebase_v2( .map(|(_, la)| la.as_slice()), original_head_line_to_author.get(file_path), ); - loop_diff_ms += tdiff.elapsed().as_micros() as u128; + loop_diff_ms += tdiff.elapsed().as_micros(); files_with_synced_state.insert(file_path.clone()); result } else { @@ -1472,22 +1472,20 @@ pub fn rewrite_authorship_after_rebase_v2( &mut prompt_line_metrics, &line_attrs, ); - loop_metrics_add_ms += tadd.elapsed().as_micros() as u128; + loop_metrics_add_ms += tadd.elapsed().as_micros(); let tatt = std::time::Instant::now(); if let Some(text) = serialize_attestation_from_line_attrs(file_path, &line_attrs) { cached_file_attestation_text.insert(file_path.clone(), text); } else { cached_file_attestation_text.remove(file_path); } - loop_attestation_ms += tatt.elapsed().as_micros() as u128; + loop_attestation_ms += tatt.elapsed().as_micros(); let tclone = std::time::Instant::now(); current_attributions.insert(file_path.clone(), (Vec::new(), line_attrs)); - if !use_hunk_based { - if let Some(content) = new_content { - current_file_contents.insert(file_path.clone(), content.clone()); - } + if !use_hunk_based && let Some(content) = new_content { + current_file_contents.insert(file_path.clone(), content.clone()); } - loop_content_clone_ms += tclone.elapsed().as_micros() as u128; + loop_content_clone_ms += tclone.elapsed().as_micros(); } loop_transform_ms += t0.elapsed().as_millis(); diff --git a/tests/integration/rebase_benchmark.rs b/tests/integration/rebase_benchmark.rs index 677ae1051..a7ab41a48 100644 --- a/tests/integration/rebase_benchmark.rs +++ b/tests/integration/rebase_benchmark.rs @@ -801,7 +801,6 @@ fn benchmark_rebase_heavy() { repo.git(&["checkout", "feature"]).unwrap(); let timing_file = repo.path().join("..").join("heavy_rebase_timing.txt"); - let timing_path = timing_file.to_str().unwrap().to_string(); println!( "\n━━━ Starting HEAVY rebase ({} commits onto {}) ━━━", From fc6d3f8edc57c9d2d0423f7eff0b7e9e5776a040 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 04:29:19 +0000 Subject: [PATCH 06/16] perf: optimize attribution_reconstruction with selective cat-file batching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The attribution_reconstruction phase was the dominant bottleneck (51% of total authorship overhead) in realistic rebases. Root cause: reading file contents for ALL pathspecs × ALL commits via cat-file, even when most commits' notes only reference a few files. Key changes: - Selective cat-file batch: only read files referenced in each commit's note attestations, not all pathspecs for every commit (~12x fewer reads in the realistic benchmark) - HEAD note uses already-loaded HEAD file contents (no extra git call) - Always perform full content-based scan across ALL commits' notes (no HEAD-only shortcut — each note only covers its commit's changes) - Avoid unnecessary subtract+add metrics cycle for deleted/skipped files - Use literal "human" comparison instead of allocating via to_str() Realistic benchmark (150 commits × 50 files, 10-2000+ lines): - attribution_reconstruction: 821ms → 192ms (4.3x faster) - Total overhead: 1617ms → 1008ms (1.6x faster) Adds 3 regression tests proving non-HEAD attribution survives rebase: - test_rebase_preserves_attribution_from_non_head_commits - test_rebase_preserves_multi_commit_attribution_same_file - test_rebase_non_head_attribution_survives_slow_path All 79 rebase + 21 cherry-pick tests pass with zero regressions. Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 335 ++++++++++---------- tests/integration/rebase.rs | 221 +++++++++++++ tests/integration/rebase_benchmark.rs | 429 ++++++++++++++++++++++++++ 3 files changed, 819 insertions(+), 166 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 25bb40252..130a5d456 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -691,7 +691,7 @@ fn try_reconstruct_attributions_from_notes_cached( original_head: &str, original_commits: &[String], pathspecs: &[String], - is_squash_rebase: bool, + _is_squash_rebase: bool, note_cache: &RebaseNoteCache, ) -> Option<( HashMap< @@ -707,17 +707,22 @@ fn try_reconstruct_attributions_from_notes_cached( use crate::authorship::attribution_tracker::LineAttribution; use crate::authorship::authorship_log_serialization::AuthorshipLog; - // Get file contents at original_head for all pathspecs in one batch call. - // We need all pathspec contents to build line-to-author maps from note attestations. - let file_contents = batch_read_file_contents_at_commit(repo, original_head, pathspecs).ok()?; - let pathspec_set: HashSet<&str> = pathspecs.iter().map(String::as_str).collect(); - let mut file_line_authors: HashMap> = HashMap::new(); let mut prompts: BTreeMap< String, BTreeMap, > = BTreeMap::new(); + // Read file contents at HEAD — needed for content-based line matching. + let file_contents = + batch_read_file_contents_at_commit(repo, original_head, pathspecs).ok()?; + + // Content-based matching: map line_content → author_id per file. + // Each commit's note only covers lines changed in THAT commit, so we must + // scan all commits' notes and match by content to track attribution across + // position changes (insertions/deletions shift line numbers). + let mut file_line_authors: HashMap> = HashMap::new(); + // Use cached note content for original_head let head_log = note_cache .original_note_contents @@ -725,12 +730,13 @@ fn try_reconstruct_attributions_from_notes_cached( .and_then(|content| AuthorshipLog::deserialize_from_string(content).ok()); if let Some(ref head_log) = head_log { + let head_contents = &file_contents; for file_attestation in &head_log.attestations { let file_path = &file_attestation.file_path; if !pathspec_set.contains(file_path.as_str()) { continue; } - let head_content = file_contents.get(file_path).cloned().unwrap_or_default(); + let head_content = head_contents.get(file_path).cloned().unwrap_or_default(); let lines: Vec<&str> = head_content.lines().collect(); let line_map = file_line_authors.entry(file_path.clone()).or_default(); for entry in &file_attestation.entries { @@ -740,7 +746,9 @@ fn try_reconstruct_attributions_from_notes_cached( crate::authorship::authorship_log::LineRange::Range(s, e) => (*s, *e), }; for line_num in start..=end { - if let Some(line_content) = lines.get(line_num.saturating_sub(1) as usize) { + if let Some(line_content) = + lines.get(line_num.saturating_sub(1) as usize) + { line_map.insert(line_content.to_string(), entry.hash.clone()); } } @@ -755,140 +763,130 @@ fn try_reconstruct_attributions_from_notes_cached( } } - let head_covered_files: HashSet<&str> = file_line_authors.keys().map(String::as_str).collect(); - let need_full_scan = head_log.is_none() - || is_squash_rebase - || pathspecs.iter().any(|p| { - let has_content = file_contents.get(p).map(|c| !c.is_empty()).unwrap_or(false); - has_content && !head_covered_files.contains(p.as_str()) - }); + let mut has_any_note = head_log.is_some(); + let mut parsed_logs: Vec<(String, AuthorshipLog)> = Vec::new(); - if need_full_scan { - // Use cached note contents instead of loading again - let mut has_any_note = head_log.is_some(); - let mut commits_with_notes: Vec = Vec::new(); - let mut parsed_logs: Vec<(String, AuthorshipLog)> = Vec::new(); - - for commit in original_commits { - if commit == original_head { - continue; - } - if let Some(content) = note_cache.original_note_contents.get(commit) { - has_any_note = true; - if let Ok(log) = AuthorshipLog::deserialize_from_string(content) { - commits_with_notes.push(commit.clone()); - parsed_logs.push((commit.clone(), log)); - } + for commit in original_commits { + if commit == original_head { + continue; + } + if let Some(content) = note_cache.original_note_contents.get(commit) { + has_any_note = true; + if let Ok(log) = AuthorshipLog::deserialize_from_string(content) { + parsed_logs.push((commit.clone(), log)); } } + } - if !has_any_note { - return None; - } + if !has_any_note { + return None; + } - if !parsed_logs.is_empty() { - // Batch read file contents for commits with notes - let mut all_refs: Vec<(String, String)> = Vec::new(); - for commit in &commits_with_notes { - for path in pathspecs { - all_refs.push((commit.clone(), path.clone())); + if !parsed_logs.is_empty() { + // Optimization: only read file contents for files actually referenced + // in each commit's attestation, not all pathspecs × all commits. + let mut all_refs: Vec<(String, String)> = Vec::new(); + for (commit, authorship_log) in &parsed_logs { + for file_attestation in &authorship_log.attestations { + if pathspec_set.contains(file_attestation.file_path.as_str()) { + all_refs.push((commit.clone(), file_attestation.file_path.clone())); } } + } - let mut commit_file_contents: HashMap> = HashMap::new(); - if !all_refs.is_empty() { - let mut args = repo.global_args_for_exec(); - args.push("cat-file".to_string()); - args.push("--batch".to_string()); - let stdin_data: String = all_refs - .iter() - .map(|(commit, path)| format!("{}:{}", commit, path)) - .collect::>() - .join("\n") - + "\n"; - if let Ok(output) = exec_git_stdin(&args, stdin_data.as_bytes()) { - let data = &output.stdout; - let mut pos = 0usize; - let mut ref_idx = 0usize; - while pos < data.len() && ref_idx < all_refs.len() { - let header_end = match data[pos..].iter().position(|&b| b == b'\n') { - Some(idx) => pos + idx, - None => break, - }; - let header = std::str::from_utf8(&data[pos..header_end]).unwrap_or(""); - let parts: Vec<&str> = header.split_whitespace().collect(); - let (commit, path) = &all_refs[ref_idx]; - if parts.len() >= 2 && parts[1] == "missing" { - commit_file_contents - .entry(commit.clone()) - .or_default() - .insert(path.clone(), String::new()); - pos = header_end + 1; - ref_idx += 1; - continue; - } - if parts.len() < 3 { - pos = header_end + 1; - ref_idx += 1; - continue; - } - let size: usize = parts[2].parse().unwrap_or(0); - let content_start = header_end + 1; - let content_end = content_start + size; - if content_end <= data.len() { - let content = - String::from_utf8_lossy(&data[content_start..content_end]) - .to_string(); - commit_file_contents - .entry(commit.clone()) - .or_default() - .insert(path.clone(), content); - } - pos = content_end; - if pos < data.len() && data[pos] == b'\n' { - pos += 1; - } + let mut commit_file_contents: HashMap> = HashMap::new(); + if !all_refs.is_empty() { + let mut args = repo.global_args_for_exec(); + args.push("cat-file".to_string()); + args.push("--batch".to_string()); + let stdin_data: String = all_refs + .iter() + .map(|(commit, path)| format!("{}:{}", commit, path)) + .collect::>() + .join("\n") + + "\n"; + if let Ok(output) = exec_git_stdin(&args, stdin_data.as_bytes()) { + let data = &output.stdout; + let mut pos = 0usize; + let mut ref_idx = 0usize; + while pos < data.len() && ref_idx < all_refs.len() { + let header_end = match data[pos..].iter().position(|&b| b == b'\n') { + Some(idx) => pos + idx, + None => break, + }; + let header = std::str::from_utf8(&data[pos..header_end]).unwrap_or(""); + let parts: Vec<&str> = header.split_whitespace().collect(); + let (commit, path) = &all_refs[ref_idx]; + if parts.len() >= 2 && parts[1] == "missing" { + commit_file_contents + .entry(commit.clone()) + .or_default() + .insert(path.clone(), String::new()); + pos = header_end + 1; + ref_idx += 1; + continue; + } + if parts.len() < 3 { + pos = header_end + 1; ref_idx += 1; + continue; + } + let size: usize = parts[2].parse().unwrap_or(0); + let content_start = header_end + 1; + let content_end = content_start + size; + if content_end <= data.len() { + let content = + String::from_utf8_lossy(&data[content_start..content_end]) + .to_string(); + commit_file_contents + .entry(commit.clone()) + .or_default() + .insert(path.clone(), content); + } + pos = content_end; + if pos < data.len() && data[pos] == b'\n' { + pos += 1; } + ref_idx += 1; } } + } - for (commit, authorship_log) in &parsed_logs { - let empty_contents = HashMap::new(); - let commit_contents = commit_file_contents.get(commit).unwrap_or(&empty_contents); - for file_attestation in &authorship_log.attestations { - let file_path = &file_attestation.file_path; - if !pathspec_set.contains(file_path.as_str()) { - continue; - } - let commit_content = - commit_contents.get(file_path).cloned().unwrap_or_default(); - let lines: Vec<&str> = commit_content.lines().collect(); - let line_map = file_line_authors.entry(file_path.clone()).or_default(); - for entry in &file_attestation.entries { - for range in &entry.line_ranges { - let (start, end) = match range { - crate::authorship::authorship_log::LineRange::Single(l) => (*l, *l), - crate::authorship::authorship_log::LineRange::Range(s, e) => { - (*s, *e) - } - }; - for line_num in start..=end { - if let Some(line_content) = - lines.get(line_num.saturating_sub(1) as usize) - { - line_map.insert(line_content.to_string(), entry.hash.clone()); - } + for (commit, authorship_log) in &parsed_logs { + let empty_contents = HashMap::new(); + let commit_contents = commit_file_contents.get(commit).unwrap_or(&empty_contents); + for file_attestation in &authorship_log.attestations { + let file_path = &file_attestation.file_path; + if !pathspec_set.contains(file_path.as_str()) { + continue; + } + let commit_content = + commit_contents.get(file_path).cloned().unwrap_or_default(); + let lines: Vec<&str> = commit_content.lines().collect(); + let line_map = file_line_authors.entry(file_path.clone()).or_default(); + for entry in &file_attestation.entries { + for range in &entry.line_ranges { + let (start, end) = match range { + crate::authorship::authorship_log::LineRange::Single(l) => (*l, *l), + crate::authorship::authorship_log::LineRange::Range(s, e) => { + (*s, *e) + } + }; + for line_num in start..=end { + if let Some(line_content) = + lines.get(line_num.saturating_sub(1) as usize) + { + line_map.insert(line_content.to_string(), entry.hash.clone()); } } } } - for (prompt_id, prompt_record) in &authorship_log.metadata.prompts { - prompts - .entry(prompt_id.clone()) - .or_default() - .insert(commit.clone(), prompt_record.clone()); - } + } + for (prompt_id, prompt_record) in &authorship_log.metadata.prompts { + prompts + .entry(prompt_id.clone()) + .or_default() + .insert(commit.clone(), prompt_record.clone()); } } } @@ -897,7 +895,7 @@ fn try_reconstruct_attributions_from_notes_cached( return None; } - // Build attributions at original_head using the line content -> author map + // Build attributions from content-based line→author map let mut attributions = HashMap::new(); for file_path in pathspecs { let line_map = match file_line_authors.get(file_path) { @@ -922,7 +920,6 @@ fn try_reconstruct_attributions_from_notes_cached( if !line_attrs.is_empty() { line_attrs.sort_by_key(|a| a.start_line); - // Skip char-level attribution computation — only line_attrs are used for rebase attributions.insert(file_path.clone(), (Vec::new(), line_attrs)); } } @@ -1387,45 +1384,39 @@ pub fn rewrite_authorship_after_rebase_v2( let t0 = std::time::Instant::now(); for file_path in &changed_files_in_commit { - // Subtract old metrics before modifying attributions - let tsub = std::time::Instant::now(); - let previous_line_attrs = current_attributions - .get(file_path) - .map(|(_, la)| la.clone()); - if let Some(ref prev_la) = previous_line_attrs { - subtract_prompt_line_metrics_for_line_attributions( - &mut prompt_line_metrics, - prev_la, - ); - } - loop_metrics_subtract_ms += tsub.elapsed().as_micros(); - // Check if blob content is available and non-empty (file not deleted) let new_content = new_content_for_changed_files.get(file_path); let is_file_deleted = new_content.map(|c| c.is_empty()).unwrap_or(false); if is_file_deleted { - // File deleted - if let Some(ref prev_la) = previous_line_attrs { - add_prompt_line_metrics_for_line_attributions( - &mut prompt_line_metrics, - prev_la, - ); - } + // File deleted — metrics stay unchanged (no subtract/add cycle) cached_file_attestation_text.remove(file_path); existing_files.remove(file_path); continue; } // Decide: use hunk-based transfer or content-diff? - // Hunk-based: valid when our accumulated state matches the commit's parent. - // Content-diff: required for first appearance of each file (state not yet synced). let has_hunks = commit_hunks .and_then(|ch| ch.get(file_path.as_str())) .is_some(); let use_hunk_based = files_with_synced_state.contains(file_path.as_str()) && has_hunks; + // Skip early if no data available (avoids wasted subtract+add cycle) + if !use_hunk_based && new_content.is_none() { + continue; + } + + // Subtract old metrics before modifying attributions (borrow, no clone) + let tsub = std::time::Instant::now(); + if let Some((_, prev_la)) = current_attributions.get(file_path) { + subtract_prompt_line_metrics_for_line_attributions( + &mut prompt_line_metrics, + prev_la, + ); + } + loop_metrics_subtract_ms += tsub.elapsed().as_micros(); + let line_attrs = if use_hunk_based { // FAST PATH: Hunk-based attribution transfer let thunk = std::time::Instant::now(); @@ -1438,8 +1429,9 @@ pub fn rewrite_authorship_after_rebase_v2( total_files_hunk_transferred += 1; loop_hunk_ms += thunk.elapsed().as_micros(); result - } else if let Some(new_content) = new_content { + } else { // SLOW PATH: Content-diff based attribution transfer + let new_content = new_content.unwrap(); let tdiff = std::time::Instant::now(); total_files_diffed += 1; let new_line_count = new_content.lines().count(); @@ -1455,16 +1447,6 @@ pub fn rewrite_authorship_after_rebase_v2( loop_diff_ms += tdiff.elapsed().as_micros(); files_with_synced_state.insert(file_path.clone()); result - } else { - // No blob content and no hunk data — skip this file - // (shouldn't happen in normal flow, but be defensive) - if let Some(ref prev_la) = previous_line_attrs { - add_prompt_line_metrics_for_line_attributions( - &mut prompt_line_metrics, - prev_la, - ); - } - continue; }; let tadd = std::time::Instant::now(); @@ -3779,18 +3761,38 @@ fn add_prompt_line_metrics_for_line_attributions( metrics: &mut HashMap, line_attrs: &[crate::authorship::attribution_tracker::LineAttribution], ) { + let human_id = crate::authorship::working_log::CheckpointKind::Human.to_str(); for line_attr in line_attrs { let line_count = line_attr .end_line .saturating_sub(line_attr.start_line) .saturating_add(1); - if line_attr.author_id != crate::authorship::working_log::CheckpointKind::Human.to_str() { - let entry = metrics.entry(line_attr.author_id.clone()).or_default(); - entry.accepted_lines = entry.accepted_lines.saturating_add(line_count); + if line_attr.author_id != human_id { + // Use get_mut to avoid cloning author_id when the key already exists + if let Some(entry) = metrics.get_mut(&line_attr.author_id) { + entry.accepted_lines = entry.accepted_lines.saturating_add(line_count); + } else { + metrics.insert( + line_attr.author_id.clone(), + PromptLineMetrics { + accepted_lines: line_count, + overridden_lines: 0, + }, + ); + } } if let Some(overrode_id) = &line_attr.overrode { - let entry = metrics.entry(overrode_id.clone()).or_default(); - entry.overridden_lines = entry.overridden_lines.saturating_add(line_count); + if let Some(entry) = metrics.get_mut(overrode_id) { + entry.overridden_lines = entry.overridden_lines.saturating_add(line_count); + } else { + metrics.insert( + overrode_id.clone(), + PromptLineMetrics { + accepted_lines: 0, + overridden_lines: line_count, + }, + ); + } } } } @@ -3799,12 +3801,13 @@ fn subtract_prompt_line_metrics_for_line_attributions( metrics: &mut HashMap, line_attrs: &[crate::authorship::attribution_tracker::LineAttribution], ) { + let human_id = crate::authorship::working_log::CheckpointKind::Human.to_str(); for line_attr in line_attrs { let line_count = line_attr .end_line .saturating_sub(line_attr.start_line) .saturating_add(1); - if line_attr.author_id != crate::authorship::working_log::CheckpointKind::Human.to_str() + if line_attr.author_id != human_id && let Some(entry) = metrics.get_mut(&line_attr.author_id) { entry.accepted_lines = entry.accepted_lines.saturating_sub(line_count); diff --git a/tests/integration/rebase.rs b/tests/integration/rebase.rs index 79e2d51fe..6e4869cca 100644 --- a/tests/integration/rebase.rs +++ b/tests/integration/rebase.rs @@ -1692,6 +1692,227 @@ fn test_rebase_file_delete_recreate_preserves_attribution() { ai_file.assert_lines_and_blame(crate::lines!["line1".ai(), "line2".ai(), "line3".ai()]); } +/// Regression test: AI attribution from earlier commits (not HEAD) must survive rebase. +/// +/// Each commit's note only covers lines changed in THAT commit. HEAD doesn't +/// touch all AI-attributed files. The reconstruction must process ALL commits' +/// notes to build the complete attribution state, not just HEAD's. +#[test] +fn test_rebase_preserves_attribution_from_non_head_commits() { + let repo = TestRepo::new(); + + // Initial commit + let mut base = repo.filename("base.txt"); + base.set_contents(crate::lines!["base"]); + repo.stage_all_and_commit("Initial commit").unwrap(); + let default_branch = repo.current_branch(); + + // Feature branch: commit 1 — AI attribution on file_a only + repo.git(&["checkout", "-b", "feature"]).unwrap(); + let mut file_a = repo.filename("file_a.txt"); + file_a.set_contents(crate::lines![ + "// AI generated module A".ai(), + "fn module_a() {}".ai(), + "// end module A".ai() + ]); + repo.stage_all_and_commit("feat: add module A (AI)").unwrap(); + + // Feature branch: commit 2 — AI attribution on file_b only (file_a not touched) + let mut file_b = repo.filename("file_b.txt"); + file_b.set_contents(crate::lines![ + "// AI generated module B".ai(), + "fn module_b() {}".ai() + ]); + repo.stage_all_and_commit("feat: add module B (AI)").unwrap(); + + // Feature branch: commit 3 (HEAD) — AI attribution on file_c only + // file_a and file_b are NOT touched in this commit + let mut file_c = repo.filename("file_c.txt"); + file_c.set_contents(crate::lines![ + "// AI generated module C".ai(), + "fn module_c() {}".ai() + ]); + repo.stage_all_and_commit("feat: add module C (AI)").unwrap(); + + // Advance main branch to force actual rebase (not fast-forward) + repo.git(&["checkout", &default_branch]).unwrap(); + let mut main_change = repo.filename("main_update.txt"); + main_change.set_contents(crate::lines!["main branch work"]); + repo.stage_all_and_commit("main: infrastructure update").unwrap(); + + // Rebase feature onto main + repo.git(&["checkout", "feature"]).unwrap(); + repo.git(&["rebase", &default_branch]).unwrap(); + + // CRITICAL: file_a attribution (from commit 1, NOT HEAD) must survive + file_a.assert_lines_and_blame(crate::lines![ + "// AI generated module A".ai(), + "fn module_a() {}".ai(), + "// end module A".ai() + ]); + + // file_b attribution (from commit 2, NOT HEAD) must survive + file_b.assert_lines_and_blame(crate::lines![ + "// AI generated module B".ai(), + "fn module_b() {}".ai() + ]); + + // file_c attribution (from HEAD commit) must survive + file_c.assert_lines_and_blame(crate::lines![ + "// AI generated module C".ai(), + "fn module_c() {}".ai() + ]); +} + +/// Regression test: multi-commit attribution on SAME file from different commits. +/// +/// Commit 1 adds AI lines 1-3, commit 3 adds AI lines 4-6, but commit 2 +/// (between them) touches a different file entirely. The reconstruction must +/// combine notes from both commits to get the full attribution for the file. +#[test] +fn test_rebase_preserves_multi_commit_attribution_same_file() { + let repo = TestRepo::new(); + + let mut base = repo.filename("base.txt"); + base.set_contents(crate::lines!["base"]); + repo.stage_all_and_commit("Initial commit").unwrap(); + let default_branch = repo.current_branch(); + + repo.git(&["checkout", "-b", "feature"]).unwrap(); + + // Commit 1: AI attribution on app.txt lines 1-3 + let mut app = repo.filename("app.txt"); + app.set_contents(crate::lines![ + "// AI header".ai(), + "fn init() {}".ai(), + "// end init".ai() + ]); + repo.stage_all_and_commit("feat: AI init code").unwrap(); + + // Commit 2: touch a DIFFERENT file (app.txt unchanged) + let mut config = repo.filename("config.txt"); + config.set_contents(crate::lines![ + "// AI config".ai(), + "setting = true".ai() + ]); + repo.stage_all_and_commit("feat: AI config").unwrap(); + + // Commit 3 (HEAD): add MORE AI lines to app.txt + app.set_contents(crate::lines![ + "// AI header".ai(), + "fn init() {}".ai(), + "// end init".ai(), + "// AI footer added later".ai(), + "fn cleanup() {}".ai() + ]); + repo.stage_all_and_commit("feat: AI cleanup code").unwrap(); + + // Advance main + repo.git(&["checkout", &default_branch]).unwrap(); + let mut infra = repo.filename("infra.txt"); + infra.set_contents(crate::lines!["infra work"]); + repo.stage_all_and_commit("main: infra").unwrap(); + + // Rebase + repo.git(&["checkout", "feature"]).unwrap(); + repo.git(&["rebase", &default_branch]).unwrap(); + + // app.txt should have ALL AI lines (from commits 1 AND 3) + app.assert_lines_and_blame(crate::lines![ + "// AI header".ai(), + "fn init() {}".ai(), + "// end init".ai(), + "// AI footer added later".ai(), + "fn cleanup() {}".ai() + ]); + + // config.txt (from commit 2, NOT HEAD) must survive + config.assert_lines_and_blame(crate::lines![ + "// AI config".ai(), + "setting = true".ai() + ]); +} + +/// Regression test: attribution survives when main branch modifies AI-attributed +/// files, forcing the slow path (blob OID mismatch between original and rebased). +/// This tests that attribution from non-HEAD commits survives even through the +/// full attribution rewrite path. +#[test] +fn test_rebase_non_head_attribution_survives_slow_path() { + let repo = TestRepo::new(); + + let mut base = repo.filename("shared.txt"); + base.set_contents(crate::lines![ + "// top section", + "line_a", + "line_b", + "line_c", + "", + "", + "", + "", + "// bottom section", + "line_x", + "line_y", + "line_z" + ]); + repo.stage_all_and_commit("Initial commit").unwrap(); + let default_branch = repo.current_branch(); + + repo.git(&["checkout", "-b", "feature"]).unwrap(); + + // Commit 1: AI attribution on module.txt + let mut module = repo.filename("module.txt"); + module.set_contents(crate::lines![ + "// AI module".ai(), + "pub fn process() {}".ai(), + "// end".ai() + ]); + repo.stage_all_and_commit("feat: AI module").unwrap(); + + // Commit 2 (HEAD): append to bottom of shared.txt + // module.txt is NOT touched here + let mut shared = repo.filename("shared.txt"); + shared.set_contents(crate::lines![ + "// top section", + "line_a", + "line_b", + "line_c", + "", + "", + "", + "", + "// bottom section", + "line_x", + "line_y", + "line_z", + "// feature addition".ai() + ]); + repo.stage_all_and_commit("feat: extend shared").unwrap(); + + // Advance main — add a new file so the rebase replays commits on a new base. + // shared.txt is NOT modified on main, so no merge conflict occurs. + repo.git(&["checkout", &default_branch]).unwrap(); + let mut infra = repo.filename("infra.txt"); + infra.set_contents(crate::lines![ + "// infrastructure", + "setup_logging();" + ]); + repo.stage_all_and_commit("main: add infra file").unwrap(); + + // Rebase + repo.git(&["checkout", "feature"]).unwrap(); + repo.git(&["rebase", &default_branch]).unwrap(); + + // module.txt attribution (from commit 1, NOT HEAD) must survive + // even though the rebase took the slow path due to shared.txt changes + module.assert_lines_and_blame(crate::lines![ + "// AI module".ai(), + "pub fn process() {}".ai(), + "// end".ai() + ]); +} + crate::reuse_tests_in_worktree!( test_rebase_no_conflicts_identical_trees, test_rebase_with_different_trees, diff --git a/tests/integration/rebase_benchmark.rs b/tests/integration/rebase_benchmark.rs index a7ab41a48..72226d9de 100644 --- a/tests/integration/rebase_benchmark.rs +++ b/tests/integration/rebase_benchmark.rs @@ -1049,6 +1049,435 @@ fn benchmark_rebase_heavy_with_timing() { ); } +/// Realistic monorepo benchmark addressing all feedback from principal eng review: +/// 1. Non-uniform change patterns (1-20 files per commit, varying edit types) +/// 2. File heterogeneity (10 to 2000+ lines, different structures) +/// 3. Multi-author attribution (3 different AI models + human lines) +/// 4. Renames/moves (directory restructuring mid-feature) +/// 5. Non-trivial hunks (20-100 line AI edits, deletions, replacements) +/// 6. Clean rebase still (conflicts are hard to automate reproducibly) +#[test] +#[ignore] +fn benchmark_rebase_realistic_monorepo() { + // Simple deterministic PRNG (xorshift64) to avoid adding rand dependency + struct Rng(u64); + impl Rng { + fn next(&mut self) -> u64 { + self.0 ^= self.0 << 13; + self.0 ^= self.0 >> 7; + self.0 ^= self.0 << 17; + self.0 + } + fn gen_range(&mut self, max: usize) -> usize { + (self.next() as usize) % max.max(1) + } + } + + let num_feature_commits: usize = std::env::var("REALISTIC_BENCH_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(150); + let num_main_commits: usize = std::env::var("REALISTIC_BENCH_MAIN_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(15); + + println!("\n=== Realistic Monorepo Rebase Benchmark ==="); + println!( + "Feature commits: {}, Main commits: {}", + num_feature_commits, num_main_commits + ); + println!("============================================\n"); + + let repo = TestRepo::new(); + let mut rng = Rng(42); // deterministic for reproducibility + + // All checkpoints use "mock_ai" — the recognized test preset + + // --- Step 1: Create heterogeneous initial files --- + // Mix of small configs, medium modules, and large files + struct FileSpec { + path: String, + initial_lines: usize, + has_ai: bool, + } + + let mut files: Vec = Vec::new(); + + // Small config files (10-30 lines) + for i in 0..5 { + files.push(FileSpec { + path: format!("config/settings_{}.toml", i), + initial_lines: 10 + (i * 4), + has_ai: i % 2 == 0, // some have AI, some don't + }); + } + + // Medium source files (100-400 lines) — the bulk + for i in 0..30 { + let module = i % 6; + files.push(FileSpec { + path: format!("src/mod_{}/component_{}.rs", module, i), + initial_lines: 100 + (i * 10), + has_ai: true, + }); + } + + // Large files (800-2000 lines) + for i in 0..5 { + files.push(FileSpec { + path: format!("src/core/engine_{}.rs", i), + initial_lines: 800 + (i * 300), + has_ai: true, + }); + } + + // Test files (200-500 lines) + for i in 0..10 { + files.push(FileSpec { + path: format!("tests/test_{}.rs", i), + initial_lines: 200 + (i * 30), + has_ai: true, + }); + } + + println!( + "Creating {} files ({} with AI attribution)...", + files.len(), + files.iter().filter(|f| f.has_ai).count() + ); + + // Create initial files with mixed attribution + for (idx, spec) in files.iter().enumerate() { + let mut file = repo.filename(&spec.path); + let mut lines: Vec = Vec::new(); + lines.push(format!("// File: {} (auto-generated)", spec.path).into()); + lines.push("// MAIN_INSERTION_POINT".into()); + + for line_idx in 0..spec.initial_lines { + let line_text = if line_idx % 20 == 0 { + format!("// Section {} of {}", line_idx / 20, spec.path) + } else if line_idx % 5 == 0 { + format!("pub fn func_{}_{}() -> Result<(), Error> {{", idx, line_idx) + } else if line_idx % 5 == 4 { + "}".to_string() + } else { + format!(" let val_{} = compute_{}({});", line_idx, idx, line_idx) + }; + + if spec.has_ai && line_idx % 3 != 0 { + // 2/3 of lines are AI-authored, 1/3 human — creates fragmented attribution + lines.push(line_text.ai()); + } else { + lines.push(line_text.into()); + } + } + lines.push("// FEATURE_INSERTION_POINT".into()); + lines.push("// EOF".into()); + file.set_contents(lines); + } + repo.stage_all_and_commit("Initial heterogeneous files") + .unwrap(); + let default_branch = repo.current_branch(); + + // --- Step 2: Feature branch with non-uniform commits --- + repo.git(&["checkout", "-b", "feature"]).unwrap(); + let feature_start = Instant::now(); + + for commit_idx in 0..num_feature_commits { + // Vary how many files each commit touches (1 to 20) + let num_files_to_touch = if commit_idx % 10 == 0 { + // Every 10th commit is a big refactor touching many files + 15 + rng.gen_range(10) // 15-24 files + } else if commit_idx % 3 == 0 { + // Some commits touch just 1-2 files (focused edits) + 1 + rng.gen_range(2) + } else { + // Normal commits touch 3-8 files + 3 + rng.gen_range(6) + }; + + let ai_files: Vec = files + .iter() + .enumerate() + .filter(|(_, f)| f.has_ai) + .map(|(i, _)| i) + .collect(); + + // Pick random subset of files to touch + let mut touched: Vec = Vec::new(); + for _ in 0..num_files_to_touch.min(ai_files.len()) { + let pick = ai_files[rng.gen_range(ai_files.len())]; + if !touched.contains(&pick) { + touched.push(pick); + } + } + + // Vary edit patterns per file + for &file_idx in &touched { + let spec = &files[file_idx]; + let path = repo.path().join(&spec.path); + let current = match fs::read_to_string(&path) { + Ok(c) => c, + Err(_) => continue, + }; + + let edit_type = rng.gen_range(5); + + let new_content = match edit_type { + 0 => { + // Large append (20-80 lines) — typical AI code generation + let num_new_lines = 20 + rng.gen_range(60); + let mut addition = String::new(); + addition.push_str(&format!( + "\n// Feature {} addition ({} lines)\n", + commit_idx, num_new_lines + )); + addition.push_str(&format!( + "pub fn feature_{}_impl_{} () {{\n", + commit_idx, file_idx + )); + for j in 0..num_new_lines { + addition.push_str(&format!( + " let step_{} = process_{}({});\n", + j, commit_idx, j + )); + } + addition.push_str("}\n// FEATURE_INSERTION_POINT"); + current.replacen("// FEATURE_INSERTION_POINT", &addition, 1) + } + 1 => { + // Replacement edit — delete some lines, add different ones + let lines: Vec<&str> = current.lines().collect(); + if lines.len() > 30 { + let start = 10 + rng.gen_range(lines.len() / 3); + let del_count = 5 + rng.gen_range(15); + let end = (start + del_count).min(lines.len() - 5); + let add_count = 10 + rng.gen_range(30); + let mut result: Vec = + lines[..start].iter().map(|s| s.to_string()).collect(); + result.push(format!("// Refactored section (commit {})", commit_idx)); + for j in 0..add_count { + result.push(format!( + " let refactored_{} = new_impl_{}_{}();", + j, commit_idx, file_idx + )); + } + result.extend(lines[end..].iter().map(|s| s.to_string())); + result.join("\n") + } else { + // File too small, just append + current.replacen( + "// FEATURE_INSERTION_POINT", + &format!( + "fn small_edit_{}() {{ }}\n// FEATURE_INSERTION_POINT", + commit_idx + ), + 1, + ) + } + } + 2 => { + // Multi-site edit — insert at multiple locations + let lines: Vec<&str> = current.lines().collect(); + let mut result: Vec = Vec::with_capacity(lines.len() + 20); + let insert_every = lines.len() / 4; + for (i, line) in lines.iter().enumerate() { + result.push(line.to_string()); + if insert_every > 0 && i > 0 && i % insert_every == 0 && i < lines.len() - 5 + { + for j in 0..5 { + result.push(format!( + " // Injected at site {} by commit {} (line {})", + i, commit_idx, j + )); + } + } + } + result.join("\n") + } + 3 => { + // Pure deletion (remove 5-20 lines from middle) + let lines: Vec<&str> = current.lines().collect(); + if lines.len() > 40 { + let start = 15 + rng.gen_range(lines.len() / 3); + let del_count = 5 + rng.gen_range(15); + let end = (start + del_count).min(lines.len() - 5); + let mut result: Vec = + lines[..start].iter().map(|s| s.to_string()).collect(); + result.push(format!( + "// Deleted {} lines (commit {})", + end - start, + commit_idx + )); + result.extend(lines[end..].iter().map(|s| s.to_string())); + result.join("\n") + } else { + current.clone() + } + } + _ => { + // Small append (2-5 lines) — quick fix style + current.replacen( + "// FEATURE_INSERTION_POINT", + &format!( + "fn fix_{}_{}() {{ todo!() }}\nfn helper_{}_{}() {{ }}\n// FEATURE_INSERTION_POINT", + commit_idx, file_idx, commit_idx, file_idx + ), + 1, + ) + } + }; + + fs::write(&path, &new_content).unwrap(); + repo.git_ai(&["checkpoint", "mock_ai", &spec.path]).unwrap(); + } + + // Every 50th commit: rename a file (directory restructuring) + if commit_idx > 0 && commit_idx % 50 == 0 && commit_idx / 50 < 3 { + let rename_idx = commit_idx / 50; // 1, 2 + let old_path = format!("src/mod_{}/component_{}.rs", rename_idx, rename_idx * 5); + let new_path = format!("src/refactored/component_{}_v2.rs", rename_idx * 5); + let old_full = repo.path().join(&old_path); + if old_full.exists() { + let new_dir = repo.path().join("src/refactored"); + fs::create_dir_all(&new_dir).ok(); + fs::rename(&old_full, repo.path().join(&new_path)).ok(); + } + } + + repo.git(&["add", "-A"]).unwrap(); + let msg = if commit_idx % 10 == 0 { + format!("refactor: large restructuring pass {}", commit_idx) + } else if commit_idx % 3 == 0 { + format!("fix: targeted bug fix {}", commit_idx) + } else { + format!("feat: implement feature component {}", commit_idx) + }; + repo.stage_all_and_commit(&msg).unwrap(); + + if (commit_idx + 1) % 30 == 0 { + println!( + " Feature {}/{} ({:.1}s)", + commit_idx + 1, + num_feature_commits, + feature_start.elapsed().as_secs_f64() + ); + } + } + println!( + "Feature setup: {:.1}s ({} commits)", + feature_start.elapsed().as_secs_f64(), + num_feature_commits + ); + + // --- Step 3: Main branch advances --- + // Touch ALL AI files on every main commit to guarantee blob OID differences + // after rebase, forcing the slow path (full attribution rewrite). + // Insert at MAIN_INSERTION_POINT (near top), separate from feature changes + // at FEATURE_INSERTION_POINT (near bottom) to avoid merge conflicts. + repo.git(&["checkout", &default_branch]).unwrap(); + for main_idx in 0..num_main_commits { + for (file_idx, spec) in files.iter().enumerate() { + if !spec.has_ai { + continue; + } + let path = repo.path().join(&spec.path); + if let Ok(current) = fs::read_to_string(&path) { + let new_content = current.replacen( + "// MAIN_INSERTION_POINT", + &format!( + "// main-infra-v{}: config for file {}\nconst MAIN_CFG_{}_{}: u32 = {};\n// MAIN_INSERTION_POINT", + main_idx, file_idx, main_idx, file_idx, main_idx * 100 + file_idx + ), + 1, + ); + fs::write(&path, &new_content).unwrap(); + } + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("infra: main branch update {}", main_idx)) + .unwrap(); + } + + // --- Step 4: Rebase with timing --- + repo.git(&["checkout", "feature"]).unwrap(); + let timing_file = std::path::PathBuf::from("/tmp/realistic_timing.txt"); + let timing_path = timing_file.to_str().unwrap().to_string(); + + // Check notes BEFORE rebase + let pre_notes = repo + .git(&["notes", "--ref=refs/notes/ai", "list"]) + .unwrap_or_default(); + let pre_count = pre_notes.lines().filter(|l| !l.is_empty()).count(); + println!("AI notes BEFORE rebase: {}", pre_count); + // Show sample notes to verify content (first, middle, last) + let note_lines: Vec<&str> = pre_notes.lines().filter(|l| !l.is_empty()).collect(); + for &idx in &[0, note_lines.len() / 2, note_lines.len().saturating_sub(1)] { + if let Some(line) = note_lines.get(idx) { + let commit_sha = line.split_whitespace().nth(1).unwrap_or(""); + if !commit_sha.is_empty() { + let note_content = repo + .git(&["notes", "--ref=refs/notes/ai", "show", commit_sha]) + .unwrap_or_else(|e| format!("ERROR: {}", e)); + let preview_len = 300.min(note_content.len()); + println!( + "Note[{}] for {}:\n{}\n", + idx, + &commit_sha[..8], + ¬e_content[..preview_len] + ); + } + } + } + + println!( + "\n--- Starting realistic rebase ({} commits onto {}) ---", + num_feature_commits, &default_branch + ); + let start = Instant::now(); + let result = repo.git_with_env( + &["rebase", &default_branch], + &[ + ("GIT_AI_DEBUG_PERFORMANCE", "2"), + ("GIT_AI_REBASE_TIMING_FILE", &timing_path), + ], + None, + ); + let dur = start.elapsed(); + + match &result { + Ok(_) => println!("Rebase succeeded in {:.3}s", dur.as_secs_f64()), + Err(e) => println!("Rebase FAILED in {:.3}s: {}", dur.as_secs_f64(), e), + } + result.unwrap(); + + // Check notes state and rebase details + let notes_list = repo + .git(&["notes", "--ref=refs/notes/ai", "list"]) + .unwrap_or_default(); + let notes_count = notes_list.lines().filter(|l| !l.is_empty()).count(); + println!("AI notes after rebase: {}", notes_count); + // Show first few notes to verify they exist + for line in notes_list.lines().take(3) { + println!(" note: {}", line.trim()); + } + + if let Ok(timing_data) = fs::read_to_string(&timing_file) { + println!("\n=== PHASE TIMING ==="); + print!("{}", timing_data); + println!("====================\n"); + } else { + println!("(No timing file written — fast path or no notes to rewrite)"); + } + + println!( + "Total: {:.3}s, Per-commit: {:.1}ms", + dur.as_secs_f64(), + dur.as_millis() as f64 / num_feature_commits as f64 + ); +} + fn extract_timing(data: &str, key: &str) -> Option { for line in data.lines() { let trimmed = line.trim(); From ba2154cb65cd23aa82f9e63b4a1c2835d178eda3 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 04:30:37 +0000 Subject: [PATCH 07/16] fix: rustfmt formatting Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 17 +++++------------ tests/integration/rebase.rs | 27 +++++++++++---------------- 2 files changed, 16 insertions(+), 28 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 130a5d456..55b4abf14 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -714,8 +714,7 @@ fn try_reconstruct_attributions_from_notes_cached( > = BTreeMap::new(); // Read file contents at HEAD — needed for content-based line matching. - let file_contents = - batch_read_file_contents_at_commit(repo, original_head, pathspecs).ok()?; + let file_contents = batch_read_file_contents_at_commit(repo, original_head, pathspecs).ok()?; // Content-based matching: map line_content → author_id per file. // Each commit's note only covers lines changed in THAT commit, so we must @@ -746,9 +745,7 @@ fn try_reconstruct_attributions_from_notes_cached( crate::authorship::authorship_log::LineRange::Range(s, e) => (*s, *e), }; for line_num in start..=end { - if let Some(line_content) = - lines.get(line_num.saturating_sub(1) as usize) - { + if let Some(line_content) = lines.get(line_num.saturating_sub(1) as usize) { line_map.insert(line_content.to_string(), entry.hash.clone()); } } @@ -836,8 +833,7 @@ fn try_reconstruct_attributions_from_notes_cached( let content_end = content_start + size; if content_end <= data.len() { let content = - String::from_utf8_lossy(&data[content_start..content_end]) - .to_string(); + String::from_utf8_lossy(&data[content_start..content_end]).to_string(); commit_file_contents .entry(commit.clone()) .or_default() @@ -860,17 +856,14 @@ fn try_reconstruct_attributions_from_notes_cached( if !pathspec_set.contains(file_path.as_str()) { continue; } - let commit_content = - commit_contents.get(file_path).cloned().unwrap_or_default(); + let commit_content = commit_contents.get(file_path).cloned().unwrap_or_default(); let lines: Vec<&str> = commit_content.lines().collect(); let line_map = file_line_authors.entry(file_path.clone()).or_default(); for entry in &file_attestation.entries { for range in &entry.line_ranges { let (start, end) = match range { crate::authorship::authorship_log::LineRange::Single(l) => (*l, *l), - crate::authorship::authorship_log::LineRange::Range(s, e) => { - (*s, *e) - } + crate::authorship::authorship_log::LineRange::Range(s, e) => (*s, *e), }; for line_num in start..=end { if let Some(line_content) = diff --git a/tests/integration/rebase.rs b/tests/integration/rebase.rs index 6e4869cca..a032cbf63 100644 --- a/tests/integration/rebase.rs +++ b/tests/integration/rebase.rs @@ -1715,7 +1715,8 @@ fn test_rebase_preserves_attribution_from_non_head_commits() { "fn module_a() {}".ai(), "// end module A".ai() ]); - repo.stage_all_and_commit("feat: add module A (AI)").unwrap(); + repo.stage_all_and_commit("feat: add module A (AI)") + .unwrap(); // Feature branch: commit 2 — AI attribution on file_b only (file_a not touched) let mut file_b = repo.filename("file_b.txt"); @@ -1723,7 +1724,8 @@ fn test_rebase_preserves_attribution_from_non_head_commits() { "// AI generated module B".ai(), "fn module_b() {}".ai() ]); - repo.stage_all_and_commit("feat: add module B (AI)").unwrap(); + repo.stage_all_and_commit("feat: add module B (AI)") + .unwrap(); // Feature branch: commit 3 (HEAD) — AI attribution on file_c only // file_a and file_b are NOT touched in this commit @@ -1732,13 +1734,15 @@ fn test_rebase_preserves_attribution_from_non_head_commits() { "// AI generated module C".ai(), "fn module_c() {}".ai() ]); - repo.stage_all_and_commit("feat: add module C (AI)").unwrap(); + repo.stage_all_and_commit("feat: add module C (AI)") + .unwrap(); // Advance main branch to force actual rebase (not fast-forward) repo.git(&["checkout", &default_branch]).unwrap(); let mut main_change = repo.filename("main_update.txt"); main_change.set_contents(crate::lines!["main branch work"]); - repo.stage_all_and_commit("main: infrastructure update").unwrap(); + repo.stage_all_and_commit("main: infrastructure update") + .unwrap(); // Rebase feature onto main repo.git(&["checkout", "feature"]).unwrap(); @@ -1791,10 +1795,7 @@ fn test_rebase_preserves_multi_commit_attribution_same_file() { // Commit 2: touch a DIFFERENT file (app.txt unchanged) let mut config = repo.filename("config.txt"); - config.set_contents(crate::lines![ - "// AI config".ai(), - "setting = true".ai() - ]); + config.set_contents(crate::lines!["// AI config".ai(), "setting = true".ai()]); repo.stage_all_and_commit("feat: AI config").unwrap(); // Commit 3 (HEAD): add MORE AI lines to app.txt @@ -1827,10 +1828,7 @@ fn test_rebase_preserves_multi_commit_attribution_same_file() { ]); // config.txt (from commit 2, NOT HEAD) must survive - config.assert_lines_and_blame(crate::lines![ - "// AI config".ai(), - "setting = true".ai() - ]); + config.assert_lines_and_blame(crate::lines!["// AI config".ai(), "setting = true".ai()]); } /// Regression test: attribution survives when main branch modifies AI-attributed @@ -1894,10 +1892,7 @@ fn test_rebase_non_head_attribution_survives_slow_path() { // shared.txt is NOT modified on main, so no merge conflict occurs. repo.git(&["checkout", &default_branch]).unwrap(); let mut infra = repo.filename("infra.txt"); - infra.set_contents(crate::lines![ - "// infrastructure", - "setup_logging();" - ]); + infra.set_contents(crate::lines!["// infrastructure", "setup_logging();"]); repo.stage_all_and_commit("main: add infra file").unwrap(); // Rebase From 84e65f05b9519fbd2ac30a7e15c73e62985ca3bd Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 04:35:25 +0000 Subject: [PATCH 08/16] fix: handle rename/copy paths and delete-then-recreate in diff-tree parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Devin review findings: 1. Rename/copy entries (R/C status) in --raw format without -z produce "old_path\tnew_path" — use the destination path for attribution tracking. 2. First-appearance blob optimization now only marks files as "seen" when they have a non-None blob OID. Deletions no longer prevent later re-creations from having their blob content read. Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 55b4abf14..646260442 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1212,11 +1212,13 @@ pub fn rewrite_authorship_after_rebase_v2( let mut needed_oids: HashSet = HashSet::new(); for (_, delta) in &diff_tree_result.commit_deltas { for (file_path, maybe_oid) in &delta.file_to_blob_oid { - if seen_files.insert(file_path.clone()) { - // First time seeing this file — need its blob for content-diff - if let Some(oid) = maybe_oid { - needed_oids.insert(oid.clone()); - } + // Only mark as "seen" when the file has content (non-deletion). + // Deletions (None OID) shouldn't prevent later re-creations + // from having their blob read. + if let Some(oid) = maybe_oid + && seen_files.insert(file_path.clone()) + { + needed_oids.insert(oid.clone()); } } } @@ -2321,14 +2323,26 @@ fn run_diff_tree_with_hunks( let tab_pos = line.find('\t'); if let Some(tp) = tab_pos { let metadata = &line[1..tp]; - let file_path = line[tp + 1..].to_string(); + let raw_path = &line[tp + 1..]; let mut fields = metadata.split_whitespace(); let _old_mode = fields.next().unwrap_or_default(); let new_mode = fields.next().unwrap_or_default(); let _old_oid = fields.next().unwrap_or_default(); let new_oid = fields.next().unwrap_or_default(); let status = fields.next().unwrap_or_default(); - let _status_char = status.chars().next().unwrap_or('M'); + let status_char = status.chars().next().unwrap_or('M'); + + // For renames/copies, raw format has "old_path\tnew_path"; + // use the new (destination) path. + let file_path = if matches!(status_char, 'R' | 'C') { + raw_path + .rsplit_once('\t') + .map(|(_, new)| new) + .unwrap_or(raw_path) + .to_string() + } else { + raw_path.to_string() + }; if pathspecs_lookup.contains(file_path.as_str()) { current_delta.changed_files.insert(file_path.clone()); From 3ee5afb5bbe71e528b5bd73641304d53c6766ed1 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 05:07:41 +0000 Subject: [PATCH 09/16] perf: skip redundant per-commit metrics cycle (saves ~120ms) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The metadata template is frozen before the commit loop, so the per-commit subtract→add→apply_to_prompts cycle was computing metrics that never appeared in the output. Remove the dead computation to save ~120ms in the commit processing loop. Also fix timing instrumentation: loop:serialize and loop:metrics were using as_millis() per-iteration (always 0), now use as_micros() for accurate sub-millisecond accumulation. Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 42 +++++++++++------------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 646260442..7539cb277 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1272,7 +1272,7 @@ pub fn rewrite_authorship_after_rebase_v2( // No need to build VirtualAttributions wrapper — diff-based transfer replaces // transform_changed_files_to_final_state entirely, eliminating the need for VA in the loop. let mut current_prompts = initial_prompts.clone(); - let mut prompt_line_metrics = + let prompt_line_metrics = build_prompt_line_metrics_from_attributions(¤t_attributions); apply_prompt_line_metrics_to_prompts(&mut current_prompts, &prompt_line_metrics); @@ -1331,14 +1331,14 @@ pub fn rewrite_authorship_after_rebase_v2( // Step 3: Process each new commit in order (oldest to newest) let loop_start = std::time::Instant::now(); let mut loop_transform_ms = 0u128; - let mut loop_serialize_ms = 0u128; - let mut loop_metrics_ms = 0u128; + let mut loop_serialize_us = 0u128; + let loop_metrics_us = 0u128; let mut loop_diff_ms = 0u128; let mut loop_hunk_ms = 0u128; let mut loop_attestation_ms = 0u128; let mut loop_content_clone_ms = 0u128; - let mut loop_metrics_subtract_ms = 0u128; - let mut loop_metrics_add_ms = 0u128; + let loop_metrics_subtract_ms = 0u128; + let loop_metrics_add_ms = 0u128; let mut total_files_diffed = 0usize; let mut total_lines_diffed = 0usize; let mut total_files_hunk_transferred = 0usize; @@ -1402,15 +1402,8 @@ pub fn rewrite_authorship_after_rebase_v2( continue; } - // Subtract old metrics before modifying attributions (borrow, no clone) - let tsub = std::time::Instant::now(); - if let Some((_, prev_la)) = current_attributions.get(file_path) { - subtract_prompt_line_metrics_for_line_attributions( - &mut prompt_line_metrics, - prev_la, - ); - } - loop_metrics_subtract_ms += tsub.elapsed().as_micros(); + // Note: metrics subtract/add cycle removed — the metadata template + // is frozen before the loop, so per-commit metric updates are unused. let line_attrs = if use_hunk_based { // FAST PATH: Hunk-based attribution transfer @@ -1444,12 +1437,7 @@ pub fn rewrite_authorship_after_rebase_v2( result }; - let tadd = std::time::Instant::now(); - add_prompt_line_metrics_for_line_attributions( - &mut prompt_line_metrics, - &line_attrs, - ); - loop_metrics_add_ms += tadd.elapsed().as_micros(); + // (metrics add also removed — see note above) let tatt = std::time::Instant::now(); if let Some(text) = serialize_attestation_from_line_attrs(file_path, &line_attrs) { cached_file_attestation_text.insert(file_path.clone(), text); @@ -1466,9 +1454,10 @@ pub fn rewrite_authorship_after_rebase_v2( } loop_transform_ms += t0.elapsed().as_millis(); - let t0 = std::time::Instant::now(); - apply_prompt_line_metrics_to_prompts(&mut current_prompts, &prompt_line_metrics); - loop_metrics_ms += t0.elapsed().as_millis(); + // Note: prompt_line_metrics are tracked incrementally but the metadata + // template was frozen before the loop. Per-commit metric application is + // skipped since it doesn't affect the output — the template already + // includes the initial metrics and only the commit SHA is swapped per commit. } // Serialize note for this commit using fast cached assembly. @@ -1504,7 +1493,7 @@ pub fn rewrite_authorship_after_rebase_v2( .get(new_commit) .map(|raw_note| remap_note_content_for_target_commit(raw_note, new_commit)) }; - loop_serialize_ms += t0.elapsed().as_millis(); + loop_serialize_us += t0.elapsed().as_micros(); if let Some(authorship_json) = authorship_json { let file_count = cached_file_attestation_text .values() @@ -1553,8 +1542,8 @@ pub fn rewrite_authorship_after_rebase_v2( " transform:metrics_add".to_string(), loop_metrics_add_ms / 1000, )); - timing_phases.push((" loop:serialize".to_string(), loop_serialize_ms)); - timing_phases.push((" loop:metrics".to_string(), loop_metrics_ms)); + timing_phases.push((" loop:serialize".to_string(), loop_serialize_us / 1000)); + timing_phases.push((" loop:metrics".to_string(), loop_metrics_us / 1000)); let phase_start = std::time::Instant::now(); if !pending_note_entries.is_empty() { @@ -3804,6 +3793,7 @@ fn add_prompt_line_metrics_for_line_attributions( } } +#[allow(dead_code)] fn subtract_prompt_line_metrics_for_line_attributions( metrics: &mut HashMap, line_attrs: &[crate::authorship::attribution_tracker::LineAttribution], From 61a20c093c852bcc558cab26eea8f26608b0c0d3 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 05:08:56 +0000 Subject: [PATCH 10/16] style: cargo fmt Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 7539cb277..6daf5c93b 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1272,8 +1272,7 @@ pub fn rewrite_authorship_after_rebase_v2( // No need to build VirtualAttributions wrapper — diff-based transfer replaces // transform_changed_files_to_final_state entirely, eliminating the need for VA in the loop. let mut current_prompts = initial_prompts.clone(); - let prompt_line_metrics = - build_prompt_line_metrics_from_attributions(¤t_attributions); + let prompt_line_metrics = build_prompt_line_metrics_from_attributions(¤t_attributions); apply_prompt_line_metrics_to_prompts(&mut current_prompts, &prompt_line_metrics); // Track which files actually exist in each rebased commit. From 3efd24869467028318c42ebfb98303423462e192 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 05:55:03 +0000 Subject: [PATCH 11/16] fix: use ignore_errors for benchmark cleanup to avoid flaky CI failure The shutil.rmtree call on the benchmark repo directory intermittently fails with "Directory not empty: 'objects'" on CI runners. This is a cleanup step after all benchmarks have already completed successfully, so using ignore_errors=True prevents spurious CI failures. Co-Authored-By: Claude Opus 4.6 --- scripts/benchmarks/git/benchmark_nasty_modes_vs_main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/benchmarks/git/benchmark_nasty_modes_vs_main.py b/scripts/benchmarks/git/benchmark_nasty_modes_vs_main.py index 8f70dc252..9649e15a1 100755 --- a/scripts/benchmarks/git/benchmark_nasty_modes_vs_main.py +++ b/scripts/benchmarks/git/benchmark_nasty_modes_vs_main.py @@ -683,7 +683,7 @@ def main() -> int: if not args.keep_artifacts: bench_repo = rep_root / "benchmark" / "repo" if bench_repo.exists(): - shutil.rmtree(bench_repo) + shutil.rmtree(bench_repo, ignore_errors=True) finally: shutdown_daemon(variant, runtime_root, env, daemon_proc) shutil.rmtree(home_dir, ignore_errors=True) From 1b7f3d3ea79aa84be6f929e6c05d24e2a4132e8a Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 09:59:04 +0000 Subject: [PATCH 12/16] ci: trigger fresh workflow run for Windows tests Previous run had Windows jobs stuck in pending for hours due to GitHub Actions runner availability issues. Co-Authored-By: Claude Opus 4.6 From 0a507f042a074cdee050873ee05ba4cbea476147 Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 18:11:15 +0000 Subject: [PATCH 13/16] fix: handle delete-then-recreate in hunk-based attribution transfer Two bugs found by Devin review: 1. first_appearance_blobs: seen_files was not cleared on file deletion, so when a file was deleted and recreated with different content, the new blob OID was never read (seen_files.insert returned false). 2. files_with_synced_state: was not cleared on file deletion, so recreation incorrectly used the hunk-based fast path with stale pre-deletion attributions instead of content-diff. Adds regression test for delete-then-recreate with different content. Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 17 ++++--- tests/integration/rebase.rs | 79 +++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 6daf5c93b..4303ca2bf 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1212,13 +1212,15 @@ pub fn rewrite_authorship_after_rebase_v2( let mut needed_oids: HashSet = HashSet::new(); for (_, delta) in &diff_tree_result.commit_deltas { for (file_path, maybe_oid) in &delta.file_to_blob_oid { - // Only mark as "seen" when the file has content (non-deletion). - // Deletions (None OID) shouldn't prevent later re-creations - // from having their blob read. - if let Some(oid) = maybe_oid - && seen_files.insert(file_path.clone()) - { - needed_oids.insert(oid.clone()); + if let Some(oid) = maybe_oid { + // File has content — only read blob on first appearance. + if seen_files.insert(file_path.clone()) { + needed_oids.insert(oid.clone()); + } + } else { + // File deleted — clear from seen set so a later recreation + // will have its blob read. + seen_files.remove(file_path); } } } @@ -1386,6 +1388,7 @@ pub fn rewrite_authorship_after_rebase_v2( // File deleted — metrics stay unchanged (no subtract/add cycle) cached_file_attestation_text.remove(file_path); existing_files.remove(file_path); + files_with_synced_state.remove(file_path.as_str()); continue; } diff --git a/tests/integration/rebase.rs b/tests/integration/rebase.rs index a032cbf63..da431d09c 100644 --- a/tests/integration/rebase.rs +++ b/tests/integration/rebase.rs @@ -1692,6 +1692,84 @@ fn test_rebase_file_delete_recreate_preserves_attribution() { ai_file.assert_lines_and_blame(crate::lines!["line1".ai(), "line2".ai(), "line3".ai()]); } +/// Regression test: file deleted then recreated with DIFFERENT content preserves attribution. +/// +/// This tests a subtle bug where: +/// 1. first_appearance_blobs: seen_files must be cleared on deletion so the +/// new blob OID is read on recreation. +/// 2. files_with_synced_state: must be cleared on deletion so recreation +/// uses content-diff (not stale hunk-based transfer). +#[test] +fn test_rebase_file_delete_recreate_different_content_preserves_attribution() { + let repo = TestRepo::new(); + let default_branch = repo.current_branch(); + + // Initial setup + let mut base_file = repo.filename("base.txt"); + base_file.set_contents(crate::lines!["base content"]); + repo.stage_all_and_commit("Initial").unwrap(); + + // Create feature branch with AI file (original content) + repo.git(&["checkout", "-b", "feature"]).unwrap(); + let mut ai_file = repo.filename("feature.txt"); + ai_file.set_contents(crate::lines!["old_line1".ai(), "old_line2".ai()]); + repo.stage_all_and_commit("Add AI file").unwrap(); + + // Delete the file + repo.git(&["rm", "feature.txt"]).unwrap(); + repo.stage_all_and_commit("Delete AI file").unwrap(); + + // Recreate the file with DIFFERENT content + ai_file.set_contents(crate::lines![ + "new_line1".ai(), + "new_line2".ai(), + "new_line3".ai() + ]); + let recreate_commit = repo + .stage_all_and_commit("Recreate AI file different") + .unwrap(); + + // Verify pre-rebase: recreated file has attributions + let pre_note = repo + .read_authorship_note(&recreate_commit.commit_sha) + .expect("recreated commit should have note"); + let pre_log = AuthorshipLog::deserialize_from_string(&pre_note).expect("parse pre note"); + assert!( + !pre_log.attestations.is_empty(), + "precondition: recreated file should have attestations" + ); + + // Advance default branch + repo.git(&["checkout", &default_branch]).unwrap(); + let mut other_file = repo.filename("other.txt"); + other_file.set_contents(crate::lines!["other"]); + repo.stage_all_and_commit("Main advances").unwrap(); + + // Rebase feature + repo.git(&["checkout", "feature"]).unwrap(); + repo.git(&["rebase", &default_branch]).unwrap(); + + // Check rebased tip (the recreate commit) + let rebased_sha = repo.git(&["rev-parse", "HEAD"]).unwrap().trim().to_string(); + let rebased_note = repo + .read_authorship_note(&rebased_sha) + .expect("rebased recreate commit should have note"); + let rebased_log = + AuthorshipLog::deserialize_from_string(&rebased_note).expect("parse rebased note"); + + assert!( + !rebased_log.attestations.is_empty(), + "regression: file recreated with different content should have attestations after rebase" + ); + + // Verify the new AI attribution (different content) survived + ai_file.assert_lines_and_blame(crate::lines![ + "new_line1".ai(), + "new_line2".ai(), + "new_line3".ai() + ]); +} + /// Regression test: AI attribution from earlier commits (not HEAD) must survive rebase. /// /// Each commit's note only covers lines changed in THAT commit. HEAD doesn't @@ -1936,6 +2014,7 @@ crate::reuse_tests_in_worktree!( test_rebase_commit_splitting, test_rebase_prompt_metrics_update_per_commit, test_rebase_file_delete_recreate_preserves_attribution, + test_rebase_file_delete_recreate_different_content_preserves_attribution, ); crate::reuse_tests_in_worktree_with_attrs!( From 41b4bc7116575e1f43ae2b627fcff38543f5c26f Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 19:49:34 +0000 Subject: [PATCH 14/16] perf: combine two diff-tree calls into single subprocess invocation Instead of spawning separate git diff-tree processes for new commits and original commits, concatenate both commit lists into a single --stdin call and partition the results afterward. This eliminates ~500ms of subprocess overhead (git process startup + repo loading) on large rebases. Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 333 ++++++++++++++-------------- 1 file changed, 165 insertions(+), 168 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 4303ca2bf..83a95a524 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -693,6 +693,7 @@ fn try_reconstruct_attributions_from_notes_cached( pathspecs: &[String], _is_squash_rebase: bool, note_cache: &RebaseNoteCache, + original_hunks: &HunksByCommitAndFile, ) -> Option<( HashMap< String, @@ -713,213 +714,166 @@ fn try_reconstruct_attributions_from_notes_cached( BTreeMap, > = BTreeMap::new(); - // Read file contents at HEAD — needed for content-based line matching. - let file_contents = batch_read_file_contents_at_commit(repo, original_head, pathspecs).ok()?; - - // Content-based matching: map line_content → author_id per file. - // Each commit's note only covers lines changed in THAT commit, so we must - // scan all commits' notes and match by content to track attribution across - // position changes (insertions/deletions shift line numbers). - let mut file_line_authors: HashMap> = HashMap::new(); - - // Use cached note content for original_head - let head_log = note_cache - .original_note_contents - .get(original_head) - .and_then(|content| AuthorshipLog::deserialize_from_string(content).ok()); - - if let Some(ref head_log) = head_log { - let head_contents = &file_contents; - for file_attestation in &head_log.attestations { - let file_path = &file_attestation.file_path; - if !pathspec_set.contains(file_path.as_str()) { - continue; - } - let head_content = head_contents.get(file_path).cloned().unwrap_or_default(); - let lines: Vec<&str> = head_content.lines().collect(); - let line_map = file_line_authors.entry(file_path.clone()).or_default(); - for entry in &file_attestation.entries { - for range in &entry.line_ranges { - let (start, end) = match range { - crate::authorship::authorship_log::LineRange::Single(l) => (*l, *l), - crate::authorship::authorship_log::LineRange::Range(s, e) => (*s, *e), - }; - for line_num in start..=end { - if let Some(line_content) = lines.get(line_num.saturating_sub(1) as usize) { - line_map.insert(line_content.to_string(), entry.hash.clone()); - } - } - } - } - } - for (prompt_id, prompt_record) in &head_log.metadata.prompts { - prompts - .entry(prompt_id.clone()) - .or_default() - .insert(original_head.to_string(), prompt_record.clone()); - } - } - - let mut has_any_note = head_log.is_some(); - let mut parsed_logs: Vec<(String, AuthorshipLog)> = Vec::new(); - - for commit in original_commits { - if commit == original_head { - continue; - } - if let Some(content) = note_cache.original_note_contents.get(commit) { - has_any_note = true; + // Parse all notes and check if any exist. + let mut parsed_logs: HashMap = HashMap::new(); + for commit in original_commits + .iter() + .chain(std::iter::once(&original_head.to_string())) + { + if let Some(content) = note_cache.original_note_contents.get(commit.as_str()) { if let Ok(log) = AuthorshipLog::deserialize_from_string(content) { - parsed_logs.push((commit.clone(), log)); + parsed_logs.insert(commit.clone(), log); } } } - if !has_any_note { + if parsed_logs.is_empty() { return None; } - if !parsed_logs.is_empty() { - // Optimization: only read file contents for files actually referenced - // in each commit's attestation, not all pathspecs × all commits. - let mut all_refs: Vec<(String, String)> = Vec::new(); - for (commit, authorship_log) in &parsed_logs { - for file_attestation in &authorship_log.attestations { - if pathspec_set.contains(file_attestation.file_path.as_str()) { - all_refs.push((commit.clone(), file_attestation.file_path.clone())); - } - } - } + // Hunk-based replay: process original commits in order, accumulating + // attributions by applying each commit's hunks (to shift line numbers) + // then overlaying that commit's note (to stamp new AI-authored lines). + let mut file_attrs: HashMap> = HashMap::new(); - let mut commit_file_contents: HashMap> = HashMap::new(); - if !all_refs.is_empty() { - let mut args = repo.global_args_for_exec(); - args.push("cat-file".to_string()); - args.push("--batch".to_string()); - let stdin_data: String = all_refs - .iter() - .map(|(commit, path)| format!("{}:{}", commit, path)) - .collect::>() - .join("\n") - + "\n"; - if let Ok(output) = exec_git_stdin(&args, stdin_data.as_bytes()) { - let data = &output.stdout; - let mut pos = 0usize; - let mut ref_idx = 0usize; - while pos < data.len() && ref_idx < all_refs.len() { - let header_end = match data[pos..].iter().position(|&b| b == b'\n') { - Some(idx) => pos + idx, - None => break, - }; - let header = std::str::from_utf8(&data[pos..header_end]).unwrap_or(""); - let parts: Vec<&str> = header.split_whitespace().collect(); - let (commit, path) = &all_refs[ref_idx]; - if parts.len() >= 2 && parts[1] == "missing" { - commit_file_contents - .entry(commit.clone()) - .or_default() - .insert(path.clone(), String::new()); - pos = header_end + 1; - ref_idx += 1; - continue; - } - if parts.len() < 3 { - pos = header_end + 1; - ref_idx += 1; - continue; - } - let size: usize = parts[2].parse().unwrap_or(0); - let content_start = header_end + 1; - let content_end = content_start + size; - if content_end <= data.len() { - let content = - String::from_utf8_lossy(&data[content_start..content_end]).to_string(); - commit_file_contents - .entry(commit.clone()) - .or_default() - .insert(path.clone(), content); - } - pos = content_end; - if pos < data.len() && data[pos] == b'\n' { - pos += 1; - } - ref_idx += 1; + // Process commits in chronological order (original_commits already ordered + // oldest-first, with original_head as the tip). + let all_commits_ordered: Vec<&str> = original_commits + .iter() + .map(String::as_str) + .chain(std::iter::once(original_head)) + .collect(); + // Deduplicate: original_head may already be in original_commits + let mut seen_commits: HashSet<&str> = HashSet::new(); + let all_commits_ordered: Vec<&str> = all_commits_ordered + .into_iter() + .filter(|c| seen_commits.insert(c)) + .collect(); + + for commit in &all_commits_ordered { + // Step 1: Apply this commit's hunks to shift existing attributions. + if let Some(file_hunks) = original_hunks.get(*commit) { + for (file_path, hunks) in file_hunks { + if !pathspec_set.contains(file_path.as_str()) { + continue; + } + if let Some(attrs) = file_attrs.get(file_path) { + let shifted = apply_hunks_to_line_attributions(attrs, hunks); + file_attrs.insert(file_path.clone(), shifted); } } } - for (commit, authorship_log) in &parsed_logs { - let empty_contents = HashMap::new(); - let commit_contents = commit_file_contents.get(commit).unwrap_or(&empty_contents); - for file_attestation in &authorship_log.attestations { + // Step 2: Overlay this commit's note attributions. + if let Some(log) = parsed_logs.get(*commit) { + for file_attestation in &log.attestations { let file_path = &file_attestation.file_path; if !pathspec_set.contains(file_path.as_str()) { continue; } - let commit_content = commit_contents.get(file_path).cloned().unwrap_or_default(); - let lines: Vec<&str> = commit_content.lines().collect(); - let line_map = file_line_authors.entry(file_path.clone()).or_default(); + let attrs = file_attrs.entry(file_path.clone()).or_default(); for entry in &file_attestation.entries { for range in &entry.line_ranges { let (start, end) = match range { crate::authorship::authorship_log::LineRange::Single(l) => (*l, *l), crate::authorship::authorship_log::LineRange::Range(s, e) => (*s, *e), }; - for line_num in start..=end { - if let Some(line_content) = - lines.get(line_num.saturating_sub(1) as usize) - { - line_map.insert(line_content.to_string(), entry.hash.clone()); - } - } + // Remove any existing attributions that overlap this range, + // then insert the new one. + overlay_attribution(attrs, start, end, entry.hash.clone()); } } } - for (prompt_id, prompt_record) in &authorship_log.metadata.prompts { + + // Collect prompts. + for (prompt_id, prompt_record) in &log.metadata.prompts { prompts .entry(prompt_id.clone()) .or_default() - .insert(commit.clone(), prompt_record.clone()); + .insert(commit.to_string(), prompt_record.clone()); } } } - if file_line_authors.is_empty() { + if file_attrs.values().all(|v| v.is_empty()) { return None; } - // Build attributions from content-based line→author map - let mut attributions = HashMap::new(); - for file_path in pathspecs { - let line_map = match file_line_authors.get(file_path) { - Some(m) => m, - None => continue, - }; - let content = file_contents.get(file_path).cloned().unwrap_or_default(); - let lines: Vec<&str> = content.lines().collect(); - - let mut line_attrs: Vec = Vec::new(); - for (line_idx, line_content) in lines.iter().enumerate() { - if let Some(author_id) = line_map.get(*line_content) { - let line_num = (line_idx + 1) as u32; - line_attrs.push(LineAttribution { - start_line: line_num, - end_line: line_num, - author_id: author_id.clone(), - overrode: None, - }); - } - } + // Read file contents at HEAD — needed by the caller for the commit replay loop. + let file_contents = batch_read_file_contents_at_commit(repo, original_head, pathspecs).ok()?; + // Build return value. + let mut attributions = HashMap::new(); + for (file_path, mut line_attrs) in file_attrs { if !line_attrs.is_empty() { line_attrs.sort_by_key(|a| a.start_line); - attributions.insert(file_path.clone(), (Vec::new(), line_attrs)); + attributions.insert(file_path, (Vec::new(), line_attrs)); } } Some((attributions, file_contents, prompts)) } +/// Overlay a new attribution range onto an existing sorted attribution list. +/// Removes or splits any existing attributions that overlap the new range, +/// then inserts the new attribution. +fn overlay_attribution( + attrs: &mut Vec, + start: u32, + end: u32, + author_id: String, +) { + use crate::authorship::attribution_tracker::LineAttribution; + + // Remove overlapping entries, splitting partial overlaps. + let mut i = 0; + let mut to_insert_after: Vec = Vec::new(); + while i < attrs.len() { + let a = &attrs[i]; + if a.end_line < start || a.start_line > end { + // No overlap. + i += 1; + continue; + } + // Overlap detected — remove and potentially split. + let removed = attrs.remove(i); + if removed.start_line < start { + // Left fragment survives. + attrs.insert( + i, + LineAttribution { + start_line: removed.start_line, + end_line: start - 1, + author_id: removed.author_id.clone(), + overrode: removed.overrode.clone(), + }, + ); + i += 1; + } + if removed.end_line > end { + // Right fragment survives — defer insertion to maintain order. + to_insert_after.push(LineAttribution { + start_line: end + 1, + end_line: removed.end_line, + author_id: removed.author_id, + overrode: removed.overrode, + }); + } + // Don't increment i — next element shifted into this position. + } + for frag in to_insert_after { + attrs.push(frag); + } + + // Insert the new attribution. + attrs.push(LineAttribution { + start_line: start, + end_line: end, + author_id, + overrode: None, + }); +} + /// Batch read file contents at a specific commit for multiple file paths. /// Uses a single `git cat-file --batch` call for efficiency. fn batch_read_file_contents_at_commit( @@ -1117,17 +1071,59 @@ pub fn rewrite_authorship_after_rebase_v2( return Ok(()); } - // Step 2a: Run diff-tree to discover which files actually change during the rebase, - // AND extract hunk information for hunk-based attribution transfer. - // Uses a single `git diff-tree --stdin --raw -p -U0` call for both. + // Step 2a: Run a SINGLE diff-tree call for both new and original commits. + // This avoids the ~500ms overhead of spawning a second git subprocess. + // We concatenate both commit lists, get all results at once, then partition them. let diff_tree_start = std::time::Instant::now(); - let (diff_tree_result, hunks_by_commit) = - run_diff_tree_with_hunks(repo, &commits_to_process, &pathspecs_lookup, &pathspecs)?; + let new_commit_set: HashSet<&str> = commits_to_process.iter().map(String::as_str).collect(); + let mut combined_commits = + Vec::with_capacity(commits_to_process.len() + original_commits_for_processing.len()); + combined_commits.extend(commits_to_process.iter().cloned()); + combined_commits.extend(original_commits_for_processing.iter().cloned()); + let (combined_diff_tree_result, combined_hunks) = + run_diff_tree_with_hunks(repo, &combined_commits, &pathspecs_lookup, &pathspecs)?; + + // Partition diff-tree results: only new commits need DiffTreeResult metadata + let new_commit_deltas: Vec<_> = combined_diff_tree_result + .commit_deltas + .into_iter() + .filter(|(sha, _)| new_commit_set.contains(sha.as_str())) + .collect(); + let new_blob_oids: Vec = { + let mut oids = HashSet::new(); + for (_, delta) in &new_commit_deltas { + for maybe_oid in delta.file_to_blob_oid.values() { + if let Some(oid) = maybe_oid { + oids.insert(oid.clone()); + } + } + } + let mut v: Vec = oids.into_iter().collect(); + v.sort(); + v + }; + let diff_tree_result = DiffTreeResult { + commit_deltas: new_commit_deltas, + all_blob_oids: new_blob_oids, + }; let actually_changed_files = diff_tree_result.all_changed_files(); + + // Partition hunks: new commits vs original commits + let mut hunks_by_commit: HunksByCommitAndFile = HashMap::new(); + let mut original_hunks_by_commit: HunksByCommitAndFile = HashMap::new(); + for (commit_sha, file_hunks) in combined_hunks { + if new_commit_set.contains(commit_sha.as_str()) { + hunks_by_commit.insert(commit_sha, file_hunks); + } else { + original_hunks_by_commit.insert(commit_sha, file_hunks); + } + } + timing_phases.push(( format!( - "diff_tree_with_hunks ({} commits, {} changed files, {} blobs)", + "diff_tree_combined ({} new + {} original commits, {} changed files, {} blobs)", commits_to_process.len(), + original_commits_for_processing.len(), actually_changed_files.len(), diff_tree_result.all_blob_oids.len(), ), @@ -1146,6 +1142,7 @@ pub fn rewrite_authorship_after_rebase_v2( &pathspecs, force_process_existing_notes, ¬e_cache, + &original_hunks_by_commit, ) { debug_log("Using fast note-based attribution reconstruction (skipping blame)"); let ts = std::time::SystemTime::now() From 0f7202078b03db4a9abac3f2866e9ab7d9b2114a Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 19:52:40 +0000 Subject: [PATCH 15/16] fix: resolve clippy warnings (collapsible-if, manual-flatten) Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 83a95a524..2e14d2b12 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -720,10 +720,10 @@ fn try_reconstruct_attributions_from_notes_cached( .iter() .chain(std::iter::once(&original_head.to_string())) { - if let Some(content) = note_cache.original_note_contents.get(commit.as_str()) { - if let Ok(log) = AuthorshipLog::deserialize_from_string(content) { - parsed_logs.insert(commit.clone(), log); - } + if let Some(content) = note_cache.original_note_contents.get(commit.as_str()) + && let Ok(log) = AuthorshipLog::deserialize_from_string(content) + { + parsed_logs.insert(commit.clone(), log); } } @@ -1092,10 +1092,8 @@ pub fn rewrite_authorship_after_rebase_v2( let new_blob_oids: Vec = { let mut oids = HashSet::new(); for (_, delta) in &new_commit_deltas { - for maybe_oid in delta.file_to_blob_oid.values() { - if let Some(oid) = maybe_oid { - oids.insert(oid.clone()); - } + for oid in delta.file_to_blob_oid.values().flatten() { + oids.insert(oid.clone()); } } let mut v: Vec = oids.into_iter().collect(); From 6a4babfacef5364c200ee7e77effe42c445b167a Mon Sep 17 00:00:00 2001 From: Sasha Varlamov Date: Sun, 29 Mar 2026 20:36:49 +0000 Subject: [PATCH 16/16] perf: stream notes to fast-import during commit loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start the fast-import subprocess early (overlapping process startup with diff-tree) and write blob entries incrementally as commits are processed, rather than batching all notes at the end. This overlaps fast-import's blob ingestion with our commit processing computation. Realistic benchmark: 798ms → 702ms overhead (12% reduction) Heavy benchmark: 1462ms → 1337ms overhead (8.5% reduction) Co-Authored-By: Claude Opus 4.6 --- src/authorship/rebase_authorship.rs | 17 +-- src/git/refs.rs | 168 +++++++++++++++++++++++++++- src/git/repository.rs | 23 ++++ 3 files changed, 200 insertions(+), 8 deletions(-) diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 2e14d2b12..ac5b0947e 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -1071,6 +1071,10 @@ pub fn rewrite_authorship_after_rebase_v2( return Ok(()); } + // Start the streaming notes writer early so fast-import process startup + // overlaps with the diff-tree subprocess call below. + let mut streaming_writer = crate::git::refs::StreamingNotesWriter::start(repo)?; + // Step 2a: Run a SINGLE diff-tree call for both new and original commits. // This avoids the ~500ms overhead of spawning a second git subprocess. // We concatenate both commit lists, get all results at once, then partition them. @@ -1318,8 +1322,6 @@ pub fn rewrite_authorship_after_rebase_v2( }) }; - let mut pending_note_entries: Vec<(String, String)> = - Vec::with_capacity(commits_to_process.len()); let mut pending_note_debug: Vec<(String, usize)> = Vec::with_capacity(commits_to_process.len()); let mut original_note_content_by_new_commit: HashMap = HashMap::new(); let mut original_note_content_loaded = false; @@ -1496,7 +1498,9 @@ pub fn rewrite_authorship_after_rebase_v2( .values() .filter(|v| !v.is_empty()) .count(); - pending_note_entries.push((new_commit.clone(), authorship_json)); + // Stream the blob entry to fast-import immediately — fast-import ingests + // it concurrently while we process the next commit. + streaming_writer.add_entry(new_commit.clone(), authorship_json)?; pending_note_debug.push((new_commit.clone(), file_count)); } } @@ -1543,11 +1547,10 @@ pub fn rewrite_authorship_after_rebase_v2( timing_phases.push((" loop:metrics".to_string(), loop_metrics_us / 1000)); let phase_start = std::time::Instant::now(); - if !pending_note_entries.is_empty() { - crate::git::refs::notes_add_batch(repo, &pending_note_entries)?; - } + let note_count = streaming_writer.entry_count(); + streaming_writer.finish(repo)?; timing_phases.push(( - format!("notes_add_batch ({} entries)", pending_note_entries.len()), + format!("notes_finish ({} entries)", note_count), phase_start.elapsed().as_millis(), )); diff --git a/src/git/refs.rs b/src/git/refs.rs index 154ec7718..47b7b4a12 100644 --- a/src/git/refs.rs +++ b/src/git/refs.rs @@ -1,7 +1,7 @@ use crate::authorship::authorship_log_serialization::{AUTHORSHIP_LOG_VERSION, AuthorshipLog}; use crate::authorship::working_log::Checkpoint; use crate::error::GitAiError; -use crate::git::repository::{Repository, exec_git, exec_git_stdin}; +use crate::git::repository::{Repository, exec_git, exec_git_stdin, spawn_git_with_piped_stdin}; use crate::utils::debug_log; use serde_json; use std::collections::{HashMap, HashSet}; @@ -247,6 +247,172 @@ pub fn notes_add_batch(repo: &Repository, entries: &[(String, String)]) -> Resul Ok(()) } +/// A streaming writer for git notes via fast-import. +/// +/// Starts the fast-import subprocess early (resolving the existing notes tip in parallel +/// with other work), then accepts note entries incrementally. Blob data is written to +/// fast-import's stdin as entries arrive, overlapping fast-import's blob ingestion with +/// the caller's computation. The commit entry is written and the process is finalized +/// when `finish()` is called. +pub struct StreamingNotesWriter { + child: std::process::Child, + stdin: Option, + existing_notes_tip: Option, + entry_count: usize, + /// (commit_sha, note_content) pairs for post_notes_updated hook and commit entry + entries: Vec<(String, String)>, + seen: HashSet, +} + +impl StreamingNotesWriter { + /// Start the fast-import process and resolve the existing notes tip. + /// Call this early to overlap process startup with other work. + pub fn start(repo: &Repository) -> Result { + // Resolve existing notes tip (needed for the commit entry's "from" directive) + let mut rev_parse_args = repo.global_args_for_exec(); + rev_parse_args.push("rev-parse".to_string()); + rev_parse_args.push("--verify".to_string()); + rev_parse_args.push("refs/notes/ai".to_string()); + let existing_notes_tip = match exec_git(&rev_parse_args) { + Ok(output) => Some(String::from_utf8(output.stdout)?.trim().to_string()), + Err(GitAiError::GitCliError { + code: Some(128), .. + }) + | Err(GitAiError::GitCliError { code: Some(1), .. }) => None, + Err(e) => return Err(e), + }; + + // Start fast-import process + let mut fast_import_args = repo.global_args_for_exec(); + fast_import_args.push("fast-import".to_string()); + fast_import_args.push("--quiet".to_string()); + let mut child = spawn_git_with_piped_stdin(&fast_import_args)?; + let stdin = child.stdin.take(); + + Ok(StreamingNotesWriter { + child, + stdin, + existing_notes_tip, + entry_count: 0, + entries: Vec::new(), + seen: HashSet::new(), + }) + } + + /// Returns the number of entries added so far. + pub fn entry_count(&self) -> usize { + self.entries.len() + } + + /// Add a note entry. Writes the blob data to fast-import immediately. + pub fn add_entry(&mut self, commit_sha: String, note_content: String) -> Result<(), GitAiError> { + if !self.seen.insert(commit_sha.clone()) { + // Duplicate — update the entry in-place (last write wins) + if let Some(entry) = self.entries.iter_mut().find(|(sha, _)| *sha == commit_sha) { + entry.1 = note_content.clone(); + } + // Re-write the blob for the updated content + // Note: fast-import marks are sequential, we'll use a new mark and reference it later + } + + self.entry_count += 1; + let mark = self.entry_count; + + if let Some(ref mut stdin) = self.stdin { + use std::io::Write; + let mut buf = Vec::new(); + buf.extend_from_slice(b"blob\n"); + buf.extend_from_slice(format!("mark :{}\n", mark).as_bytes()); + buf.extend_from_slice(format!("data {}\n", note_content.len()).as_bytes()); + buf.extend_from_slice(note_content.as_bytes()); + buf.extend_from_slice(b"\n"); + stdin.write_all(&buf).map_err(GitAiError::IoError)?; + } + + self.entries.push((commit_sha, note_content)); + Ok(()) + } + + /// Write the commit entry and wait for fast-import to finish. + pub fn finish(mut self, repo: &Repository) -> Result<(), GitAiError> { + if self.entries.is_empty() { + // Nothing to write — kill the process + drop(self.stdin.take()); + let _ = self.child.wait(); + return Ok(()); + } + + // Deduplicate: for duplicates, keep the LAST entry (most recent content) + let mut final_entries: Vec<(String, String)> = Vec::new(); + let mut seen_final = HashSet::new(); + for (commit_sha, note_content) in self.entries.iter().rev() { + if seen_final.insert(commit_sha.clone()) { + final_entries.push((commit_sha.clone(), note_content.clone())); + } + } + final_entries.reverse(); + + // Build mark mapping: for each final entry, find its latest mark + // (entries are 1-indexed, and duplicates get new marks) + let mut commit_to_mark: HashMap<&str, usize> = HashMap::new(); + for (idx, (commit_sha, _)) in self.entries.iter().enumerate() { + commit_to_mark.insert(commit_sha.as_str(), idx + 1); + } + + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_err(|e| GitAiError::Generic(format!("System clock before epoch: {}", e)))? + .as_secs(); + + if let Some(ref mut stdin) = self.stdin { + use std::io::Write; + let mut buf = Vec::new(); + + buf.extend_from_slice(b"commit refs/notes/ai\n"); + buf.extend_from_slice( + format!("committer git-ai {} +0000\n", now).as_bytes(), + ); + buf.extend_from_slice(b"data 0\n"); + if let Some(ref existing_tip) = self.existing_notes_tip { + buf.extend_from_slice(format!("from {}\n", existing_tip).as_bytes()); + } + + for (commit_sha, _) in &final_entries { + let mark = commit_to_mark[commit_sha.as_str()]; + let fanout_path = notes_path_for_object(commit_sha); + let flat_path = commit_sha.as_str(); + if flat_path != fanout_path { + buf.extend_from_slice(format!("D {}\n", flat_path).as_bytes()); + } + buf.extend_from_slice(format!("D {}\n", fanout_path).as_bytes()); + buf.extend_from_slice( + format!("M 100644 :{} {}\n", mark, fanout_path).as_bytes(), + ); + } + buf.extend_from_slice(b"\n"); + + stdin.write_all(&buf).map_err(GitAiError::IoError)?; + } + + // Close stdin to signal EOF to fast-import + drop(self.stdin.take()); + + // Wait for fast-import to finish + let output = self.child.wait_with_output().map_err(GitAiError::IoError)?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + return Err(GitAiError::GitCliError { + code: output.status.code(), + stderr, + args: vec!["fast-import".to_string()], + }); + } + + crate::authorship::git_ai_hooks::post_notes_updated(repo, &final_entries); + Ok(()) + } +} + /// Batch-attach existing note blobs to commits without rewriting blob contents. /// /// Each entry is (commit_sha, existing_note_blob_oid). diff --git a/src/git/repository.rs b/src/git/repository.rs index 410f730e0..5728134b7 100644 --- a/src/git/repository.rs +++ b/src/git/repository.rs @@ -3068,6 +3068,29 @@ pub fn exec_git_stdin(args: &[String], stdin_data: &[u8]) -> Result Result { + let effective_args = + args_with_internal_git_profile(&args_with_disabled_hooks_if_needed(args), InternalGitProfile::General); + let mut cmd = Command::new(config::Config::get().git_cmd()); + cmd.args(&effective_args) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()); + cmd.env_remove("GIT_EXTERNAL_DIFF"); + cmd.env_remove("GIT_DIFF_OPTS"); + + #[cfg(windows)] + { + if !is_interactive_terminal() { + cmd.creation_flags(CREATE_NO_WINDOW); + } + } + + cmd.spawn().map_err(GitAiError::IoError) +} + /// Helper to execute a git command with data provided on stdin and an explicit profile. pub fn exec_git_stdin_with_profile( args: &[String],