diff --git a/table/metadata.go b/table/metadata.go index 92b976cae..2b61c4efb 100644 --- a/table/metadata.go +++ b/table/metadata.go @@ -485,6 +485,19 @@ func (b *MetadataBuilder) RemoveSnapshots(snapshotIds []int64) error { return errors.New("current snapshot cannot be removed") } + // Filter out snapshots that are referenced by any branch or tag. + // This prevents a race condition where a concurrent commit adds a ref + // to a snapshot that is being expired. Without this check, we could + // remove a snapshot that was just linked to a new branch/tag, leaving + // the metadata in a corrupt state with a dangling reference. + snapshotIds = slices.DeleteFunc(snapshotIds, func(id int64) bool { + return b.isSnapshotReferenced(id) + }) + + if len(snapshotIds) == 0 { + return nil + } + b.snapshotList = slices.DeleteFunc(b.snapshotList, func(e Snapshot) bool { return slices.Contains(snapshotIds, e.SnapshotID) }) @@ -492,19 +505,23 @@ func (b *MetadataBuilder) RemoveSnapshots(snapshotIds []int64) error { return slices.Contains(snapshotIds, e.SnapshotID) }) - newRefs := make(map[string]SnapshotRef) - for name, ref := range b.refs { - if _, err := b.SnapshotByID(ref.SnapshotID); err == nil { - newRefs[name] = ref - } - } - b.refs = newRefs - b.updates = append(b.updates, NewRemoveSnapshotsUpdate(snapshotIds)) return nil } +// isSnapshotReferenced returns true if the given snapshot ID is referenced +// by any branch or tag in the current metadata state. +func (b *MetadataBuilder) isSnapshotReferenced(snapshotID int64) bool { + for _, ref := range b.refs { + if ref.SnapshotID == snapshotID { + return true + } + } + + return false +} + func (b *MetadataBuilder) AddSortOrder(sortOrder *SortOrder) error { curSchema := b.CurrentSchema() if curSchema == nil { diff --git a/table/metadata_builder_internal_test.go b/table/metadata_builder_internal_test.go index 6c9fed32e..aaa76345f 100644 --- a/table/metadata_builder_internal_test.go +++ b/table/metadata_builder_internal_test.go @@ -570,7 +570,7 @@ func TestSetBranchSnapshotCreatesBranchIfNotExists(t *testing.T) { require.Equal(t, int64(2), builder.updates[1].(*setSnapshotRefUpdate).SnapshotID) } -func TestRemoveSnapshotRemovesBranch(t *testing.T) { +func TestRemoveSnapshotSkipsReferencedSnapshot(t *testing.T) { builder := builderWithoutChanges(2) schemaID := 0 snapshot := Snapshot{ @@ -607,19 +607,92 @@ func TestRemoveSnapshotRemovesBranch(t *testing.T) { require.Equal(t, BranchRef, builder.updates[1].(*setSnapshotRefUpdate).RefType) require.Equal(t, int64(2), builder.updates[1].(*setSnapshotRefUpdate).SnapshotID) + // Attempting to remove a snapshot that is referenced by a branch should + // be a no-op. The snapshot and its ref must remain intact to prevent + // race conditions where a concurrent commit links a ref to a snapshot + // being expired by a maintenance job. newBuilder, err := MetadataBuilderFromBase(meta, "") require.NoError(t, err) require.NoError(t, newBuilder.RemoveSnapshots([]int64{snapshot.SnapshotID})) newMeta, err := newBuilder.Build() require.NoError(t, err) require.NotNil(t, newMeta) - require.Len(t, newMeta.(*metadataV2).SnapshotRefs, 0) - require.Len(t, newBuilder.updates, 1) - require.Equal(t, newBuilder.updates[0].(*removeSnapshotsUpdate).SnapshotIDs[0], snapshot.SnapshotID) - for k, r := range newMeta.Refs() { - require.NotEqual(t, r.SnapshotID, snapshot.SnapshotID) - require.NotEqual(t, k, "new_branch") + require.Len(t, newMeta.(*metadataV2).SnapshotRefs, 1) + require.Equal(t, int64(2), newMeta.(*metadataV2).SnapshotRefs["new_branch"].SnapshotID) + require.Len(t, newBuilder.updates, 0) + require.NotNil(t, newMeta.SnapshotByID(snapshot.SnapshotID)) +} + +// TestRemoveSnapshotsConcurrentRefRace simulates the race condition between +// snapshot expiration maintenance and a concurrent client commit that adds a +// branch ref to a snapshot targeted for expiration. +// +// Timeline: +// 1. Table has snap-1, snap-2, snap-3; main → snap-3 +// 2. Client commits: feature-branch → snap-2 +// 3. Maintenance loads fresh metadata (sees feature-branch → snap-2) +// 4. Maintenance calls RemoveSnapshots([snap-1, snap-2]) +// +// Expected: snap-1 is removed (unreferenced), snap-2 is skipped (referenced). +func TestRemoveSnapshotsConcurrentRefRace(t *testing.T) { + schemaID := 0 + newSnapshot := func(id int64, ts int64) Snapshot { + return Snapshot{ + SnapshotID: id, + ParentSnapshotID: nil, + SequenceNumber: 0, + TimestampMs: ts, + ManifestList: fmt.Sprintf("/snap-%d.avro", id), + Summary: &Summary{ + Operation: OpAppend, + Properties: map[string]string{}, + }, + SchemaID: &schemaID, + } } + + // Step 1: Build initial metadata with three snapshots, main → snap-3. + builder := builderWithoutChanges(2) + snap1 := newSnapshot(1, builder.base.LastUpdatedMillis()+1) + snap2 := newSnapshot(2, builder.base.LastUpdatedMillis()+2) + snap3 := newSnapshot(3, builder.base.LastUpdatedMillis()+3) + require.NoError(t, builder.AddSnapshot(&snap1)) + require.NoError(t, builder.AddSnapshot(&snap2)) + require.NoError(t, builder.AddSnapshot(&snap3)) + require.NoError(t, builder.SetSnapshotRef(MainBranch, 3, BranchRef)) + baseMeta, err := builder.Build() + require.NoError(t, err) + + // Step 2: Simulate client commit that links feature-branch → snap-2. + clientBuilder, err := MetadataBuilderFromBase(baseMeta, "") + require.NoError(t, err) + require.NoError(t, clientBuilder.SetSnapshotRef("feature-branch", 2, BranchRef)) + postClientMeta, err := clientBuilder.Build() + require.NoError(t, err) + + // Step 3: Maintenance loads fresh metadata (post-client commit) and + // tries to expire snap-1 and snap-2. + maintBuilder, err := MetadataBuilderFromBase(postClientMeta, "") + require.NoError(t, err) + require.NoError(t, maintBuilder.RemoveSnapshots([]int64{snap1.SnapshotID, snap2.SnapshotID})) + resultMeta, err := maintBuilder.Build() + require.NoError(t, err) + + // snap-1 must be removed: it is unreferenced. + require.Nil(t, resultMeta.SnapshotByID(snap1.SnapshotID)) + + // snap-2 must survive: it is referenced by feature-branch. + require.NotNil(t, resultMeta.SnapshotByID(snap2.SnapshotID)) + ref := resultMeta.(*metadataV2).SnapshotRefs["feature-branch"] + require.Equal(t, snap2.SnapshotID, ref.SnapshotID) + + // snap-3 must survive: it is referenced by main. + require.NotNil(t, resultMeta.SnapshotByID(snap3.SnapshotID)) + + // Only one remove-snapshots update should be emitted, containing only snap-1. + require.Len(t, maintBuilder.updates, 1) + removeUpd := maintBuilder.updates[0].(*removeSnapshotsUpdate) + require.Equal(t, []int64{snap1.SnapshotID}, removeUpd.SnapshotIDs) } func TestExpireMetadataLog(t *testing.T) {