Skip to content

Commit 7ded96c

Browse files
bilby91claude
andauthored
runtime: normalize tar headers for reproducible build contexts (#86)
tarDirectory stamps mtime/uid/gid into headers from live FileInfo, so synthesized contexts (uid-reconcile, etc.) carry wall-clock mtimes that shift BuildKit's COPY vertex digest across invocations of byte-identical content. Downstream consumers that snapshot a workspace after one Up and restore it for a later Up hit full cache misses and re-extract GBs of image layers into new snapshotter dirs. Normalize ModTime to epoch, zero AccessTime/ChangeTime, and clear uid/gid/uname/gname in every tar header. Also pin useruid's temp context files to the epoch via os.Chtimes so determinism is a local property of the synthesizer. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a9e2d70 commit 7ded96c

3 files changed

Lines changed: 132 additions & 2 deletions

File tree

runtime/docker/build.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"os"
1111
"path/filepath"
1212
"strings"
13+
"time"
1314

1415
"github.com/moby/moby/api/types/build"
1516
"github.com/moby/moby/client"
@@ -168,6 +169,12 @@ func streamBuildOutput(ctx context.Context, body io.ReadCloser, events chan<- ru
168169
}
169170
}
170171

172+
// tarEpoch is the canonical normalized mtime stamped into every tar
173+
// header by tarDirectory. Using unix epoch (rather than time.Time{},
174+
// which clamps to epoch anyway in USTAR) makes intent explicit and
175+
// avoids any reader special-casing the year-1 sentinel.
176+
var tarEpoch = time.Unix(0, 0)
177+
171178
// tarDirectory writes the contents of dir (recursively) into w as a
172179
// non-gzipped tar archive. Symlinks are preserved as tar TypeSymlink
173180
// entries with their original target text; the daemon-side BuildKit
@@ -217,6 +224,22 @@ func tarDirectory(dir string, w io.Writer) error {
217224
if d.IsDir() {
218225
hdr.Name = rel + "/"
219226
}
227+
// Normalize metadata so the tar stream — and therefore
228+
// BuildKit's COPY vertex digest — is reproducible across
229+
// invocations and machines. Wall-clock mtimes from
230+
// os.WriteFile (e.g. useruid's synthesized context) and
231+
// host-specific uid/gid would otherwise perturb the digest of
232+
// byte-identical content, causing cache misses and forcing
233+
// re-extraction of downstream image layers. BuildKit hashes
234+
// content for cache purposes, so erasing these fields doesn't
235+
// lose information it relies on.
236+
hdr.ModTime = tarEpoch
237+
hdr.AccessTime = time.Time{}
238+
hdr.ChangeTime = time.Time{}
239+
hdr.Uid = 0
240+
hdr.Gid = 0
241+
hdr.Uname = ""
242+
hdr.Gname = ""
220243

221244
if err := tw.WriteHeader(hdr); err != nil {
222245
return err

runtime/docker/build_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
package docker
22

33
import (
4+
"archive/tar"
5+
"bytes"
6+
"io"
7+
"os"
8+
"path/filepath"
49
"reflect"
510
"testing"
11+
"time"
612
)
713

814
func TestExtractBaseImages(t *testing.T) {
@@ -92,6 +98,91 @@ FROM alpine:3.20 AS b`,
9298
}
9399
}
94100

101+
// TestTarDirectoryNormalizesMetadata guards against the BuildKit
102+
// COPY-cache regression caused by wall-clock mtimes in synthesized
103+
// build contexts (uid-reconcile, etc.) leaking into the tar stream and
104+
// perturbing the vertex digest of byte-identical content.
105+
func TestTarDirectoryNormalizesMetadata(t *testing.T) {
106+
dir := t.TempDir()
107+
if err := os.WriteFile(filepath.Join(dir, "file.txt"), []byte("hello"), 0o644); err != nil {
108+
t.Fatalf("write: %v", err)
109+
}
110+
if err := os.Mkdir(filepath.Join(dir, "sub"), 0o755); err != nil {
111+
t.Fatalf("mkdir: %v", err)
112+
}
113+
if err := os.WriteFile(filepath.Join(dir, "sub", "nested.txt"), []byte("world"), 0o644); err != nil {
114+
t.Fatalf("write nested: %v", err)
115+
}
116+
117+
var buf bytes.Buffer
118+
if err := tarDirectory(dir, &buf); err != nil {
119+
t.Fatalf("tarDirectory: %v", err)
120+
}
121+
122+
tr := tar.NewReader(&buf)
123+
entries := 0
124+
for {
125+
hdr, err := tr.Next()
126+
if err == io.EOF {
127+
break
128+
}
129+
if err != nil {
130+
t.Fatalf("tar.Next: %v", err)
131+
}
132+
entries++
133+
if !hdr.ModTime.Equal(time.Unix(0, 0)) {
134+
t.Errorf("%s: ModTime not epoch: %v", hdr.Name, hdr.ModTime)
135+
}
136+
if !hdr.AccessTime.IsZero() {
137+
t.Errorf("%s: AccessTime not zero: %v", hdr.Name, hdr.AccessTime)
138+
}
139+
if !hdr.ChangeTime.IsZero() {
140+
t.Errorf("%s: ChangeTime not zero: %v", hdr.Name, hdr.ChangeTime)
141+
}
142+
if hdr.Uid != 0 || hdr.Gid != 0 {
143+
t.Errorf("%s: uid/gid not zero: uid=%d gid=%d", hdr.Name, hdr.Uid, hdr.Gid)
144+
}
145+
if hdr.Uname != "" || hdr.Gname != "" {
146+
t.Errorf("%s: uname/gname not empty: uname=%q gname=%q", hdr.Name, hdr.Uname, hdr.Gname)
147+
}
148+
}
149+
if entries == 0 {
150+
t.Fatal("no tar entries read")
151+
}
152+
}
153+
154+
// TestTarDirectoryDeterministic asserts that taring the same content
155+
// twice with diverging wall-clock mtimes produces byte-identical
156+
// streams — the property BuildKit's COPY cache relies on.
157+
func TestTarDirectoryDeterministic(t *testing.T) {
158+
mkContext := func(t *testing.T, mtime time.Time) string {
159+
t.Helper()
160+
dir := t.TempDir()
161+
p := filepath.Join(dir, "uid-fix.sh")
162+
if err := os.WriteFile(p, []byte("#!/bin/sh\nexit 0\n"), 0o755); err != nil {
163+
t.Fatalf("write: %v", err)
164+
}
165+
if err := os.Chtimes(p, mtime, mtime); err != nil {
166+
t.Fatalf("chtimes: %v", err)
167+
}
168+
return dir
169+
}
170+
171+
a := mkContext(t, time.Unix(1_700_000_000, 0))
172+
b := mkContext(t, time.Unix(1_800_000_000, 0))
173+
174+
var bufA, bufB bytes.Buffer
175+
if err := tarDirectory(a, &bufA); err != nil {
176+
t.Fatalf("tarDirectory a: %v", err)
177+
}
178+
if err := tarDirectory(b, &bufB); err != nil {
179+
t.Fatalf("tarDirectory b: %v", err)
180+
}
181+
if !bytes.Equal(bufA.Bytes(), bufB.Bytes()) {
182+
t.Fatalf("tar streams differ despite identical content (mtime leaked)")
183+
}
184+
}
185+
95186
func TestSubstituteArgs(t *testing.T) {
96187
args := map[string]string{"X": "alpine", "Y": "3.20"}
97188
cases := []struct {

useruid.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"runtime"
99
"strconv"
1010
"syscall"
11+
"time"
1112

1213
"github.com/crunchloop/devcontainer/config"
1314
"github.com/crunchloop/devcontainer/events"
@@ -84,11 +85,26 @@ func (e *Engine) reconcileRemoteUserUID(ctx context.Context, cfg *config.Resolve
8485
}
8586
defer os.RemoveAll(tmp)
8687

87-
if err := os.WriteFile(filepath.Join(tmp, "uid-fix.sh"), []byte(uidReconcileScript), 0o755); err != nil {
88+
// Pin synthesized-context file times to the epoch so the tar
89+
// stream is stable across invocations regardless of wall-clock.
90+
// runtime/docker/build.go also normalizes tar headers as a
91+
// defense-in-depth measure, but keeping the on-disk mtimes
92+
// deterministic here means this context's reproducibility is a
93+
// local property — independent of any consumer's tar pipeline.
94+
epoch := time.Unix(0, 0)
95+
uidFix := filepath.Join(tmp, "uid-fix.sh")
96+
if err := os.WriteFile(uidFix, []byte(uidReconcileScript), 0o755); err != nil {
97+
return "", err
98+
}
99+
if err := os.Chtimes(uidFix, epoch, epoch); err != nil {
88100
return "", err
89101
}
90102
df := generateUIDDockerfile(finalImage, user, hostUID, hostGID)
91-
if err := os.WriteFile(filepath.Join(tmp, "Dockerfile"), []byte(df), 0o644); err != nil {
103+
dfPath := filepath.Join(tmp, "Dockerfile")
104+
if err := os.WriteFile(dfPath, []byte(df), 0o644); err != nil {
105+
return "", err
106+
}
107+
if err := os.Chtimes(dfPath, epoch, epoch); err != nil {
92108
return "", err
93109
}
94110
opts.bus.Emit(events.BuildStartEvent{Source: events.BuildSourceUIDReconcile, Ref: tag})

0 commit comments

Comments
 (0)