From 89a14ac3f7318142f8f79335135074ba04b6dbce Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Wed, 25 Mar 2026 11:31:52 +0100 Subject: [PATCH 1/7] Handle hardlinks in tarfiles In bootc images, the typical layout for a layer tar is: ``` sysroot/ostree/repo/objects/9f/a74817a833dd0b4cefd91da9072006dde770bff03166a75f8e0f2e6b795c9e.file usr/bin/bash link to sysroot/ostree/repo/objects/9f/a74817a833dd0b4cefd91da9072006dde770bff03166a75f8e0f2e6b795c9e.file ``` In the tar file this makes the sha256 name a "real" file object, and the actual file a hardlink referencing it. When diffing such a layer we're only looking at the path/basename of the "real" file, which means we will never find the right source to delta against. To fix this we record *all* the names for each file, and compare against them. Comparing an OCI layer with this gives a large boost: -rw-r--r--. 1 alex alex 17M 25 mar 10.58 image1-layer.tar -rw-r--r--. 1 alex alex 17M 25 mar 10.58 image2-layer.tar -rw-r--r--. 1 alex alex 17M 25 mar 10.59 old-result.tardiff -rw-r--r--. 1 alex alex 3,0M 25 mar 11.19 new-result.tardiff Signed-off-by: Alexander Larsson --- pkg/tar-diff/analysis.go | 69 +++++++++++++------ pkg/tar-diff/analysis_test.go | 122 ++++++++++++++++++++++++++++++++++ pkg/tar-diff/diff.go | 6 +- 3 files changed, 175 insertions(+), 22 deletions(-) diff --git a/pkg/tar-diff/analysis.go b/pkg/tar-diff/analysis.go index 2c6067a..fddbbf1 100644 --- a/pkg/tar-diff/analysis.go +++ b/pkg/tar-diff/analysis.go @@ -16,9 +16,10 @@ import ( ) type tarFileInfo struct { - index int - basename string - path string + index int + // Hard-linked files have multiple names/basenames + basenames []string + paths []string size int64 sha1 string blobs []rollsumBlob @@ -177,17 +178,26 @@ func analyzeTar(tarMaybeCompressed io.Reader) (*tarInfo, error) { } fileInfo := tarFileInfo{ - index: index, - basename: path.Base(pathname), - path: pathname, - size: hdr.Size, - sha1: hex.EncodeToString(h.Sum(nil)), - blobs: r.GetBlobs(), + index: index, + basenames: []string{path.Base(pathname)}, + paths: []string{pathname}, + size: hdr.Size, + sha1: hex.EncodeToString(h.Sum(nil)), + blobs: r.GetBlobs(), } infoByPath[pathname] = len(files) files = append(files, fileInfo) } + // Add hardlink paths and basenames to their target files + for i := range hardlinks { + hl := &hardlinks[i] + if fileIndex, ok := infoByPath[hl.linkname]; ok { + files[fileIndex].paths = append(files[fileIndex].paths, hl.path) + files[fileIndex].basenames = append(files[fileIndex].basenames, path.Base(hl.path)) + } + } + info := tarInfo{files: files, hardlinks: hardlinks} return &info, nil } @@ -198,21 +208,33 @@ func isDeltaCandidate(file *tarFileInfo) bool { // Look for known non-delta-able files (currently just compression) // NB: We explicitly don't have .gz here in case someone might be // using --rsyncable for that. - if strings.HasPrefix(file.basename, ".xz") || - strings.HasPrefix(file.basename, ".bz2") { - return false + for _, basename := range file.basenames { + if strings.HasPrefix(basename, ".xz") || + strings.HasPrefix(basename, ".bz2") { + return false + } } return true } func nameIsSimilar(a *tarFileInfo, b *tarFileInfo, fuzzy int) bool { - if fuzzy == 0 { - return a.basename == b.basename + for _, aBasename := range a.basenames { + for _, bBasename := range b.basenames { + if fuzzy == 0 { + if aBasename == bBasename { + return true + } + } else { + aa := strings.SplitAfterN(aBasename, ".", 2)[0] + bb := strings.SplitAfterN(bBasename, ".", 2)[0] + if aa == bb { + return true + } + } + } } - aa := strings.SplitAfterN(a.basename, ".", 2)[0] - bb := strings.SplitAfterN(b.basename, ".", 2)[0] - return aa == bb + return false } // Check that two files are not wildly dissimilar in size. @@ -283,7 +305,9 @@ func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAn s := &sourceInfos[i] if !s.file.overwritten { sourceBySha1[s.file.sha1] = s - sourceByPath[s.file.path] = s + for _, p := range s.file.paths { + sourceByPath[p] = s + } sourceByIndex[s.file.index] = s } } @@ -303,7 +327,14 @@ func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAn if source == nil && isDeltaCandidate(file) { // No exact match, try to find a useful source - s := sourceByPath[file.path] + // Check if any of the target file's paths match a source file + var s *sourceInfo + for _, p := range file.paths { + if matchedSource := sourceByPath[p]; matchedSource != nil { + s = matchedSource + break + } + } if s != nil && isDeltaCandidate(s.file) && sizeIsSimilar(file, s.file) { usedForDelta = true diff --git a/pkg/tar-diff/analysis_test.go b/pkg/tar-diff/analysis_test.go index f885244..16bfd17 100644 --- a/pkg/tar-diff/analysis_test.go +++ b/pkg/tar-diff/analysis_test.go @@ -214,3 +214,125 @@ func TestAnalyzeForDelta_HardlinksInTargetInfo(t *testing.T) { t.Errorf("Expected linkname 'file.txt', got %q", hlInfo.hardlink.linkname) } } + +func TestAnalyzeTar_HardlinksAddMultiplePaths(t *testing.T) { + entries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("content")}, + {name: "real/file.txt", typeflag: tar.TypeLink, linkname: "blobs/sha256/abc123"}, + {name: "other/link.txt", typeflag: tar.TypeLink, linkname: "blobs/sha256/abc123"}, + } + tarFile, err := createTestTar(entries) + if err != nil { + t.Fatalf("Failed to create test tar: %v", err) + } + + info, err := analyzeTar(tarFile) + if err != nil { + t.Fatalf("analyzeTar failed: %v", err) + } + + if len(info.files) != 1 { + t.Fatalf("Expected 1 file, got %d", len(info.files)) + } + + file := &info.files[0] + + expectedPaths := []string{"blobs/sha256/abc123", "real/file.txt", "other/link.txt"} + if len(file.paths) != len(expectedPaths) { + t.Fatalf("Expected %d paths, got %d", len(expectedPaths), len(file.paths)) + } + for i, expected := range expectedPaths { + if file.paths[i] != expected { + t.Errorf("Path %d: expected %q, got %q", i, expected, file.paths[i]) + } + } + + expectedBasenames := []string{"abc123", "file.txt", "link.txt"} + if len(file.basenames) != len(expectedBasenames) { + t.Fatalf("Expected %d basenames, got %d", len(expectedBasenames), len(file.basenames)) + } + for i, expected := range expectedBasenames { + if file.basenames[i] != expected { + t.Errorf("Basename %d: expected %q, got %q", i, expected, file.basenames[i]) + } + } +} + +func TestAnalyzeForDelta_MatchViaHardlinkPath(t *testing.T) { + // Old tar: file with sha256 name and real name hardlink + oldEntries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("version 1 content")}, + {name: "real/file.txt", typeflag: tar.TypeLink, linkname: "blobs/sha256/abc123"}, + } + oldTar, err := createTestTar(oldEntries) + if err != nil { + t.Fatalf("Failed to create old tar: %v", err) + } + + // New tar: file with different sha256 name but same real name + newEntries := []tarEntry{ + {name: "blobs/sha256/def456", typeflag: tar.TypeReg, data: []byte("version 2 content")}, + {name: "real/file.txt", typeflag: tar.TypeLink, linkname: "blobs/sha256/def456"}, + } + newTar, err := createTestTar(newEntries) + if err != nil { + t.Fatalf("Failed to create new tar: %v", err) + } + + if _, err := oldTar.Seek(0, 0); err != nil { + t.Fatalf("oldTar.Seek: %v", err) + } + if _, err := newTar.Seek(0, 0); err != nil { + t.Fatalf("newTar.Seek: %v", err) + } + + oldInfo, err := analyzeTar(oldTar) + if err != nil { + t.Fatalf("analyzeTar (old) failed: %v", err) + } + + newInfo, err := analyzeTar(newTar) + if err != nil { + t.Fatalf("analyzeTar (new) failed: %v", err) + } + + if _, err := oldTar.Seek(0, 0); err != nil { + t.Fatalf("oldTar.Seek: %v", err) + } + + analysis, err := analyzeForDelta(oldInfo, newInfo, oldTar) + if err != nil { + t.Fatalf("analyzeForDelta failed: %v", err) + } + defer func() { + if err := analysis.Close(); err != nil { + t.Fatalf("analysis.Close failed: %v", err) + } + }() + + // The new file should have matched the old file via the "real/file.txt" path + targetInfo := &analysis.targetInfos[0] + if targetInfo.source == nil { + t.Fatal("Expected target file to find a source match") + } + + // The source should be the old file (which has "real/file.txt" as one of its paths) + if len(targetInfo.source.file.paths) < 2 { + t.Fatal("Expected source file to have multiple paths") + } + foundRealPath := false + for _, p := range targetInfo.source.file.paths { + if p == "real/file.txt" { + foundRealPath = true + break + } + } + if !foundRealPath { + t.Error("Expected source file to have 'real/file.txt' in its paths") + } + + // The primary path (paths[0]) should be the sha256 path (first regular file entry) + if targetInfo.source.file.paths[0] != "blobs/sha256/abc123" { + t.Errorf("Expected primary source path to be 'blobs/sha256/abc123', got %q", targetInfo.source.file.paths[0]) + } +} diff --git a/pkg/tar-diff/diff.go b/pkg/tar-diff/diff.go index 9ee2c1c..d41e9a1 100644 --- a/pkg/tar-diff/diff.go +++ b/pkg/tar-diff/diff.go @@ -70,7 +70,7 @@ func (g *deltaGenerator) generateForFileWithBsdiff(info *targetInfo) error { file := info.file source := info.source - err := g.deltaWriter.SetCurrentFile(source.file.path) + err := g.deltaWriter.SetCurrentFile(source.file.paths[0]) if err != nil { return err } @@ -104,7 +104,7 @@ func (g *deltaGenerator) generateForFileWithrollsums(info *targetInfo) error { matches := info.rollsumMatches.matches pos := int64(0) - err := g.deltaWriter.SetCurrentFile(source.file.path) + err := g.deltaWriter.SetCurrentFile(source.file.paths[0]) if err != nil { return err } @@ -160,7 +160,7 @@ func (g *deltaGenerator) generateForFile(info *targetInfo) error { switch { case sourceFile.sha1 == file.sha1 && sourceFile.size == file.size: // Reuse exact file from old tar - if err := g.deltaWriter.WriteOldFile(sourceFile.path, uint64(sourceFile.size)); err != nil { + if err := g.deltaWriter.WriteOldFile(sourceFile.paths[0], uint64(sourceFile.size)); err != nil { return err } From 85737710236d39d182df62714c9c4bf083f5954c Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Wed, 25 Mar 2026 11:38:59 +0100 Subject: [PATCH 2/7] analysis: Fix the detection of compressed files We need to use HasSuffix, not HasPrefix. Signed-off-by: Alexander Larsson --- pkg/tar-diff/analysis.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/tar-diff/analysis.go b/pkg/tar-diff/analysis.go index fddbbf1..b8c1cef 100644 --- a/pkg/tar-diff/analysis.go +++ b/pkg/tar-diff/analysis.go @@ -209,8 +209,8 @@ func isDeltaCandidate(file *tarFileInfo) bool { // NB: We explicitly don't have .gz here in case someone might be // using --rsyncable for that. for _, basename := range file.basenames { - if strings.HasPrefix(basename, ".xz") || - strings.HasPrefix(basename, ".bz2") { + if strings.HasSuffix(basename, ".xz") || + strings.HasSuffix(basename, ".bz2") { return false } } From 7a58e5546c29920831d6614a00a6ce223e9545ba Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Wed, 25 Mar 2026 15:36:32 +0100 Subject: [PATCH 3/7] Support multiple "old" tar files Sometimes you have multiple tar files as source for delta information. In particular, this is common when you are diffing OCI container image layers. For example, when generating a delta for one layer in a new image you don't necessarily know what layer the has the original files, because layers index are not stable, especially with bootc style OCI images that get rechunked. This is mostly trivial code that makes oldTars an array, but there is some complexity in how you have to handle filenames that conflict in the old tars. We assume they have been extracted in the order given, so any files in an earlier tar-file that has been overwritten by a file from a later tar-file will be marked overwritten and not used as delta source. Signed-off-by: Alexander Larsson --- README.md | 21 +++++- cmd/tar-diff/main.go | 28 ++++--- pkg/tar-diff/analysis.go | 101 ++++++++++++++++--------- pkg/tar-diff/analysis_test.go | 4 +- pkg/tar-diff/diff.go | 32 +++++--- pkg/tar-diff/diff_test.go | 5 +- pkg/tar-diff/multifile_test.go | 131 +++++++++++++++++++++++++++++++++ 7 files changed, 263 insertions(+), 59 deletions(-) create mode 100644 pkg/tar-diff/multifile_test.go diff --git a/README.md b/README.md index a6bead9..7bf0608 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ `tar-diff` is a golang library and set of commandline tools to diff and patch tar files. -`pkg/tar-diff` and the `tar-diff` tool take two (optionally compressed) tar files and generate a single file representing the delta between them (a tardiff file). +`pkg/tar-diff` and the `tar-diff` tool take one or more old tar files (optionally compressed) and a new tar file to generate a single file representing the delta between them (a tardiff file). -`pkg/tar-patch` takes a tardiff file and the uncompressed contents (such as an extracted directory) of the first tar file and reconstructs (binary identically) the second tar file (uncompressed). +`pkg/tar-patch` takes a tardiff file and the uncompressed contents (such as an extracted directory) of the old tar file(s) and reconstructs (binary identically) the new tar file (uncompressed). ## Example ``` @@ -15,6 +15,21 @@ $ zcat new.tar.gz | shasum $ shasum reconstructed.tar ``` +## Multi-file example + +It is sometimes useful to have multiple sources for delta information, such as for example when the +sources are container image layers. In this case, you need to provide the old tar files in +the order they will be extracted when applying: + +``` +$ tar-diff layer1.tar layer2.tar layer3.tar new-layer.tar delta.tardiff +$ tar xf layer1.tar -C extracted/ +$ tar xf layer2.tar -C extracted/ +$ tar xf layer3.tar -C extracted/ +$ tar-patch delta.tardiff extracted/ reconstructed.tar +``` + +This handles the case where a file in a later tar file overwrites another. ## Build requirements @@ -40,4 +55,4 @@ The `tar-diff` file format is described in [file-format.md](file-format.md). ## License `tar-diff` is licensed under the Apache License, Version 2.0. See -[LICENSE](LICENSE) for the full license text. \ No newline at end of file +[LICENSE](LICENSE) for the full license text. diff --git a/cmd/tar-diff/main.go b/cmd/tar-diff/main.go index d998426..07ddb14 100644 --- a/cmd/tar-diff/main.go +++ b/cmd/tar-diff/main.go @@ -4,6 +4,7 @@ package main import ( "flag" "fmt" + "io" "log" "os" "path" @@ -19,7 +20,7 @@ var maxBsdiffSize = flag.Int("max-bsdiff-size", 192, "Max file size in megabytes func main() { flag.Usage = func() { - _, _ = fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [OPTION] old.tar.gz new.tar.gz result.tardiff\n", path.Base(os.Args[0])) + _, _ = fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [OPTION] old1.tar.gz [old2.tar.gz ...] new.tar.gz result.tardiff\n", path.Base(os.Args[0])) _, _ = fmt.Fprintf(flag.CommandLine.Output(), "Options:\n") flag.PrintDefaults() } @@ -31,35 +32,44 @@ func main() { return } - if flag.NArg() != 3 { + if flag.NArg() < 3 { flag.Usage() os.Exit(1) } - oldFilename := flag.Arg(0) - newFilename := flag.Arg(1) - deltaFilename := flag.Arg(2) + args := flag.Args() + numOldFiles := len(args) - 2 + oldFilenames := args[0:numOldFiles] + newFilename := args[numOldFiles] + deltaFilename := args[numOldFiles+1] - oldFile, err := os.Open(oldFilename) - if err != nil { - log.Fatalf("Error: %s", err) + oldFiles := make([]io.ReadSeeker, numOldFiles) + for i, oldFilename := range oldFilenames { + file, err := os.Open(oldFilename) + if err != nil { + log.Fatalf("Error: %s", err) + } + defer file.Close() + oldFiles[i] = file } newFile, err := os.Open(newFilename) if err != nil { log.Fatalf("Error: %s", err) } + defer newFile.Close() deltaFile, err := os.Create(deltaFilename) if err != nil { log.Fatalf("Error: %s", err) } + defer deltaFile.Close() options := tardiff.NewOptions() options.SetCompressionLevel(*compressionLevel) options.SetMaxBsdiffFileSize(int64(*maxBsdiffSize) * 1024 * 1024) - err = tardiff.Diff(oldFile, newFile, deltaFile, options) + err = tardiff.Diff(oldFiles, newFile, deltaFile, options) if err != nil { log.Fatalf("Error: %s", err) } diff --git a/pkg/tar-diff/analysis.go b/pkg/tar-diff/analysis.go index b8c1cef..c78c8cf 100644 --- a/pkg/tar-diff/analysis.go +++ b/pkg/tar-diff/analysis.go @@ -46,9 +46,10 @@ type targetInfo struct { } type sourceInfo struct { - file *tarFileInfo - usedForDelta bool - offset int64 + file *tarFileInfo + usedForDelta bool + offset int64 + sourceTarFileIndex int } type deltaAnalysis struct { @@ -251,36 +252,43 @@ func sizeIsSimilar(a *tarFileInfo, b *tarFileInfo) bool { return a.size < 10*b.size && b.size < 10*a.size } -func extractDeltaData(tarMaybeCompressed io.Reader, sourceByIndex map[int]*sourceInfo, dest *os.File) error { - offset := int64(0) +type indexKey struct { + fileIndex int + entryIndex int +} - tarFile, _, err := compression.AutoDecompress(tarMaybeCompressed) - if err != nil { - return err - } - defer func() { - if err := tarFile.Close(); err != nil { - log.Printf("close tar file: %v", err) - } - }() +func extractDeltaData(tarMaybeCompressedFiles []io.ReadSeeker, sourceByIndex map[indexKey]*sourceInfo, dest *os.File) error { + offset := int64(0) - rdr := tar.NewReader(tarFile) - for index := 0; true; index++ { - var hdr *tar.Header - hdr, err = rdr.Next() + for fileIndex, tarMaybeCompressed := range tarMaybeCompressedFiles { + tarFile, _, err := compression.AutoDecompress(tarMaybeCompressed) if err != nil { - if err == io.EOF { - break // Expected error - } return err } - info := sourceByIndex[index] - if info != nil && info.usedForDelta { - info.offset = offset - offset += hdr.Size - if _, err := io.Copy(dest, rdr); err != nil { + defer func() { + if err := tarFile.Close(); err != nil { + log.Printf("close tar file: %v", err) + } + }() + + rdr := tar.NewReader(tarFile) + for index := 0; true; index++ { + var hdr *tar.Header + hdr, err = rdr.Next() + if err != nil { + if err == io.EOF { + break // Expected error + } return err } + info := sourceByIndex[indexKey{fileIndex: fileIndex, entryIndex: index}] + if info != nil && info.usedForDelta { + info.offset = offset + offset += hdr.Size + if _, err := io.Copy(dest, rdr); err != nil { + return err + } + } } } return nil @@ -292,15 +300,42 @@ func abs(n int64) int64 { } return n } -func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAnalysis, error) { - sourceInfos := make([]sourceInfo, 0, len(old.files)) - for i := range old.files { - sourceInfos = append(sourceInfos, sourceInfo{file: &old.files[i]}) + +func buildSourceInfos(oldInfos []*tarInfo) []sourceInfo { + sourceInfos := make([]sourceInfo, 0) + pathToFileIndex := make(map[string]int) + + for fileIdx, oldInfo := range oldInfos { + for i := range oldInfo.files { + file := &oldInfo.files[i] + + // Check if any path from this file conflicts with existing files + for _, p := range file.paths { + if existingIdx, exists := pathToFileIndex[p]; exists { + sourceInfos[existingIdx].file.overwritten = true + } + } + + // Add the primary path of this file (which is the one used as delta source) + currentFileIndex := len(sourceInfos) + pathToFileIndex[file.paths[0]] = currentFileIndex + + sourceInfos = append(sourceInfos, sourceInfo{ + file: file, + sourceTarFileIndex: fileIdx, + }) + } } + return sourceInfos +} + +func analyzeForDelta(oldInfos []*tarInfo, newTar *tarInfo, oldFiles []io.ReadSeeker) (*deltaAnalysis, error) { + sourceInfos := buildSourceInfos(oldInfos) + sourceBySha1 := make(map[string]*sourceInfo) sourceByPath := make(map[string]*sourceInfo) - sourceByIndex := make(map[int]*sourceInfo) + sourceByIndex := make(map[indexKey]*sourceInfo) for i := range sourceInfos { s := &sourceInfos[i] if !s.file.overwritten { @@ -308,7 +343,7 @@ func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAn for _, p := range s.file.paths { sourceByPath[p] = s } - sourceByIndex[s.file.index] = s + sourceByIndex[indexKey{fileIndex: s.sourceTarFileIndex, entryIndex: s.file.index}] = s } } @@ -399,7 +434,7 @@ func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAn return nil, err } - err = extractDeltaData(oldFile, sourceByIndex, tmpfile) + err = extractDeltaData(oldFiles, sourceByIndex, tmpfile) if err != nil { _ = os.Remove(tmpfile.Name()) return nil, err diff --git a/pkg/tar-diff/analysis_test.go b/pkg/tar-diff/analysis_test.go index 16bfd17..ec8bfae 100644 --- a/pkg/tar-diff/analysis_test.go +++ b/pkg/tar-diff/analysis_test.go @@ -189,7 +189,7 @@ func TestAnalyzeForDelta_HardlinksInTargetInfo(t *testing.T) { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTar) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } @@ -300,7 +300,7 @@ func TestAnalyzeForDelta_MatchViaHardlinkPath(t *testing.T) { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTar) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } diff --git a/pkg/tar-diff/diff.go b/pkg/tar-diff/diff.go index d41e9a1..32a527b 100644 --- a/pkg/tar-diff/diff.go +++ b/pkg/tar-diff/diff.go @@ -3,6 +3,7 @@ package tardiff import ( "archive/tar" "bytes" + "fmt" "io" "log" @@ -281,17 +282,26 @@ func NewOptions() *Options { } } -// Diff creates a binary difference between two tar archives. -func Diff(oldTarFile io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Writer, options *Options) error { +// Diff creates a binary difference between a set of tar archives and a new tar archive +// oldTarFiles contains one or more old tar files, in extraction order +func Diff(oldTarFiles []io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Writer, options *Options) error { if options == nil { options = NewOptions() } - // First analyze both tarfiles by themselves - oldInfo, err := analyzeTar(oldTarFile) - if err != nil { - return err + if len(oldTarFiles) == 0 { + return fmt.Errorf("at least one old tar file is required") + } + + // First analyze all tarfiles by themselves + oldInfos := make([]*tarInfo, len(oldTarFiles)) + for i, oldTarFile := range oldTarFiles { + oldInfo, err := analyzeTar(oldTarFile) + if err != nil { + return err + } + oldInfos[i] = oldInfo } newInfo, err := analyzeTar(newTarFile) @@ -300,9 +310,11 @@ func Diff(oldTarFile io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Writer } // Reset tar.gz for re-reading - _, err = oldTarFile.Seek(0, 0) - if err != nil { - return err + for _, oldTarFile := range oldTarFiles { + _, err = oldTarFile.Seek(0, 0) + if err != nil { + return err + } } _, err = newTarFile.Seek(0, 0) if err != nil { @@ -310,7 +322,7 @@ func Diff(oldTarFile io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Writer } // Compare new and old for delta information - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTarFile) + analysis, err := analyzeForDelta(oldInfos, newInfo, oldTarFiles) if err != nil { return err } diff --git a/pkg/tar-diff/diff_test.go b/pkg/tar-diff/diff_test.go index e8fc674..c9b5f77 100644 --- a/pkg/tar-diff/diff_test.go +++ b/pkg/tar-diff/diff_test.go @@ -3,6 +3,7 @@ package tardiff import ( "archive/tar" "bytes" + "io" "testing" ) @@ -47,7 +48,7 @@ func TestGenerateDelta_Hardlinks(t *testing.T) { if _, err := oldTar.Seek(0, 0); err != nil { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTar) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } @@ -135,7 +136,7 @@ func TestGenerateDelta_MixedHardlinksAndDuplicates(t *testing.T) { if _, err := oldTar.Seek(0, 0); err != nil { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTar) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } diff --git a/pkg/tar-diff/multifile_test.go b/pkg/tar-diff/multifile_test.go new file mode 100644 index 0000000..e2d334b --- /dev/null +++ b/pkg/tar-diff/multifile_test.go @@ -0,0 +1,131 @@ +package tardiff + +import ( + "archive/tar" + "testing" +) + +func TestBuildSourceInfos(t *testing.T) { + // Create two tar infos + tar1Entries := []tarEntry{ + {name: "file1.txt", typeflag: tar.TypeReg, data: []byte("content1")}, + {name: "file2.txt", typeflag: tar.TypeReg, data: []byte("content2")}, + } + tar1, err := createTestTar(tar1Entries) + if err != nil { + t.Fatalf("Failed to create tar1: %v", err) + } + + tar2Entries := []tarEntry{ + {name: "file2.txt", typeflag: tar.TypeReg, data: []byte("content2-override")}, + {name: "file3.txt", typeflag: tar.TypeReg, data: []byte("content3")}, + } + tar2, err := createTestTar(tar2Entries) + if err != nil { + t.Fatalf("Failed to create tar2: %v", err) + } + + info1, err := analyzeTar(tar1) + if err != nil { + t.Fatalf("Failed to analyze tar1: %v", err) + } + + info2, err := analyzeTar(tar2) + if err != nil { + t.Fatalf("Failed to analyze tar2: %v", err) + } + + sourceInfos := buildSourceInfos([]*tarInfo{info1, info2}) + + // Should have 3 files total (file1, file2-orig, file2-override, file3) + // But file2-orig should be marked as overwritten + if len(sourceInfos) != 4 { + t.Fatalf("Expected 4 source infos, got %d", len(sourceInfos)) + } + + // Check that first file2 is marked as overwritten + var file2FromTar1 *sourceInfo + var file2FromTar2 *sourceInfo + for i := range sourceInfos { + s := &sourceInfos[i] + if s.file.paths[0] == "file2.txt" { + switch s.sourceTarFileIndex { + case 0: + file2FromTar1 = s + case 1: + file2FromTar2 = s + } + } + } + + if file2FromTar1 == nil { + t.Fatal("file2.txt from tar1 not found") + } + if file2FromTar2 == nil { + t.Fatal("file2.txt from tar2 not found") + } + + if !file2FromTar1.file.overwritten { + t.Error("file2.txt from tar1 should be marked as overwritten") + } + if file2FromTar2.file.overwritten { + t.Error("file2.txt from tar2 should NOT be marked as overwritten") + } +} + +func TestBuildSourceInfos_HardlinkConflicts(t *testing.T) { + // Layer 1: + // * sha256 file with hardlink to real name, + // * a file that will be overwritten via a hardlink + tar1Entries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("version1")}, + {name: "files/app.bin", typeflag: tar.TypeLink, linkname: "blobs/sha256/abc123"}, + {name: "files/replace-me", typeflag: tar.TypeReg, data: []byte("version1")}, + } + tar1, err := createTestTar(tar1Entries) + if err != nil { + t.Fatalf("Failed to create tar1: %v", err) + } + + // Layer 2: + // * different sha256 file with same hardlink name, will not overwrite old blob + // * a hardlink that overwrites replace-me + tar2Entries := []tarEntry{ + {name: "blobs/sha256/def456", typeflag: tar.TypeReg, data: []byte("version2")}, + {name: "files/app.bin", typeflag: tar.TypeLink, linkname: "blobs/sha256/def456"}, + {name: "files/other-file", typeflag: tar.TypeReg, data: []byte("version1")}, + {name: "files/replace-me", typeflag: tar.TypeLink, linkname: "files/other-file"}, + } + tar2, err := createTestTar(tar2Entries) + if err != nil { + t.Fatalf("Failed to create tar2: %v", err) + } + + info1, err := analyzeTar(tar1) + if err != nil { + t.Fatalf("Failed to analyze tar1: %v", err) + } + + info2, err := analyzeTar(tar2) + if err != nil { + t.Fatalf("Failed to analyze tar2: %v", err) + } + + sourceInfos := buildSourceInfos([]*tarInfo{info1, info2}) + + // Should have 4 files (two from each layer) + if len(sourceInfos) != 4 { + t.Fatalf("Expected 4 source infos, got %d", len(sourceInfos)) + } + + // The layer 1 blob file should not be marked as overwritten, even though its hardlink path + // "files/app.bin" conflicts with layer 2's hardlink path. + if sourceInfos[0].file.overwritten { + t.Error("Layer 1 file should not be marked as overwritten due to hardlink path conflict") + } + + // But the replace-me file should be overwritten, by the repalce-me hardlink + if !sourceInfos[1].file.overwritten { + t.Error("Layer 1 file should not be marked as overwritten due to hardlink path conflict") + } +} From 8a178b8817f23cfe170a90b2901253abe369fed9 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Wed, 25 Mar 2026 16:30:29 +0100 Subject: [PATCH 4/7] tar-diff: Add --source-prefix If ths is specified, only files with that prefix are used as sources for deltas. This can be useful if you only have a partially extracted version of the tar files on the system when applying the patch. This is particularly useful for bootc images, because only the files in /sysroot/ostree/repo/objects/ are easily available. Signed-off-by: Alexander Larsson --- README.md | 17 +++ cmd/tar-diff/main.go | 16 +++ pkg/tar-diff/analysis.go | 43 +++++-- pkg/tar-diff/analysis_test.go | 4 +- pkg/tar-diff/diff.go | 10 +- pkg/tar-diff/diff_test.go | 4 +- pkg/tar-diff/prefix_filter_test.go | 178 +++++++++++++++++++++++++++++ 7 files changed, 260 insertions(+), 12 deletions(-) create mode 100644 pkg/tar-diff/prefix_filter_test.go diff --git a/README.md b/README.md index 7bf0608..47ac347 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,23 @@ $ tar-patch delta.tardiff extracted/ reconstructed.tar This handles the case where a file in a later tar file overwrites another. +### Partial extraction with prefix filtering + +If you only plan to extract certain directories from the old tar files on the target system, +you can use `--source-prefix` to restrict which files can be used as delta sources: + +``` +$ tar-diff --source-prefix=blobs/ --source-prefix=config/ old.tar new.tar delta.tardiff +$ tar xf old.tar blobs/ config/ -C extracted/ +$ tar-patch delta.tardiff extracted/ reconstructed.tar +``` + +This ensures the delta only references files that will be available in the extracted directory. + +This is particularly useful for e.g. bootc images, where only the files in the ostree repo +will be available on the system. For that case you would run tar-diff with +`--source-prefix=sysroot/ostree/repo/objects/` + ## Build requirements - golang >= 1.25 (see [`go.mod`](go.mod)) diff --git a/cmd/tar-diff/main.go b/cmd/tar-diff/main.go index 07ddb14..be1e34f 100644 --- a/cmd/tar-diff/main.go +++ b/cmd/tar-diff/main.go @@ -13,11 +13,24 @@ import ( tardiff "github.com/containers/tar-diff/pkg/tar-diff" ) +type prefixList []string + +func (p *prefixList) String() string { + return fmt.Sprintf("%v", *p) +} + +func (p *prefixList) Set(value string) error { + *p = append(*p, value) + return nil +} + var version = flag.Bool("version", false, "Show version") var compressionLevel = flag.Int("compression-level", 3, "zstd compression level") var maxBsdiffSize = flag.Int("max-bsdiff-size", 192, "Max file size in megabytes to consider using bsdiff, or 0 for no limit") +var sourcePrefixes prefixList func main() { + flag.Var(&sourcePrefixes, "source-prefix", "Only use source files with this path prefix for delta (can be specified multiple times)") flag.Usage = func() { _, _ = fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [OPTION] old1.tar.gz [old2.tar.gz ...] new.tar.gz result.tardiff\n", path.Base(os.Args[0])) @@ -68,6 +81,9 @@ func main() { options := tardiff.NewOptions() options.SetCompressionLevel(*compressionLevel) options.SetMaxBsdiffFileSize(int64(*maxBsdiffSize) * 1024 * 1024) + if len(sourcePrefixes) > 0 { + options.SetSourcePrefixes(sourcePrefixes) + } err = tardiff.Diff(oldFiles, newFile, deltaFile, options) if err != nil { diff --git a/pkg/tar-diff/analysis.go b/pkg/tar-diff/analysis.go index c78c8cf..db815d1 100644 --- a/pkg/tar-diff/analysis.go +++ b/pkg/tar-diff/analysis.go @@ -330,7 +330,31 @@ func buildSourceInfos(oldInfos []*tarInfo) []sourceInfo { return sourceInfos } -func analyzeForDelta(oldInfos []*tarInfo, newTar *tarInfo, oldFiles []io.ReadSeeker) (*deltaAnalysis, error) { +func matchesAnyPrefix(path string, prefixes []string) bool { + if len(prefixes) == 0 { + return true + } + for _, prefix := range prefixes { + if strings.HasPrefix(path, prefix) { + return true + } + } + return false +} + +func isDeltaSourceCandidate(s *sourceInfo, options *Options) bool { + if s.file.overwritten { + return false + } + primaryPath := s.file.paths[0] + return matchesAnyPrefix(primaryPath, options.sourcePrefixes) +} + +func analyzeForDelta(oldInfos []*tarInfo, newTar *tarInfo, oldFiles []io.ReadSeeker, options *Options) (*deltaAnalysis, error) { + if options == nil { + options = NewOptions() + } + sourceInfos := buildSourceInfos(oldInfos) sourceBySha1 := make(map[string]*sourceInfo) @@ -338,13 +362,14 @@ func analyzeForDelta(oldInfos []*tarInfo, newTar *tarInfo, oldFiles []io.ReadSee sourceByIndex := make(map[indexKey]*sourceInfo) for i := range sourceInfos { s := &sourceInfos[i] - if !s.file.overwritten { - sourceBySha1[s.file.sha1] = s - for _, p := range s.file.paths { - sourceByPath[p] = s - } - sourceByIndex[indexKey{fileIndex: s.sourceTarFileIndex, entryIndex: s.file.index}] = s + if !isDeltaSourceCandidate(s, options) { + continue + } + sourceBySha1[s.file.sha1] = s + for _, p := range s.file.paths { + sourceByPath[p] = s } + sourceByIndex[indexKey{fileIndex: s.sourceTarFileIndex, entryIndex: s.file.index}] = s } targetInfos := make([]targetInfo, 0, len(newTar.files)+len(newTar.hardlinks)) @@ -380,6 +405,10 @@ func analyzeForDelta(oldInfos []*tarInfo, newTar *tarInfo, oldFiles []io.ReadSee for j := range sourceInfos { s = &sourceInfos[j] + // Skip files that we're not allowed to use + if !isDeltaSourceCandidate(s, options) { + continue + } // Skip files that make no sense to delta (like compressed files) if !isDeltaCandidate(s.file) { continue diff --git a/pkg/tar-diff/analysis_test.go b/pkg/tar-diff/analysis_test.go index ec8bfae..e896a83 100644 --- a/pkg/tar-diff/analysis_test.go +++ b/pkg/tar-diff/analysis_test.go @@ -189,7 +189,7 @@ func TestAnalyzeForDelta_HardlinksInTargetInfo(t *testing.T) { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}, nil) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } @@ -300,7 +300,7 @@ func TestAnalyzeForDelta_MatchViaHardlinkPath(t *testing.T) { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}, nil) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } diff --git a/pkg/tar-diff/diff.go b/pkg/tar-diff/diff.go index 32a527b..6464cbd 100644 --- a/pkg/tar-diff/diff.go +++ b/pkg/tar-diff/diff.go @@ -262,6 +262,7 @@ func generateDelta(newFile io.ReadSeeker, deltaFile io.Writer, analysis *deltaAn type Options struct { compressionLevel int maxBsdiffSize int64 + sourcePrefixes []string } // SetCompressionLevel sets the compression level for the output diff file. @@ -274,11 +275,18 @@ func (o *Options) SetMaxBsdiffFileSize(maxBsdiffSize int64) { o.maxBsdiffSize = maxBsdiffSize } +// SetSourcePrefixes sets path prefixes to filter which source files can be used for delta. +// Only files whose primary path starts with one of these prefixes will be used as delta sources. +func (o *Options) SetSourcePrefixes(prefixes []string) { + o.sourcePrefixes = prefixes +} + // NewOptions creates a new Options struct with default values. func NewOptions() *Options { return &Options{ compressionLevel: 3, maxBsdiffSize: defaultMaxBsdiffSize, + sourcePrefixes: nil, } } @@ -322,7 +330,7 @@ func Diff(oldTarFiles []io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Wri } // Compare new and old for delta information - analysis, err := analyzeForDelta(oldInfos, newInfo, oldTarFiles) + analysis, err := analyzeForDelta(oldInfos, newInfo, oldTarFiles, options) if err != nil { return err } diff --git a/pkg/tar-diff/diff_test.go b/pkg/tar-diff/diff_test.go index c9b5f77..946fe4f 100644 --- a/pkg/tar-diff/diff_test.go +++ b/pkg/tar-diff/diff_test.go @@ -48,7 +48,7 @@ func TestGenerateDelta_Hardlinks(t *testing.T) { if _, err := oldTar.Seek(0, 0); err != nil { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}, nil) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } @@ -136,7 +136,7 @@ func TestGenerateDelta_MixedHardlinksAndDuplicates(t *testing.T) { if _, err := oldTar.Seek(0, 0); err != nil { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}, nil) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } diff --git a/pkg/tar-diff/prefix_filter_test.go b/pkg/tar-diff/prefix_filter_test.go new file mode 100644 index 0000000..a63b0e4 --- /dev/null +++ b/pkg/tar-diff/prefix_filter_test.go @@ -0,0 +1,178 @@ +package tardiff + +import ( + "archive/tar" + "io" + "testing" +) + +func TestMatchesAnyPrefix(t *testing.T) { + tests := []struct { + path string + prefixes []string + want bool + }{ + {"blobs/sha256/abc123", []string{"blobs/"}, true}, + {"config/app.conf", []string{"blobs/"}, false}, + {"data/file.txt", []string{"blobs/"}, false}, + {"blobs/sha256/abc123", []string{"blobs/", "config/"}, true}, + {"config/app.conf", []string{"blobs/", "config/"}, true}, + {"data/file.txt", []string{"blobs/", "config/"}, false}, + {"anything", []string{}, true}, // empty prefixes means match all + {"anything", nil, true}, // nil prefixes means match all + } + + for _, tt := range tests { + got := matchesAnyPrefix(tt.path, tt.prefixes) + if got != tt.want { + t.Errorf("matchesAnyPrefix(%q, %v) = %v, want %v", tt.path, tt.prefixes, got, tt.want) + } + } +} + +func setupPrefixFilterTestData(t *testing.T) (oldTar io.ReadSeeker, oldTarInfo *tarInfo, newTar io.ReadSeeker, newInfo *tarInfo) { + oldEntries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("blob-content")}, + {name: "config/app.conf", typeflag: tar.TypeReg, data: []byte("config-v1")}, + {name: "data/file.txt", typeflag: tar.TypeReg, data: []byte("data-v1")}, + } + oldTar, err := createTestTar(oldEntries) + if err != nil { + t.Fatalf("Failed to create oldTar: %v", err) + } + + // New tar: same names, but all files modified + newEntries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("blob-content-modified")}, + {name: "config/app.conf", typeflag: tar.TypeReg, data: []byte("config-v2")}, + {name: "data/file.txt", typeflag: tar.TypeReg, data: []byte("data-v2")}, + } + newTar, err = createTestTar(newEntries) + if err != nil { + t.Fatalf("Failed to create new tar: %v", err) + } + + oldTarInfo, err = analyzeTar(oldTar) + if err != nil { + t.Fatalf("Failed to analyze oldTar: %v", err) + } + if _, err := oldTar.Seek(0, 0); err != nil { + t.Fatalf("oldTar.Seek: %v", err) + } + + newInfo, err = analyzeTar(newTar) + if err != nil { + t.Fatalf("Failed to analyze new tar: %v", err) + } + if _, err := newTar.Seek(0, 0); err != nil { + t.Fatalf("newTar.Seek: %v", err) + } + + return oldTar, oldTarInfo, newTar, newInfo +} + +func TestDiff_SourcePrefix(t *testing.T) { + old, oldInfo, _, newInfo := setupPrefixFilterTestData(t) + + options := NewOptions() + options.SetSourcePrefixes([]string{"blobs/"}) + + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{old}, options) + if err != nil { + t.Fatalf("analyzeForDelta failed: %v", err) + } + defer func() { _ = analysis.Close() }() + + // Verify that only files with blobs/ prefix can be used as delta sources + if len(analysis.targetInfos) != 3 { + t.Fatalf("Expected 3 target infos, got %d", len(analysis.targetInfos)) + } + + for i := range analysis.targetInfos { + target := &analysis.targetInfos[i] + if target.file == nil { + continue + } + + fileName := target.file.paths[0] + source := target.source + + switch fileName { + case "blobs/sha256/abc123": + // Should have a source (matches prefix) + if source == nil { + t.Error("blobs/sha256/abc123 should have a source (matches prefix)") + } else if !source.usedForDelta { + t.Error("blobs/sha256/abc123 source should be usedForDelta") + } + + default: + // Should NOT have a source (doesn't match prefix) + if source != nil { + t.Errorf("%s should NOT have a source (doesn't match prefix)", fileName) + } + } + } +} + +func TestDiff_SourceMultiplePrefixes(t *testing.T) { + old, oldInfo, _, newInfo := setupPrefixFilterTestData(t) + + options := NewOptions() + options.SetSourcePrefixes([]string{"blobs/", "config/"}) + + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{old}, options) + if err != nil { + t.Fatalf("analyzeForDelta failed: %v", err) + } + defer func() { _ = analysis.Close() }() + + // Verify correct filtering + for i := range analysis.targetInfos { + target := &analysis.targetInfos[i] + if target.file == nil { + continue + } + + fileName := target.file.paths[0] + source := target.source + + switch fileName { + case "blobs/sha256/abc123", "config/app.conf": + // Should have a source (matches one of the prefixes) + if source == nil { + t.Errorf("%s should have a source (matches prefix)", fileName) + } + + default: + // Should NOT have a source (doesn't match any prefix) + if source != nil { + t.Error("data/file.txt should NOT have a source (doesn't match any prefix)") + } + } + } +} + +func TestDiff_NoPrefixFilter(t *testing.T) { + old, oldInfo, _, newInfo := setupPrefixFilterTestData(t) + + // No prefix filter (default) - pass nil for default options + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{old}, nil) + if err != nil { + t.Fatalf("analyzeForDelta failed: %v", err) + } + defer func() { _ = analysis.Close() }() + + // All files should have sources + sourcesFound := 0 + for i := range analysis.targetInfos { + target := &analysis.targetInfos[i] + if target.file != nil && target.source != nil { + sourcesFound++ + } + } + + if sourcesFound != 3 { + t.Errorf("Expected 3 files to have sources, got %d", sourcesFound) + } +} From 7869b15f852166c7003fc3661a686d48ad79a66e Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Thu, 26 Mar 2026 09:55:38 +0100 Subject: [PATCH 5/7] main: Fix up defer vs os.Exit warnings We're getting lint errors like: cmd/tar-diff/main.go:71:3: exitAfterDefer: log.Fatalf will exit, and `defer file.Close()` will not run (gocritic) So, lets use a realMain() wrapper that use a return value, and then do the os.Exit() inside the real main(). This lets us safely use defer. --- cmd/tar-diff/main.go | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/cmd/tar-diff/main.go b/cmd/tar-diff/main.go index be1e34f..d7555f3 100644 --- a/cmd/tar-diff/main.go +++ b/cmd/tar-diff/main.go @@ -29,7 +29,7 @@ var compressionLevel = flag.Int("compression-level", 3, "zstd compression level" var maxBsdiffSize = flag.Int("max-bsdiff-size", 192, "Max file size in megabytes to consider using bsdiff, or 0 for no limit") var sourcePrefixes prefixList -func main() { +func realMain() int { flag.Var(&sourcePrefixes, "source-prefix", "Only use source files with this path prefix for delta (can be specified multiple times)") flag.Usage = func() { @@ -42,12 +42,12 @@ func main() { if *version { fmt.Printf("%s %s\n", path.Base(os.Args[0]), protocol.VERSION) - return + return 0 } if flag.NArg() < 3 { flag.Usage() - os.Exit(1) + return 1 } args := flag.Args() @@ -60,7 +60,8 @@ func main() { for i, oldFilename := range oldFilenames { file, err := os.Open(oldFilename) if err != nil { - log.Fatalf("Error: %s", err) + log.Printf("Error: %s", err) + return 1 } defer file.Close() oldFiles[i] = file @@ -68,13 +69,15 @@ func main() { newFile, err := os.Open(newFilename) if err != nil { - log.Fatalf("Error: %s", err) + log.Printf("Error: %s", err) + return 1 } defer newFile.Close() deltaFile, err := os.Create(deltaFilename) if err != nil { - log.Fatalf("Error: %s", err) + log.Printf("Error: %s", err) + return 1 } defer deltaFile.Close() @@ -87,7 +90,13 @@ func main() { err = tardiff.Diff(oldFiles, newFile, deltaFile, options) if err != nil { - log.Fatalf("Error: %s", err) + log.Printf("Error: %s", err) + return 1 } + return 0 +} +// We wrap a function the has a return value so we can safely use defer +func main() { + os.Exit(realMain()) } From 8c31577c43e39367887317f8f86c7c1a7af9ba65 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Thu, 26 Mar 2026 09:58:19 +0100 Subject: [PATCH 6/7] main: Fix lint errors about not checking the return value of Close These are not critical (the files are just read and will be closed at process termination anyway), so this is not fatal, but we log something so people are aware that something is weird. --- cmd/tar-diff/main.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cmd/tar-diff/main.go b/cmd/tar-diff/main.go index d7555f3..9100eaa 100644 --- a/cmd/tar-diff/main.go +++ b/cmd/tar-diff/main.go @@ -29,6 +29,12 @@ var compressionLevel = flag.Int("compression-level", 3, "zstd compression level" var maxBsdiffSize = flag.Int("max-bsdiff-size", 192, "Max file size in megabytes to consider using bsdiff, or 0 for no limit") var sourcePrefixes prefixList +func closeAndWarn(file *os.File) { + if err := file.Close(); err != nil { + log.Printf("Failed to close file: %v", err) + } +} + func realMain() int { flag.Var(&sourcePrefixes, "source-prefix", "Only use source files with this path prefix for delta (can be specified multiple times)") @@ -63,7 +69,7 @@ func realMain() int { log.Printf("Error: %s", err) return 1 } - defer file.Close() + defer closeAndWarn(file) oldFiles[i] = file } @@ -72,14 +78,14 @@ func realMain() int { log.Printf("Error: %s", err) return 1 } - defer newFile.Close() + defer closeAndWarn(newFile) deltaFile, err := os.Create(deltaFilename) if err != nil { log.Printf("Error: %s", err) return 1 } - defer deltaFile.Close() + defer closeAndWarn(deltaFile) options := tardiff.NewOptions() options.SetCompressionLevel(*compressionLevel) From 1b1b2678214cc5885c41e83ffa61f10cb56f6e0d Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Thu, 26 Mar 2026 10:16:27 +0100 Subject: [PATCH 7/7] analysis: Move code into findFuzzyDeltaSource() helper This makes no functional difference, but fixes this lint warning: pkg/tar-diff/analysis.go:353:1: cyclomatic complexity 31 of func `analyzeForDelta` is high (> 30) (gocyclo) --- pkg/tar-diff/analysis.go | 69 +++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/pkg/tar-diff/analysis.go b/pkg/tar-diff/analysis.go index db815d1..fb3f168 100644 --- a/pkg/tar-diff/analysis.go +++ b/pkg/tar-diff/analysis.go @@ -350,6 +350,43 @@ func isDeltaSourceCandidate(s *sourceInfo, options *Options) bool { return matchesAnyPrefix(primaryPath, options.sourcePrefixes) } +func findFuzzyDeltaSource(sourceInfos []sourceInfo, targetFile *tarFileInfo, options *Options) *sourceInfo { + // Check for moved (first) or renamed (second) versions + for fuzzy := 0; fuzzy < 2; fuzzy++ { + var source *sourceInfo + for j := range sourceInfos { + s := &sourceInfos[j] + + // Skip files that we're not allowed to use + if !isDeltaSourceCandidate(s, options) { + continue + } + // Skip files that make no sense to delta (like compressed files) + if !isDeltaCandidate(s.file) { + continue + } + // We're looking for moved files, or renames to "similar names" + if !nameIsSimilar(targetFile, s.file, fuzzy) { + continue + } + // Skip files that are wildly dissimilar in size, such as binaries replaces by shellscripts + if !sizeIsSimilar(targetFile, s.file) { + continue + } + // Choose the matching source that have most similar size to the new file + if source != nil && abs(source.file.size-targetFile.size) < abs(s.file.size-targetFile.size) { + continue + } + + source = s + } + if source != nil { + return source + } + } + return nil +} + func analyzeForDelta(oldInfos []*tarInfo, newTar *tarInfo, oldFiles []io.ReadSeeker, options *Options) (*deltaAnalysis, error) { if options == nil { options = NewOptions() @@ -400,35 +437,9 @@ func analyzeForDelta(oldInfos []*tarInfo, newTar *tarInfo, oldFiles []io.ReadSee usedForDelta = true source = s } else { - // Check for moved (first) or renamed (second) versions - for fuzzy := 0; fuzzy < 2 && source == nil; fuzzy++ { - for j := range sourceInfos { - s = &sourceInfos[j] - - // Skip files that we're not allowed to use - if !isDeltaSourceCandidate(s, options) { - continue - } - // Skip files that make no sense to delta (like compressed files) - if !isDeltaCandidate(s.file) { - continue - } - // We're looking for moved files, or renames to "similar names" - if !nameIsSimilar(file, s.file, fuzzy) { - continue - } - // Skip files that are wildly dissimilar in size, such as binaries replaces by shellscripts - if !sizeIsSimilar(file, s.file) { - continue - } - // Choose the matching source that have most similar size to the new file - if source != nil && abs(source.file.size-file.size) < abs(s.file.size-file.size) { - continue - } - - usedForDelta = true - source = s - } + source = findFuzzyDeltaSource(sourceInfos, file, options) + if source != nil { + usedForDelta = true } } }