diff --git a/README.md b/README.md index a6bead9..47ac347 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ `tar-diff` is a golang library and set of commandline tools to diff and patch tar files. -`pkg/tar-diff` and the `tar-diff` tool take two (optionally compressed) tar files and generate a single file representing the delta between them (a tardiff file). +`pkg/tar-diff` and the `tar-diff` tool take one or more old tar files (optionally compressed) and a new tar file to generate a single file representing the delta between them (a tardiff file). -`pkg/tar-patch` takes a tardiff file and the uncompressed contents (such as an extracted directory) of the first tar file and reconstructs (binary identically) the second tar file (uncompressed). +`pkg/tar-patch` takes a tardiff file and the uncompressed contents (such as an extracted directory) of the old tar file(s) and reconstructs (binary identically) the new tar file (uncompressed). ## Example ``` @@ -15,6 +15,38 @@ $ zcat new.tar.gz | shasum $ shasum reconstructed.tar ``` +## Multi-file example + +It is sometimes useful to have multiple sources for delta information, such as for example when the +sources are container image layers. In this case, you need to provide the old tar files in +the order they will be extracted when applying: + +``` +$ tar-diff layer1.tar layer2.tar layer3.tar new-layer.tar delta.tardiff +$ tar xf layer1.tar -C extracted/ +$ tar xf layer2.tar -C extracted/ +$ tar xf layer3.tar -C extracted/ +$ tar-patch delta.tardiff extracted/ reconstructed.tar +``` + +This handles the case where a file in a later tar file overwrites another. + +### Partial extraction with prefix filtering + +If you only plan to extract certain directories from the old tar files on the target system, +you can use `--source-prefix` to restrict which files can be used as delta sources: + +``` +$ tar-diff --source-prefix=blobs/ --source-prefix=config/ old.tar new.tar delta.tardiff +$ tar xf old.tar blobs/ config/ -C extracted/ +$ tar-patch delta.tardiff extracted/ reconstructed.tar +``` + +This ensures the delta only references files that will be available in the extracted directory. + +This is particularly useful for e.g. bootc images, where only the files in the ostree repo +will be available on the system. For that case you would run tar-diff with +`--source-prefix=sysroot/ostree/repo/objects/` ## Build requirements @@ -40,4 +72,4 @@ The `tar-diff` file format is described in [file-format.md](file-format.md). ## License `tar-diff` is licensed under the Apache License, Version 2.0. See -[LICENSE](LICENSE) for the full license text. \ No newline at end of file +[LICENSE](LICENSE) for the full license text. diff --git a/cmd/tar-diff/main.go b/cmd/tar-diff/main.go index d998426..be1e34f 100644 --- a/cmd/tar-diff/main.go +++ b/cmd/tar-diff/main.go @@ -4,6 +4,7 @@ package main import ( "flag" "fmt" + "io" "log" "os" "path" @@ -12,14 +13,27 @@ import ( tardiff "github.com/containers/tar-diff/pkg/tar-diff" ) +type prefixList []string + +func (p *prefixList) String() string { + return fmt.Sprintf("%v", *p) +} + +func (p *prefixList) Set(value string) error { + *p = append(*p, value) + return nil +} + var version = flag.Bool("version", false, "Show version") var compressionLevel = flag.Int("compression-level", 3, "zstd compression level") var maxBsdiffSize = flag.Int("max-bsdiff-size", 192, "Max file size in megabytes to consider using bsdiff, or 0 for no limit") +var sourcePrefixes prefixList func main() { + flag.Var(&sourcePrefixes, "source-prefix", "Only use source files with this path prefix for delta (can be specified multiple times)") flag.Usage = func() { - _, _ = fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [OPTION] old.tar.gz new.tar.gz result.tardiff\n", path.Base(os.Args[0])) + _, _ = fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [OPTION] old1.tar.gz [old2.tar.gz ...] new.tar.gz result.tardiff\n", path.Base(os.Args[0])) _, _ = fmt.Fprintf(flag.CommandLine.Output(), "Options:\n") flag.PrintDefaults() } @@ -31,35 +45,47 @@ func main() { return } - if flag.NArg() != 3 { + if flag.NArg() < 3 { flag.Usage() os.Exit(1) } - oldFilename := flag.Arg(0) - newFilename := flag.Arg(1) - deltaFilename := flag.Arg(2) + args := flag.Args() + numOldFiles := len(args) - 2 + oldFilenames := args[0:numOldFiles] + newFilename := args[numOldFiles] + deltaFilename := args[numOldFiles+1] - oldFile, err := os.Open(oldFilename) - if err != nil { - log.Fatalf("Error: %s", err) + oldFiles := make([]io.ReadSeeker, numOldFiles) + for i, oldFilename := range oldFilenames { + file, err := os.Open(oldFilename) + if err != nil { + log.Fatalf("Error: %s", err) + } + defer file.Close() + oldFiles[i] = file } newFile, err := os.Open(newFilename) if err != nil { log.Fatalf("Error: %s", err) } + defer newFile.Close() deltaFile, err := os.Create(deltaFilename) if err != nil { log.Fatalf("Error: %s", err) } + defer deltaFile.Close() options := tardiff.NewOptions() options.SetCompressionLevel(*compressionLevel) options.SetMaxBsdiffFileSize(int64(*maxBsdiffSize) * 1024 * 1024) + if len(sourcePrefixes) > 0 { + options.SetSourcePrefixes(sourcePrefixes) + } - err = tardiff.Diff(oldFile, newFile, deltaFile, options) + err = tardiff.Diff(oldFiles, newFile, deltaFile, options) if err != nil { log.Fatalf("Error: %s", err) } diff --git a/pkg/tar-diff/analysis.go b/pkg/tar-diff/analysis.go index 2c6067a..fa89f23 100644 --- a/pkg/tar-diff/analysis.go +++ b/pkg/tar-diff/analysis.go @@ -16,9 +16,10 @@ import ( ) type tarFileInfo struct { - index int - basename string - path string + index int + // Hardlinked files have multiple names/basenames + basenames []string + paths []string size int64 sha1 string blobs []rollsumBlob @@ -45,9 +46,10 @@ type targetInfo struct { } type sourceInfo struct { - file *tarFileInfo - usedForDelta bool - offset int64 + file *tarFileInfo + usedForDelta bool + offset int64 + sourceTarFileIndex int } type deltaAnalysis struct { @@ -177,17 +179,26 @@ func analyzeTar(tarMaybeCompressed io.Reader) (*tarInfo, error) { } fileInfo := tarFileInfo{ - index: index, - basename: path.Base(pathname), - path: pathname, - size: hdr.Size, - sha1: hex.EncodeToString(h.Sum(nil)), - blobs: r.GetBlobs(), + index: index, + basenames: []string{path.Base(pathname)}, + paths: []string{pathname}, + size: hdr.Size, + sha1: hex.EncodeToString(h.Sum(nil)), + blobs: r.GetBlobs(), } infoByPath[pathname] = len(files) files = append(files, fileInfo) } + // Add hardlink paths and basenames to their target files + for i := range hardlinks { + hl := &hardlinks[i] + if fileIndex, ok := infoByPath[hl.linkname]; ok { + files[fileIndex].paths = append(files[fileIndex].paths, hl.path) + files[fileIndex].basenames = append(files[fileIndex].basenames, path.Base(hl.path)) + } + } + info := tarInfo{files: files, hardlinks: hardlinks} return &info, nil } @@ -198,21 +209,33 @@ func isDeltaCandidate(file *tarFileInfo) bool { // Look for known non-delta-able files (currently just compression) // NB: We explicitly don't have .gz here in case someone might be // using --rsyncable for that. - if strings.HasPrefix(file.basename, ".xz") || - strings.HasPrefix(file.basename, ".bz2") { - return false + for _, basename := range file.basenames { + if strings.HasSuffix(basename, ".xz") || + strings.HasSuffix(basename, ".bz2") { + return false + } } return true } func nameIsSimilar(a *tarFileInfo, b *tarFileInfo, fuzzy int) bool { - if fuzzy == 0 { - return a.basename == b.basename + for _, aBasename := range a.basenames { + for _, bBasename := range b.basenames { + if fuzzy == 0 { + if aBasename == bBasename { + return true + } + } else { + aa := strings.SplitAfterN(aBasename, ".", 2)[0] + bb := strings.SplitAfterN(bBasename, ".", 2)[0] + if aa == bb { + return true + } + } + } } - aa := strings.SplitAfterN(a.basename, ".", 2)[0] - bb := strings.SplitAfterN(b.basename, ".", 2)[0] - return aa == bb + return false } // Check that two files are not wildly dissimilar in size. @@ -229,36 +252,43 @@ func sizeIsSimilar(a *tarFileInfo, b *tarFileInfo) bool { return a.size < 10*b.size && b.size < 10*a.size } -func extractDeltaData(tarMaybeCompressed io.Reader, sourceByIndex map[int]*sourceInfo, dest *os.File) error { - offset := int64(0) +type indexKey struct { + fileIndex int + entryIndex int +} - tarFile, _, err := compression.AutoDecompress(tarMaybeCompressed) - if err != nil { - return err - } - defer func() { - if err := tarFile.Close(); err != nil { - log.Printf("close tar file: %v", err) - } - }() +func extractDeltaData(tarMaybeCompressedFiles []io.ReadSeeker, sourceByIndex map[indexKey]*sourceInfo, dest *os.File) error { + offset := int64(0) - rdr := tar.NewReader(tarFile) - for index := 0; true; index++ { - var hdr *tar.Header - hdr, err = rdr.Next() + for fileIndex, tarMaybeCompressed := range tarMaybeCompressedFiles { + tarFile, _, err := compression.AutoDecompress(tarMaybeCompressed) if err != nil { - if err == io.EOF { - break // Expected error - } return err } - info := sourceByIndex[index] - if info != nil && info.usedForDelta { - info.offset = offset - offset += hdr.Size - if _, err := io.Copy(dest, rdr); err != nil { + defer func() { + if err := tarFile.Close(); err != nil { + log.Printf("close tar file: %v", err) + } + }() + + rdr := tar.NewReader(tarFile) + for index := 0; true; index++ { + var hdr *tar.Header + hdr, err = rdr.Next() + if err != nil { + if err == io.EOF { + break // Expected error + } return err } + info := sourceByIndex[indexKey{fileIndex: fileIndex, entryIndex: index}] + if info != nil && info.usedForDelta { + info.offset = offset + offset += hdr.Size + if _, err := io.Copy(dest, rdr); err != nil { + return err + } + } } } return nil @@ -270,22 +300,76 @@ func abs(n int64) int64 { } return n } -func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAnalysis, error) { - sourceInfos := make([]sourceInfo, 0, len(old.files)) - for i := range old.files { - sourceInfos = append(sourceInfos, sourceInfo{file: &old.files[i]}) + +func buildSourceInfos(oldInfos []*tarInfo) []sourceInfo { + sourceInfos := make([]sourceInfo, 0) + pathToFileIndex := make(map[string]int) + + for fileIdx, oldInfo := range oldInfos { + for i := range oldInfo.files { + file := &oldInfo.files[i] + + // Check if any path from this file conflicts with existing files + for _, p := range file.paths { + if existingIdx, exists := pathToFileIndex[p]; exists { + sourceInfos[existingIdx].file.overwritten = true + } + } + + // Add the primary path of this file (which is the one used as delta source) + currentFileIndex := len(sourceInfos) + pathToFileIndex[file.paths[0]] = currentFileIndex + + sourceInfos = append(sourceInfos, sourceInfo{ + file: file, + sourceTarFileIndex: fileIdx, + }) + } } + return sourceInfos +} + +func matchesAnyPrefix(path string, prefixes []string) bool { + if len(prefixes) == 0 { + return true + } + for _, prefix := range prefixes { + if strings.HasPrefix(path, prefix) { + return true + } + } + return false +} + +func isDeltaSourceCandidate(s *sourceInfo, options *Options) bool { + if s.file.overwritten { + return false + } + primaryPath := s.file.paths[0] + return matchesAnyPrefix(primaryPath, options.sourcePrefixes) +} + +func analyzeForDelta(oldInfos []*tarInfo, newTar *tarInfo, oldFiles []io.ReadSeeker, options *Options) (*deltaAnalysis, error) { + if options == nil { + options = NewOptions() + } + + sourceInfos := buildSourceInfos(oldInfos) + sourceBySha1 := make(map[string]*sourceInfo) sourceByPath := make(map[string]*sourceInfo) - sourceByIndex := make(map[int]*sourceInfo) + sourceByIndex := make(map[indexKey]*sourceInfo) for i := range sourceInfos { s := &sourceInfos[i] - if !s.file.overwritten { - sourceBySha1[s.file.sha1] = s - sourceByPath[s.file.path] = s - sourceByIndex[s.file.index] = s + if !isDeltaSourceCandidate(s, options) { + continue } + sourceBySha1[s.file.sha1] = s + for _, p := range s.file.paths { + sourceByPath[p] = s + } + sourceByIndex[indexKey{fileIndex: s.sourceTarFileIndex, entryIndex: s.file.index}] = s } targetInfos := make([]targetInfo, 0, len(newTar.files)+len(newTar.hardlinks)) @@ -303,7 +387,14 @@ func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAn if source == nil && isDeltaCandidate(file) { // No exact match, try to find a useful source - s := sourceByPath[file.path] + // Check if any of the target file's paths match a source file + var s *sourceInfo + for _, p := range file.paths { + if matchedSource := sourceByPath[p]; matchedSource != nil { + s = matchedSource + break + } + } if s != nil && isDeltaCandidate(s.file) && sizeIsSimilar(file, s.file) { usedForDelta = true @@ -314,6 +405,10 @@ func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAn for j := range sourceInfos { s = &sourceInfos[j] + // Skip files that we're not allowed to use + if !isDeltaSourceCandidate(s, options) { + continue + } // Skip files that make no sense to delta (like compressed files) if !isDeltaCandidate(s.file) { continue @@ -368,7 +463,7 @@ func analyzeForDelta(old *tarInfo, newTar *tarInfo, oldFile io.Reader) (*deltaAn return nil, err } - err = extractDeltaData(oldFile, sourceByIndex, tmpfile) + err = extractDeltaData(oldFiles, sourceByIndex, tmpfile) if err != nil { _ = os.Remove(tmpfile.Name()) return nil, err diff --git a/pkg/tar-diff/analysis_test.go b/pkg/tar-diff/analysis_test.go index f885244..e896a83 100644 --- a/pkg/tar-diff/analysis_test.go +++ b/pkg/tar-diff/analysis_test.go @@ -189,7 +189,7 @@ func TestAnalyzeForDelta_HardlinksInTargetInfo(t *testing.T) { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTar) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}, nil) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } @@ -214,3 +214,125 @@ func TestAnalyzeForDelta_HardlinksInTargetInfo(t *testing.T) { t.Errorf("Expected linkname 'file.txt', got %q", hlInfo.hardlink.linkname) } } + +func TestAnalyzeTar_HardlinksAddMultiplePaths(t *testing.T) { + entries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("content")}, + {name: "real/file.txt", typeflag: tar.TypeLink, linkname: "blobs/sha256/abc123"}, + {name: "other/link.txt", typeflag: tar.TypeLink, linkname: "blobs/sha256/abc123"}, + } + tarFile, err := createTestTar(entries) + if err != nil { + t.Fatalf("Failed to create test tar: %v", err) + } + + info, err := analyzeTar(tarFile) + if err != nil { + t.Fatalf("analyzeTar failed: %v", err) + } + + if len(info.files) != 1 { + t.Fatalf("Expected 1 file, got %d", len(info.files)) + } + + file := &info.files[0] + + expectedPaths := []string{"blobs/sha256/abc123", "real/file.txt", "other/link.txt"} + if len(file.paths) != len(expectedPaths) { + t.Fatalf("Expected %d paths, got %d", len(expectedPaths), len(file.paths)) + } + for i, expected := range expectedPaths { + if file.paths[i] != expected { + t.Errorf("Path %d: expected %q, got %q", i, expected, file.paths[i]) + } + } + + expectedBasenames := []string{"abc123", "file.txt", "link.txt"} + if len(file.basenames) != len(expectedBasenames) { + t.Fatalf("Expected %d basenames, got %d", len(expectedBasenames), len(file.basenames)) + } + for i, expected := range expectedBasenames { + if file.basenames[i] != expected { + t.Errorf("Basename %d: expected %q, got %q", i, expected, file.basenames[i]) + } + } +} + +func TestAnalyzeForDelta_MatchViaHardlinkPath(t *testing.T) { + // Old tar: file with sha256 name and real name hardlink + oldEntries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("version 1 content")}, + {name: "real/file.txt", typeflag: tar.TypeLink, linkname: "blobs/sha256/abc123"}, + } + oldTar, err := createTestTar(oldEntries) + if err != nil { + t.Fatalf("Failed to create old tar: %v", err) + } + + // New tar: file with different sha256 name but same real name + newEntries := []tarEntry{ + {name: "blobs/sha256/def456", typeflag: tar.TypeReg, data: []byte("version 2 content")}, + {name: "real/file.txt", typeflag: tar.TypeLink, linkname: "blobs/sha256/def456"}, + } + newTar, err := createTestTar(newEntries) + if err != nil { + t.Fatalf("Failed to create new tar: %v", err) + } + + if _, err := oldTar.Seek(0, 0); err != nil { + t.Fatalf("oldTar.Seek: %v", err) + } + if _, err := newTar.Seek(0, 0); err != nil { + t.Fatalf("newTar.Seek: %v", err) + } + + oldInfo, err := analyzeTar(oldTar) + if err != nil { + t.Fatalf("analyzeTar (old) failed: %v", err) + } + + newInfo, err := analyzeTar(newTar) + if err != nil { + t.Fatalf("analyzeTar (new) failed: %v", err) + } + + if _, err := oldTar.Seek(0, 0); err != nil { + t.Fatalf("oldTar.Seek: %v", err) + } + + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}, nil) + if err != nil { + t.Fatalf("analyzeForDelta failed: %v", err) + } + defer func() { + if err := analysis.Close(); err != nil { + t.Fatalf("analysis.Close failed: %v", err) + } + }() + + // The new file should have matched the old file via the "real/file.txt" path + targetInfo := &analysis.targetInfos[0] + if targetInfo.source == nil { + t.Fatal("Expected target file to find a source match") + } + + // The source should be the old file (which has "real/file.txt" as one of its paths) + if len(targetInfo.source.file.paths) < 2 { + t.Fatal("Expected source file to have multiple paths") + } + foundRealPath := false + for _, p := range targetInfo.source.file.paths { + if p == "real/file.txt" { + foundRealPath = true + break + } + } + if !foundRealPath { + t.Error("Expected source file to have 'real/file.txt' in its paths") + } + + // The primary path (paths[0]) should be the sha256 path (first regular file entry) + if targetInfo.source.file.paths[0] != "blobs/sha256/abc123" { + t.Errorf("Expected primary source path to be 'blobs/sha256/abc123', got %q", targetInfo.source.file.paths[0]) + } +} diff --git a/pkg/tar-diff/diff.go b/pkg/tar-diff/diff.go index 9ee2c1c..6464cbd 100644 --- a/pkg/tar-diff/diff.go +++ b/pkg/tar-diff/diff.go @@ -3,6 +3,7 @@ package tardiff import ( "archive/tar" "bytes" + "fmt" "io" "log" @@ -70,7 +71,7 @@ func (g *deltaGenerator) generateForFileWithBsdiff(info *targetInfo) error { file := info.file source := info.source - err := g.deltaWriter.SetCurrentFile(source.file.path) + err := g.deltaWriter.SetCurrentFile(source.file.paths[0]) if err != nil { return err } @@ -104,7 +105,7 @@ func (g *deltaGenerator) generateForFileWithrollsums(info *targetInfo) error { matches := info.rollsumMatches.matches pos := int64(0) - err := g.deltaWriter.SetCurrentFile(source.file.path) + err := g.deltaWriter.SetCurrentFile(source.file.paths[0]) if err != nil { return err } @@ -160,7 +161,7 @@ func (g *deltaGenerator) generateForFile(info *targetInfo) error { switch { case sourceFile.sha1 == file.sha1 && sourceFile.size == file.size: // Reuse exact file from old tar - if err := g.deltaWriter.WriteOldFile(sourceFile.path, uint64(sourceFile.size)); err != nil { + if err := g.deltaWriter.WriteOldFile(sourceFile.paths[0], uint64(sourceFile.size)); err != nil { return err } @@ -261,6 +262,7 @@ func generateDelta(newFile io.ReadSeeker, deltaFile io.Writer, analysis *deltaAn type Options struct { compressionLevel int maxBsdiffSize int64 + sourcePrefixes []string } // SetCompressionLevel sets the compression level for the output diff file. @@ -273,25 +275,41 @@ func (o *Options) SetMaxBsdiffFileSize(maxBsdiffSize int64) { o.maxBsdiffSize = maxBsdiffSize } +// SetSourcePrefixes sets path prefixes to filter which source files can be used for delta. +// Only files whose primary path starts with one of these prefixes will be used as delta sources. +func (o *Options) SetSourcePrefixes(prefixes []string) { + o.sourcePrefixes = prefixes +} + // NewOptions creates a new Options struct with default values. func NewOptions() *Options { return &Options{ compressionLevel: 3, maxBsdiffSize: defaultMaxBsdiffSize, + sourcePrefixes: nil, } } -// Diff creates a binary difference between two tar archives. -func Diff(oldTarFile io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Writer, options *Options) error { +// Diff creates a binary difference between a set of tar archives and a new tar archive +// oldTarFiles contains one or more old tar files, in extraction order +func Diff(oldTarFiles []io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Writer, options *Options) error { if options == nil { options = NewOptions() } - // First analyze both tarfiles by themselves - oldInfo, err := analyzeTar(oldTarFile) - if err != nil { - return err + if len(oldTarFiles) == 0 { + return fmt.Errorf("at least one old tar file is required") + } + + // First analyze all tarfiles by themselves + oldInfos := make([]*tarInfo, len(oldTarFiles)) + for i, oldTarFile := range oldTarFiles { + oldInfo, err := analyzeTar(oldTarFile) + if err != nil { + return err + } + oldInfos[i] = oldInfo } newInfo, err := analyzeTar(newTarFile) @@ -300,9 +318,11 @@ func Diff(oldTarFile io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Writer } // Reset tar.gz for re-reading - _, err = oldTarFile.Seek(0, 0) - if err != nil { - return err + for _, oldTarFile := range oldTarFiles { + _, err = oldTarFile.Seek(0, 0) + if err != nil { + return err + } } _, err = newTarFile.Seek(0, 0) if err != nil { @@ -310,7 +330,7 @@ func Diff(oldTarFile io.ReadSeeker, newTarFile io.ReadSeeker, diffFile io.Writer } // Compare new and old for delta information - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTarFile) + analysis, err := analyzeForDelta(oldInfos, newInfo, oldTarFiles, options) if err != nil { return err } diff --git a/pkg/tar-diff/diff_test.go b/pkg/tar-diff/diff_test.go index e8fc674..946fe4f 100644 --- a/pkg/tar-diff/diff_test.go +++ b/pkg/tar-diff/diff_test.go @@ -3,6 +3,7 @@ package tardiff import ( "archive/tar" "bytes" + "io" "testing" ) @@ -47,7 +48,7 @@ func TestGenerateDelta_Hardlinks(t *testing.T) { if _, err := oldTar.Seek(0, 0); err != nil { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTar) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}, nil) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } @@ -135,7 +136,7 @@ func TestGenerateDelta_MixedHardlinksAndDuplicates(t *testing.T) { if _, err := oldTar.Seek(0, 0); err != nil { t.Fatalf("oldTar.Seek: %v", err) } - analysis, err := analyzeForDelta(oldInfo, newInfo, oldTar) + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{oldTar}, nil) if err != nil { t.Fatalf("analyzeForDelta failed: %v", err) } diff --git a/pkg/tar-diff/multifile_test.go b/pkg/tar-diff/multifile_test.go new file mode 100644 index 0000000..f3a964e --- /dev/null +++ b/pkg/tar-diff/multifile_test.go @@ -0,0 +1,130 @@ +package tardiff + +import ( + "archive/tar" + "testing" +) + +func TestBuildSourceInfos(t *testing.T) { + // Create two tar infos + tar1Entries := []tarEntry{ + {name: "file1.txt", typeflag: tar.TypeReg, data: []byte("content1")}, + {name: "file2.txt", typeflag: tar.TypeReg, data: []byte("content2")}, + } + tar1, err := createTestTar(tar1Entries) + if err != nil { + t.Fatalf("Failed to create tar1: %v", err) + } + + tar2Entries := []tarEntry{ + {name: "file2.txt", typeflag: tar.TypeReg, data: []byte("content2-override")}, + {name: "file3.txt", typeflag: tar.TypeReg, data: []byte("content3")}, + } + tar2, err := createTestTar(tar2Entries) + if err != nil { + t.Fatalf("Failed to create tar2: %v", err) + } + + info1, err := analyzeTar(tar1) + if err != nil { + t.Fatalf("Failed to analyze tar1: %v", err) + } + + info2, err := analyzeTar(tar2) + if err != nil { + t.Fatalf("Failed to analyze tar2: %v", err) + } + + sourceInfos := buildSourceInfos([]*tarInfo{info1, info2}) + + // Should have 3 files total (file1, file2-orig, file2-override, file3) + // But file2-orig should be marked as overwritten + if len(sourceInfos) != 4 { + t.Fatalf("Expected 4 source infos, got %d", len(sourceInfos)) + } + + // Check that first file2 is marked as overwritten + var file2FromTar1 *sourceInfo + var file2FromTar2 *sourceInfo + for i := range sourceInfos { + s := &sourceInfos[i] + if s.file.paths[0] == "file2.txt" { + if s.sourceTarFileIndex == 0 { + file2FromTar1 = s + } else if s.sourceTarFileIndex == 1 { + file2FromTar2 = s + } + } + } + + if file2FromTar1 == nil { + t.Fatal("file2.txt from tar1 not found") + } + if file2FromTar2 == nil { + t.Fatal("file2.txt from tar2 not found") + } + + if !file2FromTar1.file.overwritten { + t.Error("file2.txt from tar1 should be marked as overwritten") + } + if file2FromTar2.file.overwritten { + t.Error("file2.txt from tar2 should NOT be marked as overwritten") + } +} + +func TestBuildSourceInfos_HardlinkConflicts(t *testing.T) { + // Layer 1: + // * sha256 file with hardlink to real name, + // * a file that will be overwritten via a hardlink + tar1Entries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("version1")}, + {name: "files/app.bin", typeflag: tar.TypeLink, linkname: "blobs/sha256/abc123"}, + {name: "files/replace-me", typeflag: tar.TypeReg, data: []byte("version1")}, + } + tar1, err := createTestTar(tar1Entries) + if err != nil { + t.Fatalf("Failed to create tar1: %v", err) + } + + // Layer 2: + // * different sha256 file with same hardlink name, will not overwrite old blob + // * a hardlink that overwrites replace-me + tar2Entries := []tarEntry{ + {name: "blobs/sha256/def456", typeflag: tar.TypeReg, data: []byte("version2")}, + {name: "files/app.bin", typeflag: tar.TypeLink, linkname: "blobs/sha256/def456"}, + {name: "files/other-file", typeflag: tar.TypeReg, data: []byte("version1")}, + {name: "files/replace-me", typeflag: tar.TypeLink, linkname: "files/other-file"}, + } + tar2, err := createTestTar(tar2Entries) + if err != nil { + t.Fatalf("Failed to create tar2: %v", err) + } + + info1, err := analyzeTar(tar1) + if err != nil { + t.Fatalf("Failed to analyze tar1: %v", err) + } + + info2, err := analyzeTar(tar2) + if err != nil { + t.Fatalf("Failed to analyze tar2: %v", err) + } + + sourceInfos := buildSourceInfos([]*tarInfo{info1, info2}) + + // Should have 4 files (two from each layer) + if len(sourceInfos) != 4 { + t.Fatalf("Expected 4 source infos, got %d", len(sourceInfos)) + } + + // The layer 1 blob file should not be marked as overwritten, even though its hardlink path + // "files/app.bin" conflicts with layer 2's hardlink path. + if sourceInfos[0].file.overwritten { + t.Error("Layer 1 file should not be marked as overwritten due to hardlink path conflict") + } + + // But the replace-me file should be overwritten, by the repalce-me hardlink + if !sourceInfos[1].file.overwritten { + t.Error("Layer 1 file should not be marked as overwritten due to hardlink path conflict") + } +} diff --git a/pkg/tar-diff/prefix_filter_test.go b/pkg/tar-diff/prefix_filter_test.go new file mode 100644 index 0000000..a63b0e4 --- /dev/null +++ b/pkg/tar-diff/prefix_filter_test.go @@ -0,0 +1,178 @@ +package tardiff + +import ( + "archive/tar" + "io" + "testing" +) + +func TestMatchesAnyPrefix(t *testing.T) { + tests := []struct { + path string + prefixes []string + want bool + }{ + {"blobs/sha256/abc123", []string{"blobs/"}, true}, + {"config/app.conf", []string{"blobs/"}, false}, + {"data/file.txt", []string{"blobs/"}, false}, + {"blobs/sha256/abc123", []string{"blobs/", "config/"}, true}, + {"config/app.conf", []string{"blobs/", "config/"}, true}, + {"data/file.txt", []string{"blobs/", "config/"}, false}, + {"anything", []string{}, true}, // empty prefixes means match all + {"anything", nil, true}, // nil prefixes means match all + } + + for _, tt := range tests { + got := matchesAnyPrefix(tt.path, tt.prefixes) + if got != tt.want { + t.Errorf("matchesAnyPrefix(%q, %v) = %v, want %v", tt.path, tt.prefixes, got, tt.want) + } + } +} + +func setupPrefixFilterTestData(t *testing.T) (oldTar io.ReadSeeker, oldTarInfo *tarInfo, newTar io.ReadSeeker, newInfo *tarInfo) { + oldEntries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("blob-content")}, + {name: "config/app.conf", typeflag: tar.TypeReg, data: []byte("config-v1")}, + {name: "data/file.txt", typeflag: tar.TypeReg, data: []byte("data-v1")}, + } + oldTar, err := createTestTar(oldEntries) + if err != nil { + t.Fatalf("Failed to create oldTar: %v", err) + } + + // New tar: same names, but all files modified + newEntries := []tarEntry{ + {name: "blobs/sha256/abc123", typeflag: tar.TypeReg, data: []byte("blob-content-modified")}, + {name: "config/app.conf", typeflag: tar.TypeReg, data: []byte("config-v2")}, + {name: "data/file.txt", typeflag: tar.TypeReg, data: []byte("data-v2")}, + } + newTar, err = createTestTar(newEntries) + if err != nil { + t.Fatalf("Failed to create new tar: %v", err) + } + + oldTarInfo, err = analyzeTar(oldTar) + if err != nil { + t.Fatalf("Failed to analyze oldTar: %v", err) + } + if _, err := oldTar.Seek(0, 0); err != nil { + t.Fatalf("oldTar.Seek: %v", err) + } + + newInfo, err = analyzeTar(newTar) + if err != nil { + t.Fatalf("Failed to analyze new tar: %v", err) + } + if _, err := newTar.Seek(0, 0); err != nil { + t.Fatalf("newTar.Seek: %v", err) + } + + return oldTar, oldTarInfo, newTar, newInfo +} + +func TestDiff_SourcePrefix(t *testing.T) { + old, oldInfo, _, newInfo := setupPrefixFilterTestData(t) + + options := NewOptions() + options.SetSourcePrefixes([]string{"blobs/"}) + + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{old}, options) + if err != nil { + t.Fatalf("analyzeForDelta failed: %v", err) + } + defer func() { _ = analysis.Close() }() + + // Verify that only files with blobs/ prefix can be used as delta sources + if len(analysis.targetInfos) != 3 { + t.Fatalf("Expected 3 target infos, got %d", len(analysis.targetInfos)) + } + + for i := range analysis.targetInfos { + target := &analysis.targetInfos[i] + if target.file == nil { + continue + } + + fileName := target.file.paths[0] + source := target.source + + switch fileName { + case "blobs/sha256/abc123": + // Should have a source (matches prefix) + if source == nil { + t.Error("blobs/sha256/abc123 should have a source (matches prefix)") + } else if !source.usedForDelta { + t.Error("blobs/sha256/abc123 source should be usedForDelta") + } + + default: + // Should NOT have a source (doesn't match prefix) + if source != nil { + t.Errorf("%s should NOT have a source (doesn't match prefix)", fileName) + } + } + } +} + +func TestDiff_SourceMultiplePrefixes(t *testing.T) { + old, oldInfo, _, newInfo := setupPrefixFilterTestData(t) + + options := NewOptions() + options.SetSourcePrefixes([]string{"blobs/", "config/"}) + + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{old}, options) + if err != nil { + t.Fatalf("analyzeForDelta failed: %v", err) + } + defer func() { _ = analysis.Close() }() + + // Verify correct filtering + for i := range analysis.targetInfos { + target := &analysis.targetInfos[i] + if target.file == nil { + continue + } + + fileName := target.file.paths[0] + source := target.source + + switch fileName { + case "blobs/sha256/abc123", "config/app.conf": + // Should have a source (matches one of the prefixes) + if source == nil { + t.Errorf("%s should have a source (matches prefix)", fileName) + } + + default: + // Should NOT have a source (doesn't match any prefix) + if source != nil { + t.Error("data/file.txt should NOT have a source (doesn't match any prefix)") + } + } + } +} + +func TestDiff_NoPrefixFilter(t *testing.T) { + old, oldInfo, _, newInfo := setupPrefixFilterTestData(t) + + // No prefix filter (default) - pass nil for default options + analysis, err := analyzeForDelta([]*tarInfo{oldInfo}, newInfo, []io.ReadSeeker{old}, nil) + if err != nil { + t.Fatalf("analyzeForDelta failed: %v", err) + } + defer func() { _ = analysis.Close() }() + + // All files should have sources + sourcesFound := 0 + for i := range analysis.targetInfos { + target := &analysis.targetInfos[i] + if target.file != nil && target.source != nil { + sourcesFound++ + } + } + + if sourcesFound != 3 { + t.Errorf("Expected 3 files to have sources, got %d", sourcesFound) + } +}