Skip to content

Commit 9fb3ee1

Browse files
committed
updated tmpfile to prefer disk-based temp files.\
Fixes #9
1 parent d065bf9 commit 9fb3ee1

File tree

12 files changed

+728
-130
lines changed

12 files changed

+728
-130
lines changed

.cspell.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@
3232
"strconv",
3333
"tempfile",
3434
"tempfiles",
35-
"timsort"
35+
"timsort",
36+
"tmpfs",
37+
"writability"
3638
],
3739
"ignoreWords": [],
3840
"import": []

README.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,18 @@ config := &extsort.Config{
185185
NumWorkers: 4, // Parallel sorting/merging workers (default: 2)
186186
ChanBuffSize: 10, // Channel buffer size (default: 1)
187187
SortedChanBuffSize: 1000, // Output channel buffer (default: 1000)
188-
TempFilesDir: "/tmp", // Temporary files directory (default: OS temp)
188+
TempFilesDir: "/var/tmp", // Temporary files directory (default: intelligent selection)
189189
}
190190

191191
sorter, outputChan, errChan := extsort.Ordered(inputChan, config)
192192
```
193193

194+
### Temporary Directory Selection
195+
196+
When `TempFilesDir` is empty (default), the library intelligently selects temporary directories that prefer disk-backed locations over potentially memory-backed filesystems. On Linux systems where `/tmp` may be mounted as tmpfs (memory-backed), this helps prevent out-of-memory issues when sorting datasets larger than available RAM.
197+
198+
**For production use with large datasets, it's recommended to explicitly set `TempFilesDir` to a known disk-backed directory** (such as `/var/tmp` on Unix systems) to ensure optimal performance and avoid memory limitations.
199+
194200
## Legacy Interface-Based API
195201

196202
The library maintains backward compatibility with the original interface-based API:
@@ -456,7 +462,10 @@ func main() {
456462

457463
- **Memory Usage**: Configure `ChunkSize` based on available memory (larger chunks = less I/O, more memory)
458464
- **Parallelism**: Increase `NumWorkers` on multi-core systems
459-
- **I/O Performance**: Use fast storage for `TempFilesDir` (SSD recommended for large datasets)
465+
- **Temporary Storage**:
466+
- Explicitly set `TempFilesDir` to a known disk-backed directory for large datasets
467+
- On Linux, prefer `/var/tmp` over `/tmp` (which may be tmpfs/memory-backed)
468+
- Use fast storage (SSD recommended) for temporary files
460469
- **Channel Buffers**: Tune buffer sizes based on your producer/consumer patterns
461470

462471
## Error Handling

config.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,17 @@ type Config struct {
2424
SortedChanBuffSize int
2525

2626
// TempFilesDir specifies the directory for temporary files during sorting.
27-
// Empty string uses the OS default temporary directory (e.g., /tmp on Unix).
28-
// Default: "" (OS default).
27+
// When empty (default), the library uses intelligent directory selection that
28+
// prefers disk-backed locations over potentially memory-backed filesystems
29+
// (like tmpfs on Linux). This helps prevent out-of-memory issues when sorting
30+
// datasets larger than available RAM.
31+
//
32+
// For production use with large datasets, it's recommended to explicitly set
33+
// this to a known disk-backed directory (such as "/var/tmp" on Unix systems)
34+
// to ensure optimal performance and avoid memory limitations. On Linux systems,
35+
// prefer "/var/tmp" over "/tmp" since "/tmp" may be mounted as tmpfs.
36+
//
37+
// Default: "" (intelligent selection).
2938
TempFilesDir string
3039
}
3140

sort_generic.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ func (s *GenericSorter[E]) initMemoryPools() *memoryPools {
159159
func Generic[E any](input <-chan E, fromBytes FromBytesGeneric[E], toBytes ToBytesGeneric[E], compareFunc CompareGeneric[E], config *Config) (*GenericSorter[E], <-chan E, <-chan error) {
160160
var err error
161161
s := newSorter(input, fromBytes, toBytes, compareFunc, config)
162-
s.tempWriter, err = tempfile.New(s.config.TempFilesDir)
162+
s.tempWriter, err = tempfile.New(s.config.TempFilesDir, true)
163163
if err != nil {
164164
s.mergeErrChan <- err
165165
close(s.mergeErrChan)

tempfile/cleanup_test.go

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package tempfile_test
22

33
import (
44
"os"
5-
"path/filepath"
65
"runtime"
76
"testing"
87

@@ -14,7 +13,7 @@ import (
1413
func TestRobustCleanup(t *testing.T) {
1514
// Test normal cleanup path
1615
t.Run("NormalCleanup", func(t *testing.T) {
17-
writer, err := tempfile.New("")
16+
writer, err := tempfile.New("", true)
1817
if err != nil {
1918
t.Fatalf("Failed to create temp file: %v", err)
2019
}
@@ -62,7 +61,7 @@ func TestRobustCleanup(t *testing.T) {
6261

6362
// Test cleanup when writer is closed directly (abort case)
6463
t.Run("WriterAbortCleanup", func(t *testing.T) {
65-
writer, err := tempfile.New("")
64+
writer, err := tempfile.New("", true)
6665
if err != nil {
6766
t.Fatalf("Failed to create temp file: %v", err)
6867
}
@@ -89,40 +88,27 @@ func TestRobustCleanup(t *testing.T) {
8988
})
9089
}
9190

92-
// TestCleanupBehaviorDifferences documents the platform differences
91+
// TestCleanupBehaviorDifferences verifies platform-specific cleanup behavior
9392
func TestCleanupBehaviorDifferences(t *testing.T) {
94-
writer, err := tempfile.New("")
93+
writer, err := tempfile.New("", true)
9594
if err != nil {
9695
t.Fatalf("Failed to create temp file: %v", err)
9796
}
9897

9998
filename := writer.Name()
100-
tempDir := filepath.Dir(filename)
10199

102100
// Check if file exists initially
103101
_, err = os.Stat(filename)
104102
initialExists := err == nil
105103

106-
t.Logf("Platform: %s", runtime.GOOS)
107-
t.Logf("Temp file: %s", filename)
108-
t.Logf("Temp dir: %s", tempDir)
109-
t.Logf("File exists after creation: %t", initialExists)
110-
111-
if runtime.GOOS != "windows" {
112-
// On Unix, file should be unlinked immediately, so stat should fail
113-
if initialExists {
114-
t.Logf("NOTE: File still visible after creation (expected on some systems)")
115-
} else {
116-
t.Logf("File unlinked immediately after creation (Unix behavior)")
117-
}
118-
} else {
104+
if runtime.GOOS == "windows" {
119105
// On Windows, file should exist until explicitly closed
120106
if !initialExists {
121107
t.Errorf("Expected file to exist on Windows after creation")
122-
} else {
123-
t.Logf("File exists until close (Windows behavior)")
124108
}
125109
}
110+
// On Unix, file may or may not be visible due to immediate unlinking
111+
// This is implementation detail and both behaviors are acceptable
126112

127113
// Clean up
128114
_ = writer.Close()

tempfile/directory_test.go

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
package tempfile_test
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
8+
"github.com/lanrat/extsort/tempfile"
9+
)
10+
11+
func TestDirectorySharing(t *testing.T) {
12+
// Test that multiple writers share the same directory when using intelligent directory selection
13+
14+
// Create first writer - this will use the intelligent directory selection
15+
writer1, err := tempfile.New("", true)
16+
if err != nil {
17+
t.Fatalf("Failed to create first tempfile writer: %v", err)
18+
}
19+
20+
// Get the directory that was actually used
21+
dir1 := filepath.Dir(writer1.Name())
22+
t.Logf("First writer using directory: %s", dir1)
23+
24+
// Create second writer with same parameters - should use same directory if it's cached
25+
writer2, err := tempfile.New("", true)
26+
if err != nil {
27+
t.Fatalf("Failed to create second tempfile writer: %v", err)
28+
}
29+
30+
dir2 := filepath.Dir(writer2.Name())
31+
t.Logf("Second writer using directory: %s", dir2)
32+
33+
// Both should use the same directory due to caching
34+
if dir1 != dir2 {
35+
t.Errorf("Expected both writers to use same directory, got %s and %s", dir1, dir2)
36+
}
37+
38+
// The directory should exist
39+
if _, err := os.Stat(dir1); os.IsNotExist(err) {
40+
t.Fatalf("Expected directory %s to exist", dir1)
41+
}
42+
43+
// Close first writer
44+
err = writer1.Close()
45+
if err != nil {
46+
t.Fatalf("Failed to close first writer: %v", err)
47+
}
48+
49+
// Directory should still exist
50+
if _, err := os.Stat(dir1); os.IsNotExist(err) {
51+
t.Errorf("Expected directory %s to still exist after closing first writer", dir1)
52+
}
53+
54+
// Close second writer
55+
err = writer2.Close()
56+
if err != nil {
57+
t.Fatalf("Failed to close second writer: %v", err)
58+
}
59+
60+
// Check final state - the behavior depends on whether we created the directory
61+
_, err = os.Stat(dir1)
62+
dirStillExists := !os.IsNotExist(err)
63+
64+
t.Logf("Directory %s still exists after all writers closed: %t", dir1, dirStillExists)
65+
66+
// This test documents the behavior rather than asserting specific cleanup
67+
// The important guarantee is that temp files are cleaned up, not necessarily the directories
68+
}
69+
70+
func TestDirectoryReferenceCountingCleanup(t *testing.T) {
71+
// Test directory reference counting behavior with user-specified directories
72+
73+
// Create a base test directory
74+
baseDir, err := os.MkdirTemp("", "extsort-cleanup-test-")
75+
if err != nil {
76+
t.Fatalf("Failed to create test base dir: %v", err)
77+
}
78+
defer os.RemoveAll(baseDir)
79+
80+
// Create a subdirectory that matches our process-specific pattern
81+
// This simulates what would happen in the fallback case
82+
testDir := filepath.Join(baseDir, "test-extsort-dir")
83+
84+
// Don't create the directory - let tempfile.New create it
85+
86+
// Create first writer in the test directory
87+
writer1, err := tempfile.New(testDir, true)
88+
if err != nil {
89+
t.Fatalf("Failed to create first tempfile writer: %v", err)
90+
}
91+
92+
// Verify directory was created
93+
if _, err := os.Stat(testDir); os.IsNotExist(err) {
94+
t.Fatalf("Expected directory %s to be created", testDir)
95+
}
96+
97+
// Create second writer in the same directory
98+
writer2, err := tempfile.New(testDir, true)
99+
if err != nil {
100+
t.Fatalf("Failed to create second tempfile writer: %v", err)
101+
}
102+
103+
// Both writers should be in the same directory
104+
dir1 := filepath.Dir(writer1.Name())
105+
dir2 := filepath.Dir(writer2.Name())
106+
if dir1 != testDir || dir2 != testDir {
107+
t.Errorf("Expected both writers in %s, got %s and %s", testDir, dir1, dir2)
108+
}
109+
110+
// Close first writer - directory should still exist
111+
err = writer1.Close()
112+
if err != nil {
113+
t.Fatalf("Failed to close first writer: %v", err)
114+
}
115+
116+
// Directory should still exist because second writer is using it
117+
if _, err := os.Stat(testDir); os.IsNotExist(err) {
118+
t.Errorf("Expected directory %s to still exist after closing first writer", testDir)
119+
}
120+
121+
// Close second writer
122+
err = writer2.Close()
123+
if err != nil {
124+
t.Fatalf("Failed to close second writer: %v", err)
125+
}
126+
127+
// Since this isn't a process-specific extsort directory, it should still exist
128+
// (We only clean up directories that match our specific naming pattern)
129+
if _, err := os.Stat(testDir); os.IsNotExist(err) {
130+
t.Errorf("Expected directory %s to still exist after closing all writers (not an extsort-specific dir)", testDir)
131+
}
132+
}

tempfile/mockfile.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,18 @@ func (w *MockFileWriter) WriteString(s string) (int, error) {
6060
return w.data.WriteString(s)
6161
}
6262

63-
// Next stops writing the the current section/file and prepares the tempWriter for the next one
63+
// Next finalizes the current virtual file section and prepares for writing the next section.
64+
// It records the section boundary for later reading and returns the offset where the next section begins.
6465
func (w *MockFileWriter) Next() (int64, error) {
6566
// save offsets
6667
pos := w.data.Len()
6768
w.sections = append(w.sections, pos)
6869
return int64(pos), nil
6970
}
7071

71-
// Save stops allowing new writes and returns a TempReader for reading the data back
72+
// Save finalizes all virtual file sections and returns a TempReader for accessing the data.
73+
// After calling Save(), the MockFileWriter can no longer be used for writing.
74+
// The returned TempReader allows concurrent access to any virtual file section.
7275
func (w *MockFileWriter) Save() (TempReader, error) {
7376
_, err := w.Next()
7477
if err != nil {
@@ -77,6 +80,8 @@ func (w *MockFileWriter) Save() (TempReader, error) {
7780
return newMockTempReader(w.sections, w.data.Bytes())
7881
}
7982

83+
// newMockTempReader creates a TempReader from in-memory data with section boundaries.
84+
// This allows reading back data written by MockFileWriter in the same sectioned manner.
8085
func newMockTempReader(sections []int, data []byte) (*mockFileReader, error) {
8186
// create TempReader
8287
var r mockFileReader
@@ -94,19 +99,21 @@ func newMockTempReader(sections []int, data []byte) (*mockFileReader, error) {
9499
return &r, nil
95100
}
96101

97-
// Close does nothing much on a MockTempWriter
102+
// Close releases memory used by the mockFileReader.
103+
// This should be called after all reading operations are complete.
98104
func (r *mockFileReader) Close() error {
99105
r.readers = nil
100106
r.data = nil
101107
return nil
102108
}
103109

104-
// Size returns the number of sections/files in the reader
110+
// Size returns the number of virtual file sections available for reading.
105111
func (r *mockFileReader) Size() int {
106112
return len(r.readers)
107113
}
108114

109-
// Read returns a reader for the provided section
115+
// Read returns a buffered reader for the specified virtual file section.
116+
// Panics if the section index is out of range.
110117
func (r *mockFileReader) Read(i int) *bufio.Reader {
111118
if i < 0 || i >= len(r.readers) {
112119
panic("tempfile: read request out of range")

0 commit comments

Comments
 (0)