fileprep/errors.go at main · nao1215/fileprep · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
package fileprep

import (
	"errors"
	"fmt"
	"strings"

	"github.com/nao1215/fileparser"
)

// Sentinel errors for fileprep
var (
	// ErrStructSlicePointer is returned when the value is not a pointer to a struct slice
	ErrStructSlicePointer = errors.New("value must be a pointer to a struct slice")
	// ErrUnsupportedFileType is returned when the file type is not supported
	ErrUnsupportedFileType = errors.New("unsupported file type")
	// ErrEmptyFile is returned when the file is empty
	ErrEmptyFile = errors.New("file is empty")
	// ErrInvalidTagFormat is returned when the tag format is invalid
	ErrInvalidTagFormat = errors.New("invalid tag format")
	// ErrInvalidJSONAfterPrep is returned when preprocessing destroys JSON structure
	// in the "data" column of a JSON/JSONL file. This is a hard error because
	// invalid JSON lines in JSONL output cause downstream parsers to fail.
	ErrInvalidJSONAfterPrep = errors.New("preprocessing produced invalid JSON")
	// ErrEmptyJSONOutput is returned when all JSON/JSONL rows are empty or invalid
	// after preprocessing, resulting in no output lines. An empty JSONL output is
	// unparseable by downstream consumers.
	ErrEmptyJSONOutput = errors.New("JSON/JSONL output has no valid rows after preprocessing")
	// ErrNilWriter is returned when a nil io.Writer is passed to ProcessToWriter.
	ErrNilWriter = errors.New("writer must not be nil")
	// ErrNilReader is returned when a nil io.Reader is passed to Process or ProcessToWriter.
	ErrNilReader = errors.New("reader must not be nil")
)

// ValidationError represents a validation error with row and column information.
//
// Example:
//
//	for _, ve := range result.ValidationErrors() {
//	    fmt.Printf("Row %d, Column %q: %s (value=%q)\n",
//	        ve.Row, ve.Column, ve.Message, ve.Value)
//	}
type ValidationError struct {
	Row     int    // 1-based row number (excluding header)
	Column  string // Column name
	Field   string // Struct field name
	Value   string // The value that failed validation
	Tag     string // The validation tag that failed
	Message string // Human-readable error message
}

// Error implements the error interface
func (e *ValidationError) Error() string {
	return fmt.Sprintf("row %d, column %q (field %s): %s (value=%q, tag=%s)",
		e.Row, e.Column, e.Field, e.Message, e.Value, e.Tag)
}

// newValidationError creates a new ValidationError
func newValidationError(row int, column, field, value, tag, message string) *ValidationError {
	return &ValidationError{
		Row:     row,
		Column:  column,
		Field:   field,
		Value:   value,
		Tag:     tag,
		Message: message,
	}
}

// PrepError represents a preprocessing error.
//
// Example:
//
//	for _, pe := range result.PrepErrors() {
//	    fmt.Printf("Row %d, Column %q: %s (tag=%q)\n",
//	        pe.Row, pe.Column, pe.Message, pe.Tag)
//	}
type PrepError struct {
	Row     int    // 1-based row number
	Column  string // Column name
	Field   string // Struct field name
	Tag     string // The prep tag that failed
	Message string // Human-readable error message
}

// Error implements the error interface
func (e *PrepError) Error() string {
	return fmt.Sprintf("row %d, column %q (field %s): prep error - %s (tag=%s)",
		e.Row, e.Column, e.Field, e.Message, e.Tag)
}

// newPrepError creates a new PrepError
func newPrepError(row int, column, field, tag, message string) *PrepError {
	return &PrepError{
		Row:     row,
		Column:  column,
		Field:   field,
		Tag:     tag,
		Message: message,
	}
}

// ProcessResult contains the results of processing a file.
//
// Example:
//
//	reader, result, err := processor.Process(input, &records)
//	if result.HasErrors() {
//	    for _, ve := range result.ValidationErrors() {
//	        fmt.Printf("Row %d: %s\n", ve.Row, ve.Message)
//	    }
//	}
//	fmt.Printf("Valid: %d/%d rows\n", result.ValidRowCount, result.RowCount)
type ProcessResult struct {
	// Errors contains all validation and preprocessing errors
	Errors []error
	// RowCount is the total number of data rows processed (excluding header)
	RowCount int
	// ValidRowCount is the number of rows that passed all validations
	ValidRowCount int
	// Columns contains the column names from the header
	Columns []string
	// OriginalFormat is the file type that was processed
	OriginalFormat fileparser.FileType

	// validRecords holds rows that passed validation when validRowsOnly is
	// enabled. This is an internal field used between processRecords and
	// the output stage; it is cleared after output is written.
	validRecords [][]string
}

// InvalidRowCount returns the number of rows that failed validation
func (r *ProcessResult) InvalidRowCount() int {
	return r.RowCount - r.ValidRowCount
}

// HasErrors returns true if there are any errors
func (r *ProcessResult) HasErrors() bool {
	return len(r.Errors) > 0
}

// ValidationErrors returns only validation errors
func (r *ProcessResult) ValidationErrors() []*ValidationError {
	var errs []*ValidationError
	for _, err := range r.Errors {
		var ve *ValidationError
		if errors.As(err, &ve) {
			errs = append(errs, ve)
		}
	}
	return errs
}

// PrepErrors returns only preprocessing errors
func (r *ProcessResult) PrepErrors() []*PrepError {
	var errs []*PrepError
	for _, err := range r.Errors {
		var pe *PrepError
		if errors.As(err, &pe) {
			errs = append(errs, pe)
		}
	}
	return errs
}

// emptyFileMessages lists the exact error messages returned by
// fileparser v0.5.1 that indicate the input file/data is empty.
// These are matched exactly (case-insensitive) to avoid false positives.
//
// fileparser generates some messages dynamically via fmt.Errorf:
//   - "empty %s data" where %s is CSV, TSV, JSON, JSONL, LTSV etc.
//   - "empty parquet file", "empty XLSX sheet"
//
// We match both the static messages and the "empty ... data" pattern.
var emptyFileMessages = map[string]struct{}{ //nolint:gochecknoglobals // constant-like lookup table
	"empty parquet file":           {},
	"empty xlsx sheet":             {},
	"no valid ltsv records found":  {},
	"no sheets found in xlsx file": {},
	"no headers found in xlsx":     {},
}

// wrapParseError wraps errors returned by fileparser.Parse with the
// appropriate fileprep sentinel error so that callers can use errors.Is.
// fileparser does not export sentinel errors, so we match on the exact
// error message text from fileparser v0.5.1.
func wrapParseError(err error) error {
	if err == nil {
		return nil
	}
	msg := strings.ToLower(err.Error())

	if msg == "unsupported file type" {
		return fmt.Errorf("%w: %w", ErrUnsupportedFileType, err)
	}
	if msg == "reader cannot be nil" {
		return fmt.Errorf("%w: %w", ErrNilReader, err)
	}
	if _, ok := emptyFileMessages[msg]; ok {
		return fmt.Errorf("%w: %w", ErrEmptyFile, err)
	}
	// Match "empty <format> data" pattern (e.g. "empty csv data", "empty json data")
	if strings.HasPrefix(msg, "empty ") && strings.HasSuffix(msg, " data") {
		return fmt.Errorf("%w: %w", ErrEmptyFile, err)
	}
	// Match "empty <format> array" pattern (e.g. "empty json array")
	if strings.HasPrefix(msg, "empty ") && strings.HasSuffix(msg, " array") {
		return fmt.Errorf("%w: %w", ErrEmptyFile, err)
	}
	return err
}