From 8d35248bb3c9c0c15319e70eeff5223bc5e1f61c Mon Sep 17 00:00:00 2001 From: Jeffrey Faer Date: Mon, 9 Mar 2026 10:41:21 -0600 Subject: [PATCH 1/5] fix: Apply regexes to the actual line content (separated by newlines) instead of the joinedLines. Now users can craft regexes that would match just the first line of the group or block (or any arbitrary line they desire) which is pretty hard to do right now. --- keepsorted/line_group.go | 48 +++++++++++++++++++++++++++++++++++----- keepsorted/options.go | 3 ++- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/keepsorted/line_group.go b/keepsorted/line_group.go index a637de8..25f6fb9 100644 --- a/keepsorted/line_group.go +++ b/keepsorted/line_group.go @@ -119,7 +119,7 @@ func groupLines(lines []string, metadata blockMetadata) []*lineGroup { // Returns another boolean indicating whether the group should be ending // after that line if so. shouldAddToRegexDelimitedGroup := func(l string) (addToGroup bool, finishGroupAfter bool) { - if metadata.opts.GroupStartRegex != nil { + if metadata.opts.GroupStartRegex != nil { // For GroupStartRegex, all non-regex-matching lines should be // part of the group including prior lines. return !matchesAnyRegex(l, metadata.opts.GroupStartRegex), false @@ -349,7 +349,7 @@ func (cb *codeBlock) append(s string, opts blockOptions) { cb.braceCounts = make(map[string]int) } - // TODO(jfalgout): Does this need to handle runes more correctly? + // TODO: jfaer - Does this need to handle runes more correctly? for i := 0; i < len(s); { if cb.expectedQuote == "" { // We do not appear to be inside a string literal. @@ -419,8 +419,18 @@ func (lg *lineGroup) commentOnly() bool { } func (lg *lineGroup) regexTokens() []regexToken { - // TODO: jfaer - Should we match regexes on the original content? - regexMatches := lg.opts.matchRegexes(lg.internalJoinedLines()) + var regexMatches []regexMatch + if len(lg.opts.ByRegex) == 0 { + // We apply other options on top of what the regex extracts, but if the user + // didn't set by_regex, we should fall back to the behavior they would've + // expected before we started supporting by_regex. + // Namely: We would apply the other options on top of the + // internalJoinedLines() instead of the raw content. + regexMatches = []regexMatch{{lg.internalJoinedLines()}} + } else { + regexMatches = lg.opts.matchRegexes(lg.regexJoinedLines()) + } + ret := make([]regexToken, len(regexMatches)) if lg.access.regexTokens == nil { lg.access.regexTokens = make([]regexTokenAccessRecorder, len(regexMatches)) @@ -453,8 +463,15 @@ func (lg *lineGroup) regexTokens() []regexToken { return ret } -// internalJoinedLines calculates the same thing as joinedLines, except it -// doesn't record that it was used in the accessRecorder. +// internalJoinedLines attempts to concatenate all of this lineGroup's content +// the same way a reasonable human might. +// +// If the previous line ends with a "word" character and the current line starts +// with a "word" character, the two lines will be separated by a space. +// Otherwise, the lines are concatenated without any separation. +// +// Unlike joinedLines, this method does not record that it was used in the +// accessRecorder. func (lg *lineGroup) internalJoinedLines() string { if len(lg.lines) == 0 { return "" @@ -475,6 +492,25 @@ func (lg *lineGroup) internalJoinedLines() string { return s.String() } +// regexJoinedLines concatenates all of this lineGroup's content in a way that +// friendlier to regexes than internalJoinedLines. +// +// Primarily, this method still strips leading whitespace but it uses a real +// newline character to separate lines instead of the "word" character logic of +// internalJoinedLines. +func (lg *lineGroup) regexJoinedLines() string { + if len(lg.lines) == 0 { + return "" + } + lines := make([]string, len(lg.lines)) + for i, l := range lg.lines { + lines[i] = strings.TrimLeftFunc(l, unicode.IsSpace) + } + return strings.Join(lines, "\n") +} + +// joinedLines returns internalJoinedLines and records that it was called in the +// accessRecorder. func (lg *lineGroup) joinedLines() string { lg.access.joinedLines = true return lg.internalJoinedLines() diff --git a/keepsorted/options.go b/keepsorted/options.go index e36d20d..ea4e867 100644 --- a/keepsorted/options.go +++ b/keepsorted/options.go @@ -437,13 +437,14 @@ func (opts blockOptions) trimIgnorePrefix(s string) string { // the resulting slice. func (opts blockOptions) matchRegexes(s string) []regexMatch { if len(opts.ByRegex) == 0 { - return []regexMatch{{s}} + return nil } var ret []regexMatch for _, p := range opts.ByRegex { regex := p.Pattern + // TODO: jfaer - It'd be nice if these two branches were unified. if p.Template != nil { var result []byte m := regex.FindAllStringSubmatchIndex(s, -1) From d7a4f840d06ee37c57e0c39f03fee3272f8ca08c Mon Sep 17 00:00:00 2001 From: Jeffrey Faer Date: Mon, 9 Mar 2026 13:27:31 -0600 Subject: [PATCH 2/5] set (?s) on all by_regexes to minimize disruption. --- keepsorted/options.go | 23 ++++++++++++++++------- keepsorted/options_parser.go | 17 ++++++++++++++--- keepsorted/options_parser_test.go | 16 ++++++++-------- keepsorted/options_test.go | 17 +++++++++-------- 4 files changed, 47 insertions(+), 26 deletions(-) diff --git a/keepsorted/options.go b/keepsorted/options.go index ea4e867..c7b4b47 100644 --- a/keepsorted/options.go +++ b/keepsorted/options.go @@ -238,21 +238,26 @@ func formatValue(val reflect.Value) (string, error) { return strconv.Itoa(int(val.Int())), nil case reflect.TypeFor[[]ByRegexOption](): opts := val.Interface().([]ByRegexOption) - vals := make([]string, 0, len(opts)) + vals := make([]any, len(opts)) seenTemplate := false - for _, opt := range opts { + for i, opt := range opts { if opt.Template != nil { seenTemplate = true - vals = append(vals, fmt.Sprintf(`%q: %q`, opt.Pattern.String(), *opt.Template)) + vals[i] = map[string]string{opt.Pattern.String(): *opt.Template} continue } - vals = append(vals, opt.Pattern.String()) + vals[i] = opt.Pattern.String() } + if seenTemplate { - // always presented as a yaml sequence to preserve any `k:v` items - return fmt.Sprintf("[%s]", strings.Join(vals, ", ")), nil + return formatYAMLList(vals) } - return formatList(vals) + + s := make([]string, len(vals)) + for i, val := range vals { + s[i] = val.(string) + } + return formatList(s) case reflect.TypeFor[[]*regexp.Regexp](): regexps := val.Interface().([]*regexp.Regexp) vals := make([]string, len(regexps)) @@ -282,6 +287,10 @@ func formatList(vals []string) (string, error) { return strings.Join(vals, ","), nil } + return formatYAMLList(vals) +} + +func formatYAMLList[T any](vals []T) (string, error) { node := new(yaml.Node) if err := node.Encode(vals); err != nil { return "", fmt.Errorf("while converting list to YAML: %w", err) diff --git a/keepsorted/options_parser.go b/keepsorted/options_parser.go index 4943842..b87c7ca 100644 --- a/keepsorted/options_parser.go +++ b/keepsorted/options_parser.go @@ -125,7 +125,7 @@ func (p *parser) popIntOrBool() (IntOrBool, error) { func (ar *ByRegexOption) UnmarshalYAML(node *yaml.Node) error { switch node.Tag { case "!!str": - pat, err := regexp.Compile(node.Value) + pat, err := compileByRegex(node.Value) if err != nil { return err } @@ -141,7 +141,7 @@ func (ar *ByRegexOption) UnmarshalYAML(node *yaml.Node) error { return fmt.Errorf("by_regex map item must have exactly one key-value pair, but got %d", len(m)) } for pattern, template := range m { - pat, err := regexp.Compile(pattern) + pat, err := compileByRegex(pattern) if err != nil { return fmt.Errorf("invalid regex pattern %q: %w", pattern, err) } @@ -191,7 +191,7 @@ func (p *parser) popList() ([]string, error) { func (p *parser) popByRegexOption() ([]ByRegexOption, error) { return popListValue(p, func(s string) (ByRegexOption, error) { - pat, err := regexp.Compile(s) + pat, err := compileByRegex(s) return ByRegexOption{Pattern: pat}, err }) } @@ -321,3 +321,14 @@ func (iter *runeIter) pop() (rune, bool) { } return ch, ok } + +func compileByRegex(re string) (*regexp.Regexp, error) { + if !strings.HasPrefix(re, "(?s)") { + // The initial version of by_regex ran on top of lineGroup.joinedLines. This + // meant that users wrote regexes that didn't need to handle newlines. + // To minimize disruption, we automatically set dotall flag so that their + // regexes might still work after we start using real newlines. + re = "(?s)" + re + } + return regexp.Compile(re) +} diff --git a/keepsorted/options_parser_test.go b/keepsorted/options_parser_test.go index 284cab5..051a9a1 100644 --- a/keepsorted/options_parser_test.go +++ b/keepsorted/options_parser_test.go @@ -218,7 +218,7 @@ func TestPopValue(t *testing.T) { name: "Regex", input: ".*", - want: []ByRegexOption{{regexp.MustCompile(".*"), nil}}, + want: []ByRegexOption{{regexp.MustCompile("(?s).*"), nil}}, }, { name: "MultipleRegex", @@ -226,9 +226,9 @@ func TestPopValue(t *testing.T) { input: `[.*, abcd, '(?:efgh)ijkl']`, allowYAMLList: true, want: []ByRegexOption{ - {regexp.MustCompile(".*"), nil}, - {regexp.MustCompile("abcd"), nil}, - {regexp.MustCompile("(?:efgh)ijkl"), nil}, + {regexp.MustCompile("(?s).*"), nil}, + {regexp.MustCompile("(?s)abcd"), nil}, + {regexp.MustCompile("(?s)(?:efgh)ijkl"), nil}, }, }, { @@ -237,10 +237,10 @@ func TestPopValue(t *testing.T) { input: `[.*, Mon: 0, '\b(\d{2})/(\d{2})/(\d{4})\b': '${3}-${1}-${2}', "0: 1": 2]`, allowYAMLList: true, want: []ByRegexOption{ - {regexp.MustCompile(".*"), nil}, - {regexp.MustCompile("Mon"), &([]string{"0"})[0]}, - {regexp.MustCompile(`\b(\d{2})/(\d{2})/(\d{4})\b`), &([]string{"${3}-${1}-${2}"})[0]}, - {regexp.MustCompile(`0: 1`), &([]string{"2"})[0]}, + {regexp.MustCompile("(?s).*"), nil}, + {regexp.MustCompile("(?s)Mon"), &([]string{"0"})[0]}, + {regexp.MustCompile(`(?s)\b(\d{2})/(\d{2})/(\d{4})\b`), &([]string{"${3}-${1}-${2}"})[0]}, + {regexp.MustCompile(`(?s)0: 1`), &([]string{"2"})[0]}, }, }, { diff --git a/keepsorted/options_test.go b/keepsorted/options_test.go index dccfd75..20acd7c 100644 --- a/keepsorted/options_test.go +++ b/keepsorted/options_test.go @@ -194,7 +194,8 @@ func TestBlockOptions(t *testing.T) { want: blockOptions{ AllowYAMLLists: true, ByRegex: []ByRegexOption{ - {regexp.MustCompile("(?:abcd)"), nil}, {regexp.MustCompile("efg.*"), nil}, + {regexp.MustCompile("(?s)(?:abcd)"), nil}, + {regexp.MustCompile("(?s)efg.*"), nil}, }, }, }, @@ -206,8 +207,8 @@ func TestBlockOptions(t *testing.T) { want: blockOptions{ AllowYAMLLists: true, ByRegex: []ByRegexOption{ - {Pattern: regexp.MustCompile(`.*`)}, - {Pattern: regexp.MustCompile(`\b(\d{2})/(\d{2})/(\d{4})\b`), + {Pattern: regexp.MustCompile(`(?s).*`)}, + {Pattern: regexp.MustCompile(`(?s)\b(\d{2})/(\d{2})/(\d{4})\b`), Template: &[]string{"${3}-${1}-${2}"}[0]}, }, }, @@ -253,13 +254,13 @@ func TestBlockOptions(t *testing.T) { got, warns := parseBlockOptions(tc.commentMarker, tc.in, tc.defaultOptions) if err := errors.Join(warns...); err != nil { if tc.wantErr == "" { - t.Errorf("parseBlockOptions(%q, %q) = %v", tc.commentMarker, tc.in, err) + t.Errorf("parseBlockOptions(%#v, %#v) = %v", tc.commentMarker, tc.in, err) } else if !strings.Contains(err.Error(), tc.wantErr) { - t.Errorf("parseBlockOptions(%q, %q) = %v, expected to contain %q", tc.commentMarker, tc.in, err, tc.wantErr) + t.Errorf("parseBlockOptions(%#v, %#v) = %v, expected to contain %q", tc.commentMarker, tc.in, err, tc.wantErr) } } if diff := cmp.Diff(tc.want, got, cmp.AllowUnexported(blockOptions{}), cmpRegexp); diff != "" { - t.Errorf("parseBlockOptions(%q, %q) mismatch (-want +got):\n%s", tc.commentMarker, tc.in, diff) + t.Errorf("parseBlockOptions(%#v, %#v) mismatch (-want +got):\n%s", tc.commentMarker, tc.in, diff) } if tc.wantErr == "" { @@ -267,10 +268,10 @@ func TestBlockOptions(t *testing.T) { s := got.String() got2, warns := parseBlockOptions(tc.commentMarker, s, tc.defaultOptions) if err := errors.Join(warns...); err != nil { - t.Errorf("parseBlockOptions(%q, %q) = %v", tc.commentMarker, s, err) + t.Errorf("parseBlockOptions(%#v, %#v) = %v", tc.commentMarker, s, err) } if diff := cmp.Diff(got, got2, cmp.AllowUnexported(blockOptions{}), cmpRegexp); diff != "" { - t.Errorf("parseBlockOptions(%q, %q) mismatch (-want +got):\n%s", tc.commentMarker, s, diff) + t.Errorf("parseBlockOptions(%#v, %#v) mismatch (-want +got):\n%s", tc.commentMarker, s, diff) } }) } From 849e102e03eaab6b66f6dc506b750fa27a6bc9b9 Mon Sep 17 00:00:00 2001 From: Jeffrey Faer Date: Mon, 9 Mar 2026 16:01:37 -0600 Subject: [PATCH 3/5] fix the new test --- keepsorted/options_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keepsorted/options_test.go b/keepsorted/options_test.go index abc7153..8ca18d7 100644 --- a/keepsorted/options_test.go +++ b/keepsorted/options_test.go @@ -221,8 +221,8 @@ func TestBlockOptions(t *testing.T) { want: blockOptions{ AllowYAMLLists: true, ByRegex: []ByRegexOption{ - {Pattern: regexp.MustCompile(`foo, bar`)}, - {Pattern: regexp.MustCompile(`\b(\d{2})/(\d{2})/(\d{4})\b`), + {Pattern: regexp.MustCompile(`(?s)foo, bar`)}, + {Pattern: regexp.MustCompile(`(?s)\b(\d{2})/(\d{2})/(\d{4})\b`), Template: &[]string{"${3}-${1}-${2}"}[0]}, }, }, From 90a0537177859076c9f7fb3445dc14c01c3854d7 Mon Sep 17 00:00:00 2001 From: Jeffrey Faer Date: Mon, 9 Mar 2026 16:07:58 -0600 Subject: [PATCH 4/5] also exempt regexes that start with (?-s). Strictly this isn't necessary since (?s)(?-s) would simplify to (?-s), but it feels nice to skip adding (?s) if it's obvious that's not wanted. --- keepsorted/options_parser.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keepsorted/options_parser.go b/keepsorted/options_parser.go index b87c7ca..2273631 100644 --- a/keepsorted/options_parser.go +++ b/keepsorted/options_parser.go @@ -323,7 +323,7 @@ func (iter *runeIter) pop() (rune, bool) { } func compileByRegex(re string) (*regexp.Regexp, error) { - if !strings.HasPrefix(re, "(?s)") { + if !strings.HasPrefix(re, "(?s)") && !strings.HasPrefix(re, "(?-s)") { // The initial version of by_regex ran on top of lineGroup.joinedLines. This // meant that users wrote regexes that didn't need to handle newlines. // To minimize disruption, we automatically set dotall flag so that their From 2075edf501d6ac590dbed5cbca4fb921cd97bca7 Mon Sep 17 00:00:00 2001 From: Jeffrey Faer Date: Mon, 9 Mar 2026 16:21:12 -0600 Subject: [PATCH 5/5] fix comment typo --- keepsorted/line_group.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keepsorted/line_group.go b/keepsorted/line_group.go index 25f6fb9..d45a740 100644 --- a/keepsorted/line_group.go +++ b/keepsorted/line_group.go @@ -492,7 +492,7 @@ func (lg *lineGroup) internalJoinedLines() string { return s.String() } -// regexJoinedLines concatenates all of this lineGroup's content in a way that +// regexJoinedLines concatenates all of this lineGroup's content in a way that's // friendlier to regexes than internalJoinedLines. // // Primarily, this method still strips leading whitespace but it uses a real