diff --git a/keepsorted/line_group.go b/keepsorted/line_group.go index a637de8..d45a740 100644 --- a/keepsorted/line_group.go +++ b/keepsorted/line_group.go @@ -119,7 +119,7 @@ func groupLines(lines []string, metadata blockMetadata) []*lineGroup { // Returns another boolean indicating whether the group should be ending // after that line if so. shouldAddToRegexDelimitedGroup := func(l string) (addToGroup bool, finishGroupAfter bool) { - if metadata.opts.GroupStartRegex != nil { + if metadata.opts.GroupStartRegex != nil { // For GroupStartRegex, all non-regex-matching lines should be // part of the group including prior lines. return !matchesAnyRegex(l, metadata.opts.GroupStartRegex), false @@ -349,7 +349,7 @@ func (cb *codeBlock) append(s string, opts blockOptions) { cb.braceCounts = make(map[string]int) } - // TODO(jfalgout): Does this need to handle runes more correctly? + // TODO: jfaer - Does this need to handle runes more correctly? for i := 0; i < len(s); { if cb.expectedQuote == "" { // We do not appear to be inside a string literal. @@ -419,8 +419,18 @@ func (lg *lineGroup) commentOnly() bool { } func (lg *lineGroup) regexTokens() []regexToken { - // TODO: jfaer - Should we match regexes on the original content? - regexMatches := lg.opts.matchRegexes(lg.internalJoinedLines()) + var regexMatches []regexMatch + if len(lg.opts.ByRegex) == 0 { + // We apply other options on top of what the regex extracts, but if the user + // didn't set by_regex, we should fall back to the behavior they would've + // expected before we started supporting by_regex. + // Namely: We would apply the other options on top of the + // internalJoinedLines() instead of the raw content. + regexMatches = []regexMatch{{lg.internalJoinedLines()}} + } else { + regexMatches = lg.opts.matchRegexes(lg.regexJoinedLines()) + } + ret := make([]regexToken, len(regexMatches)) if lg.access.regexTokens == nil { lg.access.regexTokens = make([]regexTokenAccessRecorder, len(regexMatches)) @@ -453,8 +463,15 @@ func (lg *lineGroup) regexTokens() []regexToken { return ret } -// internalJoinedLines calculates the same thing as joinedLines, except it -// doesn't record that it was used in the accessRecorder. +// internalJoinedLines attempts to concatenate all of this lineGroup's content +// the same way a reasonable human might. +// +// If the previous line ends with a "word" character and the current line starts +// with a "word" character, the two lines will be separated by a space. +// Otherwise, the lines are concatenated without any separation. +// +// Unlike joinedLines, this method does not record that it was used in the +// accessRecorder. func (lg *lineGroup) internalJoinedLines() string { if len(lg.lines) == 0 { return "" @@ -475,6 +492,25 @@ func (lg *lineGroup) internalJoinedLines() string { return s.String() } +// regexJoinedLines concatenates all of this lineGroup's content in a way that's +// friendlier to regexes than internalJoinedLines. +// +// Primarily, this method still strips leading whitespace but it uses a real +// newline character to separate lines instead of the "word" character logic of +// internalJoinedLines. +func (lg *lineGroup) regexJoinedLines() string { + if len(lg.lines) == 0 { + return "" + } + lines := make([]string, len(lg.lines)) + for i, l := range lg.lines { + lines[i] = strings.TrimLeftFunc(l, unicode.IsSpace) + } + return strings.Join(lines, "\n") +} + +// joinedLines returns internalJoinedLines and records that it was called in the +// accessRecorder. func (lg *lineGroup) joinedLines() string { lg.access.joinedLines = true return lg.internalJoinedLines() diff --git a/keepsorted/options_parser.go b/keepsorted/options_parser.go index 4943842..2273631 100644 --- a/keepsorted/options_parser.go +++ b/keepsorted/options_parser.go @@ -125,7 +125,7 @@ func (p *parser) popIntOrBool() (IntOrBool, error) { func (ar *ByRegexOption) UnmarshalYAML(node *yaml.Node) error { switch node.Tag { case "!!str": - pat, err := regexp.Compile(node.Value) + pat, err := compileByRegex(node.Value) if err != nil { return err } @@ -141,7 +141,7 @@ func (ar *ByRegexOption) UnmarshalYAML(node *yaml.Node) error { return fmt.Errorf("by_regex map item must have exactly one key-value pair, but got %d", len(m)) } for pattern, template := range m { - pat, err := regexp.Compile(pattern) + pat, err := compileByRegex(pattern) if err != nil { return fmt.Errorf("invalid regex pattern %q: %w", pattern, err) } @@ -191,7 +191,7 @@ func (p *parser) popList() ([]string, error) { func (p *parser) popByRegexOption() ([]ByRegexOption, error) { return popListValue(p, func(s string) (ByRegexOption, error) { - pat, err := regexp.Compile(s) + pat, err := compileByRegex(s) return ByRegexOption{Pattern: pat}, err }) } @@ -321,3 +321,14 @@ func (iter *runeIter) pop() (rune, bool) { } return ch, ok } + +func compileByRegex(re string) (*regexp.Regexp, error) { + if !strings.HasPrefix(re, "(?s)") && !strings.HasPrefix(re, "(?-s)") { + // The initial version of by_regex ran on top of lineGroup.joinedLines. This + // meant that users wrote regexes that didn't need to handle newlines. + // To minimize disruption, we automatically set dotall flag so that their + // regexes might still work after we start using real newlines. + re = "(?s)" + re + } + return regexp.Compile(re) +} diff --git a/keepsorted/options_parser_test.go b/keepsorted/options_parser_test.go index 284cab5..051a9a1 100644 --- a/keepsorted/options_parser_test.go +++ b/keepsorted/options_parser_test.go @@ -218,7 +218,7 @@ func TestPopValue(t *testing.T) { name: "Regex", input: ".*", - want: []ByRegexOption{{regexp.MustCompile(".*"), nil}}, + want: []ByRegexOption{{regexp.MustCompile("(?s).*"), nil}}, }, { name: "MultipleRegex", @@ -226,9 +226,9 @@ func TestPopValue(t *testing.T) { input: `[.*, abcd, '(?:efgh)ijkl']`, allowYAMLList: true, want: []ByRegexOption{ - {regexp.MustCompile(".*"), nil}, - {regexp.MustCompile("abcd"), nil}, - {regexp.MustCompile("(?:efgh)ijkl"), nil}, + {regexp.MustCompile("(?s).*"), nil}, + {regexp.MustCompile("(?s)abcd"), nil}, + {regexp.MustCompile("(?s)(?:efgh)ijkl"), nil}, }, }, { @@ -237,10 +237,10 @@ func TestPopValue(t *testing.T) { input: `[.*, Mon: 0, '\b(\d{2})/(\d{2})/(\d{4})\b': '${3}-${1}-${2}', "0: 1": 2]`, allowYAMLList: true, want: []ByRegexOption{ - {regexp.MustCompile(".*"), nil}, - {regexp.MustCompile("Mon"), &([]string{"0"})[0]}, - {regexp.MustCompile(`\b(\d{2})/(\d{2})/(\d{4})\b`), &([]string{"${3}-${1}-${2}"})[0]}, - {regexp.MustCompile(`0: 1`), &([]string{"2"})[0]}, + {regexp.MustCompile("(?s).*"), nil}, + {regexp.MustCompile("(?s)Mon"), &([]string{"0"})[0]}, + {regexp.MustCompile(`(?s)\b(\d{2})/(\d{2})/(\d{4})\b`), &([]string{"${3}-${1}-${2}"})[0]}, + {regexp.MustCompile(`(?s)0: 1`), &([]string{"2"})[0]}, }, }, { diff --git a/keepsorted/options_test.go b/keepsorted/options_test.go index bdcfa56..8ca18d7 100644 --- a/keepsorted/options_test.go +++ b/keepsorted/options_test.go @@ -194,7 +194,8 @@ func TestBlockOptions(t *testing.T) { want: blockOptions{ AllowYAMLLists: true, ByRegex: []ByRegexOption{ - {regexp.MustCompile("(?:abcd)"), nil}, {regexp.MustCompile("efg.*"), nil}, + {regexp.MustCompile("(?s)(?:abcd)"), nil}, + {regexp.MustCompile("(?s)efg.*"), nil}, }, }, }, @@ -206,8 +207,8 @@ func TestBlockOptions(t *testing.T) { want: blockOptions{ AllowYAMLLists: true, ByRegex: []ByRegexOption{ - {Pattern: regexp.MustCompile(`.*`)}, - {Pattern: regexp.MustCompile(`\b(\d{2})/(\d{2})/(\d{4})\b`), + {Pattern: regexp.MustCompile(`(?s).*`)}, + {Pattern: regexp.MustCompile(`(?s)\b(\d{2})/(\d{2})/(\d{4})\b`), Template: &[]string{"${3}-${1}-${2}"}[0]}, }, }, @@ -220,8 +221,8 @@ func TestBlockOptions(t *testing.T) { want: blockOptions{ AllowYAMLLists: true, ByRegex: []ByRegexOption{ - {Pattern: regexp.MustCompile(`foo, bar`)}, - {Pattern: regexp.MustCompile(`\b(\d{2})/(\d{2})/(\d{4})\b`), + {Pattern: regexp.MustCompile(`(?s)foo, bar`)}, + {Pattern: regexp.MustCompile(`(?s)\b(\d{2})/(\d{2})/(\d{4})\b`), Template: &[]string{"${3}-${1}-${2}"}[0]}, }, },