From d39d920af7f9c40aef2d2de8c9eb7247fef213a2 Mon Sep 17 00:00:00 2001 From: Dennis Liu Date: Wed, 26 Feb 2025 10:12:52 +0800 Subject: [PATCH] Clear where non-American English appears Previously, warnings about non-American English words in commit messages did not indicate the specific word or its exact location. This commit improves the hook by outputting the precise line number and word that triggered the warning. A new function, 'get_all_match_positions', was introduced to locate the first occurrence of each target word in a multi-line string. We now preserve blank lines in the commit message by using the variable 'FULL_COMMIT_MSG_WITH_SPACE', so that the line search functionality operates correctly. This change makes it easier for users to quickly identify and correct non-American English words in their commit messages. Change-Id: I8a2b7eb3984b06b0be6506ca4f410ca857fe50a7 --- scripts/commit-msg.hook | 55 +++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/scripts/commit-msg.hook b/scripts/commit-msg.hook index 0b588e4f8..d1863d085 100755 --- a/scripts/commit-msg.hook +++ b/scripts/commit-msg.hook @@ -124,6 +124,41 @@ read_commit_message() { done < $COMMIT_MSG_FILE } +# Get positions (line, column) for each target word in a multiline string. +# Output format: "target: line" +get_all_match_positions() { + local text="$1" + local targets="$2" + local start_line=1 + local start_col=1 + + while IFS= read -r target; do + # search for the target string + local result + result=$( + awk -v t="$target" -v sl="$start_line" -v sc="$start_col" '{ + if (NR < sl) next + pos = index(NR == sl ? substr($0, sc) : $0, t) + if (pos) { + print NR, (NR == sl ? pos + sc - 1 : pos) + exit + } + }' <<< "$text" + ) + + # skip if the target is not found + [ -z "$result" ] && continue + + # output and update states + local line col + read -r line col <<< "$result" + echo "$target: $line" + start_line="$line" + start_col=$((col + 1)) + + done <<< "$targets" +} + # # Validate the contents of the commmit msg agains the good commit guidelines. # @@ -348,8 +383,10 @@ done # 12. Avoid abusive language in commit message content # ------------------------------------------------------------------------------ - FULL_COMMIT_MSG=$(sed '/^#/d;/^[[:space:]]*$/d;/^[[:space:]]*Change-Id:/d' "$COMMIT_MSG_FILE" | \ - sed -E "s@${URL_REGEX#^}@@g") + FULL_COMMIT_MSG_WITH_SPACE=$(sed '/^#/d;/^[[:space:]]*Change-Id:/d' "$COMMIT_MSG_FILE" | \ + sed -E "s@${URL_REGEX#^}@@g") + FULL_COMMIT_MSG=$(echo "$FULL_COMMIT_MSG_WITH_SPACE" | sed '/^[[:space:]]*$/d') + # Extended list of abusive words (case-insensitive). # Adjust the list as needed. ABUSIVE_WORDS_REGEX='\b(fuck|fucking|dick|shit|bitch|asshole|cunt|motherfucker|damn|crap|dumbass|piss)\b' @@ -367,16 +404,20 @@ done add_warning 1 "Commit message appears to be written in Chinese: $MISSPELLED_WORDS" fi - # Remove quoted text and commit hashes from $FULL_COMMIT_MSG for spell checking. - # Handles commit references like "commit 7d05741" (short) or full 40-char hashes. - MSG_FOR_SPELLCHECK=$(echo "$FULL_COMMIT_MSG" | sed -E \ + MSG_FOR_SPELLCHECK_LINE_FINDING=$(echo "$FULL_COMMIT_MSG_WITH_SPACE" | sed -E \ -e "s/(['\"][^'\"]*['\"])//g" \ -e "s/\bcommit[[:space:]]+[0-9a-fA-F]{7,40}\b/commit/g") - + MSG_FOR_SPELLCHECK=$(echo "$MSG_FOR_SPELLCHECK_LINE_FINDING" | sed '/^[[:space:]]*$/d') + + # Use aspell to list misspelled words according to American English, ignoring quoted text. MISSPELLED_WORDS=$(echo "$MSG_FOR_SPELLCHECK" | $ASPELL --lang=en --list --home-dir=scripts --personal=aspell-pws) if [ -n "$MISSPELLED_WORDS" ]; then - add_warning 1 "Avoid using non-American English words" + results=$(get_all_match_positions "$MSG_FOR_SPELLCHECK_LINE_FINDING" "$MISSPELLED_WORDS") + + while read -r result; do + add_warning "${result#*:}" "Avoid using non-American English words: ${result%%:*}" + done <<< "$results" fi }