Skip to content

Commit ac909a0

Browse files
bobbai00claude
andauthored
fix(operator): correct regex escaping in WordCloud operator (#4261)
### What changes were proposed in this PR? Fixed two issues in `WordCloudOpDesc.scala`: 1. **Regex escaping bug**: The `pyb` refactor in #4189 changed `manipulateTable()` from `s"..."` to `pyb"""..."""`, but the regex `\\w` was not adjusted. In `s"..."`, `\\w` is an escape sequence producing `\w`. In triple-quoted `pyb"""..."""`, backslashes are literal, so `\\w` stays as `\\w` — producing `r'\\w'` in Python, which matches a literal backslash + `w` instead of word characters. This caused all rows to be filtered out, resulting in: *"text column does not contain words or contains only nulls."* Fixed by changing to `\w`. 2. **Duplicate statement**: Removed a duplicate `Map(...)` line in `getOutputSchemas`. Added unit tests to verify the regex pattern is correct. ### Any related issues, documentation, discussions? Regression introduced by #4189. ### How was this PR tested? Added `WordCloudOpDescSpec` with tests that verify: - `manipulateTable()` uses `r'\w'` (not `r'\\w'`) - Text column name appears in generated code All tests pass. ### Was this PR authored or co-authored using generative AI tooling? Generated-by: Claude Code (Claude Opus 4.6) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 206f3f3 commit ac909a0

2 files changed

Lines changed: 58 additions & 2 deletions

File tree

common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ class WordCloudOpDesc extends PythonOperatorDescriptor {
5252
val outputSchema = Schema()
5353
.add("html-content", AttributeType.STRING)
5454
Map(operatorInfo.outputPorts.head.id -> outputSchema)
55-
Map(operatorInfo.outputPorts.head.id -> outputSchema)
5655
}
5756

5857
override def operatorInfo: OperatorInfo =
@@ -67,7 +66,7 @@ class WordCloudOpDesc extends PythonOperatorDescriptor {
6766
def manipulateTable(): PythonTemplateBuilder = {
6867
pyb"""
6968
| table.dropna(subset = [$textColumn], inplace = True) #remove missing values
70-
| table = table[table[$textColumn].str.contains(r'\\w', regex=True)]
69+
| table = table[table[$textColumn].str.contains(r'\w', regex=True)]
7170
|"""
7271
}
7372

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.texera.amber.operator.visualization.wordCloud
21+
22+
import org.scalatest.BeforeAndAfter
23+
import org.scalatest.flatspec.AnyFlatSpec
24+
25+
class WordCloudOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
26+
27+
var opDesc: WordCloudOpDesc = _
28+
29+
before {
30+
opDesc = new WordCloudOpDesc()
31+
}
32+
33+
it should "use correct regex pattern to match word characters" in {
34+
opDesc.textColumn = "text_col"
35+
val code = opDesc.manipulateTable().plain
36+
assert(
37+
code.contains("""r'\w'"""),
38+
"regex should use single backslash \\w to match word characters"
39+
)
40+
assert(
41+
!code.contains("""r'\\w'"""),
42+
"regex should not use double backslash \\\\w which matches literal backslash+w"
43+
)
44+
}
45+
46+
it should "include the text column in manipulateTable" in {
47+
opDesc.textColumn = "my_text"
48+
val code = opDesc.manipulateTable().plain
49+
assert(code.contains("my_text"))
50+
}
51+
52+
it should "include the text column in createWordCloudFigure" in {
53+
opDesc.textColumn = "my_text"
54+
val code = opDesc.createWordCloudFigure().plain
55+
assert(code.contains("my_text"))
56+
}
57+
}

0 commit comments

Comments
 (0)