Skip to content

Commit 72c264c

Browse files
committed
Check in initial diff engine and semantic analysis
1 parent f4c375a commit 72c264c

File tree

3 files changed

+273
-30
lines changed

3 files changed

+273
-30
lines changed

crates/parser/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,11 @@ edit-distance = "2.1"
3333
[dev-dependencies]
3434
tokio = { workspace = true, features = ["test-util"] }
3535
tempfile = "3.8"
36+
37+
[[example]]
38+
name = "parse_demo"
39+
path = "../../examples/parse_demo.rs"
40+
41+
[[example]]
42+
name = "language_detection_demo"
43+
path = "../../examples/language_detection_demo.rs"

crates/parser/src/language.rs

Lines changed: 264 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//! Language detection and configuration
22
3+
use std::collections::HashMap;
34
use std::path::Path;
45
use regex::Regex;
56
use once_cell::sync::Lazy;
@@ -62,28 +63,33 @@ impl LanguageDetector {
6263
}
6364

6465
pub fn detect_from_content(content: &str) -> Language {
65-
// Simple heuristic-based detection
66-
static JAVA_PATTERN: Lazy<Regex> = Lazy::new(|| {
67-
Regex::new(r"(?m)^(public|private|protected)\s+(class|interface)").unwrap()
68-
});
69-
70-
static PYTHON_PATTERN: Lazy<Regex> = Lazy::new(|| {
71-
Regex::new(r"(?m)^(def|class|import|from)\s+").unwrap()
72-
});
73-
74-
static JS_PATTERN: Lazy<Regex> = Lazy::new(|| {
75-
Regex::new(r"(?m)(function|const|let|var)\s+").unwrap()
76-
});
77-
78-
if JAVA_PATTERN.is_match(content) {
79-
Language::Java
80-
} else if PYTHON_PATTERN.is_match(content) {
81-
Language::Python
82-
} else if JS_PATTERN.is_match(content) {
83-
Language::JavaScript
84-
} else {
85-
Language::Unknown
66+
// Use sophisticated pattern matching with scoring
67+
let mut scores = HashMap::new();
68+
69+
// Initialize scores for all languages
70+
for &lang in &[Language::Java, Language::Python, Language::JavaScript, Language::Cpp, Language::C] {
71+
scores.insert(lang, 0.0);
72+
}
73+
74+
// Apply detection patterns
75+
Self::apply_java_patterns(content, &mut scores);
76+
Self::apply_python_patterns(content, &mut scores);
77+
Self::apply_javascript_patterns(content, &mut scores);
78+
Self::apply_cpp_patterns(content, &mut scores);
79+
Self::apply_c_patterns(content, &mut scores);
80+
81+
// Find the language with the highest score
82+
let mut best_language = Language::Unknown;
83+
let mut best_score = 0.0;
84+
85+
for (&language, &score) in &scores {
86+
if score > best_score && score > 0.3 { // Minimum confidence threshold
87+
best_score = score;
88+
best_language = language;
89+
}
8690
}
91+
92+
best_language
8793
}
8894

8995
pub fn detect<P: AsRef<Path>>(path: P, content: &str) -> Language {
@@ -94,4 +100,241 @@ impl LanguageDetector {
94100
Self::detect_from_content(content)
95101
}
96102
}
103+
104+
/// Apply Java-specific detection patterns
105+
fn apply_java_patterns(content: &str, scores: &mut HashMap<Language, f64>) {
106+
static JAVA_PATTERNS: Lazy<Vec<(Regex, f64)>> = Lazy::new(|| {
107+
vec![
108+
// Strong indicators
109+
(Regex::new(r"(?m)^(public|private|protected)\s+(class|interface|enum)").unwrap(), 0.8),
110+
(Regex::new(r"(?m)^(public|private|protected)\s+(static\s+)?(void|int|String|boolean)").unwrap(), 0.7),
111+
(Regex::new(r"\bSystem\.out\.print(ln)?\s*\(").unwrap(), 0.9),
112+
(Regex::new(r"\bpublic\s+static\s+void\s+main\s*\(\s*String\[\]\s+\w+\s*\)").unwrap(), 1.0),
113+
114+
// Medium indicators
115+
(Regex::new(r"\b(import\s+java\.|package\s+)").unwrap(), 0.6),
116+
(Regex::new(r"\b(ArrayList|HashMap|List|Map|Set)\s*<").unwrap(), 0.5),
117+
(Regex::new(r"\b@(Override|Deprecated|SuppressWarnings)").unwrap(), 0.6),
118+
(Regex::new(r"\bnew\s+\w+\s*\(").unwrap(), 0.3),
119+
120+
// Weak indicators
121+
(Regex::new(r"\b(final|static|abstract|synchronized)\s+").unwrap(), 0.2),
122+
(Regex::new(r"\.length\b").unwrap(), 0.1),
123+
]
124+
});
125+
126+
let mut score = 0.0;
127+
for (pattern, weight) in JAVA_PATTERNS.iter() {
128+
if pattern.is_match(content) {
129+
score += weight;
130+
}
131+
}
132+
133+
// Penalty for non-Java patterns
134+
if content.contains("def ") || content.contains("import ") && !content.contains("import java") {
135+
score -= 0.3;
136+
}
137+
if content.contains("function ") || content.contains("const ") || content.contains("let ") {
138+
score -= 0.3;
139+
}
140+
141+
scores.insert(Language::Java, score.max(0.0));
142+
}
143+
144+
/// Apply Python-specific detection patterns
145+
fn apply_python_patterns(content: &str, scores: &mut HashMap<Language, f64>) {
146+
static PYTHON_PATTERNS: Lazy<Vec<(Regex, f64)>> = Lazy::new(|| {
147+
vec![
148+
// Strong indicators
149+
(Regex::new(r"(?m)^def\s+\w+\s*\(.*\)\s*:").unwrap(), 0.9),
150+
(Regex::new(r"(?m)^class\s+\w+(\([^)]*\))?\s*:").unwrap(), 0.8),
151+
(Regex::new(r"\bif\s+__name__\s*==\s*['\"]__main__['\"]").unwrap(), 1.0),
152+
(Regex::new(r"\bprint\s*\(").unwrap(), 0.7),
153+
154+
// Medium indicators
155+
(Regex::new(r"(?m)^(import\s+\w+|from\s+\w+\s+import)").unwrap(), 0.6),
156+
(Regex::new(r"\bself\.\w+").unwrap(), 0.6),
157+
(Regex::new(r"\b(True|False|None)\b").unwrap(), 0.5),
158+
(Regex::new(r"(?m)^\s*#.*$").unwrap(), 0.2),
159+
160+
// Python-specific syntax
161+
(Regex::new(r"\blen\s*\(").unwrap(), 0.3),
162+
(Regex::new(r"\brange\s*\(").unwrap(), 0.4),
163+
(Regex::new(r"\b(list|dict|tuple|set)\s*\(").unwrap(), 0.3),
164+
(Regex::new(r"(?m)^\s*elif\s+").unwrap(), 0.5),
165+
(Regex::new(r"\bwith\s+\w+.*:").unwrap(), 0.4),
166+
(Regex::new(r"\btry\s*:").unwrap(), 0.3),
167+
(Regex::new(r"\bexcept\s+\w*:").unwrap(), 0.4),
168+
]
169+
});
170+
171+
let mut score = 0.0;
172+
for (pattern, weight) in PYTHON_PATTERNS.iter() {
173+
if pattern.is_match(content) {
174+
score += weight;
175+
}
176+
}
177+
178+
// Check for Python-specific indentation patterns
179+
let lines: Vec<&str> = content.lines().collect();
180+
let mut indent_score = 0.0;
181+
for line in &lines {
182+
if line.starts_with(" ") || line.starts_with("\t") {
183+
indent_score += 0.1;
184+
}
185+
}
186+
score += (indent_score / lines.len() as f64).min(0.3);
187+
188+
// Penalty for non-Python patterns
189+
if content.contains("public class") || content.contains("private ") {
190+
score -= 0.4;
191+
}
192+
if content.contains("function ") || content.contains("var ") {
193+
score -= 0.3;
194+
}
195+
196+
scores.insert(Language::Python, score.max(0.0));
197+
}
198+
199+
/// Apply JavaScript-specific detection patterns
200+
fn apply_javascript_patterns(content: &str, scores: &mut HashMap<Language, f64>) {
201+
static JS_PATTERNS: Lazy<Vec<(Regex, f64)>> = Lazy::new(|| {
202+
vec![
203+
// Strong indicators
204+
(Regex::new(r"\bfunction\s+\w+\s*\(").unwrap(), 0.8),
205+
(Regex::new(r"\b(const|let|var)\s+\w+\s*=").unwrap(), 0.7),
206+
(Regex::new(r"\bconsole\.(log|error|warn)\s*\(").unwrap(), 0.9),
207+
(Regex::new(r"=>\s*\{").unwrap(), 0.8), // Arrow functions
208+
(Regex::new(r"\bclass\s+\w+\s*\{").unwrap(), 0.6),
209+
210+
// Medium indicators
211+
(Regex::new(r"\b(require|import)\s*\(").unwrap(), 0.6),
212+
(Regex::new(r"\bmodule\.exports\s*=").unwrap(), 0.8),
213+
(Regex::new(r"\bexport\s+(default\s+)?(function|class|const)").unwrap(), 0.7),
214+
(Regex::new(r"\b(true|false|null|undefined)\b").unwrap(), 0.4),
215+
(Regex::new(r"\bnew\s+\w+\s*\(").unwrap(), 0.3),
216+
217+
// JavaScript-specific syntax
218+
(Regex::new(r"\bthis\.\w+").unwrap(), 0.3),
219+
(Regex::new(r"\b(async|await)\b").unwrap(), 0.6),
220+
(Regex::new(r"\b(Promise|setTimeout|setInterval)\b").unwrap(), 0.5),
221+
(Regex::new(r"\.then\s*\(").unwrap(), 0.4),
222+
(Regex::new(r"\$\{.*\}").unwrap(), 0.5), // Template literals
223+
(Regex::new(r"//.*$").unwrap(), 0.1), // Single-line comments
224+
]
225+
});
226+
227+
let mut score = 0.0;
228+
for (pattern, weight) in JS_PATTERNS.iter() {
229+
if pattern.is_match(content) {
230+
score += weight;
231+
}
232+
}
233+
234+
// Penalty for non-JavaScript patterns
235+
if content.contains("public class") || content.contains("def ") {
236+
score -= 0.4;
237+
}
238+
if content.contains("#include") || content.contains("std::") {
239+
score -= 0.5;
240+
}
241+
242+
scores.insert(Language::JavaScript, score.max(0.0));
243+
}
244+
245+
/// Apply C++-specific detection patterns
246+
fn apply_cpp_patterns(content: &str, scores: &mut HashMap<Language, f64>) {
247+
static CPP_PATTERNS: Lazy<Vec<(Regex, f64)>> = Lazy::new(|| {
248+
vec![
249+
// Strong indicators
250+
(Regex::new(r"#include\s*<(iostream|vector|string|map|algorithm)>").unwrap(), 0.9),
251+
(Regex::new(r"\bstd::(cout|cin|endl|vector|string|map)").unwrap(), 0.8),
252+
(Regex::new(r"\bclass\s+\w+\s*\{").unwrap(), 0.6),
253+
(Regex::new(r"\b(public|private|protected)\s*:").unwrap(), 0.7),
254+
255+
// Medium indicators
256+
(Regex::new(r"\b(template\s*<|typename\s+)").unwrap(), 0.8),
257+
(Regex::new(r"\bnamespace\s+\w+").unwrap(), 0.7),
258+
(Regex::new(r"\b(virtual|override|final)\s+").unwrap(), 0.6),
259+
(Regex::new(r"\bnew\s+\w+(\[\]|\(\))").unwrap(), 0.4),
260+
(Regex::new(r"\bdelete\s+").unwrap(), 0.5),
261+
262+
// C++ specific syntax
263+
(Regex::new(r"::").unwrap(), 0.4),
264+
(Regex::new(r"\b(auto|decltype)\s+").unwrap(), 0.5),
265+
(Regex::new(r"\b(nullptr|constexpr)\b").unwrap(), 0.6),
266+
(Regex::new(r"->").unwrap(), 0.2),
267+
(Regex::new(r"<<|>>").unwrap(), 0.2),
268+
]
269+
});
270+
271+
let mut score = 0.0;
272+
for (pattern, weight) in CPP_PATTERNS.iter() {
273+
if pattern.is_match(content) {
274+
score += weight;
275+
}
276+
}
277+
278+
// Bonus for C++ over C patterns
279+
if content.contains("std::") || content.contains("class ") {
280+
score += 0.3;
281+
}
282+
283+
// Penalty for non-C++ patterns
284+
if content.contains("def ") || content.contains("function ") {
285+
score -= 0.4;
286+
}
287+
if content.contains("public class") && content.contains("System.out") {
288+
score -= 0.5;
289+
}
290+
291+
scores.insert(Language::Cpp, score.max(0.0));
292+
}
293+
294+
/// Apply C-specific detection patterns
295+
fn apply_c_patterns(content: &str, scores: &mut HashMap<Language, f64>) {
296+
static C_PATTERNS: Lazy<Vec<(Regex, f64)>> = Lazy::new(|| {
297+
vec![
298+
// Strong indicators
299+
(Regex::new(r"#include\s*<(stdio\.h|stdlib\.h|string\.h|math\.h)>").unwrap(), 0.9),
300+
(Regex::new(r"\bprintf\s*\(").unwrap(), 0.8),
301+
(Regex::new(r"\bscanf\s*\(").unwrap(), 0.7),
302+
(Regex::new(r"\bmain\s*\(\s*(void|int\s+argc.*char.*argv)\s*\)").unwrap(), 0.8),
303+
304+
// Medium indicators
305+
(Regex::new(r"\b(struct|union|enum)\s+\w+").unwrap(), 0.6),
306+
(Regex::new(r"\b(malloc|calloc|realloc|free)\s*\(").unwrap(), 0.7),
307+
(Regex::new(r"\b(int|char|float|double|void)\s+\*?\w+").unwrap(), 0.4),
308+
(Regex::new(r"\btypedef\s+").unwrap(), 0.5),
309+
310+
// C-specific syntax
311+
(Regex::new(r"->").unwrap(), 0.3),
312+
(Regex::new(r"\*\w+").unwrap(), 0.2), // Pointer dereference
313+
(Regex::new(r"&\w+").unwrap(), 0.2), // Address-of operator
314+
(Regex::new(r"\b(NULL|EXIT_SUCCESS|EXIT_FAILURE)\b").unwrap(), 0.4),
315+
]
316+
});
317+
318+
let mut score = 0.0;
319+
for (pattern, weight) in C_PATTERNS.iter() {
320+
if pattern.is_match(content) {
321+
score += weight;
322+
}
323+
}
324+
325+
// Penalty for C++ specific features
326+
if content.contains("std::") || content.contains("class ") || content.contains("namespace ") {
327+
score -= 0.5;
328+
}
329+
330+
// Penalty for other languages
331+
if content.contains("def ") || content.contains("function ") {
332+
score -= 0.4;
333+
}
334+
if content.contains("public class") {
335+
score -= 0.5;
336+
}
337+
338+
scores.insert(Language::C, score.max(0.0));
339+
}
97340
}

crates/parser/src/lib.rs

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,4 @@ pub use parser::{Parser, ParseResult, ParseError};
2121
pub type Result<T> = std::result::Result<T, ParseError>;
2222

2323
#[cfg(test)]
24-
mod tests {
25-
use super::*;
26-
27-
#[test]
28-
fn test_basic_functionality() {
29-
// Basic smoke test to ensure the crate compiles
30-
assert!(true);
31-
}
32-
}
24+
mod tests;

0 commit comments

Comments
 (0)