11//! Language detection and configuration
22
3+ use std:: collections:: HashMap ;
34use std:: path:: Path ;
45use regex:: Regex ;
56use once_cell:: sync:: Lazy ;
@@ -62,28 +63,33 @@ impl LanguageDetector {
6263 }
6364
6465 pub fn detect_from_content ( content : & str ) -> Language {
65- // Simple heuristic-based detection
66- static JAVA_PATTERN : Lazy < Regex > = Lazy :: new ( || {
67- Regex :: new ( r"(?m)^(public|private|protected)\s+(class|interface)" ) . unwrap ( )
68- } ) ;
69-
70- static PYTHON_PATTERN : Lazy < Regex > = Lazy :: new ( || {
71- Regex :: new ( r"(?m)^(def|class|import|from)\s+" ) . unwrap ( )
72- } ) ;
73-
74- static JS_PATTERN : Lazy < Regex > = Lazy :: new ( || {
75- Regex :: new ( r"(?m)(function|const|let|var)\s+" ) . unwrap ( )
76- } ) ;
77-
78- if JAVA_PATTERN . is_match ( content) {
79- Language :: Java
80- } else if PYTHON_PATTERN . is_match ( content) {
81- Language :: Python
82- } else if JS_PATTERN . is_match ( content) {
83- Language :: JavaScript
84- } else {
85- Language :: Unknown
66+ // Use sophisticated pattern matching with scoring
67+ let mut scores = HashMap :: new ( ) ;
68+
69+ // Initialize scores for all languages
70+ for & lang in & [ Language :: Java , Language :: Python , Language :: JavaScript , Language :: Cpp , Language :: C ] {
71+ scores. insert ( lang, 0.0 ) ;
72+ }
73+
74+ // Apply detection patterns
75+ Self :: apply_java_patterns ( content, & mut scores) ;
76+ Self :: apply_python_patterns ( content, & mut scores) ;
77+ Self :: apply_javascript_patterns ( content, & mut scores) ;
78+ Self :: apply_cpp_patterns ( content, & mut scores) ;
79+ Self :: apply_c_patterns ( content, & mut scores) ;
80+
81+ // Find the language with the highest score
82+ let mut best_language = Language :: Unknown ;
83+ let mut best_score = 0.0 ;
84+
85+ for ( & language, & score) in & scores {
86+ if score > best_score && score > 0.3 { // Minimum confidence threshold
87+ best_score = score;
88+ best_language = language;
89+ }
8690 }
91+
92+ best_language
8793 }
8894
8995 pub fn detect < P : AsRef < Path > > ( path : P , content : & str ) -> Language {
@@ -94,4 +100,241 @@ impl LanguageDetector {
94100 Self :: detect_from_content ( content)
95101 }
96102 }
103+
104+ /// Apply Java-specific detection patterns
105+ fn apply_java_patterns ( content : & str , scores : & mut HashMap < Language , f64 > ) {
106+ static JAVA_PATTERNS : Lazy < Vec < ( Regex , f64 ) > > = Lazy :: new ( || {
107+ vec ! [
108+ // Strong indicators
109+ ( Regex :: new( r"(?m)^(public|private|protected)\s+(class|interface|enum)" ) . unwrap( ) , 0.8 ) ,
110+ ( Regex :: new( r"(?m)^(public|private|protected)\s+(static\s+)?(void|int|String|boolean)" ) . unwrap( ) , 0.7 ) ,
111+ ( Regex :: new( r"\bSystem\.out\.print(ln)?\s*\(" ) . unwrap( ) , 0.9 ) ,
112+ ( Regex :: new( r"\bpublic\s+static\s+void\s+main\s*\(\s*String\[\]\s+\w+\s*\)" ) . unwrap( ) , 1.0 ) ,
113+
114+ // Medium indicators
115+ ( Regex :: new( r"\b(import\s+java\.|package\s+)" ) . unwrap( ) , 0.6 ) ,
116+ ( Regex :: new( r"\b(ArrayList|HashMap|List|Map|Set)\s*<" ) . unwrap( ) , 0.5 ) ,
117+ ( Regex :: new( r"\b@(Override|Deprecated|SuppressWarnings)" ) . unwrap( ) , 0.6 ) ,
118+ ( Regex :: new( r"\bnew\s+\w+\s*\(" ) . unwrap( ) , 0.3 ) ,
119+
120+ // Weak indicators
121+ ( Regex :: new( r"\b(final|static|abstract|synchronized)\s+" ) . unwrap( ) , 0.2 ) ,
122+ ( Regex :: new( r"\.length\b" ) . unwrap( ) , 0.1 ) ,
123+ ]
124+ } ) ;
125+
126+ let mut score = 0.0 ;
127+ for ( pattern, weight) in JAVA_PATTERNS . iter ( ) {
128+ if pattern. is_match ( content) {
129+ score += weight;
130+ }
131+ }
132+
133+ // Penalty for non-Java patterns
134+ if content. contains ( "def " ) || content. contains ( "import " ) && !content. contains ( "import java" ) {
135+ score -= 0.3 ;
136+ }
137+ if content. contains ( "function " ) || content. contains ( "const " ) || content. contains ( "let " ) {
138+ score -= 0.3 ;
139+ }
140+
141+ scores. insert ( Language :: Java , score. max ( 0.0 ) ) ;
142+ }
143+
144+ /// Apply Python-specific detection patterns
145+ fn apply_python_patterns ( content : & str , scores : & mut HashMap < Language , f64 > ) {
146+ static PYTHON_PATTERNS : Lazy < Vec < ( Regex , f64 ) > > = Lazy :: new ( || {
147+ vec ! [
148+ // Strong indicators
149+ ( Regex :: new( r"(?m)^def\s+\w+\s*\(.*\)\s*:" ) . unwrap( ) , 0.9 ) ,
150+ ( Regex :: new( r"(?m)^class\s+\w+(\([^)]*\))?\s*:" ) . unwrap( ) , 0.8 ) ,
151+ ( Regex :: new( r"\bif\s+__name__\s*==\s*['\" ] __main__[ ' \" ] ").unwrap(), 1.0),
152+ (Regex::new(r"\b print\s *\( " ) . unwrap( ) , 0.7 ) ,
153+
154+ // Medium indicators
155+ ( Regex :: new( r"(?m)^(import\s+\w+|from\s+\w+\s+import)" ) . unwrap( ) , 0.6 ) ,
156+ ( Regex :: new( r"\bself\.\w+" ) . unwrap( ) , 0.6 ) ,
157+ ( Regex :: new( r"\b(True|False|None)\b" ) . unwrap( ) , 0.5 ) ,
158+ ( Regex :: new( r"(?m)^\s*#.*$" ) . unwrap( ) , 0.2 ) ,
159+
160+ // Python-specific syntax
161+ ( Regex :: new( r"\blen\s*\(" ) . unwrap( ) , 0.3 ) ,
162+ ( Regex :: new( r"\brange\s*\(" ) . unwrap( ) , 0.4 ) ,
163+ ( Regex :: new( r"\b(list|dict|tuple|set)\s*\(" ) . unwrap( ) , 0.3 ) ,
164+ ( Regex :: new( r"(?m)^\s*elif\s+" ) . unwrap( ) , 0.5 ) ,
165+ ( Regex :: new( r"\bwith\s+\w+.*:" ) . unwrap( ) , 0.4 ) ,
166+ ( Regex :: new( r"\btry\s*:" ) . unwrap( ) , 0.3 ) ,
167+ ( Regex :: new( r"\bexcept\s+\w*:" ) . unwrap( ) , 0.4 ) ,
168+ ]
169+ } ) ;
170+
171+ let mut score = 0.0 ;
172+ for ( pattern, weight) in PYTHON_PATTERNS . iter ( ) {
173+ if pattern. is_match ( content) {
174+ score += weight;
175+ }
176+ }
177+
178+ // Check for Python-specific indentation patterns
179+ let lines: Vec < & str > = content. lines ( ) . collect ( ) ;
180+ let mut indent_score = 0.0 ;
181+ for line in & lines {
182+ if line. starts_with ( " " ) || line. starts_with ( "\t " ) {
183+ indent_score += 0.1 ;
184+ }
185+ }
186+ score += ( indent_score / lines. len ( ) as f64 ) . min ( 0.3 ) ;
187+
188+ // Penalty for non-Python patterns
189+ if content. contains ( "public class" ) || content. contains ( "private " ) {
190+ score -= 0.4 ;
191+ }
192+ if content. contains ( "function " ) || content. contains ( "var " ) {
193+ score -= 0.3 ;
194+ }
195+
196+ scores. insert ( Language :: Python , score. max ( 0.0 ) ) ;
197+ }
198+
199+ /// Apply JavaScript-specific detection patterns
200+ fn apply_javascript_patterns ( content : & str , scores : & mut HashMap < Language , f64 > ) {
201+ static JS_PATTERNS : Lazy < Vec < ( Regex , f64 ) > > = Lazy :: new ( || {
202+ vec ! [
203+ // Strong indicators
204+ ( Regex :: new( r"\bfunction\s+\w+\s*\(" ) . unwrap( ) , 0.8 ) ,
205+ ( Regex :: new( r"\b(const|let|var)\s+\w+\s*=" ) . unwrap( ) , 0.7 ) ,
206+ ( Regex :: new( r"\bconsole\.(log|error|warn)\s*\(" ) . unwrap( ) , 0.9 ) ,
207+ ( Regex :: new( r"=>\s*\{" ) . unwrap( ) , 0.8 ) , // Arrow functions
208+ ( Regex :: new( r"\bclass\s+\w+\s*\{" ) . unwrap( ) , 0.6 ) ,
209+
210+ // Medium indicators
211+ ( Regex :: new( r"\b(require|import)\s*\(" ) . unwrap( ) , 0.6 ) ,
212+ ( Regex :: new( r"\bmodule\.exports\s*=" ) . unwrap( ) , 0.8 ) ,
213+ ( Regex :: new( r"\bexport\s+(default\s+)?(function|class|const)" ) . unwrap( ) , 0.7 ) ,
214+ ( Regex :: new( r"\b(true|false|null|undefined)\b" ) . unwrap( ) , 0.4 ) ,
215+ ( Regex :: new( r"\bnew\s+\w+\s*\(" ) . unwrap( ) , 0.3 ) ,
216+
217+ // JavaScript-specific syntax
218+ ( Regex :: new( r"\bthis\.\w+" ) . unwrap( ) , 0.3 ) ,
219+ ( Regex :: new( r"\b(async|await)\b" ) . unwrap( ) , 0.6 ) ,
220+ ( Regex :: new( r"\b(Promise|setTimeout|setInterval)\b" ) . unwrap( ) , 0.5 ) ,
221+ ( Regex :: new( r"\.then\s*\(" ) . unwrap( ) , 0.4 ) ,
222+ ( Regex :: new( r"\$\{.*\}" ) . unwrap( ) , 0.5 ) , // Template literals
223+ ( Regex :: new( r"//.*$" ) . unwrap( ) , 0.1 ) , // Single-line comments
224+ ]
225+ } ) ;
226+
227+ let mut score = 0.0 ;
228+ for ( pattern, weight) in JS_PATTERNS . iter ( ) {
229+ if pattern. is_match ( content) {
230+ score += weight;
231+ }
232+ }
233+
234+ // Penalty for non-JavaScript patterns
235+ if content. contains ( "public class" ) || content. contains ( "def " ) {
236+ score -= 0.4 ;
237+ }
238+ if content. contains ( "#include" ) || content. contains ( "std::" ) {
239+ score -= 0.5 ;
240+ }
241+
242+ scores. insert ( Language :: JavaScript , score. max ( 0.0 ) ) ;
243+ }
244+
245+ /// Apply C++-specific detection patterns
246+ fn apply_cpp_patterns ( content : & str , scores : & mut HashMap < Language , f64 > ) {
247+ static CPP_PATTERNS : Lazy < Vec < ( Regex , f64 ) > > = Lazy :: new ( || {
248+ vec ! [
249+ // Strong indicators
250+ ( Regex :: new( r"#include\s*<(iostream|vector|string|map|algorithm)>" ) . unwrap( ) , 0.9 ) ,
251+ ( Regex :: new( r"\bstd::(cout|cin|endl|vector|string|map)" ) . unwrap( ) , 0.8 ) ,
252+ ( Regex :: new( r"\bclass\s+\w+\s*\{" ) . unwrap( ) , 0.6 ) ,
253+ ( Regex :: new( r"\b(public|private|protected)\s*:" ) . unwrap( ) , 0.7 ) ,
254+
255+ // Medium indicators
256+ ( Regex :: new( r"\b(template\s*<|typename\s+)" ) . unwrap( ) , 0.8 ) ,
257+ ( Regex :: new( r"\bnamespace\s+\w+" ) . unwrap( ) , 0.7 ) ,
258+ ( Regex :: new( r"\b(virtual|override|final)\s+" ) . unwrap( ) , 0.6 ) ,
259+ ( Regex :: new( r"\bnew\s+\w+(\[\]|\(\))" ) . unwrap( ) , 0.4 ) ,
260+ ( Regex :: new( r"\bdelete\s+" ) . unwrap( ) , 0.5 ) ,
261+
262+ // C++ specific syntax
263+ ( Regex :: new( r"::" ) . unwrap( ) , 0.4 ) ,
264+ ( Regex :: new( r"\b(auto|decltype)\s+" ) . unwrap( ) , 0.5 ) ,
265+ ( Regex :: new( r"\b(nullptr|constexpr)\b" ) . unwrap( ) , 0.6 ) ,
266+ ( Regex :: new( r"->" ) . unwrap( ) , 0.2 ) ,
267+ ( Regex :: new( r"<<|>>" ) . unwrap( ) , 0.2 ) ,
268+ ]
269+ } ) ;
270+
271+ let mut score = 0.0 ;
272+ for ( pattern, weight) in CPP_PATTERNS . iter ( ) {
273+ if pattern. is_match ( content) {
274+ score += weight;
275+ }
276+ }
277+
278+ // Bonus for C++ over C patterns
279+ if content. contains ( "std::" ) || content. contains ( "class " ) {
280+ score += 0.3 ;
281+ }
282+
283+ // Penalty for non-C++ patterns
284+ if content. contains ( "def " ) || content. contains ( "function " ) {
285+ score -= 0.4 ;
286+ }
287+ if content. contains ( "public class" ) && content. contains ( "System.out" ) {
288+ score -= 0.5 ;
289+ }
290+
291+ scores. insert ( Language :: Cpp , score. max ( 0.0 ) ) ;
292+ }
293+
294+ /// Apply C-specific detection patterns
295+ fn apply_c_patterns ( content : & str , scores : & mut HashMap < Language , f64 > ) {
296+ static C_PATTERNS : Lazy < Vec < ( Regex , f64 ) > > = Lazy :: new ( || {
297+ vec ! [
298+ // Strong indicators
299+ ( Regex :: new( r"#include\s*<(stdio\.h|stdlib\.h|string\.h|math\.h)>" ) . unwrap( ) , 0.9 ) ,
300+ ( Regex :: new( r"\bprintf\s*\(" ) . unwrap( ) , 0.8 ) ,
301+ ( Regex :: new( r"\bscanf\s*\(" ) . unwrap( ) , 0.7 ) ,
302+ ( Regex :: new( r"\bmain\s*\(\s*(void|int\s+argc.*char.*argv)\s*\)" ) . unwrap( ) , 0.8 ) ,
303+
304+ // Medium indicators
305+ ( Regex :: new( r"\b(struct|union|enum)\s+\w+" ) . unwrap( ) , 0.6 ) ,
306+ ( Regex :: new( r"\b(malloc|calloc|realloc|free)\s*\(" ) . unwrap( ) , 0.7 ) ,
307+ ( Regex :: new( r"\b(int|char|float|double|void)\s+\*?\w+" ) . unwrap( ) , 0.4 ) ,
308+ ( Regex :: new( r"\btypedef\s+" ) . unwrap( ) , 0.5 ) ,
309+
310+ // C-specific syntax
311+ ( Regex :: new( r"->" ) . unwrap( ) , 0.3 ) ,
312+ ( Regex :: new( r"\*\w+" ) . unwrap( ) , 0.2 ) , // Pointer dereference
313+ ( Regex :: new( r"&\w+" ) . unwrap( ) , 0.2 ) , // Address-of operator
314+ ( Regex :: new( r"\b(NULL|EXIT_SUCCESS|EXIT_FAILURE)\b" ) . unwrap( ) , 0.4 ) ,
315+ ]
316+ } ) ;
317+
318+ let mut score = 0.0 ;
319+ for ( pattern, weight) in C_PATTERNS . iter ( ) {
320+ if pattern. is_match ( content) {
321+ score += weight;
322+ }
323+ }
324+
325+ // Penalty for C++ specific features
326+ if content. contains ( "std::" ) || content. contains ( "class " ) || content. contains ( "namespace " ) {
327+ score -= 0.5 ;
328+ }
329+
330+ // Penalty for other languages
331+ if content. contains ( "def " ) || content. contains ( "function " ) {
332+ score -= 0.4 ;
333+ }
334+ if content. contains ( "public class" ) {
335+ score -= 0.5 ;
336+ }
337+
338+ scores. insert ( Language :: C , score. max ( 0.0 ) ) ;
339+ }
97340}
0 commit comments