1717RE_WHITESPACE = re .compile (r"[ \n\r\t]+" )
1818RE_PROPERTY = re .compile (r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*" )
1919RE_INDEX = re .compile (r"-?[0-9]+" )
20- RE_INT = re .compile (r"-?[0-9]+" )
21- RE_EXPONENT = re . compile ( r"[eE][+-]?[0-9]+" )
22- RE_NEGATIVE_EXPONENT = re .compile (r"[ eE]- [0-9]+" )
20+ RE_INT = re .compile (r"-?[0-9]+(?:[eE]\+?[0-9]+)? " )
21+ # RE_FLOAT includes numbers with a negative exponent and no decimal point.
22+ RE_FLOAT = re .compile (r"(:?-?[0-9]+\.[0-9]+(?:[ eE][+-]? [0-9]+)?)|(-?[0-9]+[eE]-[0-9]+) " )
2323RE_FUNCTION_NAME = re .compile (r"[a-z][a-z_0-9]*" )
24- RE_AND = re .compile (r"&&" )
25- RE_OR = re .compile (r"\|\|" )
26- RE_TRUE = re .compile (r"true" )
27- RE_FALSE = re .compile (r"false" )
28- RE_NULL = re .compile (r"null" )
29- RE_ESCAPE = re .compile (r"\\[bfnrtu/]" )
24+ ESCAPES = frozenset (["b" , "f" , "n" , "r" , "t" , "u" , "/" , "\\ " ])
3025
3126
3227class Lexer :
@@ -77,14 +72,13 @@ def emit(self, t: TokenType) -> None:
7772
7873 def next (self ) -> str :
7974 """Return the next character, or the empty string if no more characters."""
80- # TODO: benchmark ty/except approach
81- if self .pos >= len (self .query ):
75+ try :
76+ c = self .query [self .pos ]
77+ self .pos += 1
78+ return c
79+ except IndexError :
8280 return ""
8381
84- c = self .query [self .pos ]
85- self .pos += 1
86- return c
87-
8882 def ignore (self ) -> None :
8983 """Ignore characters up to the pointer."""
9084 self .start = self .pos
@@ -101,15 +95,13 @@ def backup(self) -> None:
10195
10296 def peek (self ) -> str :
10397 """Return the next character without advancing the pointer."""
104- # TODO: benchmark try/except without self.next()
105- c = self .next ()
106- if c :
107- self .backup ()
108- return c
98+ try :
99+ return self .query [self .pos ]
100+ except IndexError :
101+ return ""
109102
110103 def accept (self , s : str ) -> bool :
111104 """Increment the pointer if the current position starts with _s_."""
112- # TODO: benchmark using accept instead of accept_match for known words
113105 if self .query .startswith (s , self .pos ):
114106 self .pos += len (s )
115107 return True
@@ -141,20 +133,25 @@ def ignore_whitespace(self) -> bool:
141133
142134 def error (self , msg : str ) -> None :
143135 """Emit an error token."""
144- # TODO: move msg out of Token.value. We'll need the value too when implementing
145136 # better error messages.
146- self .tokens .append (Token (TokenType .ERROR , msg , self .pos , self .query ))
137+ self .tokens .append (
138+ Token (
139+ TokenType .ERROR ,
140+ self .query [self .start : self .pos ],
141+ self .start ,
142+ self .query ,
143+ msg ,
144+ )
145+ )
147146
148147
149148StateFn = Callable [[Lexer ], Optional ["StateFn" ]]
150149
151150
152151def lex_root (l : Lexer ) -> Optional [StateFn ]: # noqa: D103
153- # TODO: benchmark peek/next instead of next/backup
154152 c = l .next ()
155153
156154 if c != "$" :
157- l .backup ()
158155 l .error (f"expected '$', found { c !r} " )
159156 return None
160157
@@ -184,9 +181,8 @@ def lex_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0911
184181 l .emit (TokenType .LBRACKET )
185182 return lex_inside_bracketed_segment
186183
187- # default
188- l .backup ()
189184 if l .filter_depth :
185+ l .backup ()
190186 return lex_inside_filter
191187
192188 l .error (f"expected '.', '..' or a bracketed selection, found { c !r} " )
@@ -208,21 +204,21 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103
208204 l .emit (TokenType .LBRACKET )
209205 return lex_inside_bracketed_segment
210206
211- # default
212207 l .backup ()
213208
214209 if l .accept_match (RE_PROPERTY ):
215210 l .emit (TokenType .PROPERTY )
216211 return lex_segment
217212
213+ l .next ()
218214 l .error (f"unexpected descendant selection token { c !r} " )
219215 return None
220216
221217
222218def lex_shorthand_selector (l : Lexer ) -> Optional [StateFn ]: # noqa: D103
223219 l .ignore () # ignore dot
224220
225- if l .ignore_whitespace ( ):
221+ if l .accept_match ( RE_WHITESPACE ):
226222 l .error ("unexpected whitespace after dot" )
227223 return None
228224
@@ -322,11 +318,9 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
322318 return lex_inside_bracketed_segment
323319
324320 if c == "'" :
325- # String literal
326321 return lex_single_quoted_string_inside_filter_expression
327322
328323 if c == '"' :
329- # String literal
330324 return lex_double_quoted_string_inside_filter_expression
331325
332326 if c == "(" :
@@ -392,62 +386,31 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
392386 l .emit (TokenType .GT )
393387 continue
394388
395- # default
396389 l .backup ()
397390
398- # numbers
399- # TODO: try accept_match(RE_FLOAT), including negative exponent
400- if l .accept_match (RE_INT ):
401- if l .peek () == "." :
402- # A float
403- l .next ()
404- if not l .accept_match (RE_INT ):
405- l .error ("a fractional digit is required after a decimal point" )
406- return None
407-
408- l .accept_match (RE_EXPONENT )
409- l .emit (TokenType .FLOAT )
410- continue
411-
412- # An int, or float if exponent is negative
413- if l .accept_match (RE_NEGATIVE_EXPONENT ):
414- l .emit (TokenType .FLOAT )
415- else :
416- l .accept_match (RE_EXPONENT )
417- l .emit (TokenType .INT )
418- continue
419-
420- if l .accept_match (RE_AND ):
391+ if l .accept ("&&" ):
421392 l .emit (TokenType .AND )
422- continue
423-
424- if l .accept_match (RE_OR ):
393+ elif l .accept ("||" ):
425394 l .emit (TokenType .OR )
426- continue
427-
428- if l .accept_match (RE_TRUE ):
395+ elif l .accept ("true" ):
429396 l .emit (TokenType .TRUE )
430- continue
431-
432- if l .accept_match (RE_FALSE ):
397+ elif l .accept ("false" ):
433398 l .emit (TokenType .FALSE )
434- continue
435-
436- if l .accept_match (RE_NULL ):
399+ elif l .accept ("null" ):
437400 l .emit (TokenType .NULL )
438- continue
439-
440- # functions
441- if l .accept_match (RE_FUNCTION_NAME ) and l .peek () == "(" :
401+ elif l .accept_match (RE_FLOAT ):
402+ l .emit (TokenType .FLOAT )
403+ elif l .accept_match (RE_INT ):
404+ l .emit (TokenType .INT )
405+ elif l .accept_match (RE_FUNCTION_NAME ) and l .peek () == "(" :
442406 # Keep track of parentheses for this function call.
443407 l .paren_stack .append (1 )
444408 l .emit (TokenType .FUNCTION )
445409 l .next ()
446410 l .ignore () # ignore LPAREN
447- continue
448-
449- l .error (f"unexpected filter selector token { c !r} " )
450- return None
411+ else :
412+ l .error (f"unexpected filter selector token { c !r} " )
413+ return None
451414
452415
453416def lex_string_factory (quote : str , state : StateFn ) -> StateFn :
@@ -472,17 +435,15 @@ def _lex_string(l: Lexer) -> Optional[StateFn]:
472435 return state
473436
474437 while True :
475- head = l .query [l .pos : l .pos + 2 ]
476438 c = l .next ()
477439
478- if head in ("\\ \\ " , f"\\ { quote } " ):
479- l .next ()
480- continue
481-
482- # TODO: replace use of `head` with peek
483- if c == "\\ " and not RE_ESCAPE .match (head ):
484- l .error ("invalid escape" )
485- return None
440+ if c == "\\ " :
441+ peeked = l .peek ()
442+ if peeked in ESCAPES or peeked == quote :
443+ l .next ()
444+ else :
445+ l .error ("invalid escape" )
446+ return None
486447
487448 if not c :
488449 l .error (f"unclosed string starting at index { l .start } " )
@@ -528,6 +489,6 @@ def tokenize(query: str) -> List[Token]:
528489 lexer .run ()
529490
530491 if tokens and tokens [- 1 ].type_ == TokenType .ERROR :
531- raise JSONPathSyntaxError (tokens [- 1 ].value , token = tokens [- 1 ])
492+ raise JSONPathSyntaxError (tokens [- 1 ].message , token = tokens [- 1 ])
532493
533494 return tokens
0 commit comments