|
1 | 1 | -module(elixir_tokenizer). |
2 | 2 | -include("elixir.hrl"). |
| 3 | +-include("elixir_tokenizer.hrl"). |
3 | 4 | -export([tokenize/1, tokenize/3, tokenize/4, invalid_do_error/1]). |
4 | 5 |
|
5 | | -%% Numbers |
6 | | --define(is_hex(S), (?is_digit(S) orelse (S >= $A andalso S =< $F) orelse (S >= $a andalso S =< $f))). |
7 | | --define(is_bin(S), (S >= $0 andalso S =< $1)). |
8 | | --define(is_octal(S), (S >= $0 andalso S =< $7)). |
9 | | - |
10 | | -%% Digits and letters |
11 | | --define(is_digit(S), (S >= $0 andalso S =< $9)). |
12 | | --define(is_upcase(S), (S >= $A andalso S =< $Z)). |
13 | | --define(is_downcase(S), (S >= $a andalso S =< $z)). |
14 | | - |
15 | | -%% Others |
16 | | --define(is_quote(S), (S =:= $" orelse S =:= $')). |
17 | | --define(is_sigil(S), (S =:= $/ orelse S =:= $< orelse S =:= $" orelse S =:= $' orelse |
18 | | - S =:= $[ orelse S =:= $( orelse S =:= ${ orelse S =:= $|)). |
19 | | - |
20 | | -%% Spaces |
21 | | --define(is_horizontal_space(S), (S =:= $\s orelse S =:= $\t)). |
22 | | --define(is_vertical_space(S), (S =:= $\r orelse S =:= $\n)). |
23 | | --define(is_space(S), (?is_horizontal_space(S) orelse ?is_vertical_space(S))). |
24 | | - |
25 | | -%% Operators |
26 | 6 | -define(at_op(T), |
27 | 7 | T =:= $@). |
28 | 8 |
|
@@ -198,9 +178,13 @@ tokenize([$0, $o, H | T], Line, Column, Scope, Tokens) when ?is_octal(H) -> |
198 | 178 | % Comments |
199 | 179 |
|
200 | 180 | tokenize([$# | String], Line, Column, Scope, Tokens) -> |
201 | | - {Rest, Comment} = tokenize_comment(String, [$#]), |
202 | | - preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), |
203 | | - tokenize(Rest, Line, Column, Scope, reset_eol(Tokens)); |
| 181 | + case tokenize_comment(String, [$#]) of |
| 182 | + {error, Char} -> |
| 183 | + error_comment(Char, [$# | String], Line, Column, Scope, Tokens); |
| 184 | + {Rest, Comment} -> |
| 185 | + preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), |
| 186 | + tokenize(Rest, Line, Column, Scope, reset_eol(Tokens)) |
| 187 | + end; |
204 | 188 |
|
205 | 189 | % Sigils |
206 | 190 |
|
@@ -645,9 +629,7 @@ tokenize([$% | T], Line, Column, Scope, Tokens) -> |
645 | 629 | tokenize(T, Line, Column + 1, Scope, [{'%', {Line, Column, nil}} | Tokens]); |
646 | 630 |
|
647 | 631 | tokenize([$. | T], Line, Column, Scope, Tokens) -> |
648 | | - DotInfo = {Line, Column, nil}, |
649 | | - {Rest, EndLine, EndColumn} = strip_dot_space(T, Line, Column + 1, [{'.', DotInfo}| Tokens], Scope), |
650 | | - handle_dot([$. | Rest], EndLine, EndColumn, DotInfo, Scope, Tokens); |
| 632 | + tokenize_dot(T, Line, Column + 1, {Line, Column, nil}, Scope, Tokens); |
651 | 633 |
|
652 | 634 | % Identifiers |
653 | 635 |
|
@@ -719,18 +701,23 @@ strip_horizontal_space([H | T], Counter) when ?is_horizontal_space(H) -> |
719 | 701 | strip_horizontal_space(T, Counter) -> |
720 | 702 | {T, Counter}. |
721 | 703 |
|
722 | | -strip_dot_space(T, Line, Column, Tokens, Scope) -> |
| 704 | +tokenize_dot(T, Line, Column, DotInfo, Scope, Tokens) -> |
723 | 705 | case strip_horizontal_space(T, 0) of |
724 | | - {"#" ++ R, _} -> |
725 | | - {Rest, Comment} = tokenize_comment(R, [$#]), |
726 | | - preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), |
727 | | - strip_dot_space(Rest, Line, 1, reset_eol(Tokens), Scope); |
| 706 | + {[$# | R], _} -> |
| 707 | + case tokenize_comment(R, [$#]) of |
| 708 | + {error, Char} -> |
| 709 | + error_comment(Char, [$# | R], Line, Column, Scope, Tokens); |
| 710 | + |
| 711 | + {Rest, Comment} -> |
| 712 | + preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), |
| 713 | + tokenize_dot(Rest, Line, 1, DotInfo, Scope, Tokens) |
| 714 | + end; |
728 | 715 | {"\r\n" ++ Rest, _} -> |
729 | | - strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope); |
| 716 | + tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens); |
730 | 717 | {"\n" ++ Rest, _} -> |
731 | | - strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope); |
| 718 | + tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens); |
732 | 719 | {Rest, Length} -> |
733 | | - {Rest, Line, Column + Length} |
| 720 | + handle_dot([$. | Rest], Line, Column + Length, DotInfo, Scope, Tokens) |
734 | 721 | end. |
735 | 722 |
|
736 | 723 | handle_char(0) -> {"\\0", "null byte"}; |
@@ -1171,11 +1158,18 @@ tokenize_comment("\r\n" ++ _ = Rest, Acc) -> |
1171 | 1158 | {Rest, lists:reverse(Acc)}; |
1172 | 1159 | tokenize_comment("\n" ++ _ = Rest, Acc) -> |
1173 | 1160 | {Rest, lists:reverse(Acc)}; |
| 1161 | +tokenize_comment([H | _Rest], _) when ?bidi(H) -> |
| 1162 | + {error, H}; |
1174 | 1163 | tokenize_comment([H | Rest], Acc) -> |
1175 | 1164 | tokenize_comment(Rest, [H | Acc]); |
1176 | 1165 | tokenize_comment([], Acc) -> |
1177 | 1166 | {[], lists:reverse(Acc)}. |
1178 | 1167 |
|
| 1168 | +error_comment(H, Comment, Line, Column, Scope, Tokens) -> |
| 1169 | + Token = io_lib:format("\\u~4.16.0B", [H]), |
| 1170 | + Reason = {Line, Column, "invalid bidirectional formatting character in comment: ", Token}, |
| 1171 | + error(Reason, Comment, Scope, Tokens). |
| 1172 | + |
1179 | 1173 | preserve_comments(Line, Column, Tokens, Comment, Rest, Scope) -> |
1180 | 1174 | case Scope#elixir_tokenizer.preserve_comments of |
1181 | 1175 | Fun when is_function(Fun) -> |
|
0 commit comments