Revision | 376428dbc496a40f77de92078a24ef954d49832f (tree) |
---|---|
Time | 2023-04-03 02:42:49 |
Author | dyknon <dyknon@user...> |
Commiter | dyknon |
Optimized
@@ -0,0 +1,4 @@ | ||
1 | +test: | |
2 | + perl -mlib=lib t/empty_root_object.t | |
3 | + perl -mlib=lib t/additional_string.t | |
4 | + perl -mlib=lib t/hjson_official.t |
@@ -4,6 +4,9 @@ use warnings; | ||
4 | 4 | use strict; |
5 | 5 | use feature qw(postderef); |
6 | 6 | use Carp; |
7 | +use Text::Hjson::Grammar; | |
8 | + | |
9 | +our $grammar = "Text::Hjson::Grammar"; | |
7 | 10 | |
8 | 11 | our $ws = qr/[\t\r\n ]/; |
9 | 12 | our $inline_ws = qr/[\t\r ]/; |
@@ -29,11 +32,11 @@ sub null($){ | ||
29 | 32 | my ($self) = @_; |
30 | 33 | undef |
31 | 34 | } |
32 | -sub array(@){ | |
35 | +sub array($@){ | |
33 | 36 | my ($self, @inner) = @_; |
34 | 37 | \@inner |
35 | 38 | } |
36 | -sub object(@){ | |
39 | +sub object($@){ | |
37 | 40 | my ($self, @inner) = @_; |
38 | 41 | my %ret; |
39 | 42 | for(@inner){ |
@@ -41,176 +44,72 @@ sub object(@){ | ||
41 | 44 | } |
42 | 45 | \%ret |
43 | 46 | } |
47 | +sub string($$){ | |
48 | + my ($self, $str) = @_; | |
49 | + $str | |
50 | +} | |
51 | +sub number($$){ | |
52 | + my ($self, $num) = @_; | |
53 | + $num | |
54 | +} | |
55 | +sub decode_quoted_string($$){ | |
56 | + my ($self, $str) = @_; | |
44 | 57 | |
45 | -our %element = ( | |
46 | - literal => { | |
47 | - re => qr/$wsc(true|false|null)$litend/, | |
48 | - act => sub{ | |
49 | - my ($self, $match) = @_; | |
50 | - { | |
51 | - true => $self->true, | |
52 | - false => $self->false, | |
53 | - null => $self->null, | |
54 | - }->{$match->[0]} | |
55 | - } | |
56 | - }, | |
57 | - separator => {re => qr/$separator/}, | |
58 | - optional_sep => {re => qr/$separator?/}, | |
59 | - # XXX: not docuemnted, but tests say quoteless string | |
60 | - # starting with quote is not allowed. | |
61 | - quoteless_guard => {re => qr/$wsc(?!["'])/}, | |
62 | - value => { | |
63 | - any => [qw(literal object array number string)], | |
64 | - act => sub{$_[2]} | |
65 | - }, | |
66 | - root => { | |
67 | - any => [qw(object_member_arr value)], | |
68 | - act => sub{$_[2]} | |
69 | - }, | |
70 | - object => { | |
71 | - seq => [qw(begin_object object_inner end_object)], | |
72 | - act => sub{$_[2]} | |
73 | - }, | |
74 | - begin_object => {re => qr/$wsc\{/}, | |
75 | - end_object => {re => qr/$wsc\}/}, | |
76 | - object_inner => { | |
77 | - opt => "object_member_arr", | |
78 | - act => sub{ | |
79 | - my ($self, $match) = @_; | |
80 | - $match // $self->object; | |
81 | - } | |
82 | - }, | |
83 | - object_member_arr => { | |
84 | - seq => [qw(object_member object_members optional_sep)], | |
85 | - act => sub{ | |
86 | - my ($self, $first, $left) = @_; | |
87 | - $self->object($first, @$left) | |
88 | - } | |
89 | - }, | |
90 | - object_member => { | |
91 | - seq => [qw(object_key object_key_sep value)], | |
92 | - act => sub{ | |
93 | - my ($self, $key, $sep, $val) = @_; | |
94 | - [$key, $val] | |
95 | - } | |
96 | - }, | |
97 | - object_members => { | |
98 | - many => "separated_object_member", | |
99 | - act => sub{shift; [@_]} | |
100 | - }, | |
101 | - separated_object_member => { | |
102 | - seq => [qw(separator object_member)], | |
103 | - act => sub{$_[2]} | |
104 | - }, | |
105 | - object_key => { | |
106 | - any => [qw(string_quoted object_key_noquote)], | |
107 | - act => sub{$_[2]} | |
108 | - }, | |
109 | - object_key_noquote => { | |
110 | - seq => [qw(quoteless_guard object_key_noquote_body)], | |
111 | - act => sub{$_[2]} | |
112 | - }, | |
113 | - object_key_noquote_body => { | |
114 | - re => qr/($nonpunct+)/, | |
115 | - act => sub{$_[1][0]} | |
116 | - }, | |
117 | - object_key_sep => {re => qr/$wsc:/}, | |
118 | - array => { | |
119 | - seq => [qw(begin_array array_inner end_array)], | |
120 | - act => sub{$_[2]} | |
121 | - }, | |
122 | - begin_array => {re => qr/$wsc\[/}, | |
123 | - end_array => {re => qr/$wsc\]/}, | |
124 | - array_inner => { | |
125 | - opt => "array_member_arr", | |
126 | - act => sub{ | |
127 | - my ($self, $match) = @_; | |
128 | - $match // $self->array; | |
129 | - } | |
130 | - }, | |
131 | - array_member_arr => { | |
132 | - seq => [qw(value array_members optional_sep)], | |
133 | - act => sub{ | |
134 | - my ($self, $first, $left) = @_; | |
135 | - $self->array($first, @$left) | |
136 | - } | |
137 | - }, | |
138 | - array_members => { | |
139 | - many => "separated_array_member", | |
140 | - act => sub{shift; [@_]} | |
141 | - }, | |
142 | - separated_array_member => { | |
143 | - seq => [qw(separator value)], | |
144 | - act => sub{$_[2]} | |
145 | - }, | |
146 | - number => { | |
147 | - re => qr/$wsc(-)?+((?:0|[1-9][0-9]*+)(?:\.[0-9]++)?+)(?:[eE]([-+]?+[0-9]++))?+$litend/, | |
148 | - act => sub{ | |
149 | - my ($self, $match) = @_; | |
150 | - my $v = $match->[1] * 10**($match->[2] // 0); | |
151 | - $match->[0] ? -$v : $v | |
152 | - } | |
153 | - }, | |
154 | - string => { | |
155 | - any => [qw(string_multiline string_quoted string_noquote)], | |
156 | - act => sub{$_[2]} | |
157 | - }, | |
158 | - # XXX: not in RFC but quote by ' is allowed. | |
159 | - string_quoted => { | |
160 | - re => qr/$wsc(["'])((?:(?!\g{1})[^\0-\x1f\\]|\\["'\\\/bfnrt]|\\u[0-9A-Fa-f]{4})*+)\g{1}/, | |
161 | - act => sub{ | |
162 | - my ($self, $match) = @_; | |
163 | - $match->[1] =~ s{\\(u(.{4})|.)}{ | |
164 | - { | |
165 | - '"' => '"', | |
166 | - "'" => "'", | |
167 | - "\\" => "\\", | |
168 | - "/" => "/", | |
169 | - "b" => "\b", | |
170 | - "f" => "\f", | |
171 | - "n" => "\n", | |
172 | - "r" => "\r", | |
173 | - "t" => "\t", | |
174 | - }->{$1} // chr(hex($2)) | |
175 | - }ger; | |
176 | - } | |
177 | - }, | |
178 | - string_noquote => { | |
179 | - seq => [qw(quoteless_guard string_noquote_body)], | |
180 | - act => sub{$_[2]} | |
181 | - }, | |
182 | - string_noquote_body => { | |
183 | - re => qr/(${nonpunct}(?:\t|[^\0-\x1f])*)/, | |
184 | - act => sub{ | |
185 | - my ($self, $match) = @_; | |
186 | - $match->[0] =~ s/$inline_ws*+\z//r; | |
58 | + # process \uxxxx sequences | |
59 | + my $newstr = ""; | |
60 | + my $pos = 0; | |
61 | + while($str =~ /\\u/g){ | |
62 | + $newstr .= substr($str, $pos, $-[0] - $pos); | |
63 | + my $start_hex = $+[0]; | |
64 | + my $hex = substr($str, $start_hex, 4); | |
65 | + die "imcomplete \\u sequence" if(4 != length $hex); | |
66 | + die "non-hex char in \\u sequence" if($hex =~ /[^0-9A-Fa-f]/); | |
67 | + my $code = hex($hex); | |
68 | + die "low surrogate prefixed by no high surrogate in \\u sequence" | |
69 | + if(0xdc00 <= $code < 0xe000); | |
70 | + if(0xd800 <= $code < 0xdc00){ | |
71 | + die "high surrogate suffixed by no \\u in \\u sequence" | |
72 | + if(substr($str, $start_hex + 4, 2) ne "\\u"); | |
73 | + my $hex = substr($str, $start_hex + 6, 4); | |
74 | + die "imcomplete \\u sequence" if(4 != length $hex); | |
75 | + die "non-hex char in \\u sequence" if($hex =~ /[^0-9A-Fa-f]/); | |
76 | + my $lowcode = hex($hex); | |
77 | + die "high surrogate suffixed by no low surrogate in \\u sequence" | |
78 | + unless(0xdc00 <= $lowcode < 0xe000); | |
79 | + $newstr .= chr((($code & 0x3ff) + 0x40) << 10 | $lowcode & 0x3ff); | |
80 | + pos($str) = $pos = $start_hex + 10; | |
81 | + }else{ | |
82 | + $newstr .= chr($code); | |
83 | + pos($str) = $pos = $start_hex + 4; | |
187 | 84 | } |
188 | - }, | |
189 | - string_multiline => { | |
190 | - re => qr/($wsc)'''((?:(?!''')$any)*+)'''/, | |
191 | - act => sub{ | |
192 | - my ($self, $match) = @_; | |
193 | - my $indent = length($match->[0] =~ s/.*\n//r); | |
194 | - my $s = $match->[1] =~ s/\r//r; | |
85 | + } | |
86 | + $str = $newstr . substr($str, $pos) if(length $newstr); | |
195 | 87 | |
196 | - # XXX: What is "column" | |
197 | - $s =~ s/(?<=\n)${inline_ws}{0,$indent}//g; | |
198 | - # XXX: RFC says: Whitespace on the first line is ignored. | |
199 | - # my understanding: first line is ignored if it contains only ws. | |
200 | - $s =~ s/^$inline_ws*\n//; | |
88 | + # process other escape sequences | |
89 | + my %escape = ( | |
90 | + '"' => '"', | |
91 | + "'" => "'", | |
92 | + "\\" => "\\", | |
93 | + "/" => "/", | |
94 | + "b" => "\b", | |
95 | + "f" => "\f", | |
96 | + "n" => "\n", | |
97 | + "r" => "\r", | |
98 | + "t" => "\t", | |
99 | + ); | |
100 | + $str =~ s{\\(.)}{$escape{$1} // die "invalid escape sequence \\$1";}eg; | |
201 | 101 | |
202 | - $s =~ s/\n\z//r | |
203 | - } | |
204 | - }, | |
205 | -); | |
102 | + $str | |
103 | +} | |
206 | 104 | |
207 | 105 | sub consume($$$$){ |
208 | 106 | my ($self, $what, $in, $at) = @_; |
209 | - my $act = sub{scalar ($element{$what}{act} // sub{})->($self, @_)}; | |
210 | - croak "unknown consume type: $what" if(!$element{$what}); | |
211 | - if($element{$what}{re}){ | |
107 | + croak "unknown grammar: $what" if(!$grammar->can($what)); | |
108 | + my %g = $grammar->$what(); | |
109 | + my $act = sub{scalar ($g{act} // sub{})->($self, @_)}; | |
110 | + if($g{re}){ | |
212 | 111 | pos($in) = $at; |
213 | - if($in =~ /\G$element{$what}{re}/g){ | |
112 | + if($in =~ /\G$g{re}/g){ | |
214 | 113 | ( |
215 | 114 | pos($in), |
216 | 115 | $act->([@{^CAPTURE}]) |
@@ -218,47 +117,266 @@ sub consume($$$$){ | ||
218 | 117 | }else{ |
219 | 118 | () |
220 | 119 | } |
221 | - }elsif($element{$what}{seq}){ | |
120 | + }elsif($g{seq}){ | |
222 | 121 | my @got; |
223 | - for($element{$what}{seq}->@*){ | |
122 | + for($g{seq}->@*){ | |
224 | 123 | ($at, my $read) = $self->consume($_, $in, $at); |
225 | 124 | return () if(!defined $at); |
226 | 125 | push @got, $read; |
227 | 126 | } |
228 | 127 | ($at, $act->(@got)) |
229 | - }elsif($element{$what}{any}){ | |
128 | + }elsif($g{any}){ | |
230 | 129 | my ($i, $bt, $read); |
231 | - for $i(0 .. $element{$what}{any}->$#*){ | |
232 | - ($bt, $read) = $self->consume($element{$what}{any}[$i], $in, $at); | |
130 | + for $i(0 .. $g{any}->$#*){ | |
131 | + ($bt, $read) = $self->consume($g{any}[$i], $in, $at); | |
233 | 132 | if(defined $bt){ |
234 | 133 | return ($bt, $act->($i, $read)); |
235 | 134 | } |
236 | 135 | } |
237 | 136 | () |
238 | - }elsif($element{$what}{many}){ | |
137 | + }elsif($g{many}){ | |
239 | 138 | my $bt; |
240 | 139 | my @got; |
241 | 140 | while(1){ |
242 | - ($bt, my $read) = $self->consume($element{$what}{many}, $in, $at); | |
141 | + ($bt, my $read) = $self->consume($g{many}, $in, $at); | |
243 | 142 | last if(!defined $bt); |
244 | 143 | die "infinite loop detected in $what" if($at == $bt); |
245 | 144 | $at = $bt; |
246 | 145 | push @got, $read; |
247 | 146 | } |
248 | 147 | ($at, $act->(@got)) |
249 | - }elsif($element{$what}{opt}){ | |
250 | - my ($bt, $read) = $self->consume($element{$what}{opt}, $in, $at); | |
148 | + }elsif($g{opt}){ | |
149 | + my ($bt, $read) = $self->consume($g{opt}, $in, $at); | |
251 | 150 | ($bt//$at, $act->($read)) |
252 | 151 | }else{ |
253 | - die "what is $what"; | |
152 | + die "broken grammar"; | |
153 | + } | |
154 | +} | |
155 | + | |
156 | +sub consume_value($$$){ | |
157 | + my ($self, $in, $at) = @_; | |
158 | + my $s = substr($in, $at, 1); | |
159 | + my @try; | |
160 | + if($s eq "{"){ | |
161 | + @try = qw(consume_object); | |
162 | + }elsif($s eq "["){ | |
163 | + @try = qw(consume_array); | |
164 | + }elsif(ord(0) <= ord($s) <= ord(9) || $s eq "-"){ | |
165 | + @try = qw(consume_number consume_string); | |
166 | + }elsif($s eq "t" || $s eq "f" || $s eq "n"){ | |
167 | + @try = qw(consume_literal consume_string); | |
168 | + }else{ | |
169 | + @try = qw(consume_string); | |
170 | + } | |
171 | + for(@try){ | |
172 | + my ($bt, $value) = $self->$_($in, $at); | |
173 | + return ($bt, $value) if(defined $bt); | |
174 | + } | |
175 | + () | |
176 | +} | |
177 | + | |
178 | +sub consume_root($$$){ | |
179 | + my ($self, $in, $at) = @_; | |
180 | + my $s = substr($in, $at, 1); | |
181 | + my @try; | |
182 | + if($s eq "{"){ | |
183 | + @try = qw(consume_object); | |
184 | + }elsif($s eq "["){ | |
185 | + @try = qw(consume_array); | |
186 | + }elsif(ord(0) <= ord($s) <= ord(1) || $s eq "-"){ | |
187 | + @try = qw(consume_root_object consume_number consume_string); | |
188 | + }elsif($s eq "t" || $s eq "f" || $s eq "n"){ | |
189 | + @try = qw(consume_root_object consume_literal consume_string); | |
190 | + }else{ | |
191 | + @try = qw(consume_root_object consume_string); | |
192 | + } | |
193 | + for(@try){ | |
194 | + my ($bt, $value) = $self->$_($in, $at); | |
195 | + return ($bt, $value) if(defined $bt); | |
196 | + } | |
197 | + () | |
198 | +} | |
199 | + | |
200 | +sub consume_literal($$$){ | |
201 | + my ($self, $in, $at) = @_; | |
202 | + $self->{cre_literal} //= qr/\G(true|false|null)$litend/; | |
203 | + pos($in) = $at; | |
204 | + if($in =~ /$self->{cre_literal}/g){ | |
205 | + (pos($in), { | |
206 | + true => $self->true, | |
207 | + false => $self->false, | |
208 | + null => $self->null, | |
209 | + }->{$1}) | |
210 | + }else{()} | |
211 | +} | |
212 | + | |
213 | +sub consume_number($$$){ | |
214 | + my ($self, $in, $at) = @_; | |
215 | + $self->{cre_number} //= qr/\G(-)?+((?:0|[1-9][0-9]*+)(?:\.[0-9]++)?+)(?:[eE]([-+]?+[0-9]++))?+$litend/; | |
216 | + | |
217 | + pos($in) = $at; | |
218 | + if($in =~ /$self->{cre_number}/g){ | |
219 | + my $v = $2 * 10**($3 // 0); | |
220 | + (pos($in), $self->number($1 ? -$v : $v)) | |
221 | + }else{()} | |
222 | +} | |
223 | + | |
224 | +sub consume_string($$$){ | |
225 | + my ($self, $in, $at) = @_; | |
226 | + $self->{cre_dquoted_string} //= qr/\G"((?:[^\0-\x1f\\"]|\\.)*+)"/; | |
227 | + $self->{cre_squoted_string} //= qr/\G'((?:[^\0-\x1f\\']|\\.)*+)'/; | |
228 | + $self->{cre_quoteless_string} //= qr/\G([^'"\0-\x20,:\[\]{}](?:\t|[^\0-\x1f])*)/; | |
229 | + my $start1 = substr($in, $at, 1); | |
230 | + if($start1 eq "'"){ | |
231 | + my $start3 = substr($in, $at, 3); | |
232 | + if($start3 eq "'''"){ | |
233 | + pos($in) = $at; | |
234 | + die if($in !~ /($inline*)\G/); | |
235 | + my $column = length($1); | |
236 | + pos($in) = $at + 3; | |
237 | + return if($in !~ /\G($any*?)'''/g); | |
238 | + my $str = $1; | |
239 | + $str =~ s/\r//g; | |
240 | + # XXX: it may not compatible with js-Hjson. | |
241 | + # but I don't understand 'column' well. | |
242 | + $str =~ s/(?<=\n)${inline_ws}{0,$column}//g; | |
243 | + # XXX: it is not compatible with js-Hjson. but I think its better. | |
244 | + $str =~ s/^$inline_ws*\n//; | |
245 | + $str =~ s/\n\z//; | |
246 | + (pos($in), $str) | |
247 | + }else{ | |
248 | + pos($in) = $at; | |
249 | + return if($in !~ /$self->{cre_squoted_string}/g); | |
250 | + (pos($in), $self->decode_quoted_string($1)) | |
251 | + } | |
252 | + }elsif($start1 eq '"'){ | |
253 | + pos($in) = $at; | |
254 | + return if($in !~ /$self->{cre_dquoted_string}/g); | |
255 | + (pos($in), $self->decode_quoted_string($1)) | |
256 | + }else{ | |
257 | + pos($in) = $at; | |
258 | + return if($in !~ /$self->{cre_quoteless_string}/g); | |
259 | + (pos($in), $1 =~ s/$inline_ws*+\z//r) | |
260 | + } | |
261 | +} | |
262 | + | |
263 | +sub consume_object($$$){ | |
264 | + my ($self, $in, $at) = @_; | |
265 | + $self->{cre_wsc} //= qr/\G$wsc/; | |
266 | + | |
267 | + return if(substr($in, $at++, 1) ne "{"); | |
268 | + pos($in) = $at; | |
269 | + die if($in !~ /$self->{cre_wsc}/g); | |
270 | + ($at, my @content) = $self->consume_object_inner($in, pos($in)); | |
271 | + return if(!defined $at); | |
272 | + pos($in) = $at; | |
273 | + die if($in !~ /$self->{cre_wsc}/g); | |
274 | + return if(substr($in, pos($in), 1) ne "}"); | |
275 | + (pos($in)+1, $self->object(@content)) | |
276 | +} | |
277 | + | |
278 | +sub consume_root_object($$$){ | |
279 | + my ($self, $in, $at) = @_; | |
280 | + ($at, my @content) = $self->consume_object_inner($in, $at); | |
281 | + return if(!defined $at || !@content); | |
282 | + ($at, $self->object(@content)) | |
283 | +} | |
284 | + | |
285 | +sub consume_object_inner($$$){ | |
286 | + my ($self, $in, $at) = @_; | |
287 | + $self->{cre_wsc} //= qr/\G$wsc/; | |
288 | + $self->{cre_quoteless_key} //= qr/\G([^'"\0-\x20,:\[\]{}][^\0-\x20,:\[\]{}]*)/; | |
289 | + $self->{cre_dquoted_string} //= qr/\G"((?:[^\0-\x1f\\"]|\\.)*+)"/; | |
290 | + $self->{cre_squoted_string} //= qr/\G'((?:[^\0-\x1f\\']|\\.)*+)'/; | |
291 | + $self->{cre_value_separator} //= qr/\G$separator/; | |
292 | + | |
293 | + my @ret; | |
294 | + pos($in) = $at; | |
295 | + while(1){ | |
296 | + my $keystart = substr($in, pos($in), 1); | |
297 | + return ($at, @ret) if($keystart eq "}" || $keystart eq ""); | |
298 | + $at = pos($in); | |
299 | + my $key; | |
300 | + if($keystart eq '"'){ | |
301 | + return if($in !~ /$self->{cre_dquoted_string}/g); $at = pos($in); | |
302 | + $key = $self->decode_quoted_string($1); | |
303 | + }elsif($keystart eq "'"){ | |
304 | + return if($in !~ /$self->{cre_squoted_string}/g); $at = pos($in); | |
305 | + $key = $self->decode_quoted_string($1); | |
306 | + }else{ | |
307 | + return if($in !~ /$self->{cre_quoteless_key}/g); $at = pos($in); | |
308 | + $key = $1; | |
309 | + } | |
310 | + | |
311 | + pos($in) = $at; die if($in !~ /$self->{cre_wsc}/g); $at = pos($in); | |
312 | + return if(substr($in, $at++, 1) ne ":"); | |
313 | + pos($in) = $at; die if($in !~ /$self->{cre_wsc}/g); $at = pos($in); | |
314 | + | |
315 | + ($at, my $value) = $self->consume_value($in, $at); pos($in) = $at; | |
316 | + return if(!defined $at); | |
317 | + | |
318 | + push(@ret, [$key, $value]); | |
319 | + last if($in !~ /$self->{cre_value_separator}/g); $at = pos($in); | |
320 | + die if($in !~ /$self->{cre_wsc}/g); | |
321 | + } | |
322 | + ($at, @ret) | |
323 | +} | |
324 | + | |
325 | +sub consume_array($$$){ | |
326 | + my ($self, $in, $at) = @_; | |
327 | + $self->{cre_wsc} //= qr/\G$wsc/; | |
328 | + $self->{cre_value_separator} //= qr/\G$separator/; | |
329 | + | |
330 | + return if(substr($in, $at++, 1) ne "["); | |
331 | + pos($in) = $at; | |
332 | + die if($in !~ /$self->{cre_wsc}/g); $at = pos($in); | |
333 | + | |
334 | + my @content; | |
335 | + while(1){ | |
336 | + my $nextstart = substr($in, pos($in), 1); | |
337 | + if($nextstart eq "]"){ | |
338 | + return (pos($in)+1, $self->array(@content)); | |
339 | + } | |
340 | + return if($nextstart eq ""); | |
341 | + $at = pos($in); | |
342 | + | |
343 | + ($at, my $value) = $self->consume_value($in, $at); pos($in) = $at; | |
344 | + return if(!defined $at); | |
345 | + | |
346 | + push(@content, $value); | |
347 | + if($in !~ /$self->{cre_value_separator}/g){ | |
348 | + pos($in) = $at; | |
349 | + die if($in !~ /$self->{cre_wsc}/g); | |
350 | + return if(substr($in, pos($in), 1) ne "]"); | |
351 | + return (pos($in)+1, $self->array(@content)); | |
352 | + } | |
353 | + $at = pos($in); | |
354 | + die if($in !~ /$self->{cre_wsc}/g); | |
355 | + } | |
356 | +} | |
357 | + | |
358 | +sub consume_wsc($$$){ | |
359 | + my ($self, $in, $at) = @_; | |
360 | + $self->{cre_wsc} //= qr/\G$wsc/; | |
361 | + pos($in) = $at; | |
362 | + if($in =~ /$self->{cre_wsc}/g){ | |
363 | + (pos($in), undef) | |
364 | + }else{ | |
365 | + die "bug"; | |
254 | 366 | } |
255 | 367 | } |
256 | 368 | |
257 | 369 | sub decode($$){ |
258 | 370 | my ($self, $in) = @_; |
259 | - my ($end, $value) = $self->consume("root", $in, 0); | |
260 | - croak "Invalid Hjson string" if(!defined $end); | |
261 | - pos($in) = $end; | |
262 | - croak "Garbage after Hjson value" if($in !~ /\G$wsc\z/g); | |
371 | + $self = bless {}, $self if(!length ref $self); | |
372 | + my ($p,) = $self->consume_wsc($in, 0); | |
373 | + die if(!defined $p); | |
374 | + ($p, my $value) = $self->consume_root($in, $p); | |
375 | + croak "Invalid Hjson string" if(!defined $p); | |
376 | + ($p,) = $self->consume_wsc($in, $p); | |
377 | + die if(!defined $p); | |
378 | + croak "Garbage after Hjson value" if($p != length($in)); | |
263 | 379 | $value |
264 | 380 | } |
381 | + | |
382 | +1; |
@@ -0,0 +1,181 @@ | ||
1 | +package Text::Hjson::Grammar; | |
2 | + | |
3 | +use warnings; | |
4 | +use strict; | |
5 | +use feature qw(postderef); | |
6 | + | |
7 | +our $ws = qr/[\t\r\n ]/; | |
8 | +our $inline_ws = qr/[\t\r ]/; | |
9 | +our $inline = qr/$inline_ws|[^\0-\x20]/; | |
10 | +our $any = qr/$ws|[^\0-\x20]/; | |
11 | +our $line_comment = qr/(?:#|\/\/)$inline*+/; | |
12 | +our $block_comment = qr/\/\*(?:[^*]|\*(?!\/))*+\*\//; | |
13 | +our $comment = qr/$line_comment|$block_comment/; | |
14 | +our $wsc = qr/(?:$ws|$comment)*+/; | |
15 | +our $litend = qr/(?![ \t]*+[^\0-\x20#,\/\[\]{}])/; | |
16 | +our $separator = qr/$wsc,|(?:$inline_ws|$comment)*+\n/; | |
17 | +our $nonpunct = qr/[^\0-\x20,:\[\]{}]/; | |
18 | + | |
19 | +sub literal{ | |
20 | + re => qr/$wsc(true|false|null)$litend/, | |
21 | + act => sub{ | |
22 | + my ($self, $match) = @_; | |
23 | + { | |
24 | + true => $self->true, | |
25 | + false => $self->false, | |
26 | + null => $self->null, | |
27 | + }->{$match->[0]} | |
28 | + } | |
29 | +} | |
30 | +sub separator{re => qr/$separator/} | |
31 | +sub optional_sep{re => qr/$separator?/} | |
32 | +# XXX: not docuemnted, but tests say quoteless string | |
33 | +# starting with quote is not allowed. | |
34 | +sub quoteless_guard{re => qr/$wsc(?!["'])/} | |
35 | +sub value{ | |
36 | + any => [qw(literal object array number string)], | |
37 | + act => sub{$_[2]} | |
38 | +} | |
39 | +sub root{ | |
40 | + any => [qw(object_member_arr value)], | |
41 | + act => sub{$_[2]} | |
42 | +} | |
43 | +sub object{ | |
44 | + seq => [qw(begin_object object_inner end_object)], | |
45 | + act => sub{$_[2]} | |
46 | +} | |
47 | +sub begin_object{re => qr/$wsc\{/} | |
48 | +sub end_object{re => qr/$wsc\}/} | |
49 | +sub object_inner{ | |
50 | + opt => "object_member_arr", | |
51 | + act => sub{ | |
52 | + my ($self, $match) = @_; | |
53 | + $match // $self->object | |
54 | + } | |
55 | +} | |
56 | +sub object_member_arr{ | |
57 | + seq => [qw(object_member object_members optional_sep)], | |
58 | + act => sub{ | |
59 | + my ($self, $first, $left) = @_; | |
60 | + $self->object($first, @$left) | |
61 | + } | |
62 | +} | |
63 | +sub object_member{ | |
64 | + seq => [qw(object_key object_key_sep value)], | |
65 | + act => sub{ | |
66 | + my ($self, $key, $sep, $val) = @_; | |
67 | + [$key, $val] | |
68 | + } | |
69 | +} | |
70 | +sub object_members{ | |
71 | + many => "separated_object_member", | |
72 | + act => sub{shift; [@_]} | |
73 | +} | |
74 | +sub separated_object_member{ | |
75 | + seq => [qw(separator object_member)], | |
76 | + act => sub{$_[2]} | |
77 | +} | |
78 | +sub object_key{ | |
79 | + any => [qw(string_quoted object_key_noquote)], | |
80 | + act => sub{$_[2]} | |
81 | +} | |
82 | +sub object_key_noquote{ | |
83 | + seq => [qw(quoteless_guard object_key_noquote_body)], | |
84 | + act => sub{$_[2]} | |
85 | +} | |
86 | +sub object_key_noquote_body{ | |
87 | + re => qr/($nonpunct+)/, | |
88 | + act => sub{$_[1][0]} | |
89 | +} | |
90 | +sub wsc{re => qr/$wsc/} | |
91 | +sub object_key_sep{re => qr/$wsc:/} | |
92 | +sub array{ | |
93 | + seq => [qw(begin_array array_inner end_array)], | |
94 | + act => sub{$_[2]} | |
95 | +} | |
96 | +sub begin_array{re => qr/$wsc\[/} | |
97 | +sub end_array{re => qr/$wsc\]/} | |
98 | +sub array_inner{ | |
99 | + opt => "array_member_arr", | |
100 | + act => sub{ | |
101 | + my ($self, $match) = @_; | |
102 | + $match // $self->array; | |
103 | + } | |
104 | +} | |
105 | +sub array_member_arr{ | |
106 | + seq => [qw(value array_members optional_sep)], | |
107 | + act => sub{ | |
108 | + my ($self, $first, $left) = @_; | |
109 | + $self->array($first, @$left) | |
110 | + } | |
111 | +} | |
112 | +sub array_members{ | |
113 | + many => "separated_array_member", | |
114 | + act => sub{shift; [@_]} | |
115 | +} | |
116 | +sub separated_array_member{ | |
117 | + seq => [qw(separator value)], | |
118 | + act => sub{$_[2]} | |
119 | +} | |
120 | +sub number{ | |
121 | + re => qr/$wsc(-)?+((?:0|[1-9][0-9]*+)(?:\.[0-9]++)?+)(?:[eE]([-+]?+[0-9]++))?+$litend/, | |
122 | + act => sub{ | |
123 | + my ($self, $match) = @_; | |
124 | + my $v = $match->[1] * 10**($match->[2] // 0); | |
125 | + $match->[0] ? -$v : $v | |
126 | + } | |
127 | +} | |
128 | +sub string{ | |
129 | + any => [qw(string_multiline string_quoted string_noquote)], | |
130 | + act => sub{$_[2]} | |
131 | +} | |
132 | +# XXX: not in RFC but quote by ' is allowed. | |
133 | +sub string_quoted{ | |
134 | + re => qr/$wsc(["'])((?:(?!\g{1})[^\0-\x1f\\]|\\.)*+)\g{1}/, | |
135 | + act => sub{ | |
136 | + my ($self, $match) = @_; | |
137 | + $self->string($self->decode_quoted_string($match->[1])) | |
138 | + #$match->[1] =~ s{\\(u(.{4})|.)}{ | |
139 | + # { | |
140 | + # '"' => '"', | |
141 | + # "'" => "'", | |
142 | + # "\\" => "\\", | |
143 | + # "/" => "/", | |
144 | + # "b" => "\b", | |
145 | + # "f" => "\f", | |
146 | + # "n" => "\n", | |
147 | + # "r" => "\r", | |
148 | + # "t" => "\t", | |
149 | + # }->{$1} // chr(hex($2)) | |
150 | + #}ger; | |
151 | + } | |
152 | +} | |
153 | +sub string_noquote{ | |
154 | + seq => [qw(quoteless_guard string_noquote_body)], | |
155 | + act => sub{$_[2]} | |
156 | +} | |
157 | +sub string_noquote_body{ | |
158 | + re => qr/(${nonpunct}(?:\t|[^\0-\x1f])*)/, | |
159 | + act => sub{ | |
160 | + my ($self, $match) = @_; | |
161 | + $match->[0] =~ s/$inline_ws*+\z//r; | |
162 | + } | |
163 | +} | |
164 | +sub string_multiline{ | |
165 | + re => qr/($wsc)'''((?:(?!''')$any)*+)'''/, | |
166 | + act => sub{ | |
167 | + my ($self, $match) = @_; | |
168 | + my $indent = length($match->[0] =~ s/.*\n//r); | |
169 | + my $s = $match->[1] =~ s/\r//r; | |
170 | + | |
171 | + # XXX: What is "column" | |
172 | + $s =~ s/(?<=\n)${inline_ws}{0,$indent}//g; | |
173 | + # XXX: RFC says: Whitespace on the first line is ignored. | |
174 | + # my understanding: first line is ignored if it contains only ws. | |
175 | + $s =~ s/^$inline_ws*\n//; | |
176 | + | |
177 | + $s =~ s/\n\z//r | |
178 | + } | |
179 | +} | |
180 | + | |
181 | +1; |
@@ -0,0 +1,61 @@ | ||
1 | +#!/usr/bin/perl | |
2 | + | |
3 | +use warnings; | |
4 | +use strict; | |
5 | +use utf8; | |
6 | +use Test2::V0; | |
7 | +use Text::Hjson; | |
8 | +use Data::Dumper; | |
9 | + | |
10 | +my $v; | |
11 | + | |
12 | +# quoteless strings with quotation marks | |
13 | +eval{Text::Hjson->decode("{v: 'invalid\n}")}; | |
14 | +ok($@, "quoteless string starting with single quote"); | |
15 | +eval{Text::Hjson->decode("{v: \"invalid\n}")}; | |
16 | +ok($@, "quoteless string starting with double quote"); | |
17 | +$v = eval{Text::Hjson->decode("{v: valid\"\n}")}; | |
18 | +is($v, {v => 'valid"'}, "quoteless string with quote at the end"); | |
19 | + | |
20 | +# quoteless object keys with quotation marks | |
21 | +eval{Text::Hjson->decode("{'invalid: v\n}")}; | |
22 | +ok($@, "quoteless object key starting with single quote"); | |
23 | +eval{Text::Hjson->decode("{\"invalid: v\n}")}; | |
24 | +ok($@, "quoteless object key starting with double quote"); | |
25 | +$v = eval{Text::Hjson->decode("{valid\": v\n}")}; | |
26 | +is($v, {'valid"' => "v"}, "quoteless object key with quote at the end"); | |
27 | + | |
28 | +# invalid escape sequence | |
29 | +eval{Text::Hjson->decode('"\z"')}; | |
30 | +ok($@, "invalid sequence \\z"); | |
31 | +eval{Text::Hjson->decode('"\!"')}; | |
32 | +ok($@, "invalid sequence \\!"); | |
33 | + | |
34 | +# surrogates | |
35 | +eval{Text::Hjson->decode('"\ud835"')}; | |
36 | +ok($@, "only high surrogate"); | |
37 | +eval{Text::Hjson->decode('"\ud835!"')}; | |
38 | +ok($@, "high surrogate before non-surrogate"); | |
39 | +eval{Text::Hjson->decode('"\ud835\u003f"')}; | |
40 | +ok($@, "high surrogate before non-surrogate \\u sequence"); | |
41 | +eval{Text::Hjson->decode('"HH:\ud835\ud835"')}; | |
42 | +ok($@, "repeating high surrogate"); | |
43 | +eval{Text::Hjson->decode('"L:\udc2c"')}; | |
44 | +ok($@, "only low surrogate"); | |
45 | +eval{Text::Hjson->decode('"L:\udc2c\udc2c"')}; | |
46 | +ok($@, "repeating low surrogate"); | |
47 | +eval{Text::Hjson->decode('"L:\udc2c\ud835\udc2c"')}; | |
48 | +ok($@, "low surrogate before high surrogate"); | |
49 | +eval{Text::Hjson->decode("'\\ud835\x{dc2c}'")}; | |
50 | +ok($@, "\\u encode only a half of a surrogate pair"); | |
51 | +$v = eval{Text::Hjson->decode('"pairs: \ud835\udc12\ud835\udc2e\ud835\udc2b\ud835\udc2b\ud835\udc28\ud835\udc20\ud835\udc1a\ud835\udc2d\ud835\udc1e\ud835\udc2c!!"')}; | |
52 | +print($@); | |
53 | +is($v, "pairs: 𝐒𝐮𝐫𝐫𝐨𝐠𝐚𝐭𝐞𝐬!!", "Correct surrogate pairs"); | |
54 | + | |
55 | +# CR | |
56 | +$v = eval{Text::Hjson->decode(" '''abc\r\nde\rf\n\rghi\rjkl\n\n'''")}; | |
57 | +is($v, "abc\ndef\nghijkl\n", "\\r in multiline string"); | |
58 | +$v = eval{Text::Hjson->decode("abc\r\n")}; | |
59 | +is($v, "abc", "\\r at the end of a quoteless string"); | |
60 | + | |
61 | +done_testing |
@@ -0,0 +1,15 @@ | ||
1 | +#!/usr/bin/perl | |
2 | + | |
3 | +use warnings; | |
4 | +use strict; | |
5 | +use utf8; | |
6 | +use Test2::V0; | |
7 | +use Text::Hjson; | |
8 | + | |
9 | +eval{Text::Hjson->decode("")}; | |
10 | +ok($@, "empty toplevel object"); | |
11 | + | |
12 | +eval{Text::Hjson->decode("#test\n")}; | |
13 | +ok($@, "only a comment"); | |
14 | + | |
15 | +done_testing; |