-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTokenStream.h
2987 lines (2562 loc) · 107 KB
/
TokenStream.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* vim: set ts=8 sts=2 et sw=2 tw=80:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* Streaming access to the raw tokens of JavaScript source.
*
* Because JS tokenization is context-sensitive -- a '/' could be either a
* regular expression *or* a division operator depending on context -- the
* various token stream classes are mostly not useful outside of the Parser
* where they reside. We should probably eventually merge the two concepts.
*/
#ifndef frontend_TokenStream_h
#define frontend_TokenStream_h
/*
* [SMDOC] Parser Token Stream
*
* A token stream exposes the raw tokens -- operators, names, numbers,
* keywords, and so on -- of JavaScript source code.
*
* These are the components of the overall token stream concept:
* TokenStreamShared, TokenStreamAnyChars, TokenStreamCharsBase<Unit>,
* TokenStreamChars<Unit>, and TokenStreamSpecific<Unit, AnyCharsAccess>.
*
* == TokenStreamShared → ∅ ==
*
* Certain aspects of tokenizing are used everywhere:
*
* * modifiers (used to select which context-sensitive interpretation of a
* character should be used to decide what token it is) and modifier
* assertion handling;
* * flags on the overall stream (have we encountered any characters on this
* line? have we hit a syntax error? and so on);
* * and certain token-count constants.
*
* These are all defined in TokenStreamShared. (They could be namespace-
* scoped, but it seems tentatively better not to clutter the namespace.)
*
* == TokenStreamAnyChars → TokenStreamShared ==
*
* Certain aspects of tokenizing have meaning independent of the character type
* of the source text being tokenized: line/column number information, tokens
* in lookahead from determining the meaning of a prior token, compilation
* options, the filename, flags, source map URL, access to details of the
* current and next tokens (is the token of the given type? what name or
* number is contained in the token? and other queries), and others.
*
* All this data/functionality *could* be duplicated for both single-byte and
* double-byte tokenizing, but there are two problems. First, it's potentially
* wasteful if the compiler doesnt recognize it can unify the concepts. (And
* if any-character concepts are intermixed with character-specific concepts,
* potentially the compiler *can't* unify them because offsets into the
* hypothetical TokenStream<Unit>s would differ.) Second, some of this stuff
* needs to be accessible in ParserBase, the aspects of JS language parsing
* that have meaning independent of the character type of the source text being
* parsed. So we need a separate data structure that ParserBase can hold on to
* for it. (ParserBase isn't the only instance of this, but it's certainly the
* biggest case of it.) Ergo, TokenStreamAnyChars.
*
* == TokenStreamCharsShared → ∅ ==
*
* Some functionality has meaning independent of character type, yet has no use
* *unless* you know the character type in actual use. It *could* live in
* TokenStreamAnyChars, but it makes more sense to live in a separate class
* that character-aware token information can simply inherit.
*
* This class currently exists only to contain a char16_t buffer, transiently
* used to accumulate strings in tricky cases that can't just be read directly
* from source text. It's not used outside character-aware tokenizing, so it
* doesn't make sense in TokenStreamAnyChars.
*
* == TokenStreamCharsBase<Unit> → TokenStreamCharsShared ==
*
* Certain data structures in tokenizing are character-type-specific: namely,
* the various pointers identifying the source text (including current offset
* and end).
*
* Additionally, some functions operating on this data are defined the same way
* no matter what character type you have (e.g. current offset in code units
* into the source text) or share a common interface regardless of character
* type (e.g. consume the next code unit if it has a given value).
*
* All such functionality lives in TokenStreamCharsBase<Unit>.
*
* == SpecializedTokenStreamCharsBase<Unit> → TokenStreamCharsBase<Unit> ==
*
* Certain tokenizing functionality is specific to a single character type.
* For example, JS's UTF-16 encoding recognizes no coding errors, because lone
* surrogates are not an error; but a UTF-8 encoding must recognize a variety
* of validation errors. Such functionality is defined only in the appropriate
* SpecializedTokenStreamCharsBase specialization.
*
* == GeneralTokenStreamChars<Unit, AnyCharsAccess> →
* SpecializedTokenStreamCharsBase<Unit> ==
*
* Some functionality operates differently on different character types, just
* as for TokenStreamCharsBase, but additionally requires access to character-
* type-agnostic information in TokenStreamAnyChars. For example, getting the
* next character performs different steps for different character types and
* must access TokenStreamAnyChars to update line break information.
*
* Such functionality, if it can be defined using the same algorithm for all
* character types, lives in GeneralTokenStreamChars<Unit, AnyCharsAccess>.
* The AnyCharsAccess parameter provides a way for a GeneralTokenStreamChars
* instance to access its corresponding TokenStreamAnyChars, without inheriting
* from it.
*
* GeneralTokenStreamChars<Unit, AnyCharsAccess> is just functionality, no
* actual member data.
*
* Such functionality all lives in TokenStreamChars<Unit, AnyCharsAccess>, a
* declared-but-not-defined template class whose specializations have a common
* public interface (plus whatever private helper functions are desirable).
*
* == TokenStreamChars<Unit, AnyCharsAccess> →
* GeneralTokenStreamChars<Unit, AnyCharsAccess> ==
*
* Some functionality is like that in GeneralTokenStreamChars, *but* it's
* defined entirely differently for different character types.
*
* For example, consider "match a multi-code unit code point" (hypothetically:
* we've only implemented two-byte tokenizing right now):
*
* * For two-byte text, there must be two code units to get, the leading code
* unit must be a UTF-16 lead surrogate, and the trailing code unit must be
* a UTF-16 trailing surrogate. (If any of these fail to hold, a next code
* unit encodes that code point and is not multi-code unit.)
* * For single-byte Latin-1 text, there are no multi-code unit code points.
* * For single-byte UTF-8 text, the first code unit must have N > 1 of its
* highest bits set (and the next unset), and |N - 1| successive code units
* must have their high bit set and next-highest bit unset, *and*
* concatenating all unconstrained bits together must not produce a code
* point value that could have been encoded in fewer code units.
*
* This functionality can't be implemented as member functions in
* GeneralTokenStreamChars because we'd need to *partially specialize* those
* functions -- hold Unit constant while letting AnyCharsAccess vary. But
* C++ forbids function template partial specialization like this: either you
* fix *all* parameters or you fix none of them.
*
* Fortunately, C++ *does* allow *class* template partial specialization. So
* TokenStreamChars is a template class with one specialization per Unit.
* Functions can be defined differently in the different specializations,
* because AnyCharsAccess as the only template parameter on member functions
* *can* vary.
*
* All TokenStreamChars<Unit, AnyCharsAccess> specializations, one per Unit,
* are just functionality, no actual member data.
*
* == TokenStreamSpecific<Unit, AnyCharsAccess> →
* TokenStreamChars<Unit, AnyCharsAccess>, TokenStreamShared,
* ErrorReporter ==
*
* TokenStreamSpecific is operations that are parametrized on character type
* but implement the *general* idea of tokenizing, without being intrinsically
* tied to character type. Notably, this includes all operations that can
* report warnings or errors at particular offsets, because we include a line
* of context with such errors -- and that necessarily accesses the raw
* characters of their specific type.
*
* Much TokenStreamSpecific operation depends on functionality in
* TokenStreamAnyChars. The obvious solution is to inherit it -- but this
* doesn't work in Parser: its ParserBase base class needs some
* TokenStreamAnyChars functionality without knowing character type.
*
* The AnyCharsAccess type parameter is a class that statically converts from a
* TokenStreamSpecific* to its corresponding TokenStreamAnyChars. The
* TokenStreamSpecific in Parser<ParseHandler, Unit> can then specify a class
* that properly converts from TokenStreamSpecific Parser::tokenStream to
* TokenStreamAnyChars ParserBase::anyChars.
*
* Could we hardcode one set of offset calculations for this and eliminate
* AnyCharsAccess? No. Offset calculations possibly could be hardcoded if
* TokenStreamSpecific were present in Parser before Parser::handler, assuring
* the same offsets in all Parser-related cases. But there's still a separate
* TokenStream class, that requires different offset calculations. So even if
* we wanted to hardcode this (it's not clear we would, because forcing the
* TokenStreamSpecific declarer to specify this is more explicit), we couldn't.
*/
#include "mozilla/ArrayUtils.h"
#include "mozilla/Assertions.h"
#include "mozilla/Attributes.h"
#include "mozilla/Casting.h"
#include "mozilla/DebugOnly.h"
#include "mozilla/Maybe.h"
#include "mozilla/MemoryChecking.h"
#include "mozilla/PodOperations.h"
#include "mozilla/Span.h"
#include "mozilla/TextUtils.h"
#include "mozilla/Unused.h"
#include "mozilla/Utf8.h"
#include <algorithm>
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <type_traits>
#include "jspubtd.h"
#include "frontend/CompilationInfo.h"
#include "frontend/ErrorReporter.h"
#include "frontend/Token.h"
#include "frontend/TokenKind.h"
#include "js/CompileOptions.h"
#include "js/HashTable.h" // js::HashMap
#include "js/RegExpFlags.h" // JS::RegExpFlags
#include "js/UniquePtr.h"
#include "js/Vector.h"
#include "util/Text.h"
#include "util/Unicode.h"
#include "vm/ErrorReporting.h"
#include "vm/JSAtom.h"
#include "vm/StringType.h"
struct JS_PUBLIC_API JSContext;
struct KeywordInfo;
namespace js {
class AutoKeepAtoms;
namespace frontend {
extern TokenKind ReservedWordTokenKind(PropertyName* str);
extern const char* ReservedWordToCharZ(PropertyName* str);
extern const char* ReservedWordToCharZ(TokenKind tt);
struct TokenStreamFlags {
// Hit end of file.
bool isEOF : 1;
// Non-whitespace since start of line.
bool isDirtyLine : 1;
// Saw an octal character escape or a 0-prefixed octal literal.
bool sawDeprecatedOctal : 1;
// Hit a syntax error, at start or during a token.
bool hadError : 1;
TokenStreamFlags()
: isEOF(false),
isDirtyLine(false),
sawDeprecatedOctal(false),
hadError(false) {}
};
template <typename Unit>
class TokenStreamPosition;
/**
* TokenStream types and constants that are used in both TokenStreamAnyChars
* and TokenStreamSpecific. Do not add any non-static data members to this
* class!
*/
class TokenStreamShared {
protected:
static constexpr size_t ntokens = 4; // 1 current + 2 lookahead, rounded
// to power of 2 to avoid divmod by 3
static constexpr unsigned ntokensMask = ntokens - 1;
template <typename Unit>
friend class TokenStreamPosition;
public:
static constexpr unsigned maxLookahead = 2;
using Modifier = Token::Modifier;
static constexpr Modifier SlashIsDiv = Token::SlashIsDiv;
static constexpr Modifier SlashIsRegExp = Token::SlashIsRegExp;
static constexpr Modifier SlashIsInvalid = Token::SlashIsInvalid;
static void verifyConsistentModifier(Modifier modifier,
const Token& nextToken) {
MOZ_ASSERT(
modifier == nextToken.modifier || modifier == SlashIsInvalid,
"This token was scanned with both SlashIsRegExp and SlashIsDiv, "
"indicating the parser is confused about how to handle a slash here. "
"See comment at Token::Modifier.");
}
};
static_assert(std::is_empty_v<TokenStreamShared>,
"TokenStreamShared shouldn't bloat classes that inherit from it");
template <typename Unit, class AnyCharsAccess>
class TokenStreamSpecific;
template <typename Unit>
class MOZ_STACK_CLASS TokenStreamPosition final {
public:
// The JS_HAZ_ROOTED is permissible below because: 1) the only field in
// TokenStreamPosition that can keep GC things alive is Token, 2) the only
// GC things Token can keep alive are atoms, and 3) the AutoKeepAtoms&
// passed to the constructor here represents that collection of atoms
// is disabled while atoms in Tokens in this Position are alive. DON'T
// ADD NON-ATOM GC THING POINTERS HERE! They would create a rooting
// hazard that JS_HAZ_ROOTED will cause to be ignored.
template <class AnyCharsAccess>
inline TokenStreamPosition(
AutoKeepAtoms& keepAtoms,
TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream);
private:
TokenStreamPosition(const TokenStreamPosition&) = delete;
// Technically only TokenStreamSpecific<Unit, AnyCharsAccess>::seek with
// Unit constant and AnyCharsAccess varying must be friended, but 1) it's
// hard to friend one function in template classes, and 2) C++ doesn't
// allow partial friend specialization to target just that single class.
template <typename Char, class AnyCharsAccess>
friend class TokenStreamSpecific;
const Unit* buf;
TokenStreamFlags flags;
unsigned lineno;
size_t linebase;
size_t prevLinebase;
Token currentToken;
unsigned lookahead;
Token lookaheadTokens[TokenStreamShared::maxLookahead];
} JS_HAZ_ROOTED;
template <typename Unit>
class SourceUnits;
/**
* This class maps:
*
* * a sourceUnits offset (a 0-indexed count of code units)
*
* to
*
* * a (1-indexed) line number and
* * a (0-indexed) offset in code *units* (not code points, not bytes) into
* that line,
*
* for either |Unit = Utf8Unit| or |Unit = char16_t|.
*
* Note that the latter quantity is *not* the same as a column number, which is
* a count of code *points*. Computing a column number requires the offset
* within the line and the source units of that line (including what type |Unit|
* is, to know how to decode them). If you need a column number, functions in
* |GeneralTokenStreamChars<Unit>| will consult this and source units to compute
* it.
*/
class SourceCoords {
// For a given buffer holding source code, |lineStartOffsets_| has one
// element per line of source code, plus one sentinel element. Each
// non-sentinel element holds the buffer offset for the start of the
// corresponding line of source code. For this example script,
// assuming an initialLineOffset of 0:
//
// 1 // xyz [line starts at offset 0]
// 2 var x; [line starts at offset 7]
// 3 [line starts at offset 14]
// 4 var y; [line starts at offset 15]
//
// |lineStartOffsets_| is:
//
// [0, 7, 14, 15, MAX_PTR]
//
// To convert a "line number" to an "index" into |lineStartOffsets_|,
// subtract |initialLineNum_|. E.g. line 3's index is
// (3 - initialLineNum_), which is 2. Therefore lineStartOffsets_[2]
// holds the buffer offset for the start of line 3, which is 14. (Note
// that |initialLineNum_| is often 1, but not always.
//
// The first element is always initialLineOffset, passed to the
// constructor, and the last element is always the MAX_PTR sentinel.
//
// Offset-to-{line,offset-into-line} lookups are O(log n) in the worst
// case (binary search), but in practice they're heavily clustered and
// we do better than that by using the previous lookup's result
// (lastIndex_) as a starting point.
//
// Checking if an offset lies within a particular line number
// (isOnThisLine()) is O(1).
//
Vector<uint32_t, 128> lineStartOffsets_;
/** The line number on which the source text begins. */
uint32_t initialLineNum_;
/**
* The index corresponding to the last offset lookup -- used so that if
* offset lookups proceed in increasing order, and and the offset appears
* in the next couple lines from the last offset, we can avoid a full
* binary-search.
*
* This is mutable because it's modified on every search, but that fact
* isn't visible outside this class.
*/
mutable uint32_t lastIndex_;
uint32_t indexFromOffset(uint32_t offset) const;
static const uint32_t MAX_PTR = UINT32_MAX;
uint32_t lineNumberFromIndex(uint32_t index) const {
return index + initialLineNum_;
}
uint32_t indexFromLineNumber(uint32_t lineNum) const {
return lineNum - initialLineNum_;
}
public:
SourceCoords(JSContext* cx, uint32_t initialLineNumber,
uint32_t initialOffset);
MOZ_MUST_USE bool add(uint32_t lineNum, uint32_t lineStartOffset);
MOZ_MUST_USE bool fill(const SourceCoords& other);
bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const {
uint32_t index = indexFromLineNumber(lineNum);
if (index + 1 >= lineStartOffsets_.length()) { // +1 due to sentinel
return false;
}
*onThisLine = lineStartOffsets_[index] <= offset &&
offset < lineStartOffsets_[index + 1];
return true;
}
/**
* A token, computed for an offset in source text, that can be used to
* access line number and line-offset information for that offset.
*
* LineToken *alone* exposes whether the corresponding offset is in the
* the first line of source (which may not be 1, depending on
* |initialLineNumber|), and whether it's in the same line as
* another LineToken.
*/
class LineToken {
uint32_t index;
#ifdef DEBUG
uint32_t offset_; // stored for consistency-of-use assertions
#endif
friend class SourceCoords;
public:
LineToken(uint32_t index, uint32_t offset)
: index(index)
#ifdef DEBUG
,
offset_(offset)
#endif
{
}
bool isFirstLine() const { return index == 0; }
bool isSameLine(LineToken other) const { return index == other.index; }
void assertConsistentOffset(uint32_t offset) const {
MOZ_ASSERT(offset_ == offset);
}
};
/**
* Compute a token usable to access information about the line at the
* given offset.
*
* The only information directly accessible in a token is whether it
* corresponds to the first line of source text (which may not be line
* 1, depending on the |initialLineNumber| value used to construct
* this). Use |lineNumber(LineToken)| to compute the actual line
* number (incorporating the contribution of |initialLineNumber|).
*/
LineToken lineToken(uint32_t offset) const;
/** Compute the line number for the given token. */
uint32_t lineNumber(LineToken lineToken) const {
return lineNumberFromIndex(lineToken.index);
}
/** Return the offset of the start of the line for |lineToken|. */
uint32_t lineStart(LineToken lineToken) const {
MOZ_ASSERT(lineToken.index + 1 < lineStartOffsets_.length(),
"recorded line-start information must be available");
return lineStartOffsets_[lineToken.index];
}
};
enum class UnitsType : unsigned char {
PossiblyMultiUnit = 0,
GuaranteedSingleUnit = 1,
};
class ChunkInfo {
private:
// Store everything in |unsigned char|s so everything packs.
unsigned char column_[sizeof(uint32_t)];
unsigned char unitsType_;
public:
ChunkInfo(uint32_t col, UnitsType type)
: unitsType_(static_cast<unsigned char>(type)) {
memcpy(column_, &col, sizeof(col));
}
uint32_t column() const {
uint32_t col;
memcpy(&col, column_, sizeof(uint32_t));
return col;
}
UnitsType unitsType() const {
MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
return static_cast<UnitsType>(unitsType_);
}
void guaranteeSingleUnits() {
MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
"should only be setting to possibly optimize from the "
"pessimistic case");
unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
}
};
enum class InvalidEscapeType {
// No invalid character escapes.
None,
// A malformed \x escape.
Hexadecimal,
// A malformed \u escape.
Unicode,
// An otherwise well-formed \u escape which represents a
// codepoint > 10FFFF.
UnicodeOverflow,
// An octal escape in a template token.
Octal
};
class TokenStreamAnyChars : public TokenStreamShared {
private:
// Constant-at-construction fields.
JSContext* const cx;
/** Options used for parsing/tokenizing. */
const JS::ReadOnlyCompileOptions& options_;
/**
* Pointer used internally to test whether in strict mode. Use |strictMode()|
* instead of this field.
*/
StrictModeGetter* const strictModeGetter_;
/** Input filename or null. */
const char* const filename_;
// Column number computation fields.
/**
* A map of (line number => sequence of the column numbers at
* |ColumnChunkLength|-unit boundaries rewound [if needed] to the nearest code
* point boundary). (|TokenStreamAnyChars::computePartialColumn| is the sole
* user of |ColumnChunkLength| and therefore contains its definition.)
*
* Entries appear in this map only when a column computation of sufficient
* distance is performed on a line -- and only when the column is beyond the
* first |ColumnChunkLength| units. Each line's vector is lazily filled as
* greater offsets require column computations.
*/
mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;
// Computing accurate column numbers requires at *some* point linearly
// iterating through prior source units in the line, to properly account for
// multi-unit code points. This is quadratic if counting happens repeatedly.
//
// But usually we need columns for advancing offsets through scripts. By
// caching the last ((line number, offset) => relative column) mapping (in
// similar manner to how |SourceCoords::lastIndex_| is used to cache
// (offset => line number) mappings) we can usually avoid re-iterating through
// the common line prefix.
//
// Additionally, we avoid hash table lookup costs by caching the
// |Vector<ChunkInfo>*| for the line of the last lookup. (|nullptr| means we
// must look it up -- or it hasn't been created yet.) This pointer is nulled
// when a lookup on a new line occurs, but as it's not a pointer at literal,
// reallocatable element data, it's *not* invalidated when new entries are
// added to such a vector.
/**
* The line in which the last column computation occurred, or UINT32_MAX if
* no prior computation has yet happened.
*/
mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;
/**
* The chunk vector of the line for that last column computation. This is
* null if the chunk vector needs to be recalculated or initially created.
*/
mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;
/**
* The offset (in code units) of the last column computation performed,
* relative to source start.
*/
mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;
/**
* The column number for the offset (in code units) of the last column
* computation performed, relative to source start.
*/
mutable uint32_t lastComputedColumn_ = 0;
// Intra-token fields.
/**
* The offset of the first invalid escape in a template literal. (If there is
* one -- if not, the value of this field is meaningless.)
*
* See also |invalidTemplateEscapeType|.
*/
uint32_t invalidTemplateEscapeOffset = 0;
/**
* The type of the first invalid escape in a template literal. (If there
* isn't one, this will be |None|.)
*
* See also |invalidTemplateEscapeOffset|.
*/
InvalidEscapeType invalidTemplateEscapeType = InvalidEscapeType::None;
// Fields with values relevant across tokens (and therefore potentially across
// function boundaries, such that lazy function parsing and stream-seeking
// must take care in saving and restoring them).
/** Line number and offset-to-line mapping information. */
SourceCoords srcCoords;
/** Circular token buffer of gotten tokens that have been ungotten. */
Token tokens[ntokens] = {};
/** The index in |tokens| of the last parsed token. */
unsigned cursor_ = 0;
/** The number of tokens in |tokens| available to be gotten. */
unsigned lookahead = 0;
/** The current line number. */
unsigned lineno;
/** Various flag bits (see above). */
TokenStreamFlags flags = {};
/** The offset of the start of the current line. */
size_t linebase = 0;
/** The start of the previous line, or |size_t(-1)| on the first line. */
size_t prevLinebase = size_t(-1);
/** The user's requested source URL. Null if none has been set. */
UniqueTwoByteChars displayURL_ = nullptr;
/** The URL of the source map for this script. Null if none has been set. */
UniqueTwoByteChars sourceMapURL_ = nullptr;
// Assorted boolean fields, none of which require maintenance across tokens,
// stored at class end to minimize padding.
/**
* Whether syntax errors should or should not contain details about the
* precise nature of the error. (This is intended for use in suppressing
* content-revealing details about syntax errors in cross-origin scripts on
* the web.)
*/
const bool mutedErrors;
/**
* An array storing whether a TokenKind observed while attempting to extend
* a valid AssignmentExpression into an even longer AssignmentExpression
* (e.g., extending '3' to '3 + 5') will terminate it without error.
*
* For example, ';' always ends an AssignmentExpression because it ends a
* Statement or declaration. '}' always ends an AssignmentExpression
* because it terminates BlockStatement, FunctionBody, and embedded
* expressions in TemplateLiterals. Therefore both entries are set to true
* in TokenStreamAnyChars construction.
*
* But e.g. '+' *could* extend an AssignmentExpression, so its entry here
* is false. Meanwhile 'this' can't extend an AssignmentExpression, but
* it's only valid after a line break, so its entry here must be false.
*
* NOTE: This array could be static, but without C99's designated
* initializers it's easier zeroing here and setting the true entries
* in the constructor body. (Having this per-instance might also aid
* locality.) Don't worry! Initialization time for each TokenStream
* is trivial. See bug 639420.
*/
bool isExprEnding[size_t(TokenKind::Limit)] = {}; // all-false initially
// End of fields.
public:
TokenStreamAnyChars(JSContext* cx, const JS::ReadOnlyCompileOptions& options,
StrictModeGetter* smg);
template <typename Unit, class AnyCharsAccess>
friend class GeneralTokenStreamChars;
template <typename Unit, class AnyCharsAccess>
friend class TokenStreamChars;
template <typename Unit, class AnyCharsAccess>
friend class TokenStreamSpecific;
template <typename Unit>
friend class TokenStreamPosition;
// Accessors.
unsigned cursor() const { return cursor_; }
unsigned nextCursor() const { return (cursor_ + 1) & ntokensMask; }
unsigned aheadCursor(unsigned steps) const {
return (cursor_ + steps) & ntokensMask;
}
const Token& currentToken() const { return tokens[cursor()]; }
bool isCurrentTokenType(TokenKind type) const {
return currentToken().type == type;
}
MOZ_MUST_USE bool checkOptions();
private:
PropertyName* reservedWordToPropertyName(TokenKind tt) const;
public:
PropertyName* currentName() const {
if (isCurrentTokenType(TokenKind::Name) ||
isCurrentTokenType(TokenKind::PrivateName)) {
return currentToken().name();
}
MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
return reservedWordToPropertyName(currentToken().type);
}
bool currentNameHasEscapes() const {
if (isCurrentTokenType(TokenKind::Name) ||
isCurrentTokenType(TokenKind::PrivateName)) {
TokenPos pos = currentToken().pos;
return (pos.end - pos.begin) != currentToken().name()->length();
}
MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
return false;
}
bool isCurrentTokenAssignment() const {
return TokenKindIsAssignment(currentToken().type);
}
// Flag methods.
bool isEOF() const { return flags.isEOF; }
bool sawDeprecatedOctal() const { return flags.sawDeprecatedOctal; }
bool hadError() const { return flags.hadError; }
void clearSawDeprecatedOctal() { flags.sawDeprecatedOctal = false; }
bool hasInvalidTemplateEscape() const {
return invalidTemplateEscapeType != InvalidEscapeType::None;
}
void clearInvalidTemplateEscape() {
invalidTemplateEscapeType = InvalidEscapeType::None;
}
private:
// This is private because it should only be called by the tokenizer while
// tokenizing not by, for example, BytecodeEmitter.
bool strictMode() const {
return strictModeGetter_ && strictModeGetter_->strictMode();
}
void setInvalidTemplateEscape(uint32_t offset, InvalidEscapeType type) {
MOZ_ASSERT(type != InvalidEscapeType::None);
if (invalidTemplateEscapeType != InvalidEscapeType::None) {
return;
}
invalidTemplateEscapeOffset = offset;
invalidTemplateEscapeType = type;
}
public:
// Call this immediately after parsing an OrExpression to allow scanning the
// next token with SlashIsRegExp without asserting (even though we just
// peeked at it in SlashIsDiv mode).
//
// It's OK to disable the assertion because the places where this is called
// have peeked at the next token in SlashIsDiv mode, and checked that it is
// *not* a Div token.
//
// To see why it is necessary to disable the assertion, consider these two
// programs:
//
// x = arg => q // per spec, this is all one statement, and the
// /a/g; // slashes are division operators
//
// x = arg => {} // per spec, ASI at the end of this line
// /a/g; // and that's a regexp literal
//
// The first program shows why orExpr() has use SlashIsDiv mode when peeking
// ahead for the next operator after parsing `q`. The second program shows
// why matchOrInsertSemicolon() must use SlashIsRegExp mode when scanning
// ahead for a semicolon.
void allowGettingNextTokenWithSlashIsRegExp() {
#ifdef DEBUG
// Check the precondition: Caller already peeked ahead at the next token,
// in SlashIsDiv mode, and it is *not* a Div token.
MOZ_ASSERT(hasLookahead());
const Token& next = nextToken();
MOZ_ASSERT(next.modifier == SlashIsDiv);
MOZ_ASSERT(next.type != TokenKind::Div);
tokens[nextCursor()].modifier = SlashIsRegExp;
#endif
}
#ifdef DEBUG
inline bool debugHasNoLookahead() const { return lookahead == 0; }
#endif
bool hasDisplayURL() const { return displayURL_ != nullptr; }
char16_t* displayURL() { return displayURL_.get(); }
bool hasSourceMapURL() const { return sourceMapURL_ != nullptr; }
char16_t* sourceMapURL() { return sourceMapURL_.get(); }
JSContext* context() const { return cx; }
using LineToken = SourceCoords::LineToken;
LineToken lineToken(uint32_t offset) const {
return srcCoords.lineToken(offset);
}
uint32_t lineNumber(LineToken lineToken) const {
return srcCoords.lineNumber(lineToken);
}
uint32_t lineStart(LineToken lineToken) const {
return srcCoords.lineStart(lineToken);
}
/**
* Fill in |err|.
*
* If the token stream doesn't have location info for this error, use the
* caller's location (including line/column number) and return false. (No
* line of context is set.)
*
* Otherwise fill in everything in |err| except 1) line/column numbers and
* 2) line-of-context-related fields and return true. The caller *must*
* fill in the line/column number; filling the line of context is optional.
*/
bool fillExceptingContext(ErrorMetadata* err, uint32_t offset);
MOZ_ALWAYS_INLINE void updateFlagsForEOL() { flags.isDirtyLine = false; }
private:
/**
* Compute the "partial" column number in Unicode code points of the absolute
* |offset| within source text on the line of |lineToken| (which must have
* been computed from |offset|).
*
* A partial column number on a line that isn't the first line is just the
* actual column number. But a partial column number on the first line is the
* column number *ignoring the initial line/column of the script*. For
* example, consider this HTML with line/column number keys:
*
* 1 2 3
* 0123456789012345678901234 567890
* ------------------------------------
* 1 | <html>
* 2 | <head>
* 3 | <script>var x = 3; x < 4;
* 4 | const y = 7;</script>
* 5 | </head>
* 6 | <body></body>
* 7 | </html>
*
* The script would be compiled specifying initial (line, column) of (3, 10)
* using |JS::ReadOnlyCompileOptions::{lineno,column}|. And the column
* reported by |computeColumn| for the "v" of |var| would be 10. But the
* partial column number of the "v" in |var|, that this function returns,
* would be 0. On the other hand, the column reported by |computeColumn| and
* the partial column number returned by this function for the "c" in |const|
* would both be 0, because it's not in the first line of source text.
*
* The partial column is with respect *only* to the JavaScript source text as
* SpiderMonkey sees it. In the example, the "<" is converted to "<" by
* the browser before SpiderMonkey would see it. So the partial column of the
* "4" in the inequality would be 16, not 19.
*
* Code points are not all equal length, so counting requires *some* kind of
* linear-time counting from the start of the line. This function attempts
* various tricks to reduce this cost. If these optimizations succeed,
* repeated calls to this function on a line will pay a one-time cost linear
* in the length of the line, then each call pays a separate constant-time
* cost. If the optimizations do not succeed, this function works in time
* linear in the length of the line.
*
* It's unusual for a function in *this* class to be |Unit|-templated, but
* while this operation manages |Unit|-agnostic fields in this class and in
* |srcCoords|, it must *perform* |Unit|-sensitive computations to fill them.
* And this is the best place to do that.
*/
template <typename Unit>
uint32_t computePartialColumn(const LineToken lineToken,
const uint32_t offset,
const SourceUnits<Unit>& sourceUnits) const;
/**
* Update line/column information for the start of a new line at
* |lineStartOffset|.
*/
MOZ_MUST_USE MOZ_ALWAYS_INLINE bool internalUpdateLineInfoForEOL(
uint32_t lineStartOffset);
public:
const Token& nextToken() const {
MOZ_ASSERT(hasLookahead());
return tokens[nextCursor()];
}
bool hasLookahead() const { return lookahead > 0; }
void advanceCursor() { cursor_ = (cursor_ + 1) & ntokensMask; }
void retractCursor() { cursor_ = (cursor_ - 1) & ntokensMask; }
Token* allocateToken() {
advanceCursor();
Token* tp = &tokens[cursor()];
MOZ_MAKE_MEM_UNDEFINED(tp, sizeof(*tp));
return tp;
}
// Push the last scanned token back into the stream.
void ungetToken() {
MOZ_ASSERT(lookahead < maxLookahead);
lookahead++;
retractCursor();
}
public:
void adoptState(TokenStreamAnyChars& other) {
// If |other| has fresh information from directives, overwrite any
// previously recorded directives. (There is no specification directing
// that last-in-source-order directive controls, sadly. We behave this way
// in the ordinary case, so we ought do so here too.)
if (auto& url = other.displayURL_) {
displayURL_ = std::move(url);
}
if (auto& url = other.sourceMapURL_) {
sourceMapURL_ = std::move(url);
}
}
// Compute error metadata for an error at no offset.
void computeErrorMetadataNoOffset(ErrorMetadata* err);
// ErrorReporter API Helpers
// Provide minimal set of error reporting API given we cannot use
// ErrorReportMixin here. "report" prefix is added to avoid conflict with
// ErrorReportMixin methods in TokenStream class.
void reportErrorNoOffset(unsigned errorNumber, ...);
void reportErrorNoOffsetVA(unsigned errorNumber, va_list* args);
const JS::ReadOnlyCompileOptions& options() const { return options_; }
const char* getFilename() const { return filename_; }
};
constexpr char16_t CodeUnitValue(char16_t unit) { return unit; }
constexpr uint8_t CodeUnitValue(mozilla::Utf8Unit unit) {
return unit.toUint8();
}
template <typename Unit>
class TokenStreamCharsBase;
template <typename T>
inline bool IsLineTerminator(T) = delete;
inline bool IsLineTerminator(char32_t codePoint) {
return codePoint == '\n' || codePoint == '\r' ||
codePoint == unicode::LINE_SEPARATOR ||
codePoint == unicode::PARA_SEPARATOR;