js/src/frontend/TokenStream.h

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 * vim: set ts=8 sts=2 et sw=2 tw=80:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * Streaming access to the raw tokens of JavaScript source.
 *
 * Because JS tokenization is context-sensitive -- a '/' could be either a
 * regular expression *or* a division operator depending on context -- the
 * various token stream classes are mostly not useful outside of the Parser
 * where they reside.  We should probably eventually merge the two concepts.
 */
#ifndef frontend_TokenStream_h
#define frontend_TokenStream_h

/*
 * [SMDOC] Parser Token Stream
 *
 * A token stream exposes the raw tokens -- operators, names, numbers,
 * keywords, and so on -- of JavaScript source code.
 *
 * These are the components of the overall token stream concept:
 * TokenStreamShared, TokenStreamAnyChars, TokenStreamCharsBase<Unit>,
 * TokenStreamChars<Unit>, and TokenStreamSpecific<Unit, AnyCharsAccess>.
 *
 * == TokenStreamShared → ∅ ==
 *
 * Certain aspects of tokenizing are used everywhere:
 *
 *   * modifiers (used to select which context-sensitive interpretation of a
 *     character should be used to decide what token it is) and modifier
 *     assertion handling;
 *   * flags on the overall stream (have we encountered any characters on this
 *     line?  have we hit a syntax error?  and so on);
 *   * and certain token-count constants.
 *
 * These are all defined in TokenStreamShared.  (They could be namespace-
 * scoped, but it seems tentatively better not to clutter the namespace.)
 *
 * == TokenStreamAnyChars → TokenStreamShared ==
 *
 * Certain aspects of tokenizing have meaning independent of the character type
 * of the source text being tokenized: line/column number information, tokens
 * in lookahead from determining the meaning of a prior token, compilation
 * options, the filename, flags, source map URL, access to details of the
 * current and next tokens (is the token of the given type?  what name or
 * number is contained in the token?  and other queries), and others.
 *
 * All this data/functionality *could* be duplicated for both single-byte and
 * double-byte tokenizing, but there are two problems.  First, it's potentially
 * wasteful if the compiler doesnt recognize it can unify the concepts.  (And
 * if any-character concepts are intermixed with character-specific concepts,
 * potentially the compiler *can't* unify them because offsets into the
 * hypothetical TokenStream<Unit>s would differ.)  Second, some of this stuff
 * needs to be accessible in ParserBase, the aspects of JS language parsing
 * that have meaning independent of the character type of the source text being
 * parsed.  So we need a separate data structure that ParserBase can hold on to
 * for it.  (ParserBase isn't the only instance of this, but it's certainly the
 * biggest case of it.)  Ergo, TokenStreamAnyChars.
 *
 * == TokenStreamCharsShared → ∅ ==
 *
 * Some functionality has meaning independent of character type, yet has no use
 * *unless* you know the character type in actual use.  It *could* live in
 * TokenStreamAnyChars, but it makes more sense to live in a separate class
 * that character-aware token information can simply inherit.
 *
 * This class currently exists only to contain a char16_t buffer, transiently
 * used to accumulate strings in tricky cases that can't just be read directly
 * from source text.  It's not used outside character-aware tokenizing, so it
 * doesn't make sense in TokenStreamAnyChars.
 *
 * == TokenStreamCharsBase<Unit> → TokenStreamCharsShared ==
 *
 * Certain data structures in tokenizing are character-type-specific: namely,
 * the various pointers identifying the source text (including current offset
 * and end).
 *
 * Additionally, some functions operating on this data are defined the same way
 * no matter what character type you have (e.g. current offset in code units
 * into the source text) or share a common interface regardless of character
 * type (e.g. consume the next code unit if it has a given value).
 *
 * All such functionality lives in TokenStreamCharsBase<Unit>.
 *
 * == SpecializedTokenStreamCharsBase<Unit> → TokenStreamCharsBase<Unit> ==
 *
 * Certain tokenizing functionality is specific to a single character type.
 * For example, JS's UTF-16 encoding recognizes no coding errors, because lone
 * surrogates are not an error; but a UTF-8 encoding must recognize a variety
 * of validation errors.  Such functionality is defined only in the appropriate
 * SpecializedTokenStreamCharsBase specialization.
 *
 * == GeneralTokenStreamChars<Unit, AnyCharsAccess> →
 *    SpecializedTokenStreamCharsBase<Unit> ==
 *
 * Some functionality operates differently on different character types, just
 * as for TokenStreamCharsBase, but additionally requires access to character-
 * type-agnostic information in TokenStreamAnyChars.  For example, getting the
 * next character performs different steps for different character types and
 * must access TokenStreamAnyChars to update line break information.
 *
 * Such functionality, if it can be defined using the same algorithm for all
 * character types, lives in GeneralTokenStreamChars<Unit, AnyCharsAccess>.
 * The AnyCharsAccess parameter provides a way for a GeneralTokenStreamChars
 * instance to access its corresponding TokenStreamAnyChars, without inheriting
 * from it.
 *
 * GeneralTokenStreamChars<Unit, AnyCharsAccess> is just functionality, no
 * actual member data.
 *
 * Such functionality all lives in TokenStreamChars<Unit, AnyCharsAccess>, a
 * declared-but-not-defined template class whose specializations have a common
 * public interface (plus whatever private helper functions are desirable).
 *
 * == TokenStreamChars<Unit, AnyCharsAccess> →
 *    GeneralTokenStreamChars<Unit, AnyCharsAccess> ==
 *
 * Some functionality is like that in GeneralTokenStreamChars, *but* it's
 * defined entirely differently for different character types.
 *
 * For example, consider "match a multi-code unit code point" (hypothetically:
 * we've only implemented two-byte tokenizing right now):
 *
 *   * For two-byte text, there must be two code units to get, the leading code
 *     unit must be a UTF-16 lead surrogate, and the trailing code unit must be
 *     a UTF-16 trailing surrogate.  (If any of these fail to hold, a next code
 *     unit encodes that code point and is not multi-code unit.)
 *   * For single-byte Latin-1 text, there are no multi-code unit code points.
 *   * For single-byte UTF-8 text, the first code unit must have N > 1 of its
 *     highest bits set (and the next unset), and |N - 1| successive code units
 *     must have their high bit set and next-highest bit unset, *and*
 *     concatenating all unconstrained bits together must not produce a code
 *     point value that could have been encoded in fewer code units.
 *
 * This functionality can't be implemented as member functions in
 * GeneralTokenStreamChars because we'd need to *partially specialize* those
 * functions -- hold Unit constant while letting AnyCharsAccess vary.  But
 * C++ forbids function template partial specialization like this: either you
 * fix *all* parameters or you fix none of them.
 *
 * Fortunately, C++ *does* allow *class* template partial specialization.  So
 * TokenStreamChars is a template class with one specialization per Unit.
 * Functions can be defined differently in the different specializations,
 * because AnyCharsAccess as the only template parameter on member functions
 * *can* vary.
 *
 * All TokenStreamChars<Unit, AnyCharsAccess> specializations, one per Unit,
 * are just functionality, no actual member data.
 *
 * == TokenStreamSpecific<Unit, AnyCharsAccess> →
 *    TokenStreamChars<Unit, AnyCharsAccess>, TokenStreamShared,
 *    ErrorReporter ==
 *
 * TokenStreamSpecific is operations that are parametrized on character type
 * but implement the *general* idea of tokenizing, without being intrinsically
 * tied to character type.  Notably, this includes all operations that can
 * report warnings or errors at particular offsets, because we include a line
 * of context with such errors -- and that necessarily accesses the raw
 * characters of their specific type.
 *
 * Much TokenStreamSpecific operation depends on functionality in
 * TokenStreamAnyChars.  The obvious solution is to inherit it -- but this
 * doesn't work in Parser: its ParserBase base class needs some
 * TokenStreamAnyChars functionality without knowing character type.
 *
 * The AnyCharsAccess type parameter is a class that statically converts from a
 * TokenStreamSpecific* to its corresponding TokenStreamAnyChars.  The
 * TokenStreamSpecific in Parser<ParseHandler, Unit> can then specify a class
 * that properly converts from TokenStreamSpecific Parser::tokenStream to
 * TokenStreamAnyChars ParserBase::anyChars.
 *
 * Could we hardcode one set of offset calculations for this and eliminate
 * AnyCharsAccess?  No.  Offset calculations possibly could be hardcoded if
 * TokenStreamSpecific were present in Parser before Parser::handler, assuring
 * the same offsets in all Parser-related cases.  But there's still a separate
 * TokenStream class, that requires different offset calculations.  So even if
 * we wanted to hardcode this (it's not clear we would, because forcing the
 * TokenStreamSpecific declarer to specify this is more explicit), we couldn't.
 */

#include "mozilla/ArrayUtils.h"
#include "mozilla/Assertions.h"
#include "mozilla/Attributes.h"
#include "mozilla/Casting.h"
#include "mozilla/DebugOnly.h"
#include "mozilla/Maybe.h"
#include "mozilla/MemoryChecking.h"
#include "mozilla/PodOperations.h"
#include "mozilla/Span.h"
#include "mozilla/TextUtils.h"
#include "mozilla/Unused.h"
#include "mozilla/Utf8.h"

#include <algorithm>
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <type_traits>

#include "jspubtd.h"

#include "frontend/CompilationInfo.h"
#include "frontend/ErrorReporter.h"
#include "frontend/Token.h"
#include "frontend/TokenKind.h"
#include "js/CompileOptions.h"
#include "js/HashTable.h"    // js::HashMap
#include "js/RegExpFlags.h"  // JS::RegExpFlags
#include "js/UniquePtr.h"
#include "js/Vector.h"
#include "util/Text.h"
#include "util/Unicode.h"
#include "vm/ErrorReporting.h"
#include "vm/JSAtom.h"
#include "vm/StringType.h"

struct JS_PUBLIC_API JSContext;
struct KeywordInfo;

namespace js {

class AutoKeepAtoms;

namespace frontend {

extern TokenKind ReservedWordTokenKind(PropertyName* str);

extern const char* ReservedWordToCharZ(PropertyName* str);

extern const char* ReservedWordToCharZ(TokenKind tt);

struct TokenStreamFlags {
  // Hit end of file.
  bool isEOF : 1;
  // Non-whitespace since start of line.
  bool isDirtyLine : 1;
  // Saw an octal character escape or a 0-prefixed octal literal.
  bool sawDeprecatedOctal : 1;
  // Hit a syntax error, at start or during a token.
  bool hadError : 1;

  TokenStreamFlags()
      : isEOF(false),
        isDirtyLine(false),
        sawDeprecatedOctal(false),
        hadError(false) {}
};

template <typename Unit>
class TokenStreamPosition;

/**
 * TokenStream types and constants that are used in both TokenStreamAnyChars
 * and TokenStreamSpecific.  Do not add any non-static data members to this
 * class!
 */
class TokenStreamShared {
 protected:
  static constexpr size_t ntokens = 4;  // 1 current + 2 lookahead, rounded
                                        // to power of 2 to avoid divmod by 3

  static constexpr unsigned ntokensMask = ntokens - 1;

  template <typename Unit>
  friend class TokenStreamPosition;

 public:
  static constexpr unsigned maxLookahead = 2;

  using Modifier = Token::Modifier;
  static constexpr Modifier SlashIsDiv = Token::SlashIsDiv;
  static constexpr Modifier SlashIsRegExp = Token::SlashIsRegExp;
  static constexpr Modifier SlashIsInvalid = Token::SlashIsInvalid;

  static void verifyConsistentModifier(Modifier modifier,
                                       const Token& nextToken) {
    MOZ_ASSERT(
        modifier == nextToken.modifier || modifier == SlashIsInvalid,
        "This token was scanned with both SlashIsRegExp and SlashIsDiv, "
        "indicating the parser is confused about how to handle a slash here. "
        "See comment at Token::Modifier.");
  }
};

static_assert(std::is_empty_v<TokenStreamShared>,
              "TokenStreamShared shouldn't bloat classes that inherit from it");

template <typename Unit, class AnyCharsAccess>
class TokenStreamSpecific;

template <typename Unit>
class MOZ_STACK_CLASS TokenStreamPosition final {
 public:
  // The JS_HAZ_ROOTED is permissible below because: 1) the only field in
  // TokenStreamPosition that can keep GC things alive is Token, 2) the only
  // GC things Token can keep alive are atoms, and 3) the AutoKeepAtoms&
  // passed to the constructor here represents that collection of atoms
  // is disabled while atoms in Tokens in this Position are alive.  DON'T
  // ADD NON-ATOM GC THING POINTERS HERE!  They would create a rooting
  // hazard that JS_HAZ_ROOTED will cause to be ignored.
  template <class AnyCharsAccess>
  inline TokenStreamPosition(
      AutoKeepAtoms& keepAtoms,
      TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream);

 private:
  TokenStreamPosition(const TokenStreamPosition&) = delete;

  // Technically only TokenStreamSpecific<Unit, AnyCharsAccess>::seek with
  // Unit constant and AnyCharsAccess varying must be friended, but 1) it's
  // hard to friend one function in template classes, and 2) C++ doesn't
  // allow partial friend specialization to target just that single class.
  template <typename Char, class AnyCharsAccess>
  friend class TokenStreamSpecific;

  const Unit* buf;
  TokenStreamFlags flags;
  unsigned lineno;
  size_t linebase;
  size_t prevLinebase;
  Token currentToken;
  unsigned lookahead;
  Token lookaheadTokens[TokenStreamShared::maxLookahead];
} JS_HAZ_ROOTED;

template <typename Unit>
class SourceUnits;

/**
 * This class maps:
 *
 *   * a sourceUnits offset (a 0-indexed count of code units)
 *
 * to
 *
 *   * a (1-indexed) line number and
 *   * a (0-indexed) offset in code *units* (not code points, not bytes) into
 *     that line,
 *
 * for either |Unit = Utf8Unit| or |Unit = char16_t|.
 *
 * Note that the latter quantity is *not* the same as a column number, which is
 * a count of code *points*.  Computing a column number requires the offset
 * within the line and the source units of that line (including what type |Unit|
 * is, to know how to decode them).  If you need a column number, functions in
 * |GeneralTokenStreamChars<Unit>| will consult this and source units to compute
 * it.
 */
class SourceCoords {
  // For a given buffer holding source code, |lineStartOffsets_| has one
  // element per line of source code, plus one sentinel element.  Each
  // non-sentinel element holds the buffer offset for the start of the
  // corresponding line of source code.  For this example script,
  // assuming an initialLineOffset of 0:
  //
  // 1  // xyz            [line starts at offset 0]
  // 2  var x;            [line starts at offset 7]
  // 3                    [line starts at offset 14]
  // 4  var y;            [line starts at offset 15]
  //
  // |lineStartOffsets_| is:
  //
  //   [0, 7, 14, 15, MAX_PTR]
  //
  // To convert a "line number" to an "index" into |lineStartOffsets_|,
  // subtract |initialLineNum_|.  E.g. line 3's index is
  // (3 - initialLineNum_), which is 2.  Therefore lineStartOffsets_[2]
  // holds the buffer offset for the start of line 3, which is 14.  (Note
  // that |initialLineNum_| is often 1, but not always.
  //
  // The first element is always initialLineOffset, passed to the
  // constructor, and the last element is always the MAX_PTR sentinel.
  //
  // Offset-to-{line,offset-into-line} lookups are O(log n) in the worst
  // case (binary search), but in practice they're heavily clustered and
  // we do better than that by using the previous lookup's result
  // (lastIndex_) as a starting point.
  //
  // Checking if an offset lies within a particular line number
  // (isOnThisLine()) is O(1).
  //
  Vector<uint32_t, 128> lineStartOffsets_;

  /** The line number on which the source text begins. */
  uint32_t initialLineNum_;

  /**
   * The index corresponding to the last offset lookup -- used so that if
   * offset lookups proceed in increasing order, and and the offset appears
   * in the next couple lines from the last offset, we can avoid a full
   * binary-search.
   *
   * This is mutable because it's modified on every search, but that fact
   * isn't visible outside this class.
   */
  mutable uint32_t lastIndex_;

  uint32_t indexFromOffset(uint32_t offset) const;

  static const uint32_t MAX_PTR = UINT32_MAX;

  uint32_t lineNumberFromIndex(uint32_t index) const {
    return index + initialLineNum_;
  }

  uint32_t indexFromLineNumber(uint32_t lineNum) const {
    return lineNum - initialLineNum_;
  }

 public:
  SourceCoords(JSContext* cx, uint32_t initialLineNumber,
               uint32_t initialOffset);

  MOZ_MUST_USE bool add(uint32_t lineNum, uint32_t lineStartOffset);
  MOZ_MUST_USE bool fill(const SourceCoords& other);

  bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const {
    uint32_t index = indexFromLineNumber(lineNum);
    if (index + 1 >= lineStartOffsets_.length()) {  // +1 due to sentinel
      return false;
    }
    *onThisLine = lineStartOffsets_[index] <= offset &&
                  offset < lineStartOffsets_[index + 1];
    return true;
  }

  /**
   * A token, computed for an offset in source text, that can be used to
   * access line number and line-offset information for that offset.
   *
   * LineToken *alone* exposes whether the corresponding offset is in the
   * the first line of source (which may not be 1, depending on
   * |initialLineNumber|), and whether it's in the same line as
   * another LineToken.
   */
  class LineToken {
    uint32_t index;
#ifdef DEBUG
    uint32_t offset_;  // stored for consistency-of-use assertions
#endif

    friend class SourceCoords;

   public:
    LineToken(uint32_t index, uint32_t offset)
        : index(index)
#ifdef DEBUG
          ,
          offset_(offset)
#endif
    {
    }

    bool isFirstLine() const { return index == 0; }

    bool isSameLine(LineToken other) const { return index == other.index; }

    void assertConsistentOffset(uint32_t offset) const {
      MOZ_ASSERT(offset_ == offset);
    }
  };

  /**
   * Compute a token usable to access information about the line at the
   * given offset.
   *
   * The only information directly accessible in a token is whether it
   * corresponds to the first line of source text (which may not be line
   * 1, depending on the |initialLineNumber| value used to construct
   * this).  Use |lineNumber(LineToken)| to compute the actual line
   * number (incorporating the contribution of |initialLineNumber|).
   */
  LineToken lineToken(uint32_t offset) const;

  /** Compute the line number for the given token. */
  uint32_t lineNumber(LineToken lineToken) const {
    return lineNumberFromIndex(lineToken.index);
  }

  /** Return the offset of the start of the line for |lineToken|. */
  uint32_t lineStart(LineToken lineToken) const {
    MOZ_ASSERT(lineToken.index + 1 < lineStartOffsets_.length(),
               "recorded line-start information must be available");
    return lineStartOffsets_[lineToken.index];
  }
};

enum class UnitsType : unsigned char {
  PossiblyMultiUnit = 0,
  GuaranteedSingleUnit = 1,
};

class ChunkInfo {
 private:
  // Store everything in |unsigned char|s so everything packs.
  unsigned char column_[sizeof(uint32_t)];
  unsigned char unitsType_;

 public:
  ChunkInfo(uint32_t col, UnitsType type)
      : unitsType_(static_cast<unsigned char>(type)) {
    memcpy(column_, &col, sizeof(col));
  }

  uint32_t column() const {
    uint32_t col;
    memcpy(&col, column_, sizeof(uint32_t));
    return col;
  }

  UnitsType unitsType() const {
    MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
    return static_cast<UnitsType>(unitsType_);
  }

  void guaranteeSingleUnits() {
    MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
               "should only be setting to possibly optimize from the "
               "pessimistic case");
    unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
  }
};

enum class InvalidEscapeType {
  // No invalid character escapes.
  None,
  // A malformed \x escape.
  Hexadecimal,
  // A malformed \u escape.
  Unicode,
  // An otherwise well-formed \u escape which represents a
  // codepoint > 10FFFF.
  UnicodeOverflow,
  // An octal escape in a template token.
  Octal
};

class TokenStreamAnyChars : public TokenStreamShared {
 private:
  // Constant-at-construction fields.

  JSContext* const cx;

  /** Options used for parsing/tokenizing. */
  const JS::ReadOnlyCompileOptions& options_;

  /**
   * Pointer used internally to test whether in strict mode.  Use |strictMode()|
   * instead of this field.
   */
  StrictModeGetter* const strictModeGetter_;

  /** Input filename or null. */
  const char* const filename_;

  // Column number computation fields.

  /**
   * A map of (line number => sequence of the column numbers at
   * |ColumnChunkLength|-unit boundaries rewound [if needed] to the nearest code
   * point boundary).  (|TokenStreamAnyChars::computePartialColumn| is the sole
   * user of |ColumnChunkLength| and therefore contains its definition.)
   *
   * Entries appear in this map only when a column computation of sufficient
   * distance is performed on a line -- and only when the column is beyond the
   * first |ColumnChunkLength| units.  Each line's vector is lazily filled as
   * greater offsets require column computations.
   */
  mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;

  // Computing accurate column numbers requires at *some* point linearly
  // iterating through prior source units in the line, to properly account for
  // multi-unit code points.  This is quadratic if counting happens repeatedly.
  //
  // But usually we need columns for advancing offsets through scripts.  By
  // caching the last ((line number, offset) => relative column) mapping (in
  // similar manner to how |SourceCoords::lastIndex_| is used to cache
  // (offset => line number) mappings) we can usually avoid re-iterating through
  // the common line prefix.
  //
  // Additionally, we avoid hash table lookup costs by caching the
  // |Vector<ChunkInfo>*| for the line of the last lookup.  (|nullptr| means we
  // must look it up -- or it hasn't been created yet.)  This pointer is nulled
  // when a lookup on a new line occurs, but as it's not a pointer at literal,
  // reallocatable element data, it's *not* invalidated when new entries are
  // added to such a vector.

  /**
   * The line in which the last column computation occurred, or UINT32_MAX if
   * no prior computation has yet happened.
   */
  mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;

  /**
   * The chunk vector of the line for that last column computation.  This is
   * null if the chunk vector needs to be recalculated or initially created.
   */
  mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;

  /**
   * The offset (in code units) of the last column computation performed,
   * relative to source start.
   */
  mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;

  /**
   * The column number for the offset (in code units) of the last column
   * computation performed, relative to source start.
   */
  mutable uint32_t lastComputedColumn_ = 0;

  // Intra-token fields.

  /**
   * The offset of the first invalid escape in a template literal.  (If there is
   * one -- if not, the value of this field is meaningless.)
   *
   * See also |invalidTemplateEscapeType|.
   */
  uint32_t invalidTemplateEscapeOffset = 0;

  /**
   * The type of the first invalid escape in a template literal.  (If there
   * isn't one, this will be |None|.)
   *
   * See also |invalidTemplateEscapeOffset|.
   */
  InvalidEscapeType invalidTemplateEscapeType = InvalidEscapeType::None;

  // Fields with values relevant across tokens (and therefore potentially across
  // function boundaries, such that lazy function parsing and stream-seeking
  // must take care in saving and restoring them).

  /** Line number and offset-to-line mapping information. */
  SourceCoords srcCoords;

  /** Circular token buffer of gotten tokens that have been ungotten. */
  Token tokens[ntokens] = {};

  /** The index in |tokens| of the last parsed token. */
  unsigned cursor_ = 0;

  /** The number of tokens in |tokens| available to be gotten. */
  unsigned lookahead = 0;

  /** The current line number. */
  unsigned lineno;

  /** Various flag bits (see above). */
  TokenStreamFlags flags = {};

  /** The offset of the start of the current line. */
  size_t linebase = 0;

  /** The start of the previous line, or |size_t(-1)| on the first line. */
  size_t prevLinebase = size_t(-1);

  /** The user's requested source URL.  Null if none has been set. */
  UniqueTwoByteChars displayURL_ = nullptr;

  /** The URL of the source map for this script.  Null if none has been set. */
  UniqueTwoByteChars sourceMapURL_ = nullptr;

  // Assorted boolean fields, none of which require maintenance across tokens,
  // stored at class end to minimize padding.

  /**
   * Whether syntax errors should or should not contain details about the
   * precise nature of the error.  (This is intended for use in suppressing
   * content-revealing details about syntax errors in cross-origin scripts on
   * the web.)
   */
  const bool mutedErrors;

  /**
   * An array storing whether a TokenKind observed while attempting to extend
   * a valid AssignmentExpression into an even longer AssignmentExpression
   * (e.g., extending '3' to '3 + 5') will terminate it without error.
   *
   * For example, ';' always ends an AssignmentExpression because it ends a
   * Statement or declaration.  '}' always ends an AssignmentExpression
   * because it terminates BlockStatement, FunctionBody, and embedded
   * expressions in TemplateLiterals.  Therefore both entries are set to true
   * in TokenStreamAnyChars construction.
   *
   * But e.g. '+' *could* extend an AssignmentExpression, so its entry here
   * is false.  Meanwhile 'this' can't extend an AssignmentExpression, but
   * it's only valid after a line break, so its entry here must be false.
   *
   * NOTE: This array could be static, but without C99's designated
   *       initializers it's easier zeroing here and setting the true entries
   *       in the constructor body.  (Having this per-instance might also aid
   *       locality.)  Don't worry!  Initialization time for each TokenStream
   *       is trivial.  See bug 639420.
   */
  bool isExprEnding[size_t(TokenKind::Limit)] = {};  // all-false initially

  // End of fields.

 public:
  TokenStreamAnyChars(JSContext* cx, const JS::ReadOnlyCompileOptions& options,
                      StrictModeGetter* smg);

  template <typename Unit, class AnyCharsAccess>
  friend class GeneralTokenStreamChars;
  template <typename Unit, class AnyCharsAccess>
  friend class TokenStreamChars;
  template <typename Unit, class AnyCharsAccess>
  friend class TokenStreamSpecific;

  template <typename Unit>
  friend class TokenStreamPosition;

  // Accessors.
  unsigned cursor() const { return cursor_; }
  unsigned nextCursor() const { return (cursor_ + 1) & ntokensMask; }
  unsigned aheadCursor(unsigned steps) const {
    return (cursor_ + steps) & ntokensMask;
  }

  const Token& currentToken() const { return tokens[cursor()]; }
  bool isCurrentTokenType(TokenKind type) const {
    return currentToken().type == type;
  }

  MOZ_MUST_USE bool checkOptions();

 private:
  PropertyName* reservedWordToPropertyName(TokenKind tt) const;

 public:
  PropertyName* currentName() const {
    if (isCurrentTokenType(TokenKind::Name) ||
        isCurrentTokenType(TokenKind::PrivateName)) {
      return currentToken().name();
    }

    MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
    return reservedWordToPropertyName(currentToken().type);
  }

  bool currentNameHasEscapes() const {
    if (isCurrentTokenType(TokenKind::Name) ||
        isCurrentTokenType(TokenKind::PrivateName)) {
      TokenPos pos = currentToken().pos;
      return (pos.end - pos.begin) != currentToken().name()->length();
    }

    MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
    return false;
  }

  bool isCurrentTokenAssignment() const {
    return TokenKindIsAssignment(currentToken().type);
  }

  // Flag methods.
  bool isEOF() const { return flags.isEOF; }
  bool sawDeprecatedOctal() const { return flags.sawDeprecatedOctal; }
  bool hadError() const { return flags.hadError; }
  void clearSawDeprecatedOctal() { flags.sawDeprecatedOctal = false; }

  bool hasInvalidTemplateEscape() const {
    return invalidTemplateEscapeType != InvalidEscapeType::None;
  }
  void clearInvalidTemplateEscape() {
    invalidTemplateEscapeType = InvalidEscapeType::None;
  }

 private:
  // This is private because it should only be called by the tokenizer while
  // tokenizing not by, for example, BytecodeEmitter.
  bool strictMode() const {
    return strictModeGetter_ && strictModeGetter_->strictMode();
  }

  void setInvalidTemplateEscape(uint32_t offset, InvalidEscapeType type) {
    MOZ_ASSERT(type != InvalidEscapeType::None);
    if (invalidTemplateEscapeType != InvalidEscapeType::None) {
      return;
    }
    invalidTemplateEscapeOffset = offset;
    invalidTemplateEscapeType = type;
  }

 public:
  // Call this immediately after parsing an OrExpression to allow scanning the
  // next token with SlashIsRegExp without asserting (even though we just
  // peeked at it in SlashIsDiv mode).
  //
  // It's OK to disable the assertion because the places where this is called
  // have peeked at the next token in SlashIsDiv mode, and checked that it is
  // *not* a Div token.
  //
  // To see why it is necessary to disable the assertion, consider these two
  // programs:
  //
  //     x = arg => q       // per spec, this is all one statement, and the
  //     /a/g;              // slashes are division operators
  //
  //     x = arg => {}      // per spec, ASI at the end of this line
  //     /a/g;              // and that's a regexp literal
  //
  // The first program shows why orExpr() has use SlashIsDiv mode when peeking
  // ahead for the next operator after parsing `q`. The second program shows
  // why matchOrInsertSemicolon() must use SlashIsRegExp mode when scanning
  // ahead for a semicolon.
  void allowGettingNextTokenWithSlashIsRegExp() {
#ifdef DEBUG
    // Check the precondition: Caller already peeked ahead at the next token,
    // in SlashIsDiv mode, and it is *not* a Div token.
    MOZ_ASSERT(hasLookahead());
    const Token& next = nextToken();
    MOZ_ASSERT(next.modifier == SlashIsDiv);
    MOZ_ASSERT(next.type != TokenKind::Div);
    tokens[nextCursor()].modifier = SlashIsRegExp;
#endif
  }

#ifdef DEBUG
  inline bool debugHasNoLookahead() const { return lookahead == 0; }
#endif

  bool hasDisplayURL() const { return displayURL_ != nullptr; }

  char16_t* displayURL() { return displayURL_.get(); }

  bool hasSourceMapURL() const { return sourceMapURL_ != nullptr; }

  char16_t* sourceMapURL() { return sourceMapURL_.get(); }

  JSContext* context() const { return cx; }

  using LineToken = SourceCoords::LineToken;

  LineToken lineToken(uint32_t offset) const {
    return srcCoords.lineToken(offset);
  }

  uint32_t lineNumber(LineToken lineToken) const {
    return srcCoords.lineNumber(lineToken);
  }

  uint32_t lineStart(LineToken lineToken) const {
    return srcCoords.lineStart(lineToken);
  }

  /**
   * Fill in |err|.
   *
   * If the token stream doesn't have location info for this error, use the
   * caller's location (including line/column number) and return false.  (No
   * line of context is set.)
   *
   * Otherwise fill in everything in |err| except 1) line/column numbers and
   * 2) line-of-context-related fields and return true.  The caller *must*
   * fill in the line/column number; filling the line of context is optional.
   */
  bool fillExceptingContext(ErrorMetadata* err, uint32_t offset);

  MOZ_ALWAYS_INLINE void updateFlagsForEOL() { flags.isDirtyLine = false; }

 private:
  /**
   * Compute the "partial" column number in Unicode code points of the absolute
   * |offset| within source text on the line of |lineToken| (which must have
   * been computed from |offset|).
   *
   * A partial column number on a line that isn't the first line is just the
   * actual column number.  But a partial column number on the first line is the
   * column number *ignoring the initial line/column of the script*.  For
   * example, consider this HTML with line/column number keys:
   *
   *                 1         2            3
   *       0123456789012345678901234   567890
   *     ------------------------------------
   *   1 | <html>
   *   2 | <head>
   *   3 |   <script>var x = 3;  x &lt; 4;
   *   4 | const y = 7;</script>
   *   5 | </head>
   *   6 | <body></body>
   *   7 | </html>
   *
   * The script would be compiled specifying initial (line, column) of (3, 10)
   * using |JS::ReadOnlyCompileOptions::{lineno,column}|.  And the column
   * reported by |computeColumn| for the "v" of |var| would be 10.  But the
   * partial column number of the "v" in |var|, that this function returns,
   * would be 0.  On the other hand, the column reported by |computeColumn| and
   * the partial column number returned by this function for the "c" in |const|
   * would both be 0, because it's not in the first line of source text.
   *
   * The partial column is with respect *only* to the JavaScript source text as
   * SpiderMonkey sees it.  In the example, the "&lt;" is converted to "<" by
   * the browser before SpiderMonkey would see it.  So the partial column of the
   * "4" in the inequality would be 16, not 19.
   *
   * Code points are not all equal length, so counting requires *some* kind of
   * linear-time counting from the start of the line.  This function attempts
   * various tricks to reduce this cost.  If these optimizations succeed,
   * repeated calls to this function on a line will pay a one-time cost linear
   * in the length of the line, then each call pays a separate constant-time
   * cost.  If the optimizations do not succeed, this function works in time
   * linear in the length of the line.
   *
   * It's unusual for a function in *this* class to be |Unit|-templated, but
   * while this operation manages |Unit|-agnostic fields in this class and in
   * |srcCoords|, it must *perform* |Unit|-sensitive computations to fill them.
   * And this is the best place to do that.
   */
  template <typename Unit>
  uint32_t computePartialColumn(const LineToken lineToken,
                                const uint32_t offset,
                                const SourceUnits<Unit>& sourceUnits) const;

  /**
   * Update line/column information for the start of a new line at
   * |lineStartOffset|.
   */
  MOZ_MUST_USE MOZ_ALWAYS_INLINE bool internalUpdateLineInfoForEOL(
      uint32_t lineStartOffset);

 public:
  const Token& nextToken() const {
    MOZ_ASSERT(hasLookahead());
    return tokens[nextCursor()];
  }

  bool hasLookahead() const { return lookahead > 0; }

  void advanceCursor() { cursor_ = (cursor_ + 1) & ntokensMask; }

  void retractCursor() { cursor_ = (cursor_ - 1) & ntokensMask; }

  Token* allocateToken() {
    advanceCursor();

    Token* tp = &tokens[cursor()];
    MOZ_MAKE_MEM_UNDEFINED(tp, sizeof(*tp));

    return tp;
  }

  // Push the last scanned token back into the stream.
  void ungetToken() {
    MOZ_ASSERT(lookahead < maxLookahead);
    lookahead++;
    retractCursor();
  }

 public:
  void adoptState(TokenStreamAnyChars& other) {
    // If |other| has fresh information from directives, overwrite any
    // previously recorded directives.  (There is no specification directing
    // that last-in-source-order directive controls, sadly.  We behave this way
    // in the ordinary case, so we ought do so here too.)
    if (auto& url = other.displayURL_) {
      displayURL_ = std::move(url);
    }
    if (auto& url = other.sourceMapURL_) {
      sourceMapURL_ = std::move(url);
    }
  }

  // Compute error metadata for an error at no offset.
  void computeErrorMetadataNoOffset(ErrorMetadata* err);

  // ErrorReporter API Helpers

  // Provide minimal set of error reporting API given we cannot use
  // ErrorReportMixin here. "report" prefix is added to avoid conflict with
  // ErrorReportMixin methods in TokenStream class.
  void reportErrorNoOffset(unsigned errorNumber, ...);
  void reportErrorNoOffsetVA(unsigned errorNumber, va_list* args);

  const JS::ReadOnlyCompileOptions& options() const { return options_; }

  const char* getFilename() const { return filename_; }
};

constexpr char16_t CodeUnitValue(char16_t unit) { return unit; }

constexpr uint8_t CodeUnitValue(mozilla::Utf8Unit unit) {
  return unit.toUint8();
}

template <typename Unit>
class TokenStreamCharsBase;

template <typename T>
inline bool IsLineTerminator(T) = delete;

inline bool IsLineTerminator(char32_t codePoint) {
  return codePoint == '\n' || codePoint == '\r' ||
         codePoint == unicode::LINE_SEPARATOR ||
         codePoint == unicode::PARA_SEPARATOR;
}

inline bool IsLineTerminator(char16_t unit) {
  // Every LineTerminator fits in char16_t, so this is exact.
  return IsLineTerminator(static_cast<char32_t>(unit));
}

template <typename Unit>
struct SourceUnitTraits;

template <>
struct SourceUnitTraits<char16_t> {
 public:
  static constexpr uint8_t maxUnitsLength = 2;

  static constexpr size_t lengthInUnits(char32_t codePoint) {
    return codePoint < unicode::NonBMPMin ? 1 : 2;
  }
};

template <>
struct SourceUnitTraits<mozilla::Utf8Unit> {
 public:
  static constexpr uint8_t maxUnitsLength = 4;

  static constexpr size_t lengthInUnits(char32_t codePoint) {
    return codePoint < 0x80
               ? 1
               : codePoint < 0x800 ? 2 : codePoint < 0x10000 ? 3 : 4;
  }
};

/**
 * PeekedCodePoint represents the result of peeking ahead in some source text
 * to determine the next validly-encoded code point.
 *
 * If there isn't a valid code point, then |isNone()|.
 *
 * But if there *is* a valid code point, then |!isNone()|, the code point has
 * value |codePoint()| and its length in code units is |lengthInUnits()|.
 *
 * Conceptually, this class is |Maybe<struct { char32_t v; uint8_t len; }>|.
 */
template <typename Unit>
class PeekedCodePoint final {
  char32_t codePoint_ = 0;
  uint8_t lengthInUnits_ = 0;

 private:
  using SourceUnitTraits = frontend::SourceUnitTraits<Unit>;

  PeekedCodePoint() = default;

 public:
  /**
   * Create a peeked code point with the given value and length in code
   * units.
   *
   * While the latter value is computable from the former for both UTF-8 and
   * JS's version of UTF-16, the caller likely computed a length in units in
   * the course of determining the peeked value.  Passing both here avoids
   * recomputation and lets us do a consistency-checking assertion.
   */
  PeekedCodePoint(char32_t codePoint, uint8_t lengthInUnits)
      : codePoint_(codePoint), lengthInUnits_(lengthInUnits) {
    MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
    MOZ_ASSERT(lengthInUnits != 0, "bad code point length");
    MOZ_ASSERT(lengthInUnits == SourceUnitTraits::lengthInUnits(codePoint));
  }

  /** Create a PeekedCodeUnit that represents no valid code point. */
  static PeekedCodePoint none() { return PeekedCodePoint(); }

  /** True if no code point was found, false otherwise. */
  bool isNone() const { return lengthInUnits_ == 0; }

  /** If a code point was found, its value. */
  char32_t codePoint() const {
    MOZ_ASSERT(!isNone());
    return codePoint_;
  }

  /** If a code point was found, its length in code units. */
  uint8_t lengthInUnits() const {
    MOZ_ASSERT(!isNone());
    return lengthInUnits_;
  }
};

inline PeekedCodePoint<char16_t> PeekCodePoint(const char16_t* const ptr,
                                               const char16_t* const end) {
  if (MOZ_UNLIKELY(ptr >= end)) {
    return PeekedCodePoint<char16_t>::none();
  }

  char16_t lead = ptr[0];

  char32_t c;
  uint8_t len;
  if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
      MOZ_UNLIKELY(ptr + 1 >= end || !unicode::IsTrailSurrogate(ptr[1]))) {
    c = lead;
    len = 1;
  } else {
    c = unicode::UTF16Decode(lead, ptr[1]);
    len = 2;
  }

  return PeekedCodePoint<char16_t>(c, len);
}

inline PeekedCodePoint<mozilla::Utf8Unit> PeekCodePoint(
    const mozilla::Utf8Unit* const ptr, const mozilla::Utf8Unit* const end) {
  if (MOZ_UNLIKELY(ptr >= end)) {
    return PeekedCodePoint<mozilla::Utf8Unit>::none();
  }

  const mozilla::Utf8Unit lead = ptr[0];
  if (mozilla::IsAscii(lead)) {
    return PeekedCodePoint<mozilla::Utf8Unit>(lead.toUint8(), 1);
  }

  const mozilla::Utf8Unit* afterLead = ptr + 1;
  mozilla::Maybe<char32_t> codePoint =
      mozilla::DecodeOneUtf8CodePoint(lead, &afterLead, end);
  if (codePoint.isNothing()) {
    return PeekedCodePoint<mozilla::Utf8Unit>::none();
  }

  auto len =
      mozilla::AssertedCast<uint8_t>(mozilla::PointerRangeSize(ptr, afterLead));
  MOZ_ASSERT(len <= 4);

  return PeekedCodePoint<mozilla::Utf8Unit>(codePoint.value(), len);
}

inline bool IsSingleUnitLineTerminator(mozilla::Utf8Unit unit) {
  // BEWARE: The Unicode line/paragraph separators don't fit in a single
  //         UTF-8 code unit, so this test is exact for Utf8Unit but inexact
  //         for UTF-8 as a whole.  Users must handle |unit| as start of a
  //         Unicode LineTerminator themselves!
  return unit == mozilla::Utf8Unit('\n') || unit == mozilla::Utf8Unit('\r');
}

// This is the low-level interface to the JS source code buffer.  It just gets
// raw Unicode code units -- 16-bit char16_t units of source text that are not
// (always) full code points, and 8-bit units of UTF-8 source text soon.
// TokenStreams functions are layered on top and do some extra stuff like
// converting all EOL sequences to '\n', tracking the line number, and setting
// |flags.isEOF|.  (The "raw" in "raw Unicode code units" refers to the lack of
// EOL sequence normalization.)
//
// buf[0..length-1] often represents a substring of some larger source,
// where we have only the substring in memory. The |startOffset| argument
// indicates the offset within this larger string at which our string
// begins, the offset of |buf[0]|.
template <typename Unit>
class SourceUnits {
 private:
  /** Base of buffer. */
  const Unit* base_;

  /** Offset of base_[0]. */
  uint32_t startOffset_;

  /** Limit for quick bounds check. */
  const Unit* limit_;

  /** Next char to get. */
  const Unit* ptr;

 public:
  SourceUnits(const Unit* units, size_t length, size_t startOffset)
      : base_(units),
        startOffset_(startOffset),
        limit_(units + length),
        ptr(units) {}

  bool atStart() const {
    MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
    return ptr == base_;
  }

  bool atEnd() const {
    MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
    MOZ_ASSERT(ptr <= limit_, "shouldn't have overrun");
    return ptr >= limit_;
  }

  size_t remaining() const {
    MOZ_ASSERT(!isPoisoned(),
               "can't get a count of remaining code units if poisoned");
    return mozilla::PointerRangeSize(ptr, limit_);
  }

  size_t startOffset() const { return startOffset_; }

  size_t offset() const {
    return startOffset_ + mozilla::PointerRangeSize(base_, ptr);
  }

  const Unit* codeUnitPtrAt(size_t offset) const {
    MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
    MOZ_ASSERT(startOffset_ <= offset);
    MOZ_ASSERT(offset - startOffset_ <=
               mozilla::PointerRangeSize(base_, limit_));
    return base_ + (offset - startOffset_);
  }

  const Unit* current() const { return ptr; }

  const Unit* limit() const { return limit_; }

  Unit previousCodeUnit() {
    MOZ_ASSERT(!isPoisoned(), "can't get previous code unit if poisoned");
    MOZ_ASSERT(!atStart(), "must have a previous code unit to get");
    return *(ptr - 1);
  }

  Unit getCodeUnit() {
    return *ptr++;  // this will nullptr-crash if poisoned
  }

  Unit peekCodeUnit() const {
    return *ptr;  // this will nullptr-crash if poisoned
  }

  /**
   * Determine the next code point in source text.  The code point is not
   * normalized: '\r', '\n', '\u2028', and '\u2029' are returned literally.
   * If there is no next code point because |atEnd()|, or if an encoding
   * error is encountered, return a |PeekedCodePoint| that |isNone()|.
   *
   * This function does not report errors: code that attempts to get the next
   * code point must report any error.
   *
   * If a next code point is found, it may be consumed by passing it to
   * |consumeKnownCodePoint|.
   */
  PeekedCodePoint<Unit> peekCodePoint() const {
    return PeekCodePoint(ptr, limit_);
  }

 private:
#ifdef DEBUG
  void assertNextCodePoint(const PeekedCodePoint<Unit>& peeked);
#endif

 public:
  /**
   * Consume a peeked code point that |!isNone()|.
   *
   * This call DOES NOT UPDATE LINE-STATUS.  You may need to call
   * |updateLineInfoForEOL()| and |updateFlagsForEOL()| if this consumes a
   * LineTerminator.  Note that if this consumes '\r', you also must consume
   * an optional '\n' (i.e. a full LineTerminatorSequence) before doing so.
   */
  void consumeKnownCodePoint(const PeekedCodePoint<Unit>& peeked) {
    MOZ_ASSERT(!peeked.isNone());
    MOZ_ASSERT(peeked.lengthInUnits() <= remaining());

#ifdef DEBUG
    assertNextCodePoint(peeked);
#endif

    ptr += peeked.lengthInUnits();
  }

  /** Match |n| hexadecimal digits and store their value in |*out|. */
  bool matchHexDigits(uint8_t n, char16_t* out) {
    MOZ_ASSERT(!isPoisoned(), "shouldn't peek into poisoned SourceUnits");
    MOZ_ASSERT(n <= 4, "hexdigit value can't overflow char16_t");
    if (n > remaining()) {
      return false;
    }

    char16_t v = 0;
    for (uint8_t i = 0; i < n; i++) {
      auto unit = CodeUnitValue(ptr[i]);
      if (!mozilla::IsAsciiHexDigit(unit)) {
        return false;
      }

      v = (v << 4) | mozilla::AsciiAlphanumericToNumber(unit);
    }

    *out = v;
    ptr += n;
    return true;
  }

  bool matchCodeUnits(const char* chars, uint8_t length) {
    MOZ_ASSERT(!isPoisoned(), "shouldn't match into poisoned SourceUnits");
    if (length > remaining()) {
      return false;
    }

    const Unit* start = ptr;
    const Unit* end = ptr + length;
    while (ptr < end) {
      if (*ptr++ != Unit(*chars++)) {
        ptr = start;
        return false;
      }
    }

    return true;
  }

  void skipCodeUnits(uint32_t n) {
    MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
    MOZ_ASSERT(n <= remaining(), "shouldn't skip beyond end of SourceUnits");
    ptr += n;
  }

  void unskipCodeUnits(uint32_t n) {
    MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
    MOZ_ASSERT(n <= mozilla::PointerRangeSize(base_, ptr),
               "shouldn't unskip beyond start of SourceUnits");
    ptr -= n;
  }

 private:
  friend class TokenStreamCharsBase<Unit>;

  bool internalMatchCodeUnit(Unit c) {
    MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
    if (MOZ_LIKELY(!atEnd()) && *ptr == c) {
      ptr++;
      return true;
    }
    return false;
  }

 public:
  void consumeKnownCodeUnit(Unit c) {
    MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
    MOZ_ASSERT(*ptr == c, "consuming the wrong code unit");
    ptr++;
  }

  /**
   * Unget the '\n' (CR) that precedes a '\n' (LF), when ungetting a line
   * terminator that's a full "\r\n" sequence.  If the prior code unit isn't
   * '\r', do nothing.
   */
  void ungetOptionalCRBeforeLF() {
    MOZ_ASSERT(!isPoisoned(),
               "shouldn't unget a '\\r' from poisoned SourceUnits");
    MOZ_ASSERT(*ptr == Unit('\n'),
               "function should only be called when a '\\n' was just "
               "ungotten, and any '\\r' preceding it must also be "
               "ungotten");
    if (*(ptr - 1) == Unit('\r')) {
      ptr--;
    }
  }

  /** Unget U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR. */
  inline void ungetLineOrParagraphSeparator();

  void ungetCodeUnit() {
    MOZ_ASSERT(!isPoisoned(), "can't unget from poisoned units");
    MOZ_ASSERT(!atStart(), "can't unget if currently at start");
    ptr--;
  }

  const Unit* addressOfNextCodeUnit(bool allowPoisoned = false) const {
    MOZ_ASSERT_IF(!allowPoisoned, !isPoisoned());
    return ptr;
  }

  // Use this with caution!
  void setAddressOfNextCodeUnit(const Unit* a, bool allowPoisoned = false) {
    MOZ_ASSERT_IF(!allowPoisoned, a);
    ptr = a;
  }

  // Poison the SourceUnits so they can't be accessed again.
  void poisonInDebug() {
#ifdef DEBUG
    ptr = nullptr;
#endif
  }

 private:
  bool isPoisoned() const {
#ifdef DEBUG
    // |ptr| can be null for unpoisoned SourceUnits if this was initialized with
    // |units == nullptr| and |length == 0|.  In that case, for lack of any
    // better options, consider this to not be poisoned.
    return ptr == nullptr && ptr != limit_;
#else
    return false;
#endif
  }

 public:
  /**
   * Consume the rest of a single-line comment (but not the EOL/EOF that
   * terminates it).
   *
   * If an encoding error is encountered -- possible only for UTF-8 because
   * JavaScript's conception of UTF-16 encompasses any sequence of 16-bit
   * code units -- valid code points prior to the encoding error are consumed
   * and subsequent invalid code units are not consumed.  For example, given
   * these UTF-8 code units:
   *
   *   'B'   'A'  'D'  ':'   <bad code unit sequence>
   *   0x42  0x41 0x44 0x3A  0xD0 0x00 ...
   *
   * the first four code units are consumed, but 0xD0 and 0x00 are not
   * consumed because 0xD0 encodes a two-byte lead unit but 0x00 is not a
   * valid trailing code unit.
   *
   * It is expected that the caller will report such an encoding error when
   * it attempts to consume the next code point.
   */
  void consumeRestOfSingleLineComment();

  /**
   * The maximum radius of code around the location of an error that should
   * be included in a syntax error message -- this many code units to either
   * side.  The resulting window of data is then accordinngly trimmed so that
   * the window contains only validly-encoded data.
   *
   * Because this number is the same for both UTF-8 and UTF-16, windows in
   * UTF-8 may contain fewer code points than windows in UTF-16.  As we only
   * use this for error messages, we don't particularly care.
   */
  static constexpr size_t WindowRadius = ErrorMetadata::lineOfContextRadius;

  /**
   * From absolute offset |offset|, search backward to find an absolute
   * offset within source text, no further than |WindowRadius| code units
   * away from |offset|, such that all code points from that offset to
   * |offset| are valid, non-LineTerminator code points.
   */
  size_t findWindowStart(size_t offset) const;

  /**
   * From absolute offset |offset|, find an absolute offset within source
   * text, no further than |WindowRadius| code units away from |offset|, such
   * that all code units from |offset| to that offset are valid,
   * non-LineTerminator code points.
   */
  size_t findWindowEnd(size_t offset) const;

  /**
   * Given a |window| of |encodingSpecificWindowLength| units encoding valid
   * Unicode text, with index |encodingSpecificTokenOffset| indicating a
   * particular code point boundary in |window|, compute the corresponding
   * token offset and length if |window| were encoded in UTF-16.  For
   * example:
   *
   *   // U+03C0 GREEK SMALL LETTER PI is encoded as 0xCF 0x80.
   *   const Utf8Unit* encodedWindow =
   *     reinterpret_cast<const Utf8Unit*>(u8"ππππ = @ FAIL");
   *   size_t encodedTokenOffset = 11; // 2 * 4 + ' = '.length
   *   size_t encodedWindowLength = 17; // 2 * 4 + ' = @ FAIL'.length
   *   size_t utf16Offset, utf16Length;
   *   computeWindowOffsetAndLength(encodedWindow,
   *                                encodedTokenOffset, &utf16Offset,
   *                                encodedWindowLength, &utf16Length);
   *   MOZ_ASSERT(utf16Offset == 7);
   *   MOZ_ASSERT(utf16Length = 13);
   *
   * This function asserts if called for UTF-16: the sole caller can avoid
   * computing UTF-16 offsets when they're definitely the same as the encoded
   * offsets.
   */
  inline void computeWindowOffsetAndLength(const Unit* encodeWindow,
                                           size_t encodingSpecificTokenOffset,
                                           size_t* utf16TokenOffset,
                                           size_t encodingSpecificWindowLength,
                                           size_t* utf16WindowLength);
};

template <>
inline void SourceUnits<char16_t>::ungetLineOrParagraphSeparator() {
#ifdef DEBUG
  char16_t prev = previousCodeUnit();
#endif
  MOZ_ASSERT(prev == unicode::LINE_SEPARATOR ||
             prev == unicode::PARA_SEPARATOR);

  ungetCodeUnit();
}

template <>
inline void SourceUnits<mozilla::Utf8Unit>::ungetLineOrParagraphSeparator() {
  unskipCodeUnits(3);

  MOZ_ASSERT(ptr[0].toUint8() == 0xE2);
  MOZ_ASSERT(ptr[1].toUint8() == 0x80);

#ifdef DEBUG
  uint8_t last = ptr[2].toUint8();
#endif
  MOZ_ASSERT(last == 0xA8 || last == 0xA9);
}

class TokenStreamCharsShared {
  // Using char16_t (not Unit) is a simplifying decision that hopefully
  // eliminates the need for a UTF-8 regular expression parser and makes
  // |copyCharBufferTo| markedly simpler.
  using CharBuffer = Vector<char16_t, 32>;

 protected:
  /**
   * Buffer transiently used to store sequences of identifier or string code
   * points when such can't be directly processed from the original source
   * text (e.g. because it contains escapes).
   */
  CharBuffer charBuffer;

  /** Information for parsing with a lifetime longer than the parser itself. */
  CompilationInfo* compilationInfo;

 protected:
  explicit TokenStreamCharsShared(JSContext* cx,
                                  CompilationInfo* compilationInfo)
      : charBuffer(cx), compilationInfo(compilationInfo) {}

  MOZ_MUST_USE bool appendCodePointToCharBuffer(uint32_t codePoint);

  MOZ_MUST_USE bool copyCharBufferTo(
      JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination);

  /**
   * Determine whether a code unit constitutes a complete ASCII code point.
   * (The code point's exact value might not be used, however, if subsequent
   * code observes that |unit| is part of a LineTerminatorSequence.)
   */
  static constexpr MOZ_ALWAYS_INLINE MOZ_MUST_USE bool isAsciiCodePoint(
      int32_t unit) {
    return mozilla::IsAscii(static_cast<char32_t>(unit));
  }

  JSAtom* drainCharBufferIntoAtom() {
    JSAtom* atom = AtomizeChars(this->compilationInfo->cx, charBuffer.begin(),
                                charBuffer.length());
    if (!atom) {
      return nullptr;
    }

    // Add to parser atoms table.
#ifdef JS_PARSER_ATOMS
    auto maybeId = this->compilationInfo->parserAtoms.internChar16(
        this->compilationInfo->cx, charBuffer.begin(), charBuffer.length());
    if (maybeId.isErr()) {
      return nullptr;
    }
#endif  // JS_PARSER_ATOMS

    charBuffer.clear();
    return atom;
  }

 protected:
  void adoptState(TokenStreamCharsShared& other) {
    // The other stream's buffer may contain information for a
    // gotten-then-ungotten token, that we must transfer into this stream so
    // that token's final get behaves as desired.
    charBuffer = std::move(other.charBuffer);
  }

 public:
  CharBuffer& getCharBuffer() { return charBuffer; }
};

inline mozilla::Span<const char> ToCharSpan(
    mozilla::Span<const mozilla::Utf8Unit> codeUnits) {
  static_assert(alignof(char) == alignof(mozilla::Utf8Unit),
                "must have equal alignment to reinterpret_cast<>");
  static_assert(sizeof(char) == sizeof(mozilla::Utf8Unit),
                "must have equal size to reinterpret_cast<>");

  // This cast is safe for two reasons.
  //
  // First, per C++11 [basic.lval]p10 it is permitted to access any object's
  // memory through |char|.
  //
  // Second, Utf8Unit *contains* a |char|.  Examining that memory as |char|
  // is simply, per C++11 [basic.lval]p10, to access the memory according to
  // the dynamic type of the object: essentially trivially safe.
  return mozilla::MakeSpan(reinterpret_cast<const char*>(codeUnits.data()),
                           codeUnits.size());
}

template <typename Unit>
class TokenStreamCharsBase : public TokenStreamCharsShared {
 protected:
  using SourceUnits = frontend::SourceUnits<Unit>;

  /** Code units in the source code being tokenized. */
  SourceUnits sourceUnits;

  // End of fields.

 protected:
  TokenStreamCharsBase(JSContext* cx, CompilationInfo* compilationInfo,
                       const Unit* units, size_t length, size_t startOffset);

  /**
   * Convert a non-EOF code unit returned by |getCodeUnit()| or
   * |peekCodeUnit()| to a Unit code unit.
   */
  inline Unit toUnit(int32_t codeUnitValue);

  void ungetCodeUnit(int32_t c) {
    if (c == EOF) {
      return;
    }

    sourceUnits.ungetCodeUnit();
  }

  MOZ_ALWAYS_INLINE JSAtom* atomizeSourceChars(mozilla::Span<const Unit> units);

  /**
   * Try to match a non-LineTerminator ASCII code point.  Return true iff it
   * was matched.
   */
  bool matchCodeUnit(char expect) {
    MOZ_ASSERT(mozilla::IsAscii(expect));
    MOZ_ASSERT(expect != '\r');
    MOZ_ASSERT(expect != '\n');
    return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
  }

  /**
   * Try to match an ASCII LineTerminator code point.  Return true iff it was
   * matched.
   */
  bool matchLineTerminator(char expect) {
    MOZ_ASSERT(expect == '\r' || expect == '\n');
    return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
  }

  template <typename T>
  bool matchCodeUnit(T) = delete;
  template <typename T>
  bool matchLineTerminator(T) = delete;

  int32_t peekCodeUnit() {
    return MOZ_LIKELY(!sourceUnits.atEnd())
               ? CodeUnitValue(sourceUnits.peekCodeUnit())
               : EOF;
  }

  /** Consume a known, non-EOF code unit. */
  inline void consumeKnownCodeUnit(int32_t unit);

  // Forbid accidental calls to consumeKnownCodeUnit *not* with the single
  // unit-or-EOF type.  Unit should use SourceUnits::consumeKnownCodeUnit;
  // CodeUnitValue() results should go through toUnit(), or better yet just
  // use the original Unit.
  template <typename T>
  inline void consumeKnownCodeUnit(T) = delete;

  /**
   * Accumulate the provided range of already-validated text (valid UTF-8, or
   * anything if Unit is char16_t because JS allows lone surrogates) into
   * |charBuffer|.  Normalize '\r', '\n', and "\r\n" into '\n'.
   */
  MOZ_MUST_USE bool fillCharBufferFromSourceNormalizingAsciiLineBreaks(
      const Unit* cur, const Unit* end);

  /**
   * Add a null-terminated line of context to error information, for the line
   * in |sourceUnits| that contains |offset|.  Also record the window's
   * length and the offset of the error in the window.  (Don't bother adding
   * a line of context if it would be empty.)
   *
   * The window will contain no LineTerminators of any kind, and it will not
   * extend more than |SourceUnits::WindowRadius| to either side of |offset|,
   * nor into the previous or next lines.
   *
   * This function is quite internal, and you probably should be calling one
   * of its existing callers instead.
   */
  MOZ_MUST_USE bool addLineOfContext(ErrorMetadata* err, uint32_t offset);
};

template <>
inline char16_t TokenStreamCharsBase<char16_t>::toUnit(int32_t codeUnitValue) {
  MOZ_ASSERT(codeUnitValue != EOF, "EOF is not a Unit");
  return mozilla::AssertedCast<char16_t>(codeUnitValue);
}

template <>
inline mozilla::Utf8Unit TokenStreamCharsBase<mozilla::Utf8Unit>::toUnit(
    int32_t value) {
  MOZ_ASSERT(value != EOF, "EOF is not a Unit");
  return mozilla::Utf8Unit(mozilla::AssertedCast<unsigned char>(value));
}

template <typename Unit>
inline void TokenStreamCharsBase<Unit>::consumeKnownCodeUnit(int32_t unit) {
  sourceUnits.consumeKnownCodeUnit(toUnit(unit));
}

template <>
MOZ_ALWAYS_INLINE JSAtom* TokenStreamCharsBase<char16_t>::atomizeSourceChars(
    mozilla::Span<const char16_t> units) {
  JSAtom* atom =
      AtomizeChars(this->compilationInfo->cx, units.data(), units.size());
  if (!atom) {
    return nullptr;
  }

#ifdef JS_PARSER_ATOMS
  auto maybeId = this->compilationInfo->parserAtoms.internChar16(
      this->compilationInfo->cx, units.data(), units.size());
  if (maybeId.isErr()) {
    return nullptr;
  }
#endif  // JS_PARSER_ATOMS

  return atom;
}

template <>
/* static */ MOZ_ALWAYS_INLINE JSAtom*
TokenStreamCharsBase<mozilla::Utf8Unit>::atomizeSourceChars(
    mozilla::Span<const mozilla::Utf8Unit> units) {
  auto chars = ToCharSpan(units);
  JSAtom* atom =
      AtomizeUTF8Chars(this->compilationInfo->cx, chars.data(), chars.size());
  if (!atom) {
    return nullptr;
  }

#ifdef JS_PARSER_ATOMS
  auto maybeId = this->compilationInfo->parserAtoms.internUtf8(
      this->compilationInfo->cx, units.data(), units.size());
  if (maybeId.isErr()) {
    return nullptr;
  }
#endif  // JS_PARSER_ATOMS

  return atom;
}

template <typename Unit>
class SpecializedTokenStreamCharsBase;

template <>
class SpecializedTokenStreamCharsBase<char16_t>
    : public TokenStreamCharsBase<char16_t> {
  using CharsBase = TokenStreamCharsBase<char16_t>;

 protected:
  using TokenStreamCharsShared::isAsciiCodePoint;
  // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(

  using typename CharsBase::SourceUnits;

 protected:
  // These APIs are only usable by UTF-16-specific code.

  /**
   * Given |lead| already consumed, consume and return the code point encoded
   * starting from it.  Infallible because lone surrogates in JS encode a
   * "code point" of the same value.
   */
  char32_t infallibleGetNonAsciiCodePointDontNormalize(char16_t lead) {
    MOZ_ASSERT(!isAsciiCodePoint(lead));
    MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == lead);

    // Handle single-unit code points and lone trailing surrogates.
    if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
        // Or handle lead surrogates not paired with trailing surrogates.
        MOZ_UNLIKELY(
            this->sourceUnits.atEnd() ||
            !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
      return lead;
    }

    // Otherwise it's a multi-unit code point.
    return unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
  }

 protected:
  // These APIs are in both SpecializedTokenStreamCharsBase specializations
  // and so are usable in subclasses no matter what Unit is.

  using CharsBase::CharsBase;
};

template <>
class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
    : public TokenStreamCharsBase<mozilla::Utf8Unit> {
  using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;

 protected:
  // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(

 protected:
  // These APIs are only usable by UTF-8-specific code.

  using typename CharsBase::SourceUnits;

  /**
   * A mutable iterator-wrapper around |SourceUnits| that translates
   * operators to calls to |SourceUnits::getCodeUnit()| and similar.
   *
   * This class is expected to be used in concert with |SourceUnitsEnd|.
   */
  class SourceUnitsIterator {
    SourceUnits& sourceUnits_;
#ifdef DEBUG
    // In iterator copies created by the post-increment operator, a pointer
    // at the next source text code unit when the post-increment operator
    // was called, cleared when the iterator is dereferenced.
    mutable mozilla::Maybe<const mozilla::Utf8Unit*>
        currentBeforePostIncrement_;
#endif

   public:
    explicit SourceUnitsIterator(SourceUnits& sourceUnits)
        : sourceUnits_(sourceUnits) {}

    mozilla::Utf8Unit operator*() const {
      // operator* is expected to get the *next* value from an iterator
      // not pointing at the end of the underlying range.  However, the
      // sole use of this is in the context of an expression of the form
      // |*iter++|, that performed the |sourceUnits_.getCodeUnit()| in
      // the |operator++(int)| below -- so dereferencing acts on a
      // |sourceUnits_| already advanced.  Therefore the correct unit to
      // return is the previous one.
      MOZ_ASSERT(currentBeforePostIncrement_.value() + 1 ==
                 sourceUnits_.current());
#ifdef DEBUG
      currentBeforePostIncrement_.reset();
#endif
      return sourceUnits_.previousCodeUnit();
    }

    SourceUnitsIterator operator++(int) {
      MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
                 "the only valid operation on a post-incremented "
                 "iterator is dereferencing a single time");

      SourceUnitsIterator copy = *this;
#ifdef DEBUG
      copy.currentBeforePostIncrement_.emplace(sourceUnits_.current());
#endif

      sourceUnits_.getCodeUnit();
      return copy;
    }

    void operator-=(size_t n) {
      MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
                 "the only valid operation on a post-incremented "
                 "iterator is dereferencing a single time");
      sourceUnits_.unskipCodeUnits(n);
    }

    mozilla::Utf8Unit operator[](ptrdiff_t index) {
      MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
                 "the only valid operation on a post-incremented "
                 "iterator is dereferencing a single time");
      MOZ_ASSERT(index == -1,
                 "must only be called to verify the value of the "
                 "previous code unit");
      return sourceUnits_.previousCodeUnit();
    }

    size_t remaining() const {
      MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
                 "the only valid operation on a post-incremented "
                 "iterator is dereferencing a single time");
      return sourceUnits_.remaining();
    }
  };

  /** A sentinel representing the end of |SourceUnits| data. */
  class SourceUnitsEnd {};

  friend inline size_t operator-(const SourceUnitsEnd& aEnd,
                                 const SourceUnitsIterator& aIter);

 protected:
  // These APIs are in both SpecializedTokenStreamCharsBase specializations
  // and so are usable in subclasses no matter what Unit is.

  using CharsBase::CharsBase;
};

inline size_t operator-(const SpecializedTokenStreamCharsBase<
                            mozilla::Utf8Unit>::SourceUnitsEnd& aEnd,
                        const SpecializedTokenStreamCharsBase<
                            mozilla::Utf8Unit>::SourceUnitsIterator& aIter) {
  return aIter.remaining();
}

/** A small class encapsulating computation of the start-offset of a Token. */
class TokenStart {
  uint32_t startOffset_;

 public:
  /**
   * Compute a starting offset that is the current offset of |sourceUnits|,
   * offset by |adjust|.  (For example, |adjust| of -1 indicates the code
   * unit one backwards from |sourceUnits|'s current offset.)
   */
  template <class SourceUnits>
  TokenStart(const SourceUnits& sourceUnits, ptrdiff_t adjust)
      : startOffset_(sourceUnits.offset() + adjust) {}

  TokenStart(const TokenStart&) = default;

  uint32_t offset() const { return startOffset_; }
};

template <typename Unit, class AnyCharsAccess>
class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {
  using CharsBase = TokenStreamCharsBase<Unit>;
  using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;

  using LineToken = TokenStreamAnyChars::LineToken;

 private:
  Token* newTokenInternal(TokenKind kind, TokenStart start, TokenKind* out);

  /**
   * Allocates a new Token from the given offset to the current offset,
   * ascribes it the given kind, and sets |*out| to that kind.
   */
  Token* newToken(TokenKind kind, TokenStart start,
                  TokenStreamShared::Modifier modifier, TokenKind* out) {
    Token* token = newTokenInternal(kind, start, out);

#ifdef DEBUG
    // Save the modifier used to get this token, so that if an ungetToken()
    // occurs and then the token is re-gotten (or peeked, etc.), we can
    // assert both gets used compatible modifiers.
    token->modifier = modifier;
#endif

    return token;
  }

  uint32_t matchUnicodeEscape(uint32_t* codePoint);
  uint32_t matchExtendedUnicodeEscape(uint32_t* codePoint);

 protected:
  using CharsBase::addLineOfContext;
  using CharsBase::fillCharBufferFromSourceNormalizingAsciiLineBreaks;
  using CharsBase::matchCodeUnit;
  using CharsBase::matchLineTerminator;
  using TokenStreamCharsShared::drainCharBufferIntoAtom;
  using TokenStreamCharsShared::isAsciiCodePoint;
  // Deliberately don't |using CharsBase::sourceUnits| because of bug 1472569.
  // :-(
  using CharsBase::toUnit;

  using typename CharsBase::SourceUnits;

 protected:
  using SpecializedCharsBase::SpecializedCharsBase;

  TokenStreamAnyChars& anyCharsAccess() {
    return AnyCharsAccess::anyChars(this);
  }

  const TokenStreamAnyChars& anyCharsAccess() const {
    return AnyCharsAccess::anyChars(this);
  }

  using TokenStreamSpecific =
      frontend::TokenStreamSpecific<Unit, AnyCharsAccess>;

  TokenStreamSpecific* asSpecific() {
    static_assert(
        std::is_base_of_v<GeneralTokenStreamChars, TokenStreamSpecific>,
        "static_cast below presumes an inheritance relationship");

    return static_cast<TokenStreamSpecific*>(this);
  }

 protected:
  /**
   * Compute the column number in Unicode code points of the absolute |offset|
   * within source text on the line corresponding to |lineToken|.
   *
   * |offset| must be a code point boundary, preceded only by validly-encoded
   * source units.  (It doesn't have to be *followed* by valid source units.)
   */
  uint32_t computeColumn(LineToken lineToken, uint32_t offset) const;
  void computeLineAndColumn(uint32_t offset, uint32_t* line,
                            uint32_t* column) const;

  /**
   * Fill in |err| completely, except for line-of-context information.
   *
   * Return true if the caller can compute a line of context from the token
   * stream.  Otherwise return false.
   */
  MOZ_MUST_USE bool fillExceptingContext(ErrorMetadata* err, uint32_t offset) {
    if (anyCharsAccess().fillExceptingContext(err, offset)) {
      computeLineAndColumn(offset, &err->lineNumber, &err->columnNumber);
      return true;
    }
    return false;
  }

  void newSimpleToken(TokenKind kind, TokenStart start,
                      TokenStreamShared::Modifier modifier, TokenKind* out) {
    newToken(kind, start, modifier, out);
  }

  void newNumberToken(double dval, DecimalPoint decimalPoint, TokenStart start,
                      TokenStreamShared::Modifier modifier, TokenKind* out) {
    Token* token = newToken(TokenKind::Number, start, modifier, out);
    token->setNumber(dval, decimalPoint);
  }

  void newBigIntToken(TokenStart start, TokenStreamShared::Modifier modifier,
                      TokenKind* out) {
    newToken(TokenKind::BigInt, start, modifier, out);
  }

  void newAtomToken(TokenKind kind, JSAtom* atom, TokenStart start,
                    TokenStreamShared::Modifier modifier, TokenKind* out) {
    MOZ_ASSERT(kind == TokenKind::String || kind == TokenKind::TemplateHead ||
               kind == TokenKind::NoSubsTemplate);

    Token* token = newToken(kind, start, modifier, out);
    token->setAtom(atom);
  }

  void newNameToken(PropertyName* name, TokenStart start,
                    TokenStreamShared::Modifier modifier, TokenKind* out) {
    Token* token = newToken(TokenKind::Name, start, modifier, out);
    token->setName(name);
  }

  void newPrivateNameToken(PropertyName* name, TokenStart start,
                           TokenStreamShared::Modifier modifier,
                           TokenKind* out) {
    Token* token = newToken(TokenKind::PrivateName, start, modifier, out);
    token->setName(name);
  }

  void newRegExpToken(JS::RegExpFlags reflags, TokenStart start,
                      TokenKind* out) {
    Token* token = newToken(TokenKind::RegExp, start,
                            TokenStreamShared::SlashIsRegExp, out);
    token->setRegExpFlags(reflags);
  }

  MOZ_COLD bool badToken();

  /**
   * Get the next code unit -- the next numeric sub-unit of source text,
   * possibly smaller than a full code point -- without updating line/column
   * counters or consuming LineTerminatorSequences.
   *
   * Because of these limitations, only use this if (a) the resulting code
   * unit is guaranteed to be ungotten (by ungetCodeUnit()) if it's an EOL,
   * and (b) the line-related state (lineno, linebase) is not used before
   * it's ungotten.
   */
  int32_t getCodeUnit() {
    if (MOZ_LIKELY(!this->sourceUnits.atEnd())) {
      return CodeUnitValue(this->sourceUnits.getCodeUnit());
    }

    anyCharsAccess().flags.isEOF = true;
    return EOF;
  }

  void ungetCodeUnit(int32_t c) {
    MOZ_ASSERT_IF(c == EOF, anyCharsAccess().flags.isEOF);

    CharsBase::ungetCodeUnit(c);
  }

  /**
   * Given a just-consumed ASCII code unit/point |lead|, consume a full code
   * point or LineTerminatorSequence (normalizing it to '\n') and store it in
   * |*codePoint|.  Return true on success, otherwise return false and leave
   * |*codePoint| undefined on failure.
   *
   * If a LineTerminatorSequence was consumed, also update line/column info.
   *
   * This may change the current |sourceUnits| offset.
   */
  MOZ_MUST_USE bool getFullAsciiCodePoint(int32_t lead, int32_t* codePoint) {
    MOZ_ASSERT(isAsciiCodePoint(lead),
               "non-ASCII code units must be handled separately");
    MOZ_ASSERT(toUnit(lead) == this->sourceUnits.previousCodeUnit(),
               "getFullAsciiCodePoint called incorrectly");

    if (MOZ_UNLIKELY(lead == '\r')) {
      matchLineTerminator('\n');
    } else if (MOZ_LIKELY(lead != '\n')) {
      *codePoint = lead;
      return true;
    }

    *codePoint = '\n';
    bool ok = updateLineInfoForEOL();
    if (!ok) {
#ifdef DEBUG
      *codePoint = EOF;  // sentinel value to hopefully cause errors
#endif
      MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
    }
    return ok;
  }

  MOZ_MUST_USE MOZ_ALWAYS_INLINE bool updateLineInfoForEOL() {
    return anyCharsAccess().internalUpdateLineInfoForEOL(
        this->sourceUnits.offset());
  }

  uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint);
  bool matchUnicodeEscapeIdent(uint32_t* codePoint);
  bool matchIdentifierStart();

  /**
   * If possible, compute a line of context for an otherwise-filled-in |err|
   * at the given offset in this token stream.
   *
   * This function is very-internal: almost certainly you should use one of
   * its callers instead.  It basically exists only to make those callers
   * more readable.
   */
  MOZ_MUST_USE bool internalComputeLineOfContext(ErrorMetadata* err,
                                                 uint32_t offset) {
    // We only have line-start information for the current line.  If the error
    // is on a different line, we can't easily provide context.  (This means
    // any error in a multi-line token, e.g. an unterminated multiline string
    // literal, won't have context.)
    if (err->lineNumber != anyCharsAccess().lineno) {
      return true;
    }

    return addLineOfContext(err, offset);
  }

 public:
  /**
   * Consume any hashbang comment at the start of a Script or Module, if one is
   * present.  Stops consuming just before any terminating LineTerminator or
   * before an encoding error is encountered.
   */
  void consumeOptionalHashbangComment();

  JSAtom* getRawTemplateStringAtom() {
    TokenStreamAnyChars& anyChars = anyCharsAccess();

    MOZ_ASSERT(anyChars.currentToken().type == TokenKind::TemplateHead ||
               anyChars.currentToken().type == TokenKind::NoSubsTemplate);
    const Unit* cur =
        this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.begin + 1);
    const Unit* end;
    if (anyChars.currentToken().type == TokenKind::TemplateHead) {
      // Of the form    |`...${|   or   |}...${|
      end =
          this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 2);
    } else {
      // NO_SUBS_TEMPLATE is of the form   |`...`|   or   |}...`|
      end =
          this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 1);
    }

    // |charBuffer| should be empty here, but we may as well code defensively.
    MOZ_ASSERT(this->charBuffer.length() == 0);
    this->charBuffer.clear();

    // Template literals normalize only '\r' and "\r\n" to '\n'; Unicode
    // separators don't need special handling.
    // https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv
    if (!fillCharBufferFromSourceNormalizingAsciiLineBreaks(cur, end)) {
      return nullptr;
    }

    return drainCharBufferIntoAtom();
  }
};

template <typename Unit, class AnyCharsAccess>
class TokenStreamChars;

template <class AnyCharsAccess>
class TokenStreamChars<char16_t, AnyCharsAccess>
    : public GeneralTokenStreamChars<char16_t, AnyCharsAccess> {
  using CharsBase = TokenStreamCharsBase<char16_t>;
  using SpecializedCharsBase = SpecializedTokenStreamCharsBase<char16_t>;
  using GeneralCharsBase = GeneralTokenStreamChars<char16_t, AnyCharsAccess>;
  using Self = TokenStreamChars<char16_t, AnyCharsAccess>;

  using GeneralCharsBase::asSpecific;

  using typename GeneralCharsBase::TokenStreamSpecific;

 protected:
  using CharsBase::matchLineTerminator;
  using GeneralCharsBase::anyCharsAccess;
  using GeneralCharsBase::getCodeUnit;
  using SpecializedCharsBase::infallibleGetNonAsciiCodePointDontNormalize;
  using TokenStreamCharsShared::isAsciiCodePoint;
  // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
  using GeneralCharsBase::ungetCodeUnit;
  using GeneralCharsBase::updateLineInfoForEOL;

 protected:
  using GeneralCharsBase::GeneralCharsBase;

  /**
   * Given the non-ASCII |lead| code unit just consumed, consume and return a
   * complete non-ASCII code point.  Line/column updates are not performed,
   * and line breaks are returned as-is without normalization.
   */
  MOZ_MUST_USE bool getNonAsciiCodePointDontNormalize(char16_t lead,
                                                      char32_t* codePoint) {
    // There are no encoding errors in 16-bit JS, so implement this so that
    // the compiler knows it, too.
    *codePoint = infallibleGetNonAsciiCodePointDontNormalize(lead);
    return true;
  }

  /**
   * Given a just-consumed non-ASCII code unit |lead| (which may also be a
   * full code point, for UTF-16), consume a full code point or
   * LineTerminatorSequence (normalizing it to '\n') and store it in
   * |*codePoint|.  Return true on success, otherwise return false and leave
   * |*codePoint| undefined on failure.
   *
   * If a LineTerminatorSequence was consumed, also update line/column info.
   *
   * This may change the current |sourceUnits| offset.
   */
  MOZ_MUST_USE bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
};

template <class AnyCharsAccess>
class TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>
    : public GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess> {
  using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
  using SpecializedCharsBase =
      SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>;
  using GeneralCharsBase =
      GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
  using Self = TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;

  using typename SpecializedCharsBase::SourceUnitsEnd;
  using typename SpecializedCharsBase::SourceUnitsIterator;

 protected:
  using GeneralCharsBase::anyCharsAccess;
  using GeneralCharsBase::computeLineAndColumn;
  using GeneralCharsBase::fillExceptingContext;
  using GeneralCharsBase::internalComputeLineOfContext;
  using TokenStreamCharsShared::isAsciiCodePoint;
  // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
  using GeneralCharsBase::updateLineInfoForEOL;

 private:
  static char toHexChar(uint8_t nibble) {
    MOZ_ASSERT(nibble < 16);
    return "0123456789ABCDEF"[nibble];
  }

  static void byteToString(uint8_t n, char* str) {
    str[0] = '0';
    str[1] = 'x';
    str[2] = toHexChar(n >> 4);
    str[3] = toHexChar(n & 0xF);
  }

  static void byteToTerminatedString(uint8_t n, char* str) {
    byteToString(n, str);
    str[4] = '\0';
  }

  /**
   * Report a UTF-8 encoding-related error for a code point starting AT THE
   * CURRENT OFFSET.
   *
   * |relevantUnits| indicates how many code units from the current offset
   * are potentially relevant to the reported error, such that they may be
   * included in the error message.  For example, if at the current offset we
   * have
   *
   *   0b1111'1111 ...
   *
   * a code unit never allowed in UTF-8, then |relevantUnits| might be 1
   * because only that unit is relevant.  Or if we have
   *
   *   0b1111'0111 0b1011'0101 0b0000'0000 ...
   *
   * where the first two code units are a valid prefix to a four-unit code
   * point but the third unit *isn't* a valid trailing code unit, then
   * |relevantUnits| might be 3.
   */
  MOZ_COLD void internalEncodingError(uint8_t relevantUnits,
                                      unsigned errorNumber, ...);

  // Don't use |internalEncodingError|!  Use one of the elaborated functions
  // that calls it, below -- all of which should be used to indicate an error
  // in a code point starting AT THE CURRENT OFFSET as with
  // |internalEncodingError|.

  /** Report an error for an invalid lead code unit |lead|. */
  MOZ_COLD void badLeadUnit(mozilla::Utf8Unit lead);

  /**
   * Report an error when there aren't enough code units remaining to
   * constitute a full code point after |lead|: only |remaining| code units
   * were available for a code point starting with |lead|, when at least
   * |required| code units were required.
   */
  MOZ_COLD void notEnoughUnits(mozilla::Utf8Unit lead, uint8_t remaining,
                               uint8_t required);

  /**
   * Report an error for a bad trailing UTF-8 code unit, where the bad
   * trailing unit was the last of |unitsObserved| units examined from the
   * current offset.
   */
  MOZ_COLD void badTrailingUnit(uint8_t unitsObserved);

  // Helper used for both |badCodePoint| and |notShortestForm| for code units
  // that have all the requisite high bits set/unset in a manner that *could*
  // encode a valid code point, but the remaining bits encoding its actual
  // value do not define a permitted value.
  MOZ_COLD void badStructurallyValidCodePoint(uint32_t codePoint,
                                              uint8_t codePointLength,
                                              const char* reason);

  /**
   * Report an error for UTF-8 that encodes a UTF-16 surrogate or a number
   * outside the Unicode range.
   */
  MOZ_COLD void badCodePoint(uint32_t codePoint, uint8_t codePointLength) {
    MOZ_ASSERT(unicode::IsSurrogate(codePoint) ||
               codePoint > unicode::NonBMPMax);

    badStructurallyValidCodePoint(codePoint, codePointLength,
                                  unicode::IsSurrogate(codePoint)
                                      ? "it's a UTF-16 surrogate"
                                      : "the maximum code point is U+10FFFF");
  }

  /**
   * Report an error for UTF-8 that encodes a code point not in its shortest
   * form.
   */
  MOZ_COLD void notShortestForm(uint32_t codePoint, uint8_t codePointLength) {
    MOZ_ASSERT(!unicode::IsSurrogate(codePoint));
    MOZ_ASSERT(codePoint <= unicode::NonBMPMax);

    badStructurallyValidCodePoint(
        codePoint, codePointLength,
        "it wasn't encoded in shortest possible form");
  }

 protected:
  using GeneralCharsBase::GeneralCharsBase;

  /**
   * Given the non-ASCII |lead| code unit just consumed, consume the rest of
   * a non-ASCII code point.  The code point is not normalized: on success
   * |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
   *
   * Report an error if an invalid code point is encountered.
   */
  MOZ_MUST_USE bool getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead,
                                                      char32_t* codePoint);

  /**
   * Given a just-consumed non-ASCII code unit |lead|, consume a full code
   * point or LineTerminatorSequence (normalizing it to '\n') and store it in
   * |*codePoint|.  Return true on success, otherwise return false and leave
   * |*codePoint| undefined on failure.
   *
   * If a LineTerminatorSequence was consumed, also update line/column info.
   *
   * This function will change the current |sourceUnits| offset.
   */
  MOZ_MUST_USE bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
};

// TokenStream is the lexical scanner for JavaScript source text.
//
// It takes a buffer of Unit code units (currently only char16_t encoding
// UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
// linearly scans it into |Token|s.
//
// Internally the class uses a four element circular buffer |tokens| of
// |Token|s. As an index for |tokens|, the member |cursor_| points to the
// current token. Calls to getToken() increase |cursor_| by one and return the
// new current token. If a TokenStream was just created, the current token is
// uninitialized. It's therefore important that one of the first four member
// functions listed below is called first. The circular buffer lets us go back
// up to two tokens from the last scanned token. Internally, the relative
// number of backward steps that were taken (via ungetToken()) after the last
// token was scanned is stored in |lookahead|.
//
// The following table lists in which situations it is safe to call each listed
// function. No checks are made by the functions in non-debug builds.
//
// Function Name     | Precondition; changes to |lookahead|
// ------------------+---------------------------------------------------------
// getToken          | none; if |lookahead > 0| then |lookahead--|
// peekToken         | none; if |lookahead == 0| then |lookahead == 1|
// peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1|
// matchToken        | none; if |lookahead > 0| and the match succeeds then
//                   |       |lookahead--|
// consumeKnownToken | none; if |lookahead > 0| then |lookahead--|
// ungetToken        | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++|
//
// The behavior of the token scanning process (see getTokenInternal()) can be
// modified by calling one of the first four above listed member functions with
// an optional argument of type Modifier.  However, the modifier will be
// ignored unless |lookahead == 0| holds.  Due to constraints of the grammar,
// this turns out not to be a problem in practice. See the
// mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?'
// for more details:
// https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E).
//
// The method seek() allows rescanning from a previously visited location of
// the buffer, initially computed by constructing a Position local variable.
//
template <typename Unit, class AnyCharsAccess>
class MOZ_STACK_CLASS TokenStreamSpecific
    : public TokenStreamChars<Unit, AnyCharsAccess>,
      public TokenStreamShared,
      public ErrorReporter {
 public:
  using CharsBase = TokenStreamCharsBase<Unit>;
  using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
  using GeneralCharsBase = GeneralTokenStreamChars<Unit, AnyCharsAccess>;
  using SpecializedChars = TokenStreamChars<Unit, AnyCharsAccess>;

  using Position = TokenStreamPosition<Unit>;

  // Anything inherited through a base class whose type depends upon this
  // class's template parameters can only be accessed through a dependent
  // name: prefixed with |this|, by explicit qualification, and so on.  (This
  // is so that references to inherited fields are statically distinguishable
  // from references to names outside of the class.)  This is tedious and
  // onerous.
  //
  // As an alternative, we directly add every one of these functions to this
  // class, using explicit qualification to address the dependent-name
  // problem.  |this| or other qualification is no longer necessary -- at
  // cost of this ever-changing laundry list of |using|s.  So it goes.
 public:
  using GeneralCharsBase::anyCharsAccess;
  using GeneralCharsBase::computeLineAndColumn;
  using TokenStreamCharsShared::adoptState;

 private:
  using typename CharsBase::SourceUnits;

 private:
  using CharsBase::atomizeSourceChars;
  using GeneralCharsBase::badToken;
  using TokenStreamCharsShared::appendCodePointToCharBuffer;
  // Deliberately don't |using| |charBuffer| because of bug 1472569.  :-(
  using CharsBase::consumeKnownCodeUnit;
  using CharsBase::fillCharBufferFromSourceNormalizingAsciiLineBreaks;
  using CharsBase::matchCodeUnit;
  using CharsBase::matchLineTerminator;
  using CharsBase::peekCodeUnit;
  using GeneralCharsBase::computeColumn;
  using GeneralCharsBase::fillExceptingContext;
  using GeneralCharsBase::getCodeUnit;
  using GeneralCharsBase::getFullAsciiCodePoint;
  using GeneralCharsBase::internalComputeLineOfContext;
  using GeneralCharsBase::matchUnicodeEscapeIdent;
  using GeneralCharsBase::matchUnicodeEscapeIdStart;
  using GeneralCharsBase::newAtomToken;
  using GeneralCharsBase::newBigIntToken;
  using GeneralCharsBase::newNameToken;
  using GeneralCharsBase::newNumberToken;
  using GeneralCharsBase::newPrivateNameToken;
  using GeneralCharsBase::newRegExpToken;
  using GeneralCharsBase::newSimpleToken;
  using SpecializedChars::getNonAsciiCodePoint;
  using SpecializedChars::getNonAsciiCodePointDontNormalize;
  using TokenStreamCharsShared::copyCharBufferTo;
  using TokenStreamCharsShared::drainCharBufferIntoAtom;
  using TokenStreamCharsShared::isAsciiCodePoint;
  // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
  using CharsBase::toUnit;
  using GeneralCharsBase::ungetCodeUnit;
  using GeneralCharsBase::updateLineInfoForEOL;

  template <typename CharU>
  friend class TokenStreamPosition;

 public:
  TokenStreamSpecific(JSContext* cx, CompilationInfo* compilationInfo,
                      const JS::ReadOnlyCompileOptions& options,
                      const Unit* units, size_t length);

  /**
   * Get the next code point, converting LineTerminatorSequences to '\n' and
   * updating internal line-counter state if needed.  Return true on success
   * and store the code point in |*cp|.  Return false and leave |*cp|
   * undefined on failure.
   */
  MOZ_MUST_USE bool getCodePoint(int32_t* cp);

  // If there is an invalid escape in a template, report it and return false,
  // otherwise return true.
  bool checkForInvalidTemplateEscapeError() {
    if (anyCharsAccess().invalidTemplateEscapeType == InvalidEscapeType::None) {
      return true;
    }

    reportInvalidEscapeError(anyCharsAccess().invalidTemplateEscapeOffset,
                             anyCharsAccess().invalidTemplateEscapeType);
    return false;
  }

 public:
  // Implement ErrorReporter.

  void lineAndColumnAt(size_t offset, uint32_t* line,
                       uint32_t* column) const final {
    computeLineAndColumn(offset, line, column);
  }

  void currentLineAndColumn(uint32_t* line, uint32_t* column) const final {
    computeLineAndColumn(anyCharsAccess().currentToken().pos.begin, line,
                         column);
  }

  bool isOnThisLine(size_t offset, uint32_t lineNum,
                    bool* onThisLine) const final {
    return anyCharsAccess().srcCoords.isOnThisLine(offset, lineNum, onThisLine);
  }

  uint32_t lineAt(size_t offset) const final {
    const auto& anyChars = anyCharsAccess();
    auto lineToken = anyChars.lineToken(offset);
    return anyChars.lineNumber(lineToken);
  }

  uint32_t columnAt(size_t offset) const final {
    return computeColumn(anyCharsAccess().lineToken(offset), offset);
  }

  bool hasTokenizationStarted() const final;

  const char* getFilename() const final {
    return anyCharsAccess().getFilename();
  }

 private:
  // Implement ErrorReportMixin.

  JSContext* getContext() const override { return anyCharsAccess().cx; }

  MOZ_MUST_USE bool strictMode() const override {
    return anyCharsAccess().strictMode();
  }

 public:
  // Implement ErrorReportMixin.

  const JS::ReadOnlyCompileOptions& options() const final {
    return anyCharsAccess().options();
  }

  MOZ_MUST_USE bool computeErrorMetadata(
      ErrorMetadata* err, const ErrorOffset& errorOffset) override;

 private:
  void reportInvalidEscapeError(uint32_t offset, InvalidEscapeType type) {
    switch (type) {
      case InvalidEscapeType::None:
        MOZ_ASSERT_UNREACHABLE("unexpected InvalidEscapeType");
        return;
      case InvalidEscapeType::Hexadecimal:
        errorAt(offset, JSMSG_MALFORMED_ESCAPE, "hexadecimal");
        return;
      case InvalidEscapeType::Unicode:
        errorAt(offset, JSMSG_MALFORMED_ESCAPE, "Unicode");
        return;
      case InvalidEscapeType::UnicodeOverflow:
        errorAt(offset, JSMSG_UNICODE_OVERFLOW, "escape sequence");
        return;
      case InvalidEscapeType::Octal:
        errorAt(offset, JSMSG_DEPRECATED_OCTAL);
        return;
    }
  }

  MOZ_MUST_USE bool putIdentInCharBuffer(const Unit* identStart);

  using IsIntegerUnit = bool (*)(int32_t);
  MOZ_MUST_USE MOZ_ALWAYS_INLINE bool matchInteger(IsIntegerUnit isIntegerUnit,
                                                   int32_t* nextUnit);
  MOZ_MUST_USE MOZ_ALWAYS_INLINE bool matchIntegerAfterFirstDigit(
      IsIntegerUnit isIntegerUnit, int32_t* nextUnit);

  /**
   * Tokenize a decimal number that begins at |numStart| into the provided
   * token.
   *
   * |unit| must be one of these values:
   *
   *   1. The first decimal digit in the integral part of a decimal number
   *      not starting with '0' or '.', e.g. '1' for "17", '3' for "3.14", or
   *      '8' for "8.675309e6".
   *
   *   In this case, the next |getCodeUnit()| must return the code unit after
   *   |unit| in the overall number.
   *
   *   2. The '.' in a "."/"0."-prefixed decimal number or the 'e'/'E' in a
   *      "0e"/"0E"-prefixed decimal number, e.g. ".17", "0.42", or "0.1e3".
   *
   *   In this case, the next |getCodeUnit()| must return the code unit
   *   *after* the first decimal digit *after* the '.'.  So the next code
   *   unit would be '7' in ".17", '2' in "0.42", 'e' in "0.4e+8", or '/' in
   *   "0.5/2" (three separate tokens).
   *
   *   3. The code unit after the '0' where "0" is the entire number token.
   *
   *   In this case, the next |getCodeUnit()| would return the code unit
   *   after |unit|, but this function will never perform such call.
   *
   *   4. (Non-strict mode code only)  The first '8' or '9' in a "noctal"
   *      number that begins with a '0' but contains a non-octal digit in its
   *      integer part so is interpreted as decimal, e.g. '9' in "09.28" or
   *      '8' in "0386" or '9' in "09+7" (three separate tokens").
   *
   *   In this case, the next |getCodeUnit()| returns the code unit after
   *   |unit|: '.', '6', or '+' in the examples above.
   *
   * This interface is super-hairy and horribly stateful.  Unfortunately, its
   * hair merely reflects the intricacy of ECMAScript numeric literal syntax.
   * And incredibly, it *improves* on the goto-based horror that predated it.
   */
  MOZ_MUST_USE bool decimalNumber(int32_t unit, TokenStart start,
                                  const Unit* numStart, Modifier modifier,
                                  TokenKind* out);

  /** Tokenize a regular expression literal beginning at |start|. */
  MOZ_MUST_USE bool regexpLiteral(TokenStart start, TokenKind* out);

  /**
   * Slurp characters between |start| and sourceUnits.current() into
   * charBuffer, to later parse into a bigint.
   */
  MOZ_MUST_USE bool bigIntLiteral(TokenStart start, Modifier modifier,
                                  TokenKind* out);

 public:
  // Advance to the next token.  If the token stream encountered an error,
  // return false.  Otherwise return true and store the token kind in |*ttp|.
  MOZ_MUST_USE bool getToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
    // Check for a pushed-back token resulting from mismatching lookahead.
    TokenStreamAnyChars& anyChars = anyCharsAccess();
    if (anyChars.lookahead != 0) {
      MOZ_ASSERT(!anyChars.flags.hadError);
      anyChars.lookahead--;
      anyChars.advanceCursor();
      TokenKind tt = anyChars.currentToken().type;
      MOZ_ASSERT(tt != TokenKind::Eol);
      verifyConsistentModifier(modifier, anyChars.currentToken());
      *ttp = tt;
      return true;
    }

    return getTokenInternal(ttp, modifier);
  }

  MOZ_MUST_USE bool peekToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
    TokenStreamAnyChars& anyChars = anyCharsAccess();
    if (anyChars.lookahead > 0) {
      MOZ_ASSERT(!anyChars.flags.hadError);
      verifyConsistentModifier(modifier, anyChars.nextToken());
      *ttp = anyChars.nextToken().type;
      return true;
    }
    if (!getTokenInternal(ttp, modifier)) {
      return false;
    }
    anyChars.ungetToken();
    return true;
  }

  MOZ_MUST_USE bool peekTokenPos(TokenPos* posp,
                                 Modifier modifier = SlashIsDiv) {
    TokenStreamAnyChars& anyChars = anyCharsAccess();
    if (anyChars.lookahead == 0) {
      TokenKind tt;
      if (!getTokenInternal(&tt, modifier)) {
        return false;
      }
      anyChars.ungetToken();
      MOZ_ASSERT(anyChars.hasLookahead());
    } else {
      MOZ_ASSERT(!anyChars.flags.hadError);
      verifyConsistentModifier(modifier, anyChars.nextToken());
    }
    *posp = anyChars.nextToken().pos;
    return true;
  }

  MOZ_MUST_USE bool peekOffset(uint32_t* offset,
                               Modifier modifier = SlashIsDiv) {
    TokenPos pos;
    if (!peekTokenPos(&pos, modifier)) {
      return false;
    }
    *offset = pos.begin;
    return true;
  }

  // This is like peekToken(), with one exception:  if there is an EOL
  // between the end of the current token and the start of the next token, it
  // return true and store Eol in |*ttp|.  In that case, no token with
  // Eol is actually created, just a Eol TokenKind is returned, and
  // currentToken() shouldn't be consulted.  (This is the only place Eol
  // is produced.)
  MOZ_ALWAYS_INLINE MOZ_MUST_USE bool peekTokenSameLine(
      TokenKind* ttp, Modifier modifier = SlashIsDiv) {
    TokenStreamAnyChars& anyChars = anyCharsAccess();
    const Token& curr = anyChars.currentToken();

    // If lookahead != 0, we have scanned ahead at least one token, and
    // |lineno| is the line that the furthest-scanned token ends on.  If
    // it's the same as the line that the current token ends on, that's a
    // stronger condition than what we are looking for, and we don't need
    // to return Eol.
    if (anyChars.lookahead != 0) {
      bool onThisLine;
      if (!anyChars.srcCoords.isOnThisLine(curr.pos.end, anyChars.lineno,
                                           &onThisLine)) {
        error(JSMSG_OUT_OF_MEMORY);
        return false;
      }

      if (onThisLine) {
        MOZ_ASSERT(!anyChars.flags.hadError);
        verifyConsistentModifier(modifier, anyChars.nextToken());
        *ttp = anyChars.nextToken().type;
        return true;
      }
    }

    // The above check misses two cases where we don't have to return
    // Eol.
    // - The next token starts on the same line, but is a multi-line token.
    // - The next token starts on the same line, but lookahead==2 and there
    //   is a newline between the next token and the one after that.
    // The following test is somewhat expensive but gets these cases (and
    // all others) right.
    TokenKind tmp;
    if (!getToken(&tmp, modifier)) {
      return false;
    }

    const Token& next = anyChars.currentToken();
    anyChars.ungetToken();

    // Careful, |next| points to an initialized-but-not-allocated Token!
    // This is safe because we don't modify token data below.

    auto currentEndToken = anyChars.lineToken(curr.pos.end);
    auto nextBeginToken = anyChars.lineToken(next.pos.begin);

    *ttp =
        currentEndToken.isSameLine(nextBeginToken) ? next.type : TokenKind::Eol;
    return true;
  }

  // Get the next token from the stream if its kind is |tt|.
  MOZ_MUST_USE bool matchToken(bool* matchedp, TokenKind tt,
                               Modifier modifier = SlashIsDiv) {
    TokenKind token;
    if (!getToken(&token, modifier)) {
      return false;
    }
    if (token == tt) {
      *matchedp = true;
    } else {
      anyCharsAccess().ungetToken();
      *matchedp = false;
    }
    return true;
  }

  void consumeKnownToken(TokenKind tt, Modifier modifier = SlashIsDiv) {
    bool matched;
    MOZ_ASSERT(anyCharsAccess().hasLookahead());
    MOZ_ALWAYS_TRUE(matchToken(&matched, tt, modifier));
    MOZ_ALWAYS_TRUE(matched);
  }

  MOZ_MUST_USE bool nextTokenEndsExpr(bool* endsExpr) {
    TokenKind tt;
    if (!peekToken(&tt)) {
      return false;
    }

    *endsExpr = anyCharsAccess().isExprEnding[size_t(tt)];
    if (*endsExpr) {
      // If the next token ends an overall Expression, we'll parse this
      // Expression without ever invoking Parser::orExpr().  But we need that
      // function's DEBUG-only side effect of marking this token as safe to get
      // with SlashIsRegExp, so we have to do it manually here.
      anyCharsAccess().allowGettingNextTokenWithSlashIsRegExp();
    }
    return true;
  }

  MOZ_MUST_USE bool advance(size_t position);

  void seekTo(const Position& pos);
  MOZ_MUST_USE bool seekTo(const Position& pos,
                           const TokenStreamAnyChars& other);

  void rewind(const Position& pos) {
    MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
               "should be rewinding here");
    seekTo(pos);
  }

  MOZ_MUST_USE bool rewind(const Position& pos,
                           const TokenStreamAnyChars& other) {
    MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
               "should be rewinding here");
    return seekTo(pos, other);
  }

  void fastForward(const Position& pos) {
    MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
               "should be moving forward here");
    seekTo(pos);
  }

  MOZ_MUST_USE bool fastForward(const Position& pos,
                                const TokenStreamAnyChars& other) {
    MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
               "should be moving forward here");
    return seekTo(pos, other);
  }

  const Unit* codeUnitPtrAt(size_t offset) const {
    return this->sourceUnits.codeUnitPtrAt(offset);
  }

  const Unit* rawLimit() const { return this->sourceUnits.limit(); }

  MOZ_MUST_USE bool identifierName(TokenStart start, const Unit* identStart,
                                   IdentifierEscapes escaping,
                                   Modifier modifier, NameVisibility visibility,
                                   TokenKind* out);

  MOZ_MUST_USE bool matchIdentifierStart(IdentifierEscapes* sawEscape);

  MOZ_MUST_USE bool getTokenInternal(TokenKind* const ttp,
                                     const Modifier modifier);

  MOZ_MUST_USE bool getStringOrTemplateToken(char untilChar, Modifier modifier,
                                             TokenKind* out);

  // Parse a TemplateMiddle or TemplateTail token (one of the string-like parts
  // of a template string) after already consuming the leading `RightCurly`.
  // (The spec says the `}` is the first character of the TemplateMiddle/
  // TemplateTail, but we treat it as a separate token because that's much
  // easier to implement in both TokenStream and the parser.)
  //
  // This consumes a token and sets the current token, like `getToken()`.  It
  // doesn't take a Modifier because there's no risk of encountering a division
  // operator or RegExp literal.
  //
  // On success, `*ttp` is either `TokenKind::TemplateHead` (if we got a
  // TemplateMiddle token) or `TokenKind::NoSubsTemplate` (if we got a
  // TemplateTail). That may seem strange; there are four different template
  // token types in the spec, but we only use two. We use `TemplateHead` for
  // TemplateMiddle because both end with `...${`, and `NoSubsTemplate` for
  // TemplateTail because both contain the end of the template, including the
  // closing quote mark. They're not treated differently, either in the parser
  // or in the tokenizer.
  MOZ_MUST_USE bool getTemplateToken(TokenKind* ttp) {
    MOZ_ASSERT(anyCharsAccess().currentToken().type == TokenKind::RightCurly);
    return getStringOrTemplateToken('`', SlashIsInvalid, ttp);
  }

  MOZ_MUST_USE bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
  MOZ_MUST_USE bool getDirective(
      bool isMultiline, bool shouldWarnDeprecated, const char* directive,
      uint8_t directiveLength, const char* errorMsgPragma,
      UniquePtr<char16_t[], JS::FreePolicy>* destination);
  MOZ_MUST_USE bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated);
  MOZ_MUST_USE bool getSourceMappingURL(bool isMultiline,
                                        bool shouldWarnDeprecated);
};

// It's preferable to define this in TokenStream.cpp, but its template-ness
// means we'd then have to *instantiate* this constructor for all possible
// (Unit, AnyCharsAccess) pairs -- and that gets super-messy as AnyCharsAccess
// *itself* is templated.  This symbol really isn't that huge compared to some
// defined inline in TokenStreamSpecific, so just rely on the linker commoning
// stuff up.
template <typename Unit>
template <class AnyCharsAccess>
inline TokenStreamPosition<Unit>::TokenStreamPosition(
    AutoKeepAtoms& keepAtoms,
    TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream)
    : currentToken(tokenStream.anyCharsAccess().currentToken()) {
  TokenStreamAnyChars& anyChars = tokenStream.anyCharsAccess();

  buf =
      tokenStream.sourceUnits.addressOfNextCodeUnit(/* allowPoisoned = */ true);
  flags = anyChars.flags;
  lineno = anyChars.lineno;
  linebase = anyChars.linebase;
  prevLinebase = anyChars.prevLinebase;
  lookahead = anyChars.lookahead;
  currentToken = anyChars.currentToken();
  for (unsigned i = 0; i < anyChars.lookahead; i++) {
    lookaheadTokens[i] = anyChars.tokens[anyChars.aheadCursor(1 + i)];
  }
}

class TokenStreamAnyCharsAccess {
 public:
  template <class TokenStreamSpecific>
  static inline TokenStreamAnyChars& anyChars(TokenStreamSpecific* tss);

  template <class TokenStreamSpecific>
  static inline const TokenStreamAnyChars& anyChars(
      const TokenStreamSpecific* tss);
};

class MOZ_STACK_CLASS TokenStream
    : public TokenStreamAnyChars,
      public TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess> {
  using Unit = char16_t;

 public:
  TokenStream(JSContext* cx, CompilationInfo* compilationInfo,
              const JS::ReadOnlyCompileOptions& options, const Unit* units,
              size_t length, StrictModeGetter* smg)
      : TokenStreamAnyChars(cx, options, smg),
        TokenStreamSpecific<Unit, TokenStreamAnyCharsAccess>(
            cx, compilationInfo, options, units, length) {}
};

class MOZ_STACK_CLASS DummyTokenStream final : public TokenStream {
 public:
  DummyTokenStream(JSContext* cx, const JS::ReadOnlyCompileOptions& options)
      : TokenStream(cx, nullptr, options, nullptr, 0, nullptr) {}
};

template <class TokenStreamSpecific>
/* static */ inline TokenStreamAnyChars& TokenStreamAnyCharsAccess::anyChars(
    TokenStreamSpecific* tss) {
  auto* ts = static_cast<TokenStream*>(tss);
  return *static_cast<TokenStreamAnyChars*>(ts);
}

template <class TokenStreamSpecific>
/* static */ inline const TokenStreamAnyChars&
TokenStreamAnyCharsAccess::anyChars(const TokenStreamSpecific* tss) {
  const auto* ts = static_cast<const TokenStream*>(tss);
  return *static_cast<const TokenStreamAnyChars*>(ts);
}

extern const char* TokenKindToDesc(TokenKind tt);

}  // namespace frontend
}  // namespace js

extern JS_FRIEND_API int js_fgets(char* buf, int size, FILE* file);

#ifdef DEBUG
extern const char* TokenKindToString(js::frontend::TokenKind tt);
#endif

#endif /* frontend_TokenStream_h */