CodeSnip Old Code (SVN)

A repository for source code snippets, designed with Pascal in mind.

[r3413]: / branches / 3.x / Src / Hiliter.UPasLexer.pas Maximize Restore History
975 lines (903 with data), 33.3 kB

{
 * Hiliter.UPasLexer.pas
 *
 * Defines a class that analyses and tokenises Pascal source code.
 *
 * $Rev$
 * $Date$
 *
 * ***** BEGIN LICENSE BLOCK *****
 *
 * Version: MPL 1.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with the
 * License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
 * the specific language governing rights and limitations under the License.
 *
 * The Original Code is Hiliter.UPasLexer.pas, formerly UHilitePasLexer.pas
 *
 * The Initial Developer of the Original Code is Peter Johnson
 * (http://www.delphidabbler.com/).
 *
 * Portions created by the Initial Developer are Copyright (C) 2005-2011 Peter
 * Johnson. All Rights Reserved.
 *
 * Contributor(s)
 *   NONE
 *
 * ***** END LICENSE BLOCK *****
}


unit Hiliter.UPasLexer;


interface


uses
  // Delphi
  SysUtils, Classes,
  // Project
  UTextStreamReader;


type

  {
  THilitePasToken:
    Tokens describing the different components of Pascal source code returned by
    the lexical analyser.
  }
  THilitePasToken = (
    tkKeyword,        // Pascal keyword
    tkComment,        // comment including opening and closing symbols
    tkCompilerDir,    // compiler directive including include comment symbols
    tkDirective,      // Pascal directive (Delphi 7)
    tkIdentifier,     // identifier: identifier that is not keyword or directive
    tkString,         // string literal including quotes
    tkChar,           // literal character "#" [Hex | Whole number ]
    tkNumber,         // integral whole number
    tkFloat,          // floating point number (may use 'E' notation)
    tkHex,            // hex digit "$" + { 0..9 | A..F }+
    tkSymbol,         // symbol (single or double character eg '=' and ':=')
    tkWhitespace,     // white space (spaces, tabs etc, excluding CR and LF)
    tkEOL,            // end of line (usually CRLF but CR and LF on own valid)
    tkEOF,            // end of file
    tkError           // error condition: shouldn't occur in valid Pascal code
  );

  {
  THilitePasLexer:
    Class that analyses and tokenises Pascal code.
  }
  THilitePasLexer = class(TObject)
  strict private
    fTokenStr: string;            // Text of last token read from input
    fToken: THilitePasToken;      // Kind of last token read from input
    fCommentState: record         // Records state of comment being processed
      InComment: Boolean;           // whether currently processing comment
      CommentType: THilitePasToken; // indicates comment or compiler directive
      CommentCloser: string;        // closing comment symbol ( *), } or EOL )
    end;
    fReader: TTextStreamReader;   // Object that reads characters from input
    procedure UpdateTokenStr; overload;
      {Appends current character in input to token string. Ignores EOF.
      }
    procedure UpdateTokenStr(const Ch: Char); overload;
      {Appends a character to token string. Ignores EOF.
        @param Ch [in] Character to append.
      }
    function ParseChar: THilitePasToken;
      {Analyses a literal character (made from # followed by number) from input
      and stores in token string.
        @return Token indicating literal char (tkChar).
      }
    function ParseCommentFromStart: THilitePasToken;
      {Begins parsing of a new comment or compiler directive.
        @return Token telling whether this is a comment or compiler directive
          (tkComment, tkCompilerDir).
      }
    function ParseCommentInterior: THilitePasToken;
      {Analyses body of comment after start or after resuming processing multi-
      line comments.
        @return Token telling whether this is a comment or compiler directive
          (tkComment, tkCompilerDir).
      }
    function ParseEOL: THilitePasToken;
      {Analyses end of line from input and stores in token string.
        @return End of line token (tkEOL).
      }
    function ParseHex: THilitePasToken;
      {Analyses a hexadecimal integer from input and stores in token string.
        @return Token indicating hexadecimal value (tkHex).
      }
    function ParseIdent: THilitePasToken;
      {Analyses an alphanumeric identifier from input and stores in token
      string. Checks if identifier is keyword or directive.
        @return Token representing identifier: tkKeyword, tkDirective or
          tkIdentifier.
      }
    function ParseNumber: THilitePasToken;
      {Analyses a number from input and stores in token string. Number can be
      integer or real.
        @return Appropriate token for number (tkNumber or tkFloat).
      }
    function ParseString: THilitePasToken;
      {Analyses a string literal from input and stores in token string.
        @return String token (tkString).
      }
    function ParseSymbol: THilitePasToken;
      {Determines whether the current symbol character on input represents a
      symbol or introduces some other syntactic entity (i.e. comment, string,
      character literal or a hex number). Analyses the input accordingly and
      stores the whole token in the token string.
        @return Token describing entity parsed.
      }
    function ParseUnknown: THilitePasToken;
      {Analyses an unrecognised entity from input and adds it to token string.
        @return Error token (tkError).
      }
    function ParseWhiteSpace: THilitePasToken;
      {Analyses a sequence of white space from input and appends space for each
      white space character read to token string.
        @return White space token (tkWhiteSpace).
      }
    function ParseWholeNumber: THilitePasToken;
      {Analyses a whole number from input and appends to token string.
        @return Whole number token (tkNumber).
      }
  public
    constructor Create(const Stm: TStream);
      {Constructor. Sets up object to analyse code on a stream.
        @param Stm [in] Stream containing Pascal source.
      }
    destructor Destroy; override;
      {Destructor. Tears down object.
      }
    function NextToken: THilitePasToken;
      {Gets and analyses next pascal token from input and stores details in
      token string.
        @return Token identifier for type of token read.
      }
    property TokenStr: string read fTokenStr;
      {Text that makes up the token last read from input}
    property Token: THilitePasToken read fToken;
      {Kind of token last read from input}
  end;


implementation


uses
  // Delphi
  Generics.Collections, Character,
  // Project
  UComparers, UConsts, UUtils;


const

  // Character constants
  cDecimalPoint     = '.';
  cCompilerDirChar  = '$';
  cStringDelim      = '''';
  cCloseParen       = ')';
  cEOL = TTextStreamReader.EOL;
  cEOF = TTextStreamReader.EOF;

  // String tables
  cDoubleSyms: array[0..9] of string = (         // list of valid double symbols
    '(*', '(.', '*)', '.)', '..', '//', ':=', '<=', '>=', '<>'
  );
  cCommentOpeners: array[0..2] of string = (       // symbols that open comments
    '{', '(*', '//'
  );
  cCommentClosers: array[0..2] of string = (      // symbols that close comments
    // item at given index matches openers at same index
    '}', '*)', cEOL
  );
  cCompilerDirOpeners: array[0..1] of string = (    // comment symbols that open
    '{', '(*'                                             // compiler directives
  );
  cKeywords: array[0..68] of string = (     // table of keywords per Delphi 2006
    'and', 'array', 'as', 'asm',
    'begin',
    'case', 'class', 'const', 'constructor',
    'destructor', 'dispinterface', 'div', 'do', 'downto',
    'else', 'end', 'except', 'exports',
    'file', 'final', 'finalization', 'finally', 'for', 'function',
    'goto',
    'if', 'implementation', 'in', 'inherited',
    'initialization', 'inline', 'interface', 'is',
    'label', 'library',
    'mod',
    'nil', 'not',
    'object', 'of', 'or', 'out',
    'packed', 'procedure', 'program', 'property',
    'raise', 'record', 'repeat', 'resourcestring',
    'sealed', 'set', 'shl', 'shr', 'static', 'string',
    'then', 'threadvar', 'to', 'try', 'type',
    'unit', 'unsafe', 'until', 'uses',
    'var',
    'while', 'with',
    'xor'
  );

  cDirectives: array[0..45] of string = (                 // table of directives
    'absolute',     // used in variable declaration
    'abstract',     // method directive
    'assembly',     // flags routine as containing assembler
    'at',           // only occurs in raise statement
    'automated',    // used in class declarations
    'cdecl',        // calling convention
    'contains',     // package clause
    'default',      // used in property declarations
    'deprecated',   // portability directive
    'dispid',       // used in automated properties
    'dynamic',      // method directive
    'export',       // calling convention (ignored)
    'external',     // routine directive
    'far',          // calling convention (ignored)
    'forward',      // routine directive
    'implements',   // used in property declarations
    'index',        // used in property declarations and re DLLs
    'inline',       // flags a routine as inlinable
    'local',        // routine directive
    'message',      // method directive
    'name',         // used re DLLs
    'near',         // calling convention (ignored)
    'nodefault',    // used in property declarations
    'on',           // used in exception handlers
    'overload',     // method / routine directive
    'override',     // method directive
    'package',      // introduces a package
    'pascal',       // calling convention
    'platform',     // portability directive
    'private',      // used in class declarations
    'protected',    // used in class declarations
    'public',       // used in class declarations
    'published',    // used in class declarations
    'read',         // used in property declarations
    'readonly',     // property directive in dispinterfaces
    'register',     // calling convention
    'reintroduce',  // method directive
    'requires',     // package clause
    'resident',     // directive used in exports clauses (ignored)
    'safecall',     // calling convention
    'stdcall',      // calling convention
    'stored',       // used in property declarations
    'varargs',      // method / routine directive
    'virtual',      // method directive
    'write',        // used in property declarations
    'writeonly'     // property directive in dispinterfaces
  );

  // Maps symbols onto likely tokens or error if token shouldn't occur (eg
  // close comments).
  cSymToTokenMap: array[0..32] of record
    Symbol: string;           // symbol strings
    Token: THilitePasToken;   // related token
  end = (
    ( Symbol: '$';  Token: tkHex;     ),
    ( Symbol: '#';  Token: tkChar;    ),
    ( Symbol: '&';  Token: tkSymbol;  ),
    ( Symbol: ''''; Token: tkString;  ),
    ( Symbol: '(';  Token: tkSymbol;  ),
    ( Symbol: ')';  Token: tkSymbol;  ),
    ( Symbol: '*';  Token: tkSymbol;  ),
    ( Symbol: '+';  Token: tkSymbol;  ),
    ( Symbol: ',';  Token: tkSymbol;  ),
    ( Symbol: '-';  Token: tkSymbol;  ),
    ( Symbol: '.';  Token: tkSymbol;  ),
    ( Symbol: '/';  Token: tkSymbol;  ),
    ( Symbol: ':';  Token: tkSymbol;  ),
    ( Symbol: ';';  Token: tkSymbol;  ),
    ( Symbol: '<';  Token: tkSymbol;  ),
    ( Symbol: '=';  Token: tkSymbol;  ),
    ( Symbol: '>';  Token: tkSymbol;  ),
    ( Symbol: '@';  Token: tkSymbol;  ),
    ( Symbol: '[';  Token: tkSymbol;  ),
    ( Symbol: ']';  Token: tkSymbol;  ),
    ( Symbol: '^';  Token: tkSymbol;  ),
    ( Symbol: '{';  Token: tkComment; ),
    ( Symbol: '}';  Token: tkError;   ),
    ( Symbol: '(*'; Token: tkComment; ),
    ( Symbol: '*)'; Token: tkError;   ),
    ( Symbol: '(.'; Token: tkSymbol;  ),
    ( Symbol: '.)'; Token: tkSymbol;  ),
    ( Symbol: '..'; Token: tkSymbol;  ),
    ( Symbol: '//'; Token: tkComment; ),
    ( Symbol: ':='; Token: tkSymbol;  ),
    ( Symbol: '<='; Token: tkSymbol;  ),
    ( Symbol: '>='; Token: tkSymbol;  ),
    ( Symbol: '<>'; Token: tkSymbol;  )
  );


type
  // Class that maps symbols to tokens
  TSymbolMap = TDictionary<string,THilitePasToken>;

var
  // Private objects used to store and search lists of symbols and keywords
  pvtKeywords: TStringList = nil;   // keywords list
  pvtDirectives: TStringList = nil; // directives list
  pvtDoubleSyms: TStringList = nil; // list of double symbols
  pvtSymMap: TSymbolMap;            // map of symbols to tokens


{ Helper routines }

function IsValidIdentBodyChar(const C: Char): Boolean; inline;
  {Checks if a character is valid for inclusion in the body of a Delphi
  identifier, after the first character.
    @param C [in] Character to be tested.
    @return True if C is valid, False otherwise.
  }
begin
  Result := TCharacter.IsLetterOrDigit(C) or (C = '_');
end;

function IsValidIdentStartChar(const C: Char): Boolean; inline;
  {Checks if a character is a valid first character of a Delphi identifier.
    @param C [in] Character to be tested.
    @return True if C is valid, False otherwise.
  }
begin
  Result := TCharacter.IsLetter(C) or (C = '_');
end;

function IsWhiteSpaceChar(const C: Char): Boolean; inline;
  {Checks if a character is a whitespace character but not end of line or end
  of file character}
begin
  Result := TCharacter.IsWhiteSpace(C) and not CharInSet(C, [CR, LF, cEOF]);
end;

function IsSymbolChar(const C: Char): Boolean; inline;
  {Checks if a character is a symbol.
    @param C [in] Character to check.
    @return True if C is a symbol, False if not.
  }
const
  // valid symbols
  cSymbols = [
    '#', '$', '&', '''', '(', ')', '*', '+', ',', '-', '.',
    '/', ':', ';', '<', '=', '>', '@', '[', ']', '^', '{', '}'
  ];
begin
  Result := CharInSet(C, cSymbols);
end;

function IsExponentChar(const C: Char): Boolean; inline;
  {Checks if a character is an exponent.
    @param C [in] Character to check.
    @return True if C is an exponent, False if not.
  }
begin
  Result := CharInSet(C, ['E', 'e']);
end;

function IsUnaryPlusMinusChar(const C: Char): Boolean; inline;
  {Checks if a character is a unary plus or minus operator.
    @param C [in] Character to check.
    @return True if C is a unary plus or minus, False if not.
  }
begin
  Result := CharInSet(C, ['+', '-']);
end;

function IsSeparatorChar(const C: Char): Boolean; inline;
  {Checks if a character is a separator character.
    @param C [in] Character to check.
    @return True if C is a separator, False if not.
  }
begin
  Result := IsWhiteSpaceChar(C) or IsSymbolChar(C) or (C = cEOL);
end;

function IndexInTable(const Str: string; const Table: array of string): Integer;
  {Gets the index of a string in a table.
    @param Str [in] String to search for. Case is ignored.
    @param Table [in] Table of strings to search.
    @return Index of string in table or -1 if string not in table.
  }
var
  I: Integer;   // loops thru table
begin
  // Note: calling code assumes Table is zero based
  Result := -1;
  for I := Low(Table) to High(Table) do
    if AnsiSameText(Table[I], Str) then
    begin
      Result := I;
      Break;
    end;
end;

procedure InitStringList(out Strings: TStringList;
  const Table: array of string);
  {Creates and initialises a sorted string list from a table of values.
    @param Strings [out] String list we create and initialise.
    @param Table [in] Table of strings to place in string list.
  }
var
  Idx: Integer; // loops thru rows of table
begin
  Strings := TStringList.Create;
  for Idx := Low(Table) to High(Table) do
    Strings.Add(Table[Idx]);
  Strings.Sorted := True;
  Strings.CaseSensitive := False;
end;

procedure InitSymbolMap(out Map: TSymbolMap);
  {Initialises object used to map valid symbols to tokens.
    @param Map [out] Map object we create and initialise.
  }
var
  I: Integer; // loops thru entries in symbol map constant table.
begin
  // Map contains only symbols, therefore it doesn't matter if searching is
  // case sensitive. We use case insensitive since it is probably quicker
  Map := TSymbolMap.Create(TSameStringEqualityComparer.Create);
  for I := Low(cSymToTokenMap) to High(cSymToTokenMap) do
    Map.Add(cSymToTokenMap[I].Symbol, cSymToTokenMap[I].Token);
end;

function IsDoubleSym(const Symbol: string): Boolean;
  {Checks if a symbol is a valid double character symbol.
    @param Symbol [in] Symbol to check.
    @return True if symbol is valid double character symbol.
  }
begin
  if not Assigned(pvtDoubleSyms) then
    InitStringList(pvtDoubleSyms, cDoubleSyms);
  Result := pvtDoubleSyms.IndexOf(Symbol) >= 0;
end;

function IsDirective(const Ident: string): Boolean;
  {Checks if an identifier is a directive.
    @param Ident [in] Identifier to check.
    @return True if Ident is a directive, false otherwise.
  }
begin
  if not Assigned(pvtDirectives) then
    InitStringList(pvtDirectives, cDirectives);
  Result := pvtDirectives.IndexOf(Ident) >= 0;
end;

function IsKeyword(const Ident: string): Boolean;
  {Checks if an identifier is a keyword.
    @param Ident [in] Identifier to check.
    @return True if Ident is a keyword, false otherwise.
  }
begin
  if not Assigned(pvtKeywords) then
    InitStringList(pvtKeywords, cKeywords);
  Result := pvtKeywords.IndexOf(Ident) >= 0;
end;

function SymbolToToken(const Symbol: string): THilitePasToken;
  {Gets the likely token associated with a symbol.
    @param Symbol [in] Symbol to check
    @return Token associated with symbol.
  }
begin
  if not Assigned(pvtSymMap) then
    InitSymbolMap(pvtSymMap);
  if pvtSymMap.ContainsKey(Symbol) then
    Result := pvtSymMap[Symbol]
  else
    Result := tkError;
end;

function MatchingCommentCloser(const CommentOpener: string): string;
  {Given a comment opening symbol gets the matching closing comment symbol.
    @param CommentOpener [in] Opening comment we need to match.
    @return Closing comment symbol.
  }
var
  Idx: Integer; // index of opening / closing symbols in table
begin
  // Note: this code assumes cCommentXXX arrays are zero based
  Idx := IndexInTable(CommentOpener, cCommentOpeners);
  Assert(Idx >= 0, 'MatchingCommentCloser: invalid comment opener');
  Result := cCommentClosers[Idx];
end;

function IsCompilerDirOpener(const Str: string): Boolean;
  {Checks if text is a comment opening symbol that is valid for a compiler
  directive.
    @param Str [in] String we are check is a compiler directive opening symbol.
    @return True if is a compiler directive opening symbol.
  }
begin
  Result := IndexInTable(Str, cCompilerDirOpeners) >= 0;
end;


{ THilitePasLexer }

constructor THilitePasLexer.Create(const Stm: TStream);
  {Constructor. Sets up object to analyse code on a stream.
    @param Stm [in] Stream containing Pascal source.
  }
begin
  inherited Create;
  fReader := TTextStreamReader.Create(Stm);
end;

destructor THilitePasLexer.Destroy;
  {Destructor. Tears down object.
  }
begin
  fReader.Free;
  inherited;
end;

function THilitePasLexer.NextToken: THilitePasToken;
  {Gets and analyses next Pascal token from input and stores details in token
  string.
    @return Token identifiing type of token read.
  }
begin
  // Reset token string
  fTokenStr := '';
  // Decide on method used to parse the token
  if not fCommentState.InComment then
  begin
    // We are not in a multi-line comment: process normally
    if IsWhiteSpaceChar(fReader.Ch) then
      Result := ParseWhiteSpace
    else if IsValidIdentStartChar(fReader.Ch) then
      Result := ParseIdent
    else if IsSymbolChar(fReader.Ch) then
      Result := ParseSymbol
    else if TCharacter.IsDigit(fReader.Ch) then
      Result := ParseNumber
    else if fReader.Ch = cEOL then
      Result := ParseEOL
    else if fReader.Ch = cEOF then
      Result := tkEOF
    else
      Result := ParseUnknown;
  end
  else
  begin
    // We're in a multiline comment: char is either from inside comment or EOL
    if fReader.Ch <> cEOL then
      Result := ParseCommentInterior
    else
      Result := ParseEOL;
  end;
  // Record the token
  fToken := Result;
end;

function THilitePasLexer.ParseChar: THilitePasToken;
  {Analyses a literal character (made from # followed by number) from input and
  stores in token string.
    @return Token indicating literal char (tkChar).
  }
begin
  // This method called with token string already containing '#' and current
  // char is char after '#'
  // Numeric part can either by whole number or hex number
  Result := tkChar;
  if SymbolToToken(fReader.Ch) = tkHex then
  begin
    // Hex number ('$' detected)
    // store '$' and skip to next
    UpdateTokenStr;
    fReader.NextChar;
    // now read hex digits
    ParseHex;
  end
  else if TCharacter.IsDigit(fReader.Ch) then
    // This is whole number: parse it
    ParseWholeNumber
  else
    // Not valid character: error token
    Result := tkError;
end;

function THilitePasLexer.ParseCommentFromStart: THilitePasToken;
  {Begins parsing of a new comment or compiler directive.
    @return Token telling whether this is a comment or compiler directive
      (tkComment, tkCompilerDir).
  }
begin
  // Token string contains comment opening symbol and current char is that which
  // follows opening symbol

  // Record information about the comment
  fCommentState.InComment := True;
  fCommentState.CommentCloser := MatchingCommentCloser(fTokenStr);
  // if char following opener is '$' we have compiler directive
  // (but only if comment opener is '{' or '(*' )
  if (fReader.Ch = cCompilerDirChar) and
    IsCompilerDirOpener(fTokenStr) then
    fCommentState.CommentType := tkCompilerDir
  else
    fCommentState.CommentType := tkComment;

  // Parse body of comment
  Result := ParseCommentInterior;
end;

function THilitePasLexer.ParseCommentInterior: THilitePasToken;
  {Analyses body of comment after start or after resuming processing multi-line
  comments.
    @return Token telling whether this is a comment or compiler directive
      (tkComment, tkCompilerDir).
  }
var
  Done: Boolean;  // flag true when we have finished comment
begin
  Assert(fCommentState.InComment,
    ClassName + '.ParseCommentInterior: called when not in comment');
  Assert(fCommentState.CommentType in [tkComment, tkCompilerDir],
    ClassName + '.ParseCommentInterior: invalid comment type');
  Assert(Length(fCommentState.CommentCloser) > 0,
    ClassName + '.ParseCommentInterior: invalid comment closer');

  Result := fCommentState.CommentType;

  // Loop thru all comment, looking for closing comment symbol
  Done := False;
  while (fReader.Ch <> cEOF) and not Done do
  begin
    if fReader.Ch = fCommentState.CommentCloser[1] then
    begin
      // We have encountered 1st char of a comment "closer"
      if Length(fCommentState.CommentCloser) = 1 then
      begin
        // Our closer is a single char: comment is closed
        Done := True;
        fCommentState.InComment := False;
        if fCommentState.CommentCloser[1] = cEOL then
          // closer is EOL: put it back to be read later
          fReader.PutBackChar
        else
          // closer not EOL: add it to token string
          UpdateTokenStr;
      end
      else
      begin
        // Our possible closer has two chars
        // Record first char in token string
        UpdateTokenStr;
        // Peek ahead at next char
        fReader.NextChar;
        if fReader.Ch = fCommentState.CommentCloser[2] then
        begin
          // This is the expected closer: comment is closed
          Done := True;
          fCommentState.InComment := False;
          UpdateTokenStr;
        end
        else
          // False alarm: put back the char we peeked at
          fReader.PutBackChar;
      end;
    end
    else
    begin
      // Ordinary comment text
      if fReader.Ch = cEOL then
      begin
        // EOL: put it back and stop parsing
        // the comment stays open: we will continue processing after EOL handled
        Done := True;
        fReader.PutBackChar;
      end
      else
        // Not EOL: add char to token string
        UpdateTokenStr;
    end;
    fReader.NextChar;
  end;
  // If at EOF ensure that comment is closed
  if fReader.Ch = cEOF then
    fCommentState.InComment := False;
end;

function THilitePasLexer.ParseEOL: THilitePasToken;
  {Analyses end of line from input and stores in token string.
    @return End of line token (tkEOL).
  }
begin
  UpdateTokenStr(cEOL);
  Result := tkEOL;
  fReader.NextChar;
end;

function THilitePasLexer.ParseHex: THilitePasToken;
  {Analyses a hexadecimal integer from input and stores in token string.
    @return Token indicating hexadecimal value (tkHex).
  }
begin
  // Called with fTokenStr = '$' and fReader.Ch with char after '$'
  // Build string of hex digits
  while IsHexDigit(fReader.Ch) do
  begin
    UpdateTokenStr;
    fReader.NextChar;
  end;
  // Check that we ended in a valid way: error if not
  if not IsSeparatorChar(fReader.Ch) then
    Result := tkError
  else
    Result := tkHex;
end;

function THilitePasLexer.ParseIdent: THilitePasToken;
  {Analyses an alphanumeric identifier from input and stores in token string.
  Checks if identifier is keyword or directive.
    @return Token representing identifier: tkKeyword, tkDirective or
      tkIdentifier.
  }
begin
  Assert(IsValidIdentStartChar(fReader.Ch),
    ClassName + '.ParseIdent: identifier starting character expected');
  // Build identifier in token string
  while IsValidIdentBodyChar(fReader.Ch) do
  begin
    UpdateTokenStr;
    fReader.NextChar;
  end;
  // Check if token is keyword or directive or is plain identifier
  if IsKeyword(fTokenStr) then
    Result := tkKeyword
  else if IsDirective(fTokenStr) then
    Result := tkDirective
  else
    Result := tkIdentifier;
end;

function THilitePasLexer.ParseNumber: THilitePasToken;
  {Analyses a number from input and stores in token string. Number can be
  integer or real.
    @return Appropriate token for number (tkNumber or tkFloat).
  }
var
  TempCh: Char; // temporary storage for a character read from input
begin
  Assert(TCharacter.IsDigit(fReader.Ch),
    ClassName + '.ParseNumber: digit expected');
  // All numbers start with a whole number: read it
  ParseWholeNumber; // leaves current char as one immediately after number
  // Assume we have whole number and see if we can disprove it
  Result := tkNumber;
  if fReader.Ch = cDecimalPoint then
  begin
    // Char after whole number is a decimal point: this *may* indicate a float,
    // but may not since there are other symbols that start with '.'
    // Store the decimal point then read ahead to see what next char is
    TempCh := fReader.Ch;
    fReader.NextChar;
    if CharInSet(fReader.Ch, [cDecimalPoint, cCloseParen]) then
    begin
      // decimal point was followed by '.' or ')' making valid two char symbols
      // .. and .) => we put back the read character and get out, leaving first
      // decimal point as current character and returning whole number
      fReader.PutBackChar;
      Exit;
    end;
    // Decimal point was valid: record in token string
    UpdateTokenStr(TempCh);
    // If we have digits after decimal point read them into token str
    // Note: there may not necessarily be digits after '.' (e.g. 2. is a valid
    // Delphi float)
    if TCharacter.IsDigit(fReader.Ch) then
      ParseWholeNumber;
    Result := tkFloat;
  end;
  if IsExponentChar(fReader.Ch) then
  begin
    // Next char is an exponent (e or E) that is present in numbers in
    // "scientific" notation. This can either follow whole number, follow
    // decimal point or follow digits after decimal point. I.e. 2e4, 2.e3 and
    // 2.0e4 are all valid, as is 2.0e-4 etc.
    // Record exponent in token string
    UpdateTokenStr;
    // Read chars after exponent (first may be unary + or -)
    fReader.NextChar;
    if IsUnaryPlusMinusChar(fReader.Ch) then
    begin
      UpdateTokenStr;
      fReader.NextChar;
    end;
    // Next comes whole number: get it
    if TCharacter.IsDigit(fReader.Ch) then
    begin
      ParseWholeNumber;
      Result := tkFloat
    end
    else
      Result := tkError;
  end;
end;

function THilitePasLexer.ParseString: THilitePasToken;
  {Analyses a string literal from input and stores in token string.
    @return String token (tkString).
  }
var
  Done: Boolean;  // flag true when done parsing string
begin
  // Note: token string already contains opening quote - current char is first
  // character of the string after the quote
  Done := False;
  // Loop thru characters until end of string found
  while (fReader.Ch <> cEOF) and not Done do
  begin
    UpdateTokenStr;
    if fReader.Ch = cStringDelim then
    begin
      // Could be closing quote or pair of quotes used to embed quote in string
      // we need to read ahead to check this
      fReader.NextChar;
      if fReader.Ch = cStringDelim then
        // this is a pair of quotes ('') => embeds quote in string => not done
        UpdateTokenStr
      else
      begin
        // not a pair of quotes => string completed
        Done := True;
        // put back char we read ahead
        fReader.PutBackChar;
      end;
    end;
    fReader.NextChar;
  end;
  Result := tkString;
end;

function THilitePasLexer.ParseSymbol: THilitePasToken;
  {Determines whether the current symbol character on input represents a symbol
  or introduces some other syntactic entity (i.e. comment, string, character
  literal or a hex number). Analyses the input accordingly and stores the whole
  token in the token string.
    @return Token describing entity parsed.
  }
var
  AToken: THilitePasToken; // token represented by the symbol
begin
  Assert(IsSymbolChar(fReader.Ch), ClassName + '.ParseSymbol: symbol expected');
  // Add character that starts symbol to token string and read next char
  UpdateTokenStr;
  fReader.NextChar;
  // Check if char read is second char of a two char symbol and process if so
  if IsSymbolChar(fReader.Ch) then
  begin
    if IsDoubleSym(fTokenStr + fReader.Ch) then
    begin
      // this is 2 char symbol: store in token string and skip over
      UpdateTokenStr;
      fReader.NextChar;
    end
  end;
  // Token string now holds symbol: check which kind of token it represents
  // and parse accordingly
  AToken := SymbolToToken(TokenStr);
  case AToken of
    tkComment:
      Result := ParseCommentFromStart;
    tkString:
      Result := ParseString;
    tkChar:
      Result := ParseChar;
    tkHex:
      Result := ParseHex;
    else
      Result := AToken;       // no special processing: return token
  end;
end;

function THilitePasLexer.ParseUnknown: THilitePasToken;
  {Analyses an unrecognised entity from input and adds it to token string.
    @return Error token (tkError).
  }
begin
  Result := tkError;
  UpdateTokenStr;
  fReader.NextChar;
end;

function THilitePasLexer.ParseWhiteSpace: THilitePasToken;
  {Analyses a sequence of white space from input and appends space for each
  white space character read to token string.
    @return White space token (tkWhiteSpace).
  }
begin
  Assert(IsWhiteSpaceChar(fReader.Ch),
    ClassName + '.ParseWhiteSpace: current char not white space');
  while IsWhiteSpaceChar(fReader.Ch) do
  begin
    UpdateTokenStr;
    fReader.NextChar;
  end;
  Result := tkWhiteSpace;
end;

function THilitePasLexer.ParseWholeNumber: THilitePasToken;
  {Analyses a whole number from input and appends to token string.
    @return Whole number token (tkNumber).
  }
begin
  Assert(TCharacter.IsDigit(fReader.Ch),
    ClassName + '.ParseWholeNumber: current char not a digit');
  while TCharacter.IsDigit(fReader.Ch) do
  begin
    UpdateTokenStr;
    fReader.NextChar;
  end;
  Result := tkNumber;
end;

procedure THilitePasLexer.UpdateTokenStr;
  {Appends current character in input to token string. Ignores EOF.
  }
begin
  UpdateTokenStr(fReader.Ch);
end;

procedure THilitePasLexer.UpdateTokenStr(const Ch: Char);
  {Appends a character to token string. Ignores EOF.
    @param Ch [in] Character to append.
  }
begin
  if Ch <> cEOF then
    fTokenStr := fTokenStr + Ch;
end;


initialization


finalization

pvtKeywords.Free;
pvtDirectives.Free;
pvtDoubleSyms.Free;
pvtSymMap.Free;

end.