Study of Lexical Analyzer
Study of Lexical Analyzer
Lexical Analyzer :
A program or function which performs lexical analysis is called a lexical analyzer, lexer or
scanner. Lexical analysis is the process of converting a sequence of characters into a sequence of
tokens. A lexer often exists as a single function which is called by a parser or another function.
The specification of a programming language will often include a set of rules which defines the
lexer. These rules are usually called regular expressions and they define the set of possible
character sequences that are used to form tokens or lexemes, whitespace, (i.e. characters that are
ignored), are also defined in the regular expressions. Tokens are sequences of characters with a
collective meaning. There are usually only a small number of tokens for a programming
language: constants (integer, double, char, string, etc.), operators (arithmetic, relational, logical),
punctuation, and reserved words.
#include<stdio.h>
#include<conio.h>
#include<ctype.h>
#include<string.h>
#include<stdlib.h>
#define SIZE 128
#define NONE -1
#define EOS ‘\0’
#define NUM 256
#define KEYWORD 257
#define PAREN 258
#define ID259
#define ASSIGN 256
#define REL_OP 261
#define DONE 262
#define MAX 999
Char leemes[‘MAX’];
Cha buffer[SIZE];
int lastchar=-1;
int lastentry=0;
int tokentry=NONE;
int lineno=1;
char *lexptr;
int token;
}
Struct entry keywords[]=
{
“if ”,KEYWORD.”else”,KEYWORD,
“for”,KEYWORD, “int”,KEYWORD, “float”,KEYWORD, “double”,KEYWORD,
“char”,KEYWORD, “struct”,KEYWORD, “return”,KEYWORD,
0,0};
Void Error_Message(char *m)
{
fprintf(stderr,”line %d:%s\n”,lineno,m);
exit(1);
}
int look_up(char s[])
{
int k;
for(k=lastentry;k>0;k=k-1)
if(strcmp(symtable[k].lexptr,s)==0)
return 0;
}
int insert(char s[],int tok)
{
int len;
len=strlen(s);
if(latentry+1>=MAX)
Error_Message(“Lexemes Array is Full”);
lastentry=lastentry+1;
symtable[lastentry].token=tok;
symtable[lastentry].lexptr=&lexemes[lastchar+1];
lastchar=lastchar+1;
strcpy(symtable[lastentry].lexptr,s);
return lastentry;
}
void Initialize()
{
struct entry *ptr;
for(ptr=keyword:ptr->token;ptr++)
insert(ptr->lexptr.ptr->token);
}
int lexer()
{
int t;
int val,i=0;
while(1){
t=getchar();
if(t==’’||t==’\t’);
else if(t==’\n’)
lineno=lineno+1;
}
If(lookahead==’+’ || lookahead==’-’ || lookahead==’*’ || lookahead==’/’)
Printf(“\n operator”);
If(lookahead==PAREN)
printf(“\n Parenthesis”);
if(lookahead==ID)
{
printf(“\n Identifier:”);
printf(“%s”,symtable[tokenval].lexptr);
}
if(lookahead=KEYWORD)
prient (“\n Keyword”);
if(lookahead=ASSIGN)
prient (“\nAssignment operator”);
if(lookahead=REL_OP)
prient (“\nRelational operator”);
lookahead=lexer();
}
}
#include<stdio.h>
#include<conio.h>
#include<ctype.h>
#include<string.h>
#include<stdlib.h>
#define SIZE 128
#define NONE -1
#define EOS ‘\0’
#define NUM 256
#define KEYWORD 257
#define PAREN 258
#define ID259
#define ASSIGN 256
#define REL_OP 261
#define DONE 262
#define MAX 999
Char leemes[‘MAX’];
Cha buffer[SIZE];
int lastchar=-1;
int lastentry=0;
int tokentry=NONE;
int lineno=1;
struct entry{
char *lexptr;
int token;
}
Struct entry keywords[]=
{
“if ”,KEYWORD.”else”,KEYWORD,
“for”,KEYWORD, “int”,KEYWORD, “float”,KEYWORD, “double”,KEYWORD,
“char”,KEYWORD, “struct”,KEYWORD, “return”,KEYWORD,
0,0};
Void Error_Message(char *m)
{
fprient(stderr,”line %d : %s \n”,lineno,m);
exit(1);
}
Int look_up(char s[])
{
int k;
for(k=lastentry;k>0;k=k-1)
if(strcmp(symtable[k].lexptr,s)==0)
return 0;
}
int insert(char s[],int tok)
{
int len;
else
{
tokenval=NONE;
return t;
}
}
}
void Match(int t)
{
if(lookahead ==t)
lookahead=lexer();
else
Error_Message (“Syntex error”);
}
}
void F()
{
Void E()
Switch(lookahead)
{
case ‘(‘:
Match(‘(‘);
E();
Match(‘)’);
break;
case NUM:
display(NUM,tokenval);
Match(NUM);
break;
case ID:
display(ID,tokenval);
Match(ID); break;
default:
Error_Message(“Syntex Error”);
}
}
void main()
{
char ans;
clrscr();
Initialize();
printf(“\n Enter the expression”);
printf(“And place ; at the end”);
parser();
}
#include<stdio.h>
#include<conio.h>
#include<string.h>
#include<stdlib.h>
Struct stack
{
Char s[30];
int top;
}st;
void main()
{ char input[30];
void input_to_code(char infix[30]);
clrscr();
printf(“\n Enter an input in the form of expression”);
scan(“%s”,input);
input_to_code(input);
getch();
}
void input_to_code(char input[30])
{
st.top=-1;
st.s[st.top]=’$’;
char polish[30];
int i,j;
char ch;
int instack(char ch);
int incoming (char ch);
void push(char item);
char pop();
j=0;
strrev(input);
for(i=0;input[i]!=’\0’;i++)
{
ch=input[i];
while(instack(st.s[st.top])>incoming(ch))
{
polish[j]=pop();
j++;
}
If(instack(st.s[st.top])!=incoming(ch))
push(ch);
else
pop();
}
while((ch=pop())!=’$’)
{
polish[j]=ch;
j++;
}
#include<stdio.h>
#include<conio.h>
#include<stdlib.h>
#define TRUE 1
#define FALSE 0
typedef struct Heap
{
int data;
struct Heap *next;
}node;
node * create();
void main()
{
int choice ,val;
char ans;
node *head;
void display(node *);
node *search(node *,int);
node *intserch(node *);
void dele(node **);
head=NULL;
do
{
clrscr();
printf(“\n Program to Perform various operation on Heap using dynamic memory
management”);
printf(“\n1.Create”);
printf(“\n2.Display”);
do
{
printf(“\n Enter the element:”);
scan(“%d”,&val);
New=get_node();
if(New==NULL)
printf(“\nMemory is not allocated”);
New->data=val;
if(flag==TRUE)
{
head=New;
temp=head;
flag=FALSE;
}
else
{
temp->next=New;
temp=New;
}
printf(“\nDo you want to enter more element?(y/n)”);
ans=getche();
}
While(ans==’y’)
{
printf(“\n The List is createde”);
getch();
clrscr();
return head;
}
node *get_node()
{
node *temp;
temp=(node *)malloc(sizeof(node));
temp->next=NULL;
return temp;
}
void display(node *head)
{
node *temp;
___________________________________________________________________
%{
int COMMENT =0;
int cnt=0;
}%
identifier[a-zA-Z][a-zA-Z0-9]*
%%
#.*{printf(“\n%s is a PREPROCESSOR DIRECTIVE”,yytext);}
int|
float|
char|
double|
while|
for|
do|
if|
break|
continioue|
void|
switch|
case|
long|
struct|
const|
typedef|
return|
else|
goto{printf(“\n\t%s is a KEYWORD”,yytext);}
“/*” { comment=1;}
“*/” { COMMENT=0;
cnt++;
}
{identifer}\(if(!COMMENT)
{printf(“FUNCTIONALCALL”,yytext);}
\{ {if(!COMMENT) printf(“\n BLOCK BEGINS”);}
}\ {if(!COMMENT) printf(“\n BLOCK ENDS”);}
{identifer}(\[[0-9]*\])?
{if(!COMMENT)printf(“is a identifier”,yytext);}
\”*\” {if(!COMMENT)printf(“is a string”,yytext);}
[0-9]+ {if(!COMMENT)printf(“is a number”,yytext);}
\)(\;? {if(!COMMENT)printf(“\n\t”);ECHO;printf(“\n”);}
\( ECHO;
= {if(!COMMENT)printf(“is a assignment operator”,yytext);}
\+|
\- {if(!COMMENT)printf(“is aoperator”,yytext);}
\<=|
|>=|
\<|
==|
______________________________________________________________________________
______________________________________________________________________________
Compiler is a simple program which reads a program written in one language (High Level
Language) and it translates to an equivalent target program (Level Language).
A compiler is a computer program (or set of programs) that transformssource code written in
a programming language (the source language) into another computer language (the target
language, often having a binary form known as object code). The most common reason for
wanting to transform source code is to create an executable program.
The name "compiler" is primarily used for programs that translate source code from a high-level
programming language to a lower level language (e.g., assembly language or machine code). If
the compiled program can only run on a computer whose CPU or operating system is different
from the one on which the compiler runs the compiler is known as a cross-compiler. A program
that translates from a low level language to a higher level one is a decompiler. A program that
translates between high-level languages is usually called a language translator, source to source
translator, or language converter. A language rewriter is usually a program that translates the
form of expressions without a change of language.
A compiler is likely to perform many or all of the following operations: lexical analysis,
preprocessing, parsing, semantic analysis (Syntax-directed translation), code generation,
and code optimization.
Program faults caused by incorrect compiler behavior can be very difficult to track down and
work around and compiler implementors invest a lot of time ensuring the correctness of their
software.
The term compiler-compiler is sometimes used to refer to a parser generator, a tool often used to
help create the lexer and parser.
______________________________________________________________________________
In computer programming, a one-pass compiler is a compiler that passes through the source code
of each compilation unit only once. In other words, a one-pass compiler does not "look back" at
code it previously processed. Some programming languages have been designed specifically to
be compiled with one-pass compilers, and include special constructs to allow one-pass
compilation. A one-pass compiler is faster and has limited scope of passes. One-pass compiler
are sometimes called narrow compiler. Many programming languages cannot be represented with
single pass compilers, for example Pascal can be implemented with a single pass compiler where
as languages like Java require a multi-pass compiler.
A multi-pass compiler is a type of compiler that processes the source code or abstract syntax tree
of a program several times. This is in contrast to a one-pass compiler, which traverses the
program only once. Each pass takes the result of the previous pass as the input, and creates an
intermediate output. In this way, the code is improved pass by pass, until the final pass emits the
final code. Multi-pass compilers are sometimes called wide compilers, referring to the greater
scope of the passes they can see the entire program being compiled, instead of just a small
portion of it. The wider scope thus available to these compilers allows better code generation e.g.
smaller code size, faster code compared to the output of one-pass compilers, at the cost of higher
compiler time and memory consumption Multi-pass compilers are slower but much more
efficient when compiling.
3. Cross compiler :
A cross compiler is a compiler capable of creating executable code for a platform other than the
one on which the compiler is run. Cross compiler tools are used to generate executables for
embedded system or multiple platforms. It is used to compile for a platform upon which it is not
feasible to do the compiling, like microcontrollers that don't support an operating system. It has
become more common to use this tool for par virtualization where a system may have one or
more platforms in use. The utilization of a cross compiler is common when there is a need to
make use of multiple platforms in order to handle computing functions. This will include
embedded systems where each
______________________________________________________________________________
1. Optimizing compiler :
Compiler optimization is the process of tuning the output of a compiler to minimize or maximize
some attribute of an executable computer program. The most common requirement is to
minimize the time taken to execute a program; a less common one is to minimize the amount of
memory occupied. The growth of portable computers has created a market for minimizing the
power consumed by a program. Compiler optimization is generally implemented using a
sequence of optimizing transformations, algorithms which take a program and transform it to
produce an output program that uses less resource.
A compiler takes a program in a source language, creates some internal representation while
checking the syntax of the program, performs semantic checks, and finally generates something
that can be executed to produce the intended effect of the program. The obvious candidate for
object technology in a compiler is the symbol table: a mapping from user-defined names to their
properties as expressed in the program. It turns out however, that compiler implementation
benefits from object technology in many more areas.
______________________________________________________________________________
1. Analysis Phase
2. Synthesis phase
c) Semantic analysis - input is parse tree and the output is expanded version of parse tree
d) Intermediate Code generation - Here all the errors are checked & it produce an
intermediate code.
e) Code Optimization - the intermediate code is optimized here to get the target program
f) Code Generation - this is the final step & here the target program code is generated.
______________________________________________________________________________
Compiler is a program that translates a computer program written on one computer language to
another computer language. A "compiler" is primarily used for programs that translate source
code from a high level language to a lower level language (e.g., assembly language or machine
language). A program that translates from a low level language to a higher level one is a
decompiles. A compiler for a relatively simple language written by one person might be a single,
monolithic, piece of software. When the source language is large and complex, and high quality
output is required the design may be split into a number of relatively independent phases, or
passes. Having separate phase’s means development can be parceled up into small parts and
given to different people. It also becomes much easier to replace a single phase by an improved
one, or to insert new phases later.
Interpreter is a program that translates an instruction into a machine language and executes it
before proceeding to the next instruction. A high-level programming language translator that
translates and runs the program at the same time. It translates one program statement into
machine language, executes it, and then proceeds to the next statement. This differs from regular
executable programs that are presented to the computer as binary-coded instructions. Interpreted
programs remain in the source language the programmer wrote in, which is human readable text.
Interpreted programs run slower than their compiler counterparts. Whereas the compiler
translates the entire program before it is run, interpreters translate a line at a time while the
program is being run. However, it is very convenient to write an interpreted program, since a
single line of code can be tested interactively.
______________________________________________________________________________
An error recovery method is powerful to such an extent as it accurately diagnoses and reports all
syntactic errors without reporting errors that are not actually present. A successful recovery, then,
has two components:
An “accurate” diagnosis is one that results in a recovery action that effects the “correction” that
a knowledgeable human reader would choose. This notion of accuracy agrees with our intuition
but cannot be precisely defined. In some instances, of course, the nature of the error is
ambiguous, but at the very least, the diagnosis and corresponding recovery should not result in an
excessive deletion of tokens or spurious or missed error detections. The development of a
minimum-distance corrector [l] is not the purpose here, although in practice a minimum-distance
correction should almost always be chosen.
The “practicality” requirement imposes certain constraints: Substantial space or time overhead, in
terms of the parsing framework or enhancements of the grammar, should not be incurred. Thus
the time and space costs of parsing a correct program should not appreciably increase. It is
further required that in practice the average time cost of a recovery should not vary with program
length. Also, this cost should be small enough to allow for incorporation of the method
in a production compiler.
Our method is language independent, but it does allow for tuning with respect to particular
languages and implementations through the setting of language specific parameters. Some of
these provide the means for heuristically controlling recovery actions for certain common or
troublesome errors; others improve recoveries for errors involving absent or distorted scope
information. The method does not depend on the presence of these parameters, and an
implementation may ignore them completely.
Most of the literature on syntactic error recovery confines its empirical studies to Pascal
programs. But, owing to Ada’s higher syntactic complexity, syntax errors tend to pose more of a
difficulty in Ada than Pascal programs. In that we have applied the method with success to both
Ada and Pascal, there is some empirical evidence for our claim that the method is essentially
language independent.
A significant result is that our general LR and LL versions perform equally well on all PTESTS
examples. The method thus shows itself to be equally applicable to LR and LL parsing. This
result suggests that there is not much to choose between LR and LL as far as the quality of error
recovery is concerned
______________________________________________________________________________
such as:
where x, y and z are variables, constants or temporary variables generated by the compiler. op
represents any operator, e.g. an arithmetic operator. Expressions containing more than one
fundamental operation, such as:
are not representable in three-address code as a single instruction. Instead, they are decomposed
into an equivalent series of instructions, such as
The term three-address code is still used even if some instructions use more or fewer than two
operands. The key features of three-address code are that every instruction implements exactly
one fundamental operation, and that the source and destination may refer to any available
register. A refinement of three-address code is static single assignment form (SSA).
Decaf TAC Instructions
The convention followed in the examples below is that t1, t2, and so on refer to variables (either
declared variables or temporaries) and L1, L2, etc. are used for labels.
Labels mark the target for a goto/branch and are used to identify function/method
definitions and vtables.
Assignment: Function/method calls:
t2 = t1; LCall L1;
t1 = "abcdefg"; t1 = LCall L1;
t1 = 8; ACall t1;
t3 = _L0; t0 = ACall t1;
(rvalue can be variable, string/int constant, (LCall a function label known at compile- or label)
time, ACall a computed function address, most likely from vtable. Each has two forms
______________________________________________________________________________
_____________________________________________________________________________