Design and implementation of C subset lexical analyzer based on Flex

preparation in advance

In order to write a lexical analyzer of C subsets, we should first understand what C subsets are.
Reference resources C basic grammar (Rookie course) Summarize first:
number
We need to consider decimal, hexadecimal, octal, binary, positive and negative numbers, decimal, scientific counting and some numerical suffixes: u, l, f, etc. (note that floating-point numbers cannot be added with uU). Should it be gone
Keyword:
auto ,break,case,char,const,continue,default,do,double,else,enum,extern,float,for,goto,if,int,long ,register,return,short,signed,sizeof,static,struct,switch,typedef,unsigned,union,void ,volatile,while.
Identifier:
The C identifier is the name used to identify a variable, function, or any other user-defined project. An identifier begins with the letter A-Z or A-Z or underscore, followed by zero or more letters, underscores, and numbers (0-9).
Operator:
Arithmetic operators +, -, *, /,%, + +--
Relational operators: = =,! =, ><=
Logical operators: & &, |!
Bitwise operators: &, | ^, ~, < < and > >
Assignment operators: =, + =, - =, =, / =,% =, < =, > =, & =, ^ =|=
I don't think flex alone can implement miscellaneous operators.
Punctuation
Three brackets: {} [] () and:,;. - > "'
Notes
Two notes: / /*/
Should it be gone

Realization

Script:

#!/bin/bash
flex -o C-lexical-analyzer.yy.c C-lexical-analyzer.l 
echo "flex Compile completed"
gcc -o C-lexical-analyzer C-lexical-analyzer.yy.c -lfl
echo "gcc Compile completed"
rm C-lexical-analyzer.yy.c
echo "Executing:(Ctrl+D Can end input)"
./C-lexical-analyzer
rm C-lexical-analyzer 

flex Code:

/*
 *file: C-lexical-analyzer.l 
 *auther: jin1ming
 *system: manjaro
 */
%option yylineno
%{
#include<stdio.h>
extern int yylineno;
%}

/*Number definition*/
/*Scientific counting representation*/
science {decimal}(\.[0-9]+)?([Ee][-+]?[0-9]+)?
/*Decimal system*/
decimal ([-+])?(0|[1-9][0-9]*)
/*Hexadecimal*/
hexadecimal 0[xX][a-fA-F0-9]+
/*Binary system*/
binary 0[bB][01]+
/*Octal number system*/
octal 0[0-7]+
/*General expression*/
number ({hexadecimal}|{binary}|{science}|{octal})(([uU]?[Ll]?)|([Ll]?[Uu]?)|([fF]?))
/*Note that floating-point numbers always have symbols and do not need the Uu suffix, so you need to do a floating-point exception handling next*/
/*Digital exception handling*/
floatexcption {decimal}\.[0-9]+([Ee]?[-+]?[0-9]+)?[Uu]
excption [0-9][0-9a-zA-Z\.]+

/*Keyword*/
AUTO auto 
BREAK break
CASE case
CHAR char
CONST const
CONTINUE continue
DEFAULT default
DO do
DOUBLE double
ELSE else
ENUM enum
EXTERN extern
FLOAT float
FOR for
GOTO goto
IF if
INT int
LONG long 
REGISTER register
RETURN return
SHORT short
SIGNED signed
SIZEOF sizeof
STATIC static
STRUCT struct
SWITCH switch
TYPEDEF typedef
UNSIGNED unsigned
UNION union
VOID void 
VOLATILE volatile
WHILE while

/*Identifier definition*/
identifier [a-z_A-Z][a-z_A-Z0-9]*

/*Other characters*/
comment (\/\/.*)|(\/\*(.|\n)*\/)
whitespace [ \t\n\r\f\v]+
errno .
 
 /*operator*/
 /*Arithmetic operator*/
ADD  \+
SUB  \-
MUL  \*
QUO  \/
REM  %
INC    \+\+
DEC    \-\-
 /*Assignment Operators */
ASSIGN  =
ADD_ASSIGN  \+=
SUB_ASSIGN  \-=
MUL_ASSIGN  \*=
QUO_ASSIGN  \/=
REM_ASSIGN  %=
AND_ASSIGN  \&=
OR_ASSIGN  \|=
XOR_ASSIGN  \^=
SHL_ASSIGN  <<=
SHR_ASSIGN  >>=
AND_NOT_ASSIGN  ~=
 /*Bitwise Operators*/
AND      &
OR       \|
XOR      \^
SHL      <<
SHR      >>
AND_NOT  ~
 /*Logical operators*/
LAND   &&
LOR    \|\|
NOT    \!
 /*Relational operators*/
EQL     ==
LSS     <
GTR     >
NEQ       !=
LEQ       <=
GEQ       >=
 /*Punctuation*/
LPAREN  \(
LBRACK  \[
LBRACE  \{
COMMA   ,
PERIOD  \.
RPAREN     \)
RBRACK     \]
RBRACE     \}
SEMICOLON  ;
COLON      :
POT         \->
DQUA       \"
SQUA       \'
%%
 
 /*Keyword*/
{AUTO} {printf("Key Word:  %s\n",yytext);}
{BREAK} {printf("Key Word:  %s\n",yytext);}
{CASE} {printf("Key Word:  %s\n",yytext);}
{CHAR} {printf("Key Word:  %s\n",yytext);}
{CONST} {printf("Key Word:  %s\n",yytext);}
{CONTINUE} {printf("Key Word:  %s\n",yytext);}
{DEFAULT} {printf("Key Word:  %s\n",yytext);}
{DO} {printf("Key Word:  %s\n",yytext);}
{DOUBLE} {printf("Key Word:  %s\n",yytext);}
{ELSE} {printf("Key Word:  %s\n",yytext);}
{ENUM} {printf("Key Word:  %s\n",yytext);}
{EXTERN} {printf("Key Word:  %s\n",yytext);}
{FLOAT} {printf("Key Word:  %s\n",yytext);}
{FOR} {printf("Key Word:  %s\n",yytext);}
{GOTO} {printf("Key Word:  %s\n",yytext);}
{IF} {printf("Key Word:  %s\n",yytext);}
{INT} {printf("Key Word:  %s\n",yytext);}
{LONG} {printf("Key Word:  %s\n",yytext);}
{REGISTER} {printf("Key Word:  %s\n",yytext);}
{RETURN} {printf("Key Word:  %s\n",yytext);}
{SHORT} {printf("Key Word:  %s\n",yytext);}
{SIGNED} {printf("Key Word:  %s\n",yytext);}
{SIZEOF} {printf("Key Word:  %s\n",yytext);}
{STATIC} {printf("Key Word:  %s\n",yytext);}
{STRUCT} {printf("Key Word:  %s\n",yytext);}
{SWITCH} {printf("Key Word:  %s\n",yytext);}
{TYPEDEF} {printf("Key Word:  %s\n",yytext);}
{UNSIGNED} {printf("Key Word:  %s\n",yytext);}
{UNION} {printf("Key Word:  %s\n",yytext);}
{VOID} {printf("Key Word:  %s\n",yytext);}
{VOLATILE} {printf("Key Word:  %s\n",yytext);}
{WHILE} {printf("Key Word:  %s\n",yytext);}

 /*Handle floating point + uU exceptions in advance*/
{floatexcption} {printf("Float Execption: %s\n",yytext);} 
 /*Digital representation*/
{number} {printf("Number:  %s\n",yytext);}
 /*Exception digital processing*/
{excption} {printf("Number Execption:  %s\n",yytext);}

 /*Skip blanks and comments*/
{whitespace} {}
{comment} {printf("This is a commit.\n");}

 /*operator*/
 /*Arithmetic operator*/
{ADD} {printf("Operator:  %s\n",yytext);}
{SUB} {printf("Operator:  %s\n",yytext);}
{MUL} {printf("Operator:  %s\n",yytext);}
{QUO} {printf("Operator:  %s\n",yytext);}
{REM} {printf("Operator:  %s\n",yytext);}
{INC} {printf("Operator:  %s\n",yytext);}
{DEC} {printf("Operator:  %s\n",yytext);}
 /*Logical operators*/
{LAND} {printf("Operator:  %s\n",yytext);}
{LOR} {printf("Operator:  %s\n",yytext);}
{NOT} {printf("Operator:  %s\n",yytext);}
 /*Assignment Operators */
{ASSIGN} {printf("Operator:  %s\n",yytext);}
{ADD_ASSIGN} {printf("Operator:  %s\n",yytext);}
{SUB_ASSIGN} {printf("Operator:  %s\n",yytext);}
{MUL_ASSIGN} {printf("Operator:  %s\n",yytext);}
{QUO_ASSIGN} {printf("Operator:  %s\n",yytext);}
{REM_ASSIGN} {printf("Operator:  %s\n",yytext);}
{AND_ASSIGN} {printf("Operator:  %s\n",yytext);}
{OR_ASSIGN} {printf("Operator:  %s\n",yytext);}
{XOR_ASSIGN} {printf("Operator:  %s\n",yytext);}
{SHL_ASSIGN} {printf("Operator:  %s\n",yytext);}
{SHR_ASSIGN} {printf("Operator:  %s\n",yytext);}
{AND_NOT_ASSIGN} {printf("Operator:  %s\n",yytext);}
 /*Bitwise Operators*/
{AND} {printf("Operator:  %s\n",yytext);}
{OR} {printf("Operator:  %s\n",yytext);}
{XOR} {printf("Operator:  %s\n",yytext);}
{SHL} {printf("Operator:  %s\n",yytext);}
{SHR} {printf("Operator:  %s\n",yytext);}
{AND_NOT} {printf("Operator:  %s\n",yytext);}
 /*Relational operators*/
{EQL} {printf("Operator:  %s\n",yytext);}
{LSS} {printf("Operator:  %s\n",yytext);}
{GTR} {printf("Operator:  %s\n",yytext);}
{NEQ} {printf("Operator:  %s\n",yytext);}
{LEQ} {printf("Operator:  %s\n",yytext);}
{GEQ} {printf("Operator:  %s\n",yytext);}
 /*Punctuation*/
{LPAREN} {printf("Punctuation:  %s\n",yytext);}
{LBRACK} {printf("Punctuation:  %s\n",yytext);}
{LBRACE} {printf("Punctuation:  %s\n",yytext);}
{COMMA} {printf("Punctuation:  %s\n",yytext);}
{PERIOD} {printf("Punctuation:  %s\n",yytext);}
{RPAREN} {printf("Punctuation:  %s\n",yytext);}
{RBRACK} {printf("Punctuation:  %s\n",yytext);}
{RBRACE} {printf("Punctuation:  %s\n",yytext);}
{SEMICOLON} {printf("Punctuation:  %s\n",yytext);}
{COLON} {printf("Punctuation:  %s\n",yytext);}
{POT} {printf("Punctuation:  %s\n",yytext);}
{DQUA} {printf("Punctuation:  %s\n",yytext);}
{SQUA} {printf("Punctuation:  %s\n",yytext);}


{identifier} {printf("ID:  %s\n",yytext);}
{errno} {printf("On line %d,mystery character:  %s\n",yylineno,yytext);}
%%
int main(int argc,char **argv)
{
        yylineno = 1;
        yylex();
        return 0;
}
int yywarp(){
        return 1;
}

Test effect:

$. / start.sh ා compile script
 Compile completed, please execute C-lexical-analyzer manually
[jin1ming@ML C-Lex]$ ./C-lexical-analyzer 
78.987e76f
Number:  78.987e76f
@
On line 2,mystery character:  @
adsda
ID:  adsda
srg090
ID:  srg090
_12
ID:  _12
!@#$#
Operator:  !
121qwqer
Number Execption:  121qwqer
1.2.
Number Execption:  1.2.
1.2uf
Number Execption:  1.2uf
==
Operator:  ==
-    
Operator:  -
=
Operator:  =
^
Operator:  ^
!
Operator:  !
>>
Operator:  >>
,.!
Punctuation:  ,
Punctuation:  .
Operator:  !

Next article: Design and implementation of C subset parser based on Bison

Posted by ShootingBlanks on Thu, 21 Nov 2019 11:37:28 -0800