preparation in advance
In order to write a lexical analyzer of C subsets, we should first understand what C subsets are.
Reference resources C basic grammar (Rookie course) Summarize first:
number
We need to consider decimal, hexadecimal, octal, binary, positive and negative numbers, decimal, scientific counting and some numerical suffixes: u, l, f, etc. (note that floating-point numbers cannot be added with uU). Should it be gone
Keyword:
auto ,break,case,char,const,continue,default,do,double,else,enum,extern,float,for,goto,if,int,long ,register,return,short,signed,sizeof,static,struct,switch,typedef,unsigned,union,void ,volatile,while.
Identifier:
The C identifier is the name used to identify a variable, function, or any other user-defined project. An identifier begins with the letter A-Z or A-Z or underscore, followed by zero or more letters, underscores, and numbers (0-9).
Operator:
Arithmetic operators +, -, *, /,%, + +--
Relational operators: = =,! =, ><=
Logical operators: & &, |!
Bitwise operators: &, | ^, ~, < < and > >
Assignment operators: =, + =, - =, =, / =,% =, < =, > =, & =, ^ =|=
I don't think flex alone can implement miscellaneous operators.
Punctuation
Three brackets: {} [] () and:,;. - > "'
Notes
Two notes: / /*/
Should it be gone
Realization
Script:
#!/bin/bash flex -o C-lexical-analyzer.yy.c C-lexical-analyzer.l echo "flex Compile completed" gcc -o C-lexical-analyzer C-lexical-analyzer.yy.c -lfl echo "gcc Compile completed" rm C-lexical-analyzer.yy.c echo "Executing:(Ctrl+D Can end input)" ./C-lexical-analyzer rm C-lexical-analyzer
flex Code:
/* *file: C-lexical-analyzer.l *auther: jin1ming *system: manjaro */ %option yylineno %{ #include<stdio.h> extern int yylineno; %} /*Number definition*/ /*Scientific counting representation*/ science {decimal}(\.[0-9]+)?([Ee][-+]?[0-9]+)? /*Decimal system*/ decimal ([-+])?(0|[1-9][0-9]*) /*Hexadecimal*/ hexadecimal 0[xX][a-fA-F0-9]+ /*Binary system*/ binary 0[bB][01]+ /*Octal number system*/ octal 0[0-7]+ /*General expression*/ number ({hexadecimal}|{binary}|{science}|{octal})(([uU]?[Ll]?)|([Ll]?[Uu]?)|([fF]?)) /*Note that floating-point numbers always have symbols and do not need the Uu suffix, so you need to do a floating-point exception handling next*/ /*Digital exception handling*/ floatexcption {decimal}\.[0-9]+([Ee]?[-+]?[0-9]+)?[Uu] excption [0-9][0-9a-zA-Z\.]+ /*Keyword*/ AUTO auto BREAK break CASE case CHAR char CONST const CONTINUE continue DEFAULT default DO do DOUBLE double ELSE else ENUM enum EXTERN extern FLOAT float FOR for GOTO goto IF if INT int LONG long REGISTER register RETURN return SHORT short SIGNED signed SIZEOF sizeof STATIC static STRUCT struct SWITCH switch TYPEDEF typedef UNSIGNED unsigned UNION union VOID void VOLATILE volatile WHILE while /*Identifier definition*/ identifier [a-z_A-Z][a-z_A-Z0-9]* /*Other characters*/ comment (\/\/.*)|(\/\*(.|\n)*\/) whitespace [ \t\n\r\f\v]+ errno . /*operator*/ /*Arithmetic operator*/ ADD \+ SUB \- MUL \* QUO \/ REM % INC \+\+ DEC \-\- /*Assignment Operators */ ASSIGN = ADD_ASSIGN \+= SUB_ASSIGN \-= MUL_ASSIGN \*= QUO_ASSIGN \/= REM_ASSIGN %= AND_ASSIGN \&= OR_ASSIGN \|= XOR_ASSIGN \^= SHL_ASSIGN <<= SHR_ASSIGN >>= AND_NOT_ASSIGN ~= /*Bitwise Operators*/ AND & OR \| XOR \^ SHL << SHR >> AND_NOT ~ /*Logical operators*/ LAND && LOR \|\| NOT \! /*Relational operators*/ EQL == LSS < GTR > NEQ != LEQ <= GEQ >= /*Punctuation*/ LPAREN \( LBRACK \[ LBRACE \{ COMMA , PERIOD \. RPAREN \) RBRACK \] RBRACE \} SEMICOLON ; COLON : POT \-> DQUA \" SQUA \' %% /*Keyword*/ {AUTO} {printf("Key Word: %s\n",yytext);} {BREAK} {printf("Key Word: %s\n",yytext);} {CASE} {printf("Key Word: %s\n",yytext);} {CHAR} {printf("Key Word: %s\n",yytext);} {CONST} {printf("Key Word: %s\n",yytext);} {CONTINUE} {printf("Key Word: %s\n",yytext);} {DEFAULT} {printf("Key Word: %s\n",yytext);} {DO} {printf("Key Word: %s\n",yytext);} {DOUBLE} {printf("Key Word: %s\n",yytext);} {ELSE} {printf("Key Word: %s\n",yytext);} {ENUM} {printf("Key Word: %s\n",yytext);} {EXTERN} {printf("Key Word: %s\n",yytext);} {FLOAT} {printf("Key Word: %s\n",yytext);} {FOR} {printf("Key Word: %s\n",yytext);} {GOTO} {printf("Key Word: %s\n",yytext);} {IF} {printf("Key Word: %s\n",yytext);} {INT} {printf("Key Word: %s\n",yytext);} {LONG} {printf("Key Word: %s\n",yytext);} {REGISTER} {printf("Key Word: %s\n",yytext);} {RETURN} {printf("Key Word: %s\n",yytext);} {SHORT} {printf("Key Word: %s\n",yytext);} {SIGNED} {printf("Key Word: %s\n",yytext);} {SIZEOF} {printf("Key Word: %s\n",yytext);} {STATIC} {printf("Key Word: %s\n",yytext);} {STRUCT} {printf("Key Word: %s\n",yytext);} {SWITCH} {printf("Key Word: %s\n",yytext);} {TYPEDEF} {printf("Key Word: %s\n",yytext);} {UNSIGNED} {printf("Key Word: %s\n",yytext);} {UNION} {printf("Key Word: %s\n",yytext);} {VOID} {printf("Key Word: %s\n",yytext);} {VOLATILE} {printf("Key Word: %s\n",yytext);} {WHILE} {printf("Key Word: %s\n",yytext);} /*Handle floating point + uU exceptions in advance*/ {floatexcption} {printf("Float Execption: %s\n",yytext);} /*Digital representation*/ {number} {printf("Number: %s\n",yytext);} /*Exception digital processing*/ {excption} {printf("Number Execption: %s\n",yytext);} /*Skip blanks and comments*/ {whitespace} {} {comment} {printf("This is a commit.\n");} /*operator*/ /*Arithmetic operator*/ {ADD} {printf("Operator: %s\n",yytext);} {SUB} {printf("Operator: %s\n",yytext);} {MUL} {printf("Operator: %s\n",yytext);} {QUO} {printf("Operator: %s\n",yytext);} {REM} {printf("Operator: %s\n",yytext);} {INC} {printf("Operator: %s\n",yytext);} {DEC} {printf("Operator: %s\n",yytext);} /*Logical operators*/ {LAND} {printf("Operator: %s\n",yytext);} {LOR} {printf("Operator: %s\n",yytext);} {NOT} {printf("Operator: %s\n",yytext);} /*Assignment Operators */ {ASSIGN} {printf("Operator: %s\n",yytext);} {ADD_ASSIGN} {printf("Operator: %s\n",yytext);} {SUB_ASSIGN} {printf("Operator: %s\n",yytext);} {MUL_ASSIGN} {printf("Operator: %s\n",yytext);} {QUO_ASSIGN} {printf("Operator: %s\n",yytext);} {REM_ASSIGN} {printf("Operator: %s\n",yytext);} {AND_ASSIGN} {printf("Operator: %s\n",yytext);} {OR_ASSIGN} {printf("Operator: %s\n",yytext);} {XOR_ASSIGN} {printf("Operator: %s\n",yytext);} {SHL_ASSIGN} {printf("Operator: %s\n",yytext);} {SHR_ASSIGN} {printf("Operator: %s\n",yytext);} {AND_NOT_ASSIGN} {printf("Operator: %s\n",yytext);} /*Bitwise Operators*/ {AND} {printf("Operator: %s\n",yytext);} {OR} {printf("Operator: %s\n",yytext);} {XOR} {printf("Operator: %s\n",yytext);} {SHL} {printf("Operator: %s\n",yytext);} {SHR} {printf("Operator: %s\n",yytext);} {AND_NOT} {printf("Operator: %s\n",yytext);} /*Relational operators*/ {EQL} {printf("Operator: %s\n",yytext);} {LSS} {printf("Operator: %s\n",yytext);} {GTR} {printf("Operator: %s\n",yytext);} {NEQ} {printf("Operator: %s\n",yytext);} {LEQ} {printf("Operator: %s\n",yytext);} {GEQ} {printf("Operator: %s\n",yytext);} /*Punctuation*/ {LPAREN} {printf("Punctuation: %s\n",yytext);} {LBRACK} {printf("Punctuation: %s\n",yytext);} {LBRACE} {printf("Punctuation: %s\n",yytext);} {COMMA} {printf("Punctuation: %s\n",yytext);} {PERIOD} {printf("Punctuation: %s\n",yytext);} {RPAREN} {printf("Punctuation: %s\n",yytext);} {RBRACK} {printf("Punctuation: %s\n",yytext);} {RBRACE} {printf("Punctuation: %s\n",yytext);} {SEMICOLON} {printf("Punctuation: %s\n",yytext);} {COLON} {printf("Punctuation: %s\n",yytext);} {POT} {printf("Punctuation: %s\n",yytext);} {DQUA} {printf("Punctuation: %s\n",yytext);} {SQUA} {printf("Punctuation: %s\n",yytext);} {identifier} {printf("ID: %s\n",yytext);} {errno} {printf("On line %d,mystery character: %s\n",yylineno,yytext);} %% int main(int argc,char **argv) { yylineno = 1; yylex(); return 0; } int yywarp(){ return 1; }
Test effect:
$. / start.sh ා compile script Compile completed, please execute C-lexical-analyzer manually
[jin1ming@ML C-Lex]$ ./C-lexical-analyzer 78.987e76f Number: 78.987e76f @ On line 2,mystery character: @ adsda ID: adsda srg090 ID: srg090 _12 ID: _12 !@#$# Operator: ! 121qwqer Number Execption: 121qwqer 1.2. Number Execption: 1.2. 1.2uf Number Execption: 1.2uf == Operator: == - Operator: - = Operator: = ^ Operator: ^ ! Operator: ! >> Operator: >> ,.! Punctuation: , Punctuation: . Operator: !
Next article: Design and implementation of C subset parser based on Bison