懒人在思考?WAF研究中...

tiny-c是pandolia大神的github开源项目,是做一个小型C语言,是语言开发,对学习语义分析非常友好。tiny-c词法分析,就是将tiny-c文本流转换成token流,lets Go ~

  • 单字元运算符

OPERATOR ([+*-/%=,;!<>(){}])

  • 双字元运算符和关键字

<= >= == != && ||
void int while if else return break continue print readint

  • 整数常量、字元串常量和标识符(变数名、函数名)

INTEGER ([0-9]+)
UNTERM_STRING ( 42[^ 42
]*)
STRING ( 42[^ 42
]* 42)
IDENTIFIER ([_a-zA-Z][_a-zA-Z0-9]*)

[^
]
: 正则表达式^非得意思,不要让我看见换行!

编写scanner.l文件:告诉你如何将文本流变成token流,也就是切割一个个token。

%{
#include "token.h"
int cur_line_num = 1;
void init_scanner();
void lex_error(char* msg, int line);
%}

/* Definitions, note: 42 is " */
INTEGER ([0-9]+)
UNTERM_STRING ( 42[^ 42
]*)
STRING ( 42[^ 42
]* 42)
IDENTIFIER ([_a-zA-Z][_a-zA-Z0-9]*)
OPERATOR ([+*-/%=,;!<>(){}])
SINGLE_COMMENT1 ("//"[^
]*)
SINGLE_COMMENT2 ("#"[^
]*)

%%

[
] { cur_line_num++; }
[
a]+ { /* ignore all spaces */ }
{SINGLE_COMMENT1} { /* skip for single line comment */ }
{SINGLE_COMMENT2} { /* skip for single line commnet */ }

{OPERATOR} { return yytext[0]; }

"<=" { return T_Le; }
">=" { return T_Ge; }
"==" { return T_Eq; }
"!=" { return T_Ne; }
"&&" { return T_And; }
"||" { return T_Or; }
"void" { return T_Void; }
"int" { return T_Int; }
"while" { return T_While; }
"if" { return T_If; }
"else" { return T_Else; }
"return" { return T_Return; }
"break" { return T_Break; }
"continue" { return T_Continue; }
"print" { return T_Print; }
"readint" { return T_ReadInt; }

{INTEGER} { return T_IntConstant; }
{STRING} { return T_StringConstant; }
{IDENTIFIER} { return T_Identifier; }

<<EOF>> { return 0; }

{UNTERM_STRING} { lex_error("Unterminated string constant", cur_line_num); }
. { lex_error("Unrecognized character", cur_line_num); }

%%

int main(int argc, char* argv[]) {
int token;
init_scanner();
while (token = yylex()) {
print_token(token);
puts(yytext);
}
return 0;
}

void init_scanner() {
printf("%-20s%s
", "TOKEN-TYPE", "TOKEN-VALUE");
printf("-------------------------------------------------
");
}

void lex_error(char* msg, int line) {
printf("
Error at line %-3d: %s

", line, msg);
}

int yywrap(void) {
return 1;
}

token.h:主要是为了列印token,大于256的token会查找枚举类型表enum,小于256的token会直接列印,这里只会列印单字元运算符。

#ifndef TOKEN_H
#define TOKEN_H

typedef enum {
T_Le = 256, T_Ge, T_Eq, T_Ne, T_And, T_Or, T_IntConstant,
T_StringConstant, T_Identifier, T_Void, T_Int, T_While,
T_If, T_Else, T_Return, T_Break, T_Continue, T_Print,
T_ReadInt
} TokenType;

static void print_token(int token) {
static char* token_strs[] = {
"T_Le", "T_Ge", "T_Eq", "T_Ne", "T_And", "T_Or", "T_IntConstant",
"T_StringConstant", "T_Identifier", "T_Void", "T_Int", "T_While",
"T_If", "T_Else", "T_Return", "T_Break", "T_Continue", "T_Print",
"T_ReadInt"
};

if (token < 256) {
printf("%-20c", token);
} else {
printf("%-20s", token_strs[token-256]);
}
}

#endif

makefile:

out: scanner

scanner: lex.yy.c token.h
gcc -o $@ $<

lex.yy.c: scanner.l
flex $<

运行结果:右边的tiny-c文本变成了左边的token流 ~

参考文献:自己动手写编译器(pandolia.net/tinyc/)

推荐阅读:

相关文章