懶人在思考?WAF研究中...

tiny-c是pandolia大神的github開源項目,是做一個小型C語言,是語言開發,對學習語義分析非常友好。tiny-c詞法分析,就是將tiny-c文本流轉換成token流,lets Go ~

  • 單字元運算符

OPERATOR ([+*-/%=,;!<>(){}])

  • 雙字元運算符和關鍵字

<= >= == != && ||
void int while if else return break continue print readint

  • 整數常量、字元串常量和標識符(變數名、函數名)

INTEGER ([0-9]+)
UNTERM_STRING ( 42[^ 42
]*)
STRING ( 42[^ 42
]* 42)
IDENTIFIER ([_a-zA-Z][_a-zA-Z0-9]*)

[^
]
: 正則表達式^非得意思,不要讓我看見換行!

編寫scanner.l文件:告訴你如何將文本流變成token流,也就是切割一個個token。

%{
#include "token.h"
int cur_line_num = 1;
void init_scanner();
void lex_error(char* msg, int line);
%}

/* Definitions, note: 42 is " */
INTEGER ([0-9]+)
UNTERM_STRING ( 42[^ 42
]*)
STRING ( 42[^ 42
]* 42)
IDENTIFIER ([_a-zA-Z][_a-zA-Z0-9]*)
OPERATOR ([+*-/%=,;!<>(){}])
SINGLE_COMMENT1 ("//"[^
]*)
SINGLE_COMMENT2 ("#"[^
]*)

%%

[
] { cur_line_num++; }
[
a]+ { /* ignore all spaces */ }
{SINGLE_COMMENT1} { /* skip for single line comment */ }
{SINGLE_COMMENT2} { /* skip for single line commnet */ }

{OPERATOR} { return yytext[0]; }

"<=" { return T_Le; }
">=" { return T_Ge; }
"==" { return T_Eq; }
"!=" { return T_Ne; }
"&&" { return T_And; }
"||" { return T_Or; }
"void" { return T_Void; }
"int" { return T_Int; }
"while" { return T_While; }
"if" { return T_If; }
"else" { return T_Else; }
"return" { return T_Return; }
"break" { return T_Break; }
"continue" { return T_Continue; }
"print" { return T_Print; }
"readint" { return T_ReadInt; }

{INTEGER} { return T_IntConstant; }
{STRING} { return T_StringConstant; }
{IDENTIFIER} { return T_Identifier; }

<<EOF>> { return 0; }

{UNTERM_STRING} { lex_error("Unterminated string constant", cur_line_num); }
. { lex_error("Unrecognized character", cur_line_num); }

%%

int main(int argc, char* argv[]) {
int token;
init_scanner();
while (token = yylex()) {
print_token(token);
puts(yytext);
}
return 0;
}

void init_scanner() {
printf("%-20s%s
", "TOKEN-TYPE", "TOKEN-VALUE");
printf("-------------------------------------------------
");
}

void lex_error(char* msg, int line) {
printf("
Error at line %-3d: %s

", line, msg);
}

int yywrap(void) {
return 1;
}

token.h:主要是為了列印token,大於256的token會查找枚舉類型表enum,小於256的token會直接列印,這裡只會列印單字元運算符。

#ifndef TOKEN_H
#define TOKEN_H

typedef enum {
T_Le = 256, T_Ge, T_Eq, T_Ne, T_And, T_Or, T_IntConstant,
T_StringConstant, T_Identifier, T_Void, T_Int, T_While,
T_If, T_Else, T_Return, T_Break, T_Continue, T_Print,
T_ReadInt
} TokenType;

static void print_token(int token) {
static char* token_strs[] = {
"T_Le", "T_Ge", "T_Eq", "T_Ne", "T_And", "T_Or", "T_IntConstant",
"T_StringConstant", "T_Identifier", "T_Void", "T_Int", "T_While",
"T_If", "T_Else", "T_Return", "T_Break", "T_Continue", "T_Print",
"T_ReadInt"
};

if (token < 256) {
printf("%-20c", token);
} else {
printf("%-20s", token_strs[token-256]);
}
}

#endif

makefile:

out: scanner

scanner: lex.yy.c token.h
gcc -o $@ $<

lex.yy.c: scanner.l
flex $<

運行結果:右邊的tiny-c文本變成了左邊的token流 ~

參考文獻:自己動手寫編譯器(pandolia.net/tinyc/)

推薦閱讀:

相关文章