设计并实现一个包含预处理功能的词法分析程序,加深对编译中词法分析过程的理解。
源程序中可能包含有对程序执行无意义的符号,要求将其剔除。
首先编制一个源程序的输入过程,从键盘、文件或文本框输入若干行语句,依次存入输入缓冲区(字符型数据);然后编制一个预处理子程序,去掉输入串中的回车符、换行符和跳格符等编辑性文字;把多个空白符合并为一个;去掉注释。
输入:所给文法的源程序字符串。
输出:二元组(syn,token或sum)构成的序列。其中,
syn为单词种别码。
Token为存放的单词自身字符串。
Sum为整型常量。
具体实现时,可以将单词的二元组用结构进行处理。
1)关键字
main if then while do static int double struct break else long switch case typedef char return const float short continue for void default sizeof do
所有的关键字都是小写。
2)运算符和界符
+ - * / : := < <> <= > >= = ; ( ) #
3)其他标记ID和NUM
通过以下正规式定义其他标记:
ID→letter(letter|digit)*
NUM→digit digit*
letter→a|…|z|A|…|Z
digit→0|…|9…
4)空格由空白、制表符和换行符组成
空格一般用来分隔ID、NUM、专用符号和关键字,词法分析阶段通常被忽略。
题目提供的种别码有部分错误,实际要求是可自行定义,这里提供一份参考:
keyword = {'main':1,'if':2,'then':3,'while':4,'do':5,'static':6,'int':7,'double':8,'struct':9,'break':10,'else':11,'long':12,'switch':13,'case':14,'typedef':15,'char':16,'return':17,'const':18,'float':19,'short':20,'continue':21,'for':22,'void':23,'ID':25,'NUM':26,'default':39,'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {'+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,'<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,')':43,'#':0,'{':46,'}':47}
算法的基本任务是从字符串表示的源程序中识别出具有独立意义的单词符号,其基本思想是根据扫描到的单词符号的第一个字符的种类,拼出相应的单词符号。
代码:
import re
def pre(file):data = file.read()out =[]data = re.sub(re.compile('/\*{1,2}[\s\S]*?\*/'),"",data)data = re.sub(re.compile('//[\s\S]*?\n'), "", data)data = data.split("\n")for i in data:i = i.strip(' ').replace('\t', '')i = ' '.join(str(i).split())if(i!=''):out.append(i)return out
keyword = {'main':1,'if':2,'then':3,'while':4,'do':5,'static':6,'int':7,'double':8,'struct':9,'break':10,'else':11,'long':12,'switch':13,'case':14,'typedef':15,'char':16,'return':17,'const':18,'float':19,'short':20,'continue':21,'for':22,'void':23,'ID':25,'NUM':26,'default':39,'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {'+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,'<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,')':43,'#':0,'{':46,'}':47}
with open('test.txt', 'r') as file:data = pre(file)#print(data)for i in range(len(data)):pattern1 = re.compile('[a-zA-Z.0-9]+')line = re.findall(pattern1,data[i])for j in line:if j in keyword:print(j+' -> '+str(keyword[j]))elif str(j).isdigit():print("'"+str(j)+"' -> 26")else:j = str(j).strip('.')print("'"+j+"' -> 25")line2 = re.sub(pattern1," ",data[i])line2=line2.split(" ")for j in line2:if j in operator:print(j+' -> '+str(operator[j]))
测试文本:
#include
struct student
{int id;long int counts;/*asfagsaf*//* data */
};
student stu[2000000];
int main(){for(long int i=0;i<2000000;i++){stu[i].id=i;stu[i].counts=0;}long int n,m;int a;scanf("%d",&n);for(long int i=0;iscanf("%ld",&a);stu[a].counts++;}scanf("%ld",&m);for(long int i=0;iscanf("%d",&a);if(stu[a].counts==0){printf("NO\n");}else{printf("YES\n");}}return 0;
}
输出:
include -> 44
stdio.h -> 40
# -> 0
< -> 33
> -> 36
struct -> 9
'student' -> 25
{ -> 46
int -> 7
'id' -> 25
; -> 41
long -> 12
int -> 7
'counts' -> 25
; -> 41
'student' -> 25
'stu' -> 25
'2000000' -> 26
int -> 7
main -> 1
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'2000000' -> 26
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
'stu' -> 25
'i' -> 25
'id' -> 25
'i' -> 25
= -> 38
; -> 41
'stu' -> 25
'i' -> 25
'counts' -> 25
'0' -> 26
= -> 38
; -> 41
} -> 47
long -> 12
int -> 7
'n' -> 25
'm' -> 25
; -> 41
int -> 7
'a' -> 25
; -> 41
scanf -> 48
'd' -> 25
'n' -> 25
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'n' -> 25
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
scanf -> 48
'ld' -> 25
'a' -> 25
'stu' -> 25
'a' -> 25
'counts' -> 25
} -> 47
scanf -> 48
'ld' -> 25
'm' -> 25
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'm' -> 25
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
scanf -> 48
'd' -> 25
'a' -> 25
if -> 2
'stu' -> 25
'a' -> 25
'counts' -> 25
'0' -> 26
( -> 42
printf -> 49
'NO' -> 25
'n' -> 25
} -> 47
else -> 11
{ -> 46
printf -> 49
'YES' -> 25
'n' -> 25
} -> 47
} -> 47
return -> 17
'0' -> 26
; -> 41
} -> 47
注意要点:
[a-zA-Z.0-9]+
,这里会出现一个小bug,算是逻辑漏洞,就是面向比如stu[i].counts=0;
这样的语句,并没有根据.
把语句前后分开,而是把.
合并到了.counts
里。解决方法是在输出的时候把.
去掉。/\*{1,2}[\s\S]*?\*/
,目的是去除多行注释,//[\s\S]*?\n
目的是去除单行注释。