diff --git a/.gitignore b/.gitignore index 378eac25d311703f3f2cd456d8036da525cd0366..716b26db670af398feb13d713951f1c22747eadc 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ build +Documentations/1-parser/*.pdf +compile_commands.json +.cache diff --git a/Documentations/1-parser/Basics.md b/Documentations/1-parser/Basics.md index 21a87e21b5cf695a4851691401e403b0d3e5e63e..8100f1fb790026fdb16fcb7d5900b271964b538c 100644 --- a/Documentations/1-parser/Basics.md +++ b/Documentations/1-parser/Basics.md @@ -7,19 +7,19 @@ `Cminus`是C语言的一个子集,该语言的语法在《编译原理与实践》第九章附录中有详细的介绍。而`Cminus-f`则是在`Cminus`上追加了浮点操作。 1. 关键字 - + ```c else if int return void while float ``` 2. 专用符号 - + ```c + - * / < <= > >= == != = ; , ( ) [ ] { } /* */ ``` 3. 标识符ID和整数NUM,通过下列正则表达式定义: - + ```c letter = a|...|z|A|...|Z digit = 0|...|9 @@ -29,7 +29,7 @@ ``` 4. 注释用`/*...*/`表示,可以超过一行。注释不能嵌套。 - + ```c /*...*/ ``` @@ -41,40 +41,40 @@ 我们将 Cminus-f 的所有规则分为五类。 1. 字面量、关键字、运算符与标识符 - - `type-specifier` - - `relop` - - `addop` - - `mulop` + - `type-specifier` + - `relop` + - `addop` + - `mulop` 2. 声明 - - `declaration-list` - - `declaration` - - `var-declaration` - - `fun-declaration` - - `local-declarations` + - `declaration-list` + - `declaration` + - `var-declaration` + - `fun-declaration` + - `local-declarations` 3. 语句 - - `compound-stmt` - - `statement-list` - - `statement` - - `expression-stmt` - - `iteration-stmt` - - `selection-stmt` - - `return-stmt` + - `compound-stmt` + - `statement-list` + - `statement` + - `expression-stmt` + - `iteration-stmt` + - `selection-stmt` + - `return-stmt` 4. 表达式 - - `expression` - - `simple-expression` - - `var` - - `additive-expression` - - `term` - - `factor` - - `integer` - - `float` - - `call` + - `expression` + - `simple-expression` + - `var` + - `additive-expression` + - `term` + - `factor` + - `integer` + - `float` + - `call` 5. 其他 - - `params` - - `param-list` - - `param` - - `args` - - `arg-list` + - `params` + - `param-list` + - `param` + - `args` + - `arg-list` 起始符号是 `program`。文法中用到的 token 均以下划线和粗体标出。 @@ -127,8 +127,6 @@ # >>>>>>>>>>>>>>>>>>token stream>>>>>>>>>>>>>>>>>>>>>>>>>>>> ``` - - 我们以一个简单的单词数量统计的程序`wc.l`为详细介绍下`Flex`的功能和用法(请仔细看程序中的注释内容): ```c @@ -223,7 +221,7 @@ void yyerror(const char *s); reimu : marisa { /* 这里写与该规则对应的处理代码 */ puts("rule1"); } | REIMU { /* 这里写与该规则对应的处理代码 */ puts("rule2"); } ; /* 规则最后不要忘了用分号结束哦~ */ - + /* 这种写法表示 ε —— 空输入 */ marisa : { puts("Hello!"); } @@ -255,13 +253,13 @@ int main(void) ``` 另外有一些值得注意的点: + 1. Bison 传统上将 token 用大写单词表示,将 symbol 用小写字母表示。 2. Bison 能且只能生成解析器源代码(一个 `.c` 文件),并且入口是 `yyparse`,所以为了让程序能跑起来,你需要手动提供 `main` 函数。 3. Bison 不能检测你的 action code 是否正确——它只能检测文法的部分错误,其他代码都是原样粘贴到 `.c` 文件中。 4. Bison 需要你提供一个 `yylex` 来获取下一个 token。 5. Bison 需要你提供一个 `yyerror` 来提供合适的报错机制。 - 另外,上面这个 `.y` 是可以工作的——尽管它只能接受两个字符串。把上面这段代码保存为 `reimu.y`,执行如下命令来构建这个程序: ```shell @@ -419,7 +417,7 @@ $ ./calc 下面详细讲解上面新出现的各种构造。 * `YYSTYPE`: 在 bison 解析过程中,每个 symbol 最终都对应到一个语义值上。或者说,在 parse tree 上,每个节点都对应一个语义值,这个值的类型是 `YYSTYPE`。`YYSTYPE` 的具体内容是由 `%union` 构造指出的。上面的例子中, - + ```c %union { char op; @@ -439,7 +437,7 @@ $ ./calc 使用 `union` 是因为不同节点可能需要不同类型的语义值。比如,上面的例子中,我们希望 `ADDOP` 的值是 `char` 类型,而 `NUMBER` 应该是 `double` 类型的。 * `$$` 和 `$1`, `$2`, `$3`, ...:现在我们来看如何从已有的值推出当前节点归约后应有的值。以加法为例: - + ```c term : term ADDOP factor { @@ -451,9 +449,9 @@ $ ./calc ``` 其实很好理解。当前节点使用 `$$` 代表,而已解析的节点则是从左到右依次编号,称作 `$1`, `$2`, `$3`... - -* `%type <>` 和 `%token <>`:注意,我们上面没有写 `$1.num` 或者 `$2.op` ,那么 bison 是怎么知道应该用 `union` 的哪部分值的呢?其秘诀就在文件一开始的 `%type` 和 `%token` 上: +* `%type <>` 和 `%token <>`:注意,我们上面没有写 `$1.num` 或者 `$2.op` ,那么 bison 是怎么知道应该用 `union` 的哪部分值的呢?其秘诀就在文件一开始的 `%type` 和 `%token` 上: + 例如,`term` 应该使用 `num` 部分,那么我们就写 ```c diff --git a/Documentations/1-parser/Flex-matching.md b/Documentations/1-parser/Flex-matching.md index 39970be8a3dbeeb0bd3a5ba82b69f7d48ae665c6..cc9169ab23adbe71f18f6167d0c0c3e5ddebb2b1 100644 --- a/Documentations/1-parser/Flex-matching.md +++ b/Documentations/1-parser/Flex-matching.md @@ -13,16 +13,10 @@ Note: if there is any discrepancy, please refer to `The flex Manual`. ************************** - - When the generated scanner is run, it analyzes its input looking for strings which match any of its patterns. If it finds more than one match, it takes the one matching the most text (for trailing context rules, this includes the length of the trailing part, even though it will then be returned to the input). If it finds two or more matches of the same length, the rule listed first in the `flex` input file is chosen. - - Once the match is determined, the text corresponding to the match (called the "token") is made available in the global character pointer `yytext`, and its length in the global integer `yyleng`. The "action" corresponding to the matched pattern is then executed, and then the remaining input is scanned for another match. - - If no match is found, then the "default rule" is executed: the next character in the input is considered matched and copied to the standard output. Thus, the simplest valid `flex` input is: ```c @@ -31,12 +25,8 @@ If no match is found, then the "default rule" is executed: the next character in which generates a scanner that simply copies its input (one character at a time) to its output. - - Note that `yytext` can be defined in two different ways: either as a character _pointer_ or as a character _array_. You can control which definition `flex` uses by including one of the special directives `%pointer` or `%array` in the first (definitions) section of your flex input. The default is `%pointer`, unless you use the `-l` lex compatibility option, in which case `yytext` will be an array. The advantage of using `%pointer` is substantially faster scanning and no buffer overflow when matching very large tokens (unless you run out of dynamic memory). The disadvantage is that you are restricted in how your actions can modify `yytext`, and calls to the `unput()` function destroys the present contents of `yytext`, which can be a considerable porting headache when moving between different `lex` versions. - - The advantage of `%array` is that you can then modify `yytext` to your heart‘s content, and calls to `unput()` do not destroy `yytext`. Furthermore, existing `lex` programs sometimes access `yytext` externally using declarations of the form: ```c @@ -45,10 +35,6 @@ The advantage of `%array` is that you can then modify `yytext` to your heart‘s This definition is erroneous when used with `%pointer`, but correct for `%array`. - - The `%array` declaration defines `yytext` to be an array of `YYLMAX` characters, which defaults to a fairly large value. You can change the size by simply #define'ing `YYLMAX` to a different value in the first section of your `flex` input. As mentioned above, with `%pointer` yytext grows dynamically to accommodate large tokens. While this means your `%pointer` scanner can accommodate very large tokens (such as matching entire blocks of comments), bear in mind that each time the scanner must resize `yytext` it also must rescan the entire token from the beginning, so matching such tokens can prove slow. `yytext` presently does _not_ dynamically grow if a call to `unput()` results in too much text being pushed back; instead, a run-time error results. - - Also note that you cannot use `%array` with C++ scanner classes diff --git a/Documentations/1-parser/Flex-regular-expressions.md b/Documentations/1-parser/Flex-regular-expressions.md index 113afac51237fbddc5e866b794ca403e5d9ec9be..4ded9130196464aea8242a6c30838d32fed44851 100644 --- a/Documentations/1-parser/Flex-regular-expressions.md +++ b/Documentations/1-parser/Flex-regular-expressions.md @@ -16,189 +16,181 @@ Note: if there is any discrepancy, please refer to `The flex Manual`. The patterns in the input are written using an extended set of regular expressions. These are: * `x` - + match the character `x` * `.` - + any character (byte) except newline * `[xyz]` - + a "character class"; in this case, the pattern matches either an `x`, a `y`, or a `z` * `[abj-oZ]` - + a "character class" with a range in it; matches an `a`, a `b`, any letter from `j` through `o`, or a `Z` * `[^A-Z]` - + a "negated character class", i.e., any character but those in the class. In this case, any character EXCEPT an uppercase letter. * `[^A-Z\n]` - + any character EXCEPT an uppercase letter or a newline * `[a-z]{-}[aeiou]` - + the lowercase consonants * `r*` - + zero or more r`s, where r is any regular expression * `r+` - + one or more r`s * `r?` - + zero or one r`s (that is, "an optional r") * `r{2,5}` - + anywhere from two to five `r` * `r{2,}` - + two or more `r` * `r{4}` - + exactly 4 `r` - * `"[xyz]\"foo"` - + the literal string: `[xyz]"foo` * `\X` - + if X is `a`, `b`, `f`, `n`, `r`, `t`, or `v`, then the ANSI-C interpretation of `\x`. Otherwise, a literal `X` (used to escape operators such as `*`) - * `\0` - + a NUL character (ASCII code 0) * `\123` - + the character with octal value 123 * `\x2a` - + the character with hexadecimal value 2a * `(r)` - + match an `r`; parentheses are used to override precedence (see below) * `(?r-s:pattern)` - + apply option `r` and omit option `s` while interpreting pattern. Options may be zero or more of the characters `i`, `s`, or `x`. - + `i` means case-insensitive. `-i` means case-sensitive. - + `s` alters the meaning of the `.` syntax to match any single byte whatsoever. `-s` alters the meaning of `.` to match any byte except `\n`. - + `x` ignores comments and whitespace in patterns. Whitespace is ignored unless it is backslash-escaped, contained within `""`s, or appears inside a character class. - + The following are all valid: - - * `(?:foo)` same as `(foo)` - - * `(?i:ab7)` same as `([aA][bB]7)` - - * `(?-i:ab)` same as `(ab)` - - * `(?s:.) ` same as `[\x00-\xFF]` - - * `(?-s:.)` same as `[^\n]` - - * `(?ix-s: a . b)` same as `([Aa][^\n][bB])` - - * `(?x:a b)` same as `("ab")` - - * `(?x:a\ b)` same as `("a b")` - - * `(?x:a" "b) ` same as` ("a b")` - - * `(?x:a[ ]b) ` same as `("a b")` - - * ```shell - (?x:a - /* comment */ - b - c) - ``` - - same as `(abc)` + + * `(?:foo)` same as `(foo)` + + * `(?i:ab7)` same as `([aA][bB]7)` + + * `(?-i:ab)` same as `(ab)` + + * `(?s:.) ` same as `[\x00-\xFF]` + + * `(?-s:.)` same as `[^\n]` + + * `(?ix-s: a . b)` same as `([Aa][^\n][bB])` + + * `(?x:a b)` same as `("ab")` + + * `(?x:a\ b)` same as `("a b")` + + * `(?x:a" "b) ` same as` ("a b")` + + * `(?x:a[ ]b) ` same as `("a b")` + + * ```shell + (?x:a + /* comment */ + b + c) + ``` + + same as `(abc)` * `(?# comment )` - + omit everything within `()`. The first `)` character encountered ends the pattern. It is not possible to for the comment to contain a `)` character. The comment may span lines. - * `rs` - + the regular expression `r` followed by the regular expression `s`; called "concatenation" - * `r|s` - + either an `r` or an `s` * `r/s` - + an `r` but only if it is followed by an `s`. The text matched by `s` is included when determining whether this rule is the longest match, but is then returned to the input before the action is executed. So the action only sees the text matched by `r`. This type of pattern is called "trailing context". (There are some combinations of `r/s` that flex cannot match correctly.) - * `^r` - + an `r`, but only at the beginning of a line (i.e., when just starting to scan, or right after a newline has been scanned). - * `r$` - + an `r`, but only at the end of a line (i.e., just before a newline). Equivalent to `r/\n`. - + Note that `flex`s notion of "newline" is exactly whatever the C compiler used to compile `flex` interprets `\n` as; in particular, on some DOS systems you must either filter out `\r`s in the input yourself, or explicitly use `r/\r\n` for `r$`. * `r` - + an `r`, but only in start condition `s`. * `r` - + same, but in any of start conditions `s1`, `s2`, or `s3`. * `<*>r` - + an `r` in any start condition, even an exclusive one. * `<>` - + an end-of-file. * `<>` - + an end-of-file when in start condition `s1` or `s2` - - Note that inside of a character class, all regular expression operators lose their special meaning except escape (`\`) and the character class operators, `-`, `]]`, and, at the beginning of the class, `^`. The regular expressions listed above are grouped according to precedence, from highest precedence at the top to lowest at the bottom. Those grouped together have equal precedence (see special note on the precedence of the repeat operator, `{}`, under the documentation for the `--posix` POSIX compliance option). For example, -​ `foo|bar*` is the same as `(foo)|(ba(r*))` +​ `foo|bar*` is the same as `(foo)|(ba(r*))` Since the `*` operator has higher precedence than concatenation, and concatenation higher than alternation (`|`). This pattern therefore matches _either_ the string `foo` _or_ the string `ba` followed by zero-or-more `r`'s. To match `foo` or zero-or-more repetitions of the string `bar`, use: -​ ` foo|(bar)*` +​ ` foo|(bar)*` And to match a sequence of zero or more repetitions of `foo` and`bar`: -​ `(foo|bar)*` +​ `(foo|bar)*` In addition to characters and ranges of characters, character classes can also contain "character class expressions". These are expressions enclosed inside `[:` and `:]` delimiters (which themselves must appear between the `[` and `]` of the character class. Other elements may occur inside the character class, too). The valid expressions are: @@ -219,91 +211,61 @@ For example, the following character classes are all equivalent: * `[[:alpha:][0-9]]` * `[a-zA-Z0-9]` - - A word of caution. Character classes are expanded immediately when seen in the `flex` input. This means the character classes are sensitive to the locale in which `flex` is executed, and the resulting scanner will not be sensitive to the runtime locale. This may or may not be desirable. * If your scanner is case-insensitive (the `-i` flag), then - + `[:upper:]` and `[:lower:]` are equivalent to `[:alpha:]`. - - * Character classes with ranges, such as `[a-Z]`, should be used with caution in a case-insensitive scanner if the range spans upper or lowercase characters. Flex does not know if you want to fold all upper and lowercase characters together, or if you want the literal numeric range specified (with no case folding). When in doubt, flex will assume that you meant the literal numeric range, and will issue a warning. The exception to this rule is a character range such as `[a-z]` or `[S-W]` where it is obvious that you want case-folding to occur. Here are some examples with the `-i` flag enabled: - - | Range | Result | Literal Range | Alternate Range | - | ------- | --------- | ------------------ | ------------------- | - | `[a-t]` | ok | `[a-tA-T]` | | - | `[A-T]` | ok | `[a-tA-T]` | | - | `[A-t]` | ambiguous | `[A-Z\[\\\]_'a-t]` | `[a-tA-T]` | - | `[_-{]` | ambiguous | `[_'a-z{]` | `[_'a-zA-Z{]` | - | `[@-C]` | ambiguous | `[@ABC]` | `[@A-Z\[\\\]_'abc]` | - - + + | Range | Result | Literal Range | Alternate Range | + | ------- | --------- | ------------------ | ------------------- | + | `[a-t]` | ok | `[a-tA-T]` | | + | `[A-T]` | ok | `[a-tA-T]` | | + | `[A-t]` | ambiguous | `[A-Z\[\\\]_'a-t]` | `[a-tA-T]` | + | `[_-{]` | ambiguous | `[_'a-z{]` | `[_'a-zA-Z{]` | + | `[@-C]` | ambiguous | `[@ABC]` | `[@A-Z\[\\\]_'abc]` | * A negated character class such as the example `[^A-Z]` above _will_ match a newline unless `\n` (or an equivalent escape sequence) is one of the characters explicitly present in the negated character class (e.g., `[^A-Z\n]`). This is unlike how many other regular expression tools treat negated character classes, but unfortunately the inconsistency is historically entrenched. Matching newlines means that a pattern like `[^"]*` can match the entire input unless there`s another quote in the input. - + Flex allows negation of character class expressions by prepending `^` to the POSIX character class name. - + `[:^alnum:]` `[:^alpha:]` `[:^blank:]` - + `[:^cntrl:]` `[:^digit:]` `[:^graph:]` - + `[:^lower:]` `[:^print:]` `[:^punct:]` - + `[:^space:]` `[:^upper:]` `[:^xdigit:]` - + Flex will issue a warning if the expressions `[:^upper:]` and`[:^lower:]` appear in a case-insensitive scanner, since their meaning is unclear. The current behavior is to skip them entirely, but this may change without notice in future revisions of flex. - - * The `{-}` operator computes the difference of two character classes. For example, `[a-c]{-}[b-z]` represents all the characters in the class `[a-c]` that are not in the class `[b-z]` (which in this case, is just the single character `a`). The `{-}` operator is left associative, so `[abc]{-}[b]{-}[c]` is the same as `[a]`. Be careful not to accidentally create an empty set, which will never match. - - - * The `{+}` operator computes the union of two character classes. For example, `[a-z]{+}[0-9]` is the same as `[a-z0-9]`. This operator is useful when preceded by the result of a difference operation, as in, `[[:alpha:]]{-}[[:lower:]]{+}[q]`, which is equivalent to `[A-Zq]` in the "C" locale. - - * A rule can have at most one instance of trailing context (the `/` operator or the `$` operator). The start condition, `^`, and `<>` patterns can only occur at the beginning of a pattern, and, as well as with `/` and `$`, cannot be grouped inside parentheses. A `^` which does not occur at the beginning of a rule or a `$` which does not occur at the end of a rule loses its special properties and is treated as a normal character. - - * The following are invalid: - + `foo/bar$` - + `foobar` - - Note that the first of these can be written `foo/bar\n`. - + Note that the first of these can be written `foo/bar\n`. * The following will result in `$` or `^` being treated as a normal character: - + `foo|(bar$)` - + ``foo|^bar` - + If the desired meaning is a `foo` or a `bar`-followed-by-a-newline, the following could be used (the special `|` action is explained below): - + ```shell foo | bar$ /* action goes here */ ``` - + A similar trick will work for matching a `foo` or a `bar`-at-the-beginning-of-a-line. - - - - - - - - - - - - - diff --git a/Documentations/1-parser/Parser-FurtherReadings.md b/Documentations/1-parser/Parser-FurtherReadings.md index 63af1c256caed296d44c72507dda8b098dbf6c6d..fbf677b54c633f25abfec93cc1b7dc7afdc92923 100644 --- a/Documentations/1-parser/Parser-FurtherReadings.md +++ b/Documentations/1-parser/Parser-FurtherReadings.md @@ -59,6 +59,7 @@ identifier("abc123") ==> (Some("abc123"), "") 这里返回的是字符串 "abc1 2. `or(p,q)`: 表示首先尝试 p,如果成功则返回结果,否则接着尝试 q,否则失败。 那么就可以定义 + ``` factor = or( seq(number,identifier).map { Expr.Mul(Expr.Const(#1), Expr.Val(#2)) }, @@ -158,13 +159,13 @@ var var = 1; // 旧版报错 考虑如下的例子 -| a₁ | a₂ | a₃ | -| ----- | ----- | ----- | -| a | ab | bba | +| a₁ | a₂ | a₃ | +| --- | --- | --- | +| a | ab | bba | -| b₁ | b₂ | b₃ | -| ----- | ----- | ----- | -| baa | aa | bb | +| b₁ | b₂ | b₃ | +| --- | --- | --- | +| baa | aa | bb | 对这组输入来说,这个问题是有解的,因为 a₃a₂a₃a₁ = b₃b₂b₃b₁。 @@ -172,11 +173,9 @@ var var = 1; // 旧版报错 **然而**,这个问题是不可能机械求解的!不可能写出一个程序来判定这个问题。事实上,不可判定问题无处不在,[莱斯定理](https://en.wikipedia.org/wiki/Rice%27s_theorem)告诉我们,任何non-trivial程序的属性都是不可判定的。 - ## 在线解析 学到这里,虽说大家已经可以写 parser 了,但是这在工程实践上却还不够。比如说,IDE 为了提供准确的实时报错、自动补全、代码缩进,都需要在用户编辑代码时立即提供语法树。仅仅利用 lab2 这种简单的离线解析器是完全不能满足使用的。在编辑代码时,大部分时间代码都是语法甚至词法不正确的,必须考虑到各种错误情形,并保证不会搞乱代码。此外,在提供自动缩进时,后方的错误不应该影响到前方代码的缩进。还有一个问题是,离线解析需要从头构建语法树,代价较高。受到这种“在线解析”需求的启发,涌现了不少很有实用价值的工作,比如: 1. [tree-sitter](https://github.com/tree-sitter/tree-sitter): incremental parser 框架,总是在内存中维护完整的语法树。 2. [Auto-indentation with incomplete information](https://arxiv.org/ftp/arxiv/papers/2006/2006.03103.pdf): 基于 Operator precedence parser 的用于代码缩进的框架,支持局部前向解析。尽管并不维护完整的语法树,但由于每次解析量很少,所以速度足够快。 - diff --git a/Documentations/1-parser/README.md b/Documentations/1-parser/README.md index eb37b32b125a3fc3aaff2ded0a4bbe00f726850f..191b4c7ec602a68c62352a9f893c0867ad79142a 100644 --- a/Documentations/1-parser/README.md +++ b/Documentations/1-parser/README.md @@ -47,6 +47,7 @@ Token Text Line Column (Start,End) 具体的需识别token请参考[基础知识](./Basics.md)。 提示: + 1. 在编写本部分前,需要首先修改 `.y` 文件。具体怎么做请参见[基础知识](./Basics.md)。 2. 在进入实验下一部分前,你可以使用我们提供的 `lexer` 程序进行调试。参见本文档 3.2 节。 3. token编号是自动生成的,`make` 后,可在 `build/syntax_analyzer.h` 中找到。每次修改token后,都应该重新 `make` 后再进行对照。 @@ -61,7 +62,7 @@ Token Text Line Column (Start,End) ```c int main(void) { - return 0; + return 0; } ``` @@ -155,7 +156,7 @@ int main(void) { 项目构建使用 `cmake` 进行。 * 编译 - + ```sh $ cd 2022fall-Compiler_CMinus $ mkdir build @@ -163,23 +164,24 @@ int main(void) { $ cmake .. $ make ``` - + 如果构建成功,会在该目录下看到 `lexer` 和 `parser` 两个可执行文件。 - * `lexer`用于词法分析,产生token stream;对于词法分析结果,我们不做考察 - * `parser`用于语法分析,产生语法树。 + + * `lexer`用于词法分析,产生token stream;对于词法分析结果,我们不做考察 + * `parser`用于语法分析,产生语法树。 * 运行 - + ```sh $ cd 2022fall-Compiler_CMinus # 词法测试 $ ./build/lexer ./tests/parser/normal/local-decl.cminus - Token Text Line Column (Start,End) - 280 int 0 (0,3) - 284 main 0 (4,8) - 272 ( 0 (8,9) - 282 void 0 (9,13) - 273 ) 0 (13,14) + Token Text Line Column (Start,End) + 280 int 0 (0,3) + 284 main 0 (4,8) + 272 ( 0 (8,9) + 282 void 0 (9,13) + 273 ) 0 (13,14) ... # 语法测试 $ ./build/parser ./tests/parser/normal/local-decl.cminus @@ -191,7 +193,7 @@ int main(void) { * 验证 可以使用 `diff` 与标准输出进行比较。 - + ```sh $ cd 2022fall-Compiler_CMinus $ export PATH="$(realpath ./build):$PATH" @@ -201,9 +203,9 @@ int main(void) { $ diff output.easy/expr.cminus syntree_easy_std/expr.syntax_tree [输出为空,代表没有区别,该测试通过] ``` - + 我们提供了 `test_syntax.sh` 脚本进行快速批量测试。该脚本的第一个参数是 `easy` `normal` `hard` 等难度,并且有第二个可选参数,用于进行批量 `diff` 比较。脚本运行后会产生名如 `syntree_easy` 的文件夹。 - + ```sh $ ./test_syntax.sh easy [info] Analyzing FAIL_id.cminus @@ -221,59 +223,63 @@ int main(void) { * 提交要求 本实验的提交要求分为两部分:实验部分的文件和报告,git提交的规范性。 - - * 实验部分 - * 需要完善 `src/parser/lexical_analyzer.l` 和 `src/parser/syntax_analyzer.y`。 - * 需要在 `Reports/1-parser/README.md` 中撰写实验报告。 - * 实验报告内容包括 - * 实验要求、实验难点、实验设计、实验结果验证、实验反馈 - - * git 提交规范 - * 不破坏目录结构(实验报告所需图片放在目录中) - * 不上传临时文件(凡是可以自动生成的都不要上传,如 `build` 目录、测试时自动生成的输出、`.DS_Store` 等) - * git log 言之有物 + + * 实验部分 + + * 需要完善 `src/parser/lexical_analyzer.l` 和 `src/parser/syntax_analyzer.y`。 + * 需要在 `Reports/1-parser/README.md` 中撰写实验报告。 + * 实验报告内容包括 + * 实验要求、实验难点、实验设计、实验结果验证、实验反馈 + + * git 提交规范 + + * 不破坏目录结构(实验报告所需图片放在目录中) + * 不上传临时文件(凡是可以自动生成的都不要上传,如 `build` 目录、测试时自动生成的输出、`.DS_Store` 等) + * git log 言之有物 * 提交方式: - * 代码提交:本次实验需要在希冀课程平台上发布的作业[Lab1-代码提交](http://cscourse.ustc.edu.cn/assignment/index.jsp?courseID=17&assignID=54)提交自己仓库的 gitlab 链接(注:由于平台限制,请提交http协议格式的仓库链接。例:学号为 PB011001 的同学,Lab1 的实验仓库地址为`http://202.38.79.174/PB011001/2022fall-compiler_cminus.git`),我们会收集最后一次提交的评测分数,作为最终代码得分。 + * 代码提交:本次实验需要在希冀课程平台上发布的作业[Lab1-代码提交](http://cscourse.ustc.edu.cn/assignment/index.jsp?courseID=17&assignID=54)提交自己仓库的 gitlab 链接(注:由于平台限制,请提交http协议格式的仓库链接。例:学号为 PB011001 的同学,Lab1 的实验仓库地址为`http://202.38.79.174/PB011001/2022fall-compiler_cminus.git`),我们会收集最后一次提交的评测分数,作为最终代码得分。 - * 实验评测 - * 除已提供的 easy, normal, hard 数据集之外,平台会使用额外的隐藏测试用例进行测试。 - - * 报告提交:将 `Reports/1-parser/README.md` 导出成 pdf 文件单独提交到[Lab1-报告提交](http://cscourse.ustc.edu.cn/assignment/index.jsp?courseID=17&assignID=54)。 + * 实验评测 + + * 除已提供的 easy, normal, hard 数据集之外,平台会使用额外的隐藏测试用例进行测试。 + + * 报告提交:将 `Reports/1-parser/README.md` 导出成 pdf 文件单独提交到[Lab1-报告提交](http://cscourse.ustc.edu.cn/assignment/index.jsp?courseID=17&assignID=54)。 - * 提交异常:如果遇到在平台上提交异常的问题,请通过邮件联系助教,助教将收取截止日期之前,学生在 gitlab 仓库最近一次 commit 内容进行评测。 + * 提交异常:如果遇到在平台上提交异常的问题,请通过邮件联系助教,助教将收取截止日期之前,学生在 gitlab 仓库最近一次 commit 内容进行评测。 * 迟交规定 - - * Soft Deadline:2022-09-30 23:59:59 (UTC+8) - - * Hard Deadline:2022-10-07 23:59:59 (UTC+8) - - * 补交请邮件提醒 TA: - - * 邮箱:`zhenghy22@mail.ustc.edu.cn` 抄送 `chen16614@mail.ustc.edu.cn` - * 邮件主题:lab1迟交-学号 - * 内容:迟交原因、最后版本commitID、迟交时间 - - * 迟交分数 - - * x 为相对 Soft Deadline 迟交天数,grade 满分 100 - - ``` - final_grade = grade, x = 0 - final_grade = grade * (0.9)^x, 0 < x <= 7 - final_grade = 0, x > 7 - ``` + + * Soft Deadline:2022-09-30 23:59:59 (UTC+8) + + * Hard Deadline:2022-10-07 23:59:59 (UTC+8) + + * 补交请邮件提醒 TA: + + * 邮箱:`zhenghy22@mail.ustc.edu.cn` 抄送 `chen16614@mail.ustc.edu.cn` + * 邮件主题:lab1迟交-学号 + * 内容:迟交原因、最后版本commitID、迟交时间 + + * 迟交分数 + + * x 为相对 Soft Deadline 迟交天数,grade 满分 100 + + ``` + final_grade = grade, x = 0 + final_grade = grade * (0.9)^x, 0 < x <= 7 + final_grade = 0, x > 7 + ``` * 评分标准: 实验一最终分数组成如下: - * 平台测试得分:(70分) - * 实验报告得分:(30分) - 注:禁止执行恶意代码,违者0分处理。 -* 关于抄袭和雷同 + * 平台测试得分:(70分) + * 实验报告得分:(30分) + 注:禁止执行恶意代码,违者0分处理。 +* 关于抄袭和雷同 + 经过助教和老师判定属于作业抄袭或雷同情况,所有参与方一律零分,不接受任何解释和反驳。 如有任何问题,欢迎在论坛提意见进行批判指正。 diff --git a/src/parser/lexical_analyzer.l b/src/parser/lexical_analyzer.l index 88ecdfb8a50e3539524934ef121ad2715a4a53dd..1bb15aa25fd525fbd21f343e2c3f0cc25cacb0b4 100644 --- a/src/parser/lexical_analyzer.l +++ b/src/parser/lexical_analyzer.l @@ -12,19 +12,54 @@ int pos_start; int pos_end; void pass_node(char *text){ - yylval.node = new_syntax_tree_node(text); + yylval.value = new_syntax_tree_node(text); } /*****************声明和选项设置 end*****************/ %} +letter [a-zA-Z] +digit [0-9] +ID {letter}+ +INTEGER {digit}+ +FLOAT {digit}+\. | {digit}*\.{digit}+ + +/* + +%token _IF _ELSE _WHILE _RETURN _INT _FLOAT _VOID +%token _ASSIGN _RELOP _ADD_OP _MUL_OP +%token _L_SQUARE _R_SQUARE _L_PARE _R_PARE _L_BRACKET _R_BRACKET +%token _SEMI _COMMA _ID _INTEGER _FLOATPOINT + +*/ %% - /* to do for students */ - /* two cases for you, pass_node will send flex's token to bison */ -\+ {pos_start = pos_end; pos_end += 1; pass_node(yytext); return ADD;} -. { pos_start = pos_end; pos_end++; return ERROR; } +if {pos_start = pos_end; pos_end += 2; pass_node("if"); return _IF;} +else {pos_start = pos_end; pos_end += 4; pass_node("else"); return _ELSE;} +while {pos_start = pos_end; pos_end += 5; pass_node("while"); return _WHILE;} +return {pos_start = pos_end; pos_end += 6; pass_node("return"); return _RETURN;} +int {pos_start = pos_end; pos_end += 3; pass_node("int"); return _INT;} +float {pos_start = pos_end; pos_end += 5; pass_node("float"); return _FLOAT;} +void {pos_start = pos_end; pos_end += 4; pass_node("void"); return _VOID;} + +{ID} {pos_start = pos_end; pos_end += yyleng; pass_node(yytext); return _ID;} +{INTEGER} {pos_start = pos_end; pos_end += yyleng; pass_node(yytext); return _INTEGER;} +{FLOAT} {pos_start = pos_end; pos_end += yyleng; pass_node(yytext); return _FLOATPOINT;} + +\= {pos_start = pos_end; pos_end += 1; pass_node("="); return _ASSIGN;} +"<=" | ">=" | "<" | ">" | "==" | "!=" {pos_start = pos_end; pos_end += yyleng; pass_node(yytext); return _RELOP;} +"+" | "-" {pos_start = pos_end; pos_end += 1; pass_node(yytext); return _ADD_OP;} +"*" | "/" {pos_start = pos_end; pos_end += 1; pass_node(yytext); return _MUL_OP;} + +\[ | \] {pos_start = pos_end; pos_end += 1; pass_node(yytext); return yytext[0] == '[' ? _L_SQUARE : _R_SQUARE;} +\( | \) {pos_start = pos_end; pos_end += 1; pass_node(yytext); return yytext[0] == '(' ? _L_PARE : _R_PARE;} +\{ | \} {pos_start = pos_end; pos_end += 1; pass_node(yytext); return yytext[0] == '{' ? _L_BRACKET : _R_BRACKET;} + +"," | ";" {pos_start = pos_end; pos_end += 1; pass_node(yytext); return yytext[0] == ',' ? _COMMA : _SEMI;} - /****请在此补全所有flex的模式与动作 end******/ +" " | \t { pos_end++; } +\r\n | \n | \r { lines++; pos_end = 0;} +/* . { pos_start = pos_end; pos_end++; return ERROR; } */ +/****请在此补全所有flex的模式与动作 end******/ %% diff --git a/src/parser/syntax_analyzer.y b/src/parser/syntax_analyzer.y index 655016ea0c418e78023fd600fe8a87aafa247c07..1beaaf564a117ab70c7f196bcf783570a29ba979 100644 --- a/src/parser/syntax_analyzer.y +++ b/src/parser/syntax_analyzer.y @@ -14,7 +14,7 @@ extern FILE * yyin; // external variables from lexical_analyzer module extern int lines; -extern char * yytext; +extern char *yytext; extern int pos_end; extern int pos_start; @@ -31,24 +31,176 @@ syntax_tree_node *node(const char *node_name, int children_num, ...); /* TODO: Complete this definition. Hint: See pass_node(), node(), and syntax_tree.h. Use forward declaring. */ -%union {} +%union { + node value; +} /* TODO: Your tokens here. */ +/* +alias: +- SPEC: SPECIFIER +- DEC:DECLARATION +- COM: COMPOUND +- STMT: STATEMENT +- EXPR: EXPRESSION +- ITER: ITERATION +- SELC: SELCTION +- RET: RETURN +- Tokens starting with '_' is the terminator +*/ %token ERROR -%token ADD +%type TYPE_SPEC RELOP ADDOP MULOP +%type DEC_LIST DEC VAR_DEC FUN_DEC LOCAL_DEC +%type COM_STMT STMT_LIST STMT EXPR_STMT ITER_STMT SELC_STMT RET_STMT +%type EXPR SIMPLE_EXPR VAR ADD_EXPR TERM FACTOR INTEGER FLOAT CALL +%type PARAM PARAMS PARAM_LIST ARGS ARG_LIST +/* These are for flex to return +NOTE: Though combining _LE _LT _BT _BE _EQ _NEQ to _RELOP makes the program simpler, + it may not satisfy the subsequent requirements. +*/ +%token _IF _ELSE _WHILE _RETURN _INT _FLOAT _VOID +%token _ASSIGN _RELOP _ADD_OP _MUL_OP +%token _L_SQUARE _R_SQUARE _L_PARE _R_PARE _L_BRACKET _R_BRACKET +%token _SEMI _COMMA _ID _INTEGER _FLOATPOINT + %type program %start program -%% /* TODO: Your rules here. */ +%% + +program: DEC_LIST {$$ = node("program", 1, $1); gt->root = $$;} + ; + +DEC_LIST: DEC_LIST DEC {$$ = node("declaration-list", 2, $1, $2); } + | DEC {$$ = node("declaration-list", 1, $1);} + ; + +DEC: VAR_DEC {$$ = node("declaration", 1, $1); } + | FUN_DEC {$$ = node("declaration", 1, $1); } + ; -/* Example: -program: declaration-list {$$ = node( "program", 1, $1); gt->root = $$;} +VAR_DEC: TYPE_SPEC _ID _SEMI {$$ = node("var-declaration", 3, $1, $2, $3); } + | TYPE_SPEC _ID _L_BRACKET _INTEGER _R_BRACKET _SEMI {$$ = node("var-declaration", 6, $1, $2, $3, $4, $5, $6); } ; + +TYPE_SPEC: _INT {$$ = node("type-specifier", 1, $1); } + | _FLOAT {$$ = node("type-specifier", 1, $1); } + | _VOID {$$ = node("type-specifier", 1, $1); } + ; + +FUN_DEC: TYPE_SPEC _ID _L_PARE PARAMS _R_PARE COM_STMT {$$ = node("fun-declaration", 6, $1, $2, $3, $4, $5, $6); } + ; + +PARAMS: PARAM_LIST {$$ = node("params", 1, $1); } + | _VOID {$$ = node("params", 1, $1); } + ; + +PARAM_LIST: PARAM_LIST _COMMA PARAM {$$ = node("param-list", 3, $1, $2, $3); } + | PARAM {$$ = node("param-list", 1, $1); } + ; + + +PARAM: TYPE_SPEC _ID {$$ = node("param", 2, $1, $2); } + | TYPE_SPEC _ID _L_SQUARE _R_SQUARE {$$ = node("param", 4, $1, $2, $3, $4);} + ; + +COM_STMT: _L_BRACKET LOCAL_DEC STMT_LIST _R_BRACKET {$$ = node("compound-stmt", 4, $1, $2, $3, $4);} + ; + +LOCAL_DEC: LOCAL_DEC VAR_DEC {$$ = node("local-declarations", 2, $1, $2);} + | {$$ = node("local-declarations", 0);} + ; + +STMT_LIST: STMT_LIST STMT {$$ = node("statement-list", 2, $1, $2);} + | {$$ = node("statement-list", 0);} + ; + +STMT: EXPR_STMT {$$ = node("statement", 1, $1);} + | COM_STMT {$$ = node("statement", 1, $1);} + | SELC_STMT {$$ = node("statement", 1, $1);} + | ITER_STMT {$$ = node("statement", 1, $1);} + | RET_STMT {$$ = node("statement", 1, $1);} + ; + +EXPR_STMT: EXPR _SEMI {$$ = node("expression-stmt", 2, $1, $2);} + | _SEMI {$$ = node("expression-stmt", 1, $1);} + ; + +SELC_STMT: _IF _L_PARE EXPR _R_PARE STMT {$$ = node("selection-stmt", 5, $1, $2, $3, $4, $5);} + | _IF _L_PARE EXPR _R_PARE STMT _ELSE STMT {$$ = node("selection-stmt", 7, $1, $2, $3, $4, $5, $6, $7);} + ; + +ITER_STMT: _WHILE _L_PARE EXPR _R_PARE STMT {$$ = node("iteration-stmt", 5, $1, $2, $3, $4, $5);} + ; + +RET_STMT: _RETURN _SEMI {$$ = node("return-stmt", 2, $1, $2);} + | _RETURN EXPR _SEMI {$$ = node("return-stmt", 3, $1, $2, $3);} + ; + +EXPR: VAR _ASSIGN EXPR {$$ = node("expression", 3, $1, $2, $3);} + | SIMPLE_EXPR {$$ = node("expression", 1, $1);} + ; + +VAR: _ID {$$ = node("var", 1, $1);} + | _ID _L_SQUARE EXPR _R_SQUARE {$$ = node("var", 4, $1, $2, $3, $4);} + ; + +SIMPLE_EXPR: ADD_EXPR RELOP ADD_EXPR {$$ = node("simple-expression", 3, $1, $2, $3);} + | ADD_EXPR {$$ = node("simple-expression", 1, $1);} + ; + +RELOP: _RELOP {$$ = node("relop", 1, $1);} + ; +/* +RELOP: _LE {$$ = node("relop", 1, $1);} + | _LT {$$ = node("relop", 1, $1);} + | _GT {$$ = node("relop", 1, $1);} + | _GE {$$ = node("relop", 1, $1);} + | _EQ {$$ = node("relop", 1, $1);} + | _NEQ {$$ = node("relop", 1, $1);} + ; */ -program : ; +ADD_EXPR: ADD_EXPR ADDOP TERM {$$ = node("additive-expression", 3, $1, $2, $3);} + | TERM {$$ = node("additive-expression", 1, $1);} + ; + +ADDOP: _ADD_OP {$$ = node("addop", 1, $1);} + ; + +TERM: TERM MULOP FACTOR {$$ = node("term", 3, $1, $2, $3);} + | FACTOR {$$ = node("term", 1, $1);} + ; + +MULOP: _MUL_OP {$$ = node("mulop", 1, $1);} + ; + +FACTOR: _L_PARE EXPR _R_PARE {$$ = node("factor", 3, $1, $2, $3);} + | VAR {$$ = node("factor", 1, $1);} + | CALL {$$ = node("factor", 1, $1);} + | INTEGER {$$ = node("factor", 1, $1);} + | FLOAT {$$ = node("factor", 1, $1);} + ; + +INTEGER: _INTEGER {$$ = node("integer", 1, $1);} + ; + +FLOAT: _FLOATPOINT {$$ = node("float", 1, $1);} + ; + +CALL: _ID _L_PARE ARGS _R_PARE {$$ = node("call", 4, $1, $2, $3, $4);} + ; + +ARGS: ARG_LIST {$$ = node("args", 1, $1);} + | {$$ = node("args", 0);} + ; + +ARG_LIST: ARG_LIST _COMMA EXPR {$$ = node("arg-list", 3, $1, $2, $3);} + | EXPR {$$ = node("arg-list", 1, $1);} + ; + %% @@ -75,7 +227,7 @@ syntax_tree *parse(const char *input_path) yyin = stdin; } - lines = pos_start = pos_end = 1; +lines = pos_start = pos_end = 1; gt = new_syntax_tree(); yyrestart(yyin); yyparse();