parrotcode: lexical analysis for Parrot Intermediate Representation | |
Contents | Compilers |
pirlexer.c - lexical analysis for Parrot Intermediate Representation
char
s or int
s?dictionary contains *all* keywords, directives, flags and other (descriptions of) tokens that are recognized by the lexer.
global goto
if n_operators
int null
num pmc
string unless
The following are PIR directives.
.arg .const .constant .end
.endnamespace .endm .get_results .globalconst
.HLL .HLL_map .include .invocant .lex
.loadlib .local .macro .meth_call .namespace
.nci_call .param .begin_call .begin_return .begin_yield
.call .end_call .end_return .end_yield .pragma
.result .return .sub .yield
The following are flags for subroutines:
:anon :immediate :init :lex :load :main
:method :multi :outer :postcomp :vtable :named
The following are flags for parameters/arguments.
:opt_flag
:optional
:slurpy
:flat
:unique_reg
The following are string encoding specifiers:
ascii:
binary:
iso-8859-1:
unicode:
Structure that represents a file. Its layout is shown below. First, it contains the filename of the file that is represented by this buffer. Then, the buffer is an array that holds the complete file contents. This is done for efficiency (instead of reading character by character from disk). The curchar
acts like a cursor, that points to the current character. The field filesize
contains the size of the file counted in bytes, line
keeps track of the current line number, and linepos
counts the number of characters since the last newline character. The field lastchar
stores the previous character (so the character before the character pointed to by curchar
. This field is used to decide whether the previous character was a newline. If so, then curchar
is at the start of a line (needed for Heredoc delimiters).
The field prevbuffer
points to another file_buffer; if the current file was .include
d, then prevbuffer
points to the file_buffer that represents the including file. An example:
$ cat main.pir
.include "util.pir"
.sub main
# ...
.end
$ cat util.pir
.sub foo
# ...
.end
In this case, when parsing the file main.pir
, prevbuffer
is NULL, because this file was not included. Then, when the file util.pir
is included, a new file_buffer is created for that file, and prevbuffer
is set to the file_buffer representing main.pir
.
The file_buffer structure is shown below:
typedef struct file_buffer {
char *filename; -- the name of this file
char *buffer; -- buffer holding contents of this file
char *curchar; -- pointer to the current char.
unsigned filesize; -- size of this file in bytes
unsigned long line; -- line number
unsigned short linepos; -- position on the current line
char lastchar; -- the previous character that was read.
struct file_buffer *prevbuffer; -- pointer to 'including' file if any
} file_buffer;
Structure representing the lexer. It holds a pointer to the current file being read, a buffer holding the current token, and a pointer to add characters to the token buffer.
typedef struct lexer_state {
struct file_buffer *curfile; -- pointer to the current file
char *token_chars; -- characters of the current token
char *charptr; -- used for adding/removing token chars
} lexer_state;
char const *find_keyword(token t)
char *const get_current_token(lexer_state const *s)
char *const get_current_file(struct lexer_state *s)
long get_current_line(struct lexer_state *s)
unsigned short get_current_linepos(struct lexer_state *s)
long get_current_filepos(struct lexer_state *s)
void print_error_context(struct lexer_state *s)
static void buffer_char(lexer_state *lexer, char c)
static char read_char(file_buffer *buf)
static void unread_char(file_buffer *buf)
static void print_buffer(lexer_state *lexer)
static void clear_buffer(lexer_state *lexer)
static file_buffer *read_file(char const *filename)
static void destroy_buffer(file_buffer *buf)
static void do_include_file(lexer_state *lexer, char const *filename)
static int is_start_of_line(file_buffer *buf)
static token check_dictionary(lexer_state *lexer, char const *dictionary[])
static void switch_buffer(lexer_state *lexer)
static int read_digits(lexer_state *lexer)
static void update_line(lexer_state *lexer)
static token read_string(lexer_state *lexer, char delimiter)
token read_heredoc(lexer_state *lexer, char *heredoc_label)
token read_macro(lexer_state *lexer)
lexer_state *new_lexer(char const *filename)
void destroy_lexer(lexer_state *lexer)
void open_include_file(lexer_state *lexer)
void close_include_file(NOTNULL(lexer_state *lexer))
token next_token(lexer_state *lexer)
Comments start with the pound sign ('#') and continue up to the end of the line.
POD comments are not yet supported.
Any whitespace in the specification is merely for readability. Significant whitespace is indicated explicitly.
PASM-REG -> PASM-PREG | PASM-SREG | PASM-NREG | PASM-IREG
PASM-PREG -> 'P' DIGIT+
PASM-SREG -> 'S' DIGIT+
PASM-NREG -> 'N' DIGIT+
PASM-IREG -> 'I' DIGIT+
IDENT -> [a-zA-Z_][a-zA-Z_0-9]*
LABEL -> IDENT ':'
INVOCANT-IDENT -> IDENT '.'
PARROT-OP -> IDENT
MACRO-IDENT -> '.' IDENT
MACRO-LABEL -> '$' IDENT ':'
PIR-REGISTER -> '$' PASM-REG
HEREDOC-IDENT -> << STRINGC
STRING-CONSTANT -> ' <characters> ' | " <characters> "
INT-CONSTANT -> [-] DIGIT+ | 0 [xX] DIGIT+ | 0 [bB] DIGIT+
NUM-CONSTANT -> [-] DIGIT+ '.' DIGIT*
DIGIT -> [0-9]
( ) [ ] , ;
Due to PIR's simplicity, there are no different levels of precedence for operators.
- ! ~
** * % / // + - >> >>> << ~ ~~ & && | || .
**= *= %= /= //= += -= .= >>= >>>= <<= &= |= ~=
< > == <= >= !=
=> ..
|