pycoolc/lexer.py

105 lines
4.4 KiB
Python
Raw Permalink Normal View History

import sys
sys.path.append(".")
import nfa, string
digit = '+'.join("0123456789")
letter = '+'.join(string.ascii_letters) + "+_"
upper = '+'.join(string.ascii_uppercase)
lower = '+'.join(string.ascii_lowercase)
2017-03-24 23:47:16 +01:00
whitespace_no_newline = " +\f+\r+\t+\v"
2017-03-25 05:59:00 +01:00
whitespace = "(" + whitespace_no_newline + "+\n)*"
2017-03-22 20:21:15 +01:00
any_char = digit + "+" + letter + "+" + whitespace
any_string = "("+any_char+")*"
2017-03-25 00:25:47 +01:00
integer = nfa.compile("-?(" + digit + ")(" + digit + ")*")
2017-03-24 21:47:40 +01:00
integer.type = "integer"
2017-03-22 20:21:15 +01:00
identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits
2017-03-24 21:47:40 +01:00
identifier.type = "identifier"
2017-03-22 20:21:15 +01:00
string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')")
2017-03-24 21:47:40 +01:00
string.type = "string"
2017-03-24 23:47:16 +01:00
comment = nfa.compile("--(" + digit + "+" + letter + "+" + whitespace_no_newline + ")*\n") # Untested
comment.type = "comment"
keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"]))
2017-03-24 21:47:40 +01:00
keyword.type = "keyword"
2017-03-22 20:21:15 +01:00
assign = nfa.compile("<-")
2017-03-24 21:47:40 +01:00
assign.type = "assign"
2017-03-25 00:16:47 +01:00
relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>", "!="]))
2017-03-24 21:47:40 +01:00
relop.type = "relop"
semicolon = nfa.compile(";")
semicolon.type = "semicolon"
colon = nfa.compile(":")
colon.type = "colon"
comma = nfa.compile(",")
comma.type = "comma"
2017-03-24 21:47:40 +01:00
whitespace_nfa = nfa.compile(whitespace)
whitespace_nfa.type = "whitespace_nfa"
parens = nfa.either(nfa.build_from_char("("), nfa.build_from_char(")"))
parens.type = "parens"
2017-03-25 00:32:58 +01:00
mathbinop = nfa.either(nfa.either(nfa.compile("-+/+%+^+|+&"), nfa.build_from_char("+")), nfa.build_from_char("*"))
2017-03-25 00:16:47 +01:00
mathbinop.type = "mathbinop"
mathunop = nfa.compile("~")
mathunop.type = "mathunop"
brace = nfa.compile("{+}")
brace.type = "brace"
bracket = nfa.compile("[+]")
bracket.type = "bracket"
unop = nfa.compile("!")
unop.type = "unop"
2017-03-22 20:21:15 +01:00
2017-03-24 21:47:40 +01:00
test_data = """
if x = y then
x <- 10;
else
2017-03-25 00:25:47 +01:00
x <- x - (y * -1); -- comment test
2017-03-24 21:47:40 +01:00
print("string literal test");
fi
"""
#print(nfa.match(keyword, "if"))
#print(nfa.match(nfa.compile("if+and"), "if"))
#print(nfa.match(integer, "10"))
#nfa.pmap(nfa.compile("if+and"))
#sys.exit(0)
class token():
def __init__(self):
self.matched_string = ""
self.type = False
2017-03-25 00:16:47 +01:00
# returns a list of tokens in the order they appeared in the input string
2017-03-24 21:47:40 +01:00
def lex(data):
2017-03-24 23:47:16 +01:00
# import subprocess
# process = subprocess.Popen(["gpp", "+c", "--", "\\n"], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
# data = process.communicate(input=data.encode("utf-8"))[0].decode("utf-8")
2017-03-25 00:16:47 +01:00
# whichever of these is the first to match a substring of the text is used to create the token
priority_order = [whitespace_nfa, comment, integer, parens, bracket, brace, mathbinop, mathunop, unop, semicolon, colon, comma, keyword, assign, relop, string, identifier]
2017-03-24 21:47:40 +01:00
done = []
data_ptr = 0
2017-03-25 00:16:47 +01:00
while data_ptr < len(data): # loop until we've read the whole input string
2017-03-24 21:47:40 +01:00
one_matched = False
2017-03-25 00:16:47 +01:00
# start by trying to match the whole rest of the input string, and chop one character off the end until there are no characters left. If none of those substrings matched, move on to the next regex in the priority order
for regex in priority_order: # starting with the highest priority regex
data_end = len(data) # MAXIMUM MUNCH - literally the largest lookahead possible
this_regex_matched = False
2017-03-24 21:47:40 +01:00
while data_end - data_ptr > 0:
considering = data[data_ptr:data_end]
2017-03-25 00:16:47 +01:00
if nfa.match(regex, considering): # If this regex matched the substring
data_ptr += len(considering) # add the length of what it matched to where we'll start reading the next token from
t = token() # construct a token and add it to the result list
2017-03-24 21:47:40 +01:00
t.matched_string = considering
t.type = regex.type
done.append(t)
2017-03-25 00:16:47 +01:00
one_matched = True # don't die
2017-03-24 21:47:40 +01:00
this_regex_matched = True
break
data_end -= 1
2017-03-25 00:16:47 +01:00
if this_regex_matched:
break
2017-03-24 21:47:40 +01:00
if not one_matched:
2017-03-25 00:16:47 +01:00
print("Nothing matched '" + considering + "', bailing out") # if we encounter something that we can't parse, just die
2017-03-24 21:47:40 +01:00
return []
return done
2017-03-25 00:16:47 +01:00
# debug
# for tkn in lex(test_data):
# if tkn.type != "whitespace_nfa": print("token was '" + tkn.matched_string.replace("\n", "\\n") + "' of type " + tkn.type)