pycoolc/lexer.py
2017-03-24 16:47:40 -04:00

83 lines
2.9 KiB
Python

import sys
sys.path.append(".")
import nfa, string
digit = '+'.join("0123456789")
letter = '+'.join(string.ascii_letters) + "+_"
upper = '+'.join(string.ascii_uppercase)
lower = '+'.join(string.ascii_lowercase)
whitespace = " +\n+\f+\r+\t+\v"
any_char = digit + "+" + letter + "+" + whitespace
any_string = "("+any_char+")*"
integer = nfa.compile("(" + digit + ")*") # may not be negative because my "-?" at the beginning broke everything
integer.type = "integer"
identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits
identifier.type = "identifier"
string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')")
string.type = "string"
comment = nfa.compile("--(" + any_char + ")*")
comment.type = "comment"
keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"]))
keyword.type = "keyword"
assign = nfa.compile("<-")
assign.type = "assign"
relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>"]))
relop.type = "relop"
semicolon = nfa.compile(";")
semicolon.type = "semicolon"
whitespace_nfa = nfa.compile(whitespace)
whitespace_nfa.type = "whitespace_nfa"
parens = nfa.either(nfa.build_from_char("("), nfa.build_from_char(")"))
parens.type = "parens"
test_data = """
if x = y then
x <- 10;
else
x <- 20;
print("string literal test");
fi
"""
#print(nfa.match(keyword, "if"))
#print(nfa.match(nfa.compile("if+and"), "if"))
#print(nfa.match(integer, "10"))
#nfa.pmap(nfa.compile("if+and"))
#sys.exit(0)
class token():
def __init__(self):
self.matched_string = ""
self.type = False
def lex(data):
priority_order = [whitespace_nfa, comment, parens, semicolon, keyword, assign, relop, integer, string, identifier]
done = []
data_ptr = 0
while data_ptr < len(data):
one_matched = False
for regex in priority_order:
data_end = len(data)
while data_end - data_ptr > 0:
considering = data[data_ptr:data_end]
#print("Considering " + considering.replace("\n", "\\n"))
#print("matching '" + considering + "' against regex '" + regex.orig + "'")
if nfa.match(regex, considering):
#print("Matched " + considering)
data_ptr += len(considering)
t = token()
t.matched_string = considering
t.type = regex.type
done.append(t)
this_regex_matched = True
one_matched = True
break
data_end -= 1
if not one_matched:
print("Nothing matched '" + considering + "', bailing out")
return []
return done
for tkn in lex(test_data):
if tkn.type != "whitespace_nfa": print("token was '" + tkn.matched_string + "' of type " + tkn.type)