83 lines
2.9 KiB
Python
83 lines
2.9 KiB
Python
import sys
|
|
sys.path.append(".")
|
|
import nfa, string
|
|
digit = '+'.join("0123456789")
|
|
letter = '+'.join(string.ascii_letters) + "+_"
|
|
upper = '+'.join(string.ascii_uppercase)
|
|
lower = '+'.join(string.ascii_lowercase)
|
|
whitespace = " +\n+\f+\r+\t+\v"
|
|
any_char = digit + "+" + letter + "+" + whitespace
|
|
any_string = "("+any_char+")*"
|
|
|
|
integer = nfa.compile("(" + digit + ")*") # may not be negative because my "-?" at the beginning broke everything
|
|
integer.type = "integer"
|
|
identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits
|
|
identifier.type = "identifier"
|
|
string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')")
|
|
string.type = "string"
|
|
comment = nfa.compile("--(" + any_char + ")*")
|
|
comment.type = "comment"
|
|
keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"]))
|
|
keyword.type = "keyword"
|
|
assign = nfa.compile("<-")
|
|
assign.type = "assign"
|
|
relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>"]))
|
|
relop.type = "relop"
|
|
semicolon = nfa.compile(";")
|
|
semicolon.type = "semicolon"
|
|
whitespace_nfa = nfa.compile(whitespace)
|
|
whitespace_nfa.type = "whitespace_nfa"
|
|
parens = nfa.either(nfa.build_from_char("("), nfa.build_from_char(")"))
|
|
parens.type = "parens"
|
|
|
|
|
|
test_data = """
|
|
if x = y then
|
|
x <- 10;
|
|
else
|
|
x <- 20;
|
|
print("string literal test");
|
|
fi
|
|
"""
|
|
|
|
#print(nfa.match(keyword, "if"))
|
|
#print(nfa.match(nfa.compile("if+and"), "if"))
|
|
#print(nfa.match(integer, "10"))
|
|
#nfa.pmap(nfa.compile("if+and"))
|
|
#sys.exit(0)
|
|
|
|
class token():
|
|
def __init__(self):
|
|
self.matched_string = ""
|
|
self.type = False
|
|
|
|
def lex(data):
|
|
priority_order = [whitespace_nfa, comment, parens, semicolon, keyword, assign, relop, integer, string, identifier]
|
|
done = []
|
|
data_ptr = 0
|
|
while data_ptr < len(data):
|
|
one_matched = False
|
|
for regex in priority_order:
|
|
data_end = len(data)
|
|
while data_end - data_ptr > 0:
|
|
considering = data[data_ptr:data_end]
|
|
#print("Considering " + considering.replace("\n", "\\n"))
|
|
#print("matching '" + considering + "' against regex '" + regex.orig + "'")
|
|
if nfa.match(regex, considering):
|
|
#print("Matched " + considering)
|
|
data_ptr += len(considering)
|
|
t = token()
|
|
t.matched_string = considering
|
|
t.type = regex.type
|
|
done.append(t)
|
|
this_regex_matched = True
|
|
one_matched = True
|
|
break
|
|
data_end -= 1
|
|
if not one_matched:
|
|
print("Nothing matched '" + considering + "', bailing out")
|
|
return []
|
|
return done
|
|
for tkn in lex(test_data):
|
|
if tkn.type != "whitespace_nfa": print("token was '" + tkn.matched_string + "' of type " + tkn.type)
|