Lexer works
This commit is contained in:
parent
852c3a30b0
commit
b389d28a0d
68
lexer.py
68
lexer.py
|
@ -9,14 +9,74 @@ whitespace = " +\n+\f+\r+\t+\v"
|
||||||
any_char = digit + "+" + letter + "+" + whitespace
|
any_char = digit + "+" + letter + "+" + whitespace
|
||||||
any_string = "("+any_char+")*"
|
any_string = "("+any_char+")*"
|
||||||
|
|
||||||
integer = nfa.compile("-?(" + digit + ")*") # may or may not be negative
|
integer = nfa.compile("(" + digit + ")*") # may not be negative because my "-?" at the beginning broke everything
|
||||||
|
integer.type = "integer"
|
||||||
identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits
|
identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits
|
||||||
|
identifier.type = "identifier"
|
||||||
string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')")
|
string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')")
|
||||||
|
string.type = "string"
|
||||||
comment = nfa.compile("--(" + any_char + ")*")
|
comment = nfa.compile("--(" + any_char + ")*")
|
||||||
|
comment.type = "comment"
|
||||||
keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"]))
|
keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"]))
|
||||||
|
keyword.type = "keyword"
|
||||||
assign = nfa.compile("<-")
|
assign = nfa.compile("<-")
|
||||||
|
assign.type = "assign"
|
||||||
relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>"]))
|
relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>"]))
|
||||||
|
relop.type = "relop"
|
||||||
|
semicolon = nfa.compile(";")
|
||||||
|
semicolon.type = "semicolon"
|
||||||
|
whitespace_nfa = nfa.compile(whitespace)
|
||||||
|
whitespace_nfa.type = "whitespace_nfa"
|
||||||
|
parens = nfa.either(nfa.build_from_char("("), nfa.build_from_char(")"))
|
||||||
|
parens.type = "parens"
|
||||||
|
|
||||||
# print("Integer's regex was " + integer.orig)
|
|
||||||
# print("string's regex was " + string.orig)
|
test_data = """
|
||||||
#nfa.pmap(identifier)
|
if x = y then
|
||||||
|
x <- 10;
|
||||||
|
else
|
||||||
|
x <- 20;
|
||||||
|
print("string literal test");
|
||||||
|
fi
|
||||||
|
"""
|
||||||
|
|
||||||
|
#print(nfa.match(keyword, "if"))
|
||||||
|
#print(nfa.match(nfa.compile("if+and"), "if"))
|
||||||
|
#print(nfa.match(integer, "10"))
|
||||||
|
#nfa.pmap(nfa.compile("if+and"))
|
||||||
|
#sys.exit(0)
|
||||||
|
|
||||||
|
class token():
|
||||||
|
def __init__(self):
|
||||||
|
self.matched_string = ""
|
||||||
|
self.type = False
|
||||||
|
|
||||||
|
def lex(data):
|
||||||
|
priority_order = [whitespace_nfa, comment, parens, semicolon, keyword, assign, relop, integer, string, identifier]
|
||||||
|
done = []
|
||||||
|
data_ptr = 0
|
||||||
|
while data_ptr < len(data):
|
||||||
|
one_matched = False
|
||||||
|
for regex in priority_order:
|
||||||
|
data_end = len(data)
|
||||||
|
while data_end - data_ptr > 0:
|
||||||
|
considering = data[data_ptr:data_end]
|
||||||
|
#print("Considering " + considering.replace("\n", "\\n"))
|
||||||
|
#print("matching '" + considering + "' against regex '" + regex.orig + "'")
|
||||||
|
if nfa.match(regex, considering):
|
||||||
|
#print("Matched " + considering)
|
||||||
|
data_ptr += len(considering)
|
||||||
|
t = token()
|
||||||
|
t.matched_string = considering
|
||||||
|
t.type = regex.type
|
||||||
|
done.append(t)
|
||||||
|
this_regex_matched = True
|
||||||
|
one_matched = True
|
||||||
|
break
|
||||||
|
data_end -= 1
|
||||||
|
if not one_matched:
|
||||||
|
print("Nothing matched '" + considering + "', bailing out")
|
||||||
|
return []
|
||||||
|
return done
|
||||||
|
for tkn in lex(test_data):
|
||||||
|
if tkn.type != "whitespace_nfa": print("token was '" + tkn.matched_string + "' of type " + tkn.type)
|
||||||
|
|
6
nfa.py
6
nfa.py
|
@ -4,6 +4,7 @@ class field(): # a "field" has a list of nodes and a starting node, that's it
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.nodes = set()
|
self.nodes = set()
|
||||||
self.start = False
|
self.start = False
|
||||||
|
self.orig = "not compiled"
|
||||||
# a node is either terminal (accepting) or not, and it has a list of possible moves.
|
# a node is either terminal (accepting) or not, and it has a list of possible moves.
|
||||||
# the moves are usually indexed by a character, so my_nfa.moves['a'] will return another nfa
|
# the moves are usually indexed by a character, so my_nfa.moves['a'] will return another nfa
|
||||||
# ε is special because there can be more than one ε moves, and they don't consume a character of input
|
# ε is special because there can be more than one ε moves, and they don't consume a character of input
|
||||||
|
@ -119,6 +120,7 @@ def match(f, inp):
|
||||||
if len(states) == 0: # if there are no states, we can't possibly end up at a terminal state so just stop reading
|
if len(states) == 0: # if there are no states, we can't possibly end up at a terminal state so just stop reading
|
||||||
return False
|
return False
|
||||||
# now we've consumed all the input. If any of the states we are in are accepting states, it matched, otherwise return false
|
# now we've consumed all the input. If any of the states we are in are accepting states, it matched, otherwise return false
|
||||||
|
states = epsilon_closure(states) # expand into epsilon connected states
|
||||||
for state in states:
|
for state in states:
|
||||||
if state.terminal:
|
if state.terminal:
|
||||||
return True
|
return True
|
||||||
|
@ -173,7 +175,9 @@ def compile(regex):
|
||||||
elif regex[i] == "*": # if we find a *, iterate the last thing on the stack, which might have been a subregex (and that's ok)
|
elif regex[i] == "*": # if we find a *, iterate the last thing on the stack, which might have been a subregex (and that's ok)
|
||||||
to_concat[-1] = iterate(to_concat[-1])
|
to_concat[-1] = iterate(to_concat[-1])
|
||||||
elif regex[i] == "+": # kind of a hack and gives + the highest possible operator precedence
|
elif regex[i] == "+": # kind of a hack and gives + the highest possible operator precedence
|
||||||
return either(list_to_field(to_concat), compile(regex[i+1:]))
|
ret = either(list_to_field(to_concat), compile(regex[i+1:]))
|
||||||
|
ret.orig = regex
|
||||||
|
return ret
|
||||||
elif regex[i] == "?": # COMPLETELY UNTESTED
|
elif regex[i] == "?": # COMPLETELY UNTESTED
|
||||||
to_concat.append(zero_or_more(to_concat[-1]))
|
to_concat.append(zero_or_more(to_concat[-1]))
|
||||||
else: # if we just found a regular character, add it to the stuff to concatenate
|
else: # if we just found a regular character, add it to the stuff to concatenate
|
||||||
|
|
|
@ -0,0 +1,157 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
||||||
|
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||||
|
<!-- Generated by graphviz version 2.38.0 (20140413.2041)
|
||||||
|
-->
|
||||||
|
<!-- Title: test Pages: 1 -->
|
||||||
|
<svg width="165pt" height="737pt"
|
||||||
|
viewBox="0.00 0.00 165.00 737.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||||
|
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 733)">
|
||||||
|
<title>test</title>
|
||||||
|
<polygon fill="white" stroke="none" points="-4,4 -4,-733 161,-733 161,4 -4,4"/>
|
||||||
|
<!-- 742 -->
|
||||||
|
<g id="node1" class="node"><title>742</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="28" cy="-624" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="28" y="-620.3" font-family="Times,serif" font-size="14.00">742</text>
|
||||||
|
</g>
|
||||||
|
<!-- 812 -->
|
||||||
|
<g id="node2" class="node"><title>812</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="27" cy="-516" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="27" y="-512.3" font-family="Times,serif" font-size="14.00">812</text>
|
||||||
|
</g>
|
||||||
|
<!-- 742->812 -->
|
||||||
|
<g id="edge1" class="edge"><title>742->812</title>
|
||||||
|
<path fill="none" stroke="black" d="M27.8393,-605.969C27.6828,-589.378 27.4423,-563.883 27.2588,-544.431"/>
|
||||||
|
<polygon fill="black" stroke="black" points="30.7579,-544.307 27.1636,-534.341 23.7582,-544.373 30.7579,-544.307"/>
|
||||||
|
<text text-anchor="middle" x="31.5" y="-566.3" font-family="Times,serif" font-size="14.00">a</text>
|
||||||
|
</g>
|
||||||
|
<!-- 240 -->
|
||||||
|
<g id="node3" class="node"><title>240</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="27" cy="-408" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="27" y="-404.3" font-family="Times,serif" font-size="14.00">240</text>
|
||||||
|
</g>
|
||||||
|
<!-- 812->240 -->
|
||||||
|
<g id="edge2" class="edge"><title>812->240</title>
|
||||||
|
<path fill="none" stroke="black" d="M24.6569,-497.944C23.9798,-492.249 23.3339,-485.862 23,-480 22.0902,-464.026 22.0902,-459.974 23,-444 23.1461,-441.435 23.3519,-438.77 23.5939,-436.107"/>
|
||||||
|
<polygon fill="black" stroke="black" points="27.0856,-436.369 24.6569,-426.056 20.1244,-435.633 27.0856,-436.369"/>
|
||||||
|
<text text-anchor="middle" x="43" y="-458.3" font-family="Times,serif" font-size="14.00">epsilon</text>
|
||||||
|
</g>
|
||||||
|
<!-- 635 -->
|
||||||
|
<g id="node4" class="node"><title>635</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="27" cy="-300" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="27" y="-296.3" font-family="Times,serif" font-size="14.00">635</text>
|
||||||
|
</g>
|
||||||
|
<!-- 240->635 -->
|
||||||
|
<g id="edge3" class="edge"><title>240->635</title>
|
||||||
|
<path fill="none" stroke="black" d="M27,-389.969C27,-373.378 27,-347.883 27,-328.431"/>
|
||||||
|
<polygon fill="black" stroke="black" points="30.5001,-328.341 27,-318.341 23.5001,-328.341 30.5001,-328.341"/>
|
||||||
|
<text text-anchor="middle" x="30.5" y="-350.3" font-family="Times,serif" font-size="14.00">n</text>
|
||||||
|
</g>
|
||||||
|
<!-- 957 -->
|
||||||
|
<g id="node7" class="node"><title>957</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="35" cy="-192" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="35" y="-188.3" font-family="Times,serif" font-size="14.00">957</text>
|
||||||
|
</g>
|
||||||
|
<!-- 635->957 -->
|
||||||
|
<g id="edge5" class="edge"><title>635->957</title>
|
||||||
|
<path fill="none" stroke="black" d="M23.9446,-281.958C21.8521,-267.503 19.9555,-246.267 23,-228 23.4851,-225.089 24.1825,-222.096 25.0004,-219.144"/>
|
||||||
|
<polygon fill="black" stroke="black" points="28.3903,-220.035 28.1119,-209.444 21.7248,-217.897 28.3903,-220.035"/>
|
||||||
|
<text text-anchor="middle" x="43" y="-242.3" font-family="Times,serif" font-size="14.00">epsilon</text>
|
||||||
|
</g>
|
||||||
|
<!-- 163 -->
|
||||||
|
<g id="node5" class="node"><title>163</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="99" cy="-462" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="99" y="-458.3" font-family="Times,serif" font-size="14.00">163</text>
|
||||||
|
</g>
|
||||||
|
<!-- 401 -->
|
||||||
|
<g id="node6" class="node"><title>401</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="99" cy="-354" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="99" y="-350.3" font-family="Times,serif" font-size="14.00">401</text>
|
||||||
|
</g>
|
||||||
|
<!-- 163->401 -->
|
||||||
|
<g id="edge4" class="edge"><title>163->401</title>
|
||||||
|
<path fill="none" stroke="black" d="M99,-443.969C99,-427.378 99,-401.883 99,-382.431"/>
|
||||||
|
<polygon fill="black" stroke="black" points="102.5,-382.341 99,-372.341 95.5001,-382.341 102.5,-382.341"/>
|
||||||
|
<text text-anchor="middle" x="119" y="-404.3" font-family="Times,serif" font-size="14.00">epsilon</text>
|
||||||
|
</g>
|
||||||
|
<!-- 202 -->
|
||||||
|
<g id="node8" class="node"><title>202</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="99" cy="-246" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="99" y="-242.3" font-family="Times,serif" font-size="14.00">202</text>
|
||||||
|
</g>
|
||||||
|
<!-- 401->202 -->
|
||||||
|
<g id="edge6" class="edge"><title>401->202</title>
|
||||||
|
<path fill="none" stroke="black" d="M99,-335.969C99,-319.378 99,-293.883 99,-274.431"/>
|
||||||
|
<polygon fill="black" stroke="black" points="102.5,-274.341 99,-264.341 95.5001,-274.341 102.5,-274.341"/>
|
||||||
|
<text text-anchor="middle" x="101.5" y="-296.3" font-family="Times,serif" font-size="14.00">f</text>
|
||||||
|
</g>
|
||||||
|
<!-- 591 -->
|
||||||
|
<g id="node12" class="node"><title>591</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="38" cy="-105" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="38" y="-101.3" font-family="Times,serif" font-size="14.00">591</text>
|
||||||
|
</g>
|
||||||
|
<!-- 957->591 -->
|
||||||
|
<g id="edge11" class="edge"><title>957->591</title>
|
||||||
|
<path fill="none" stroke="black" d="M35.6071,-173.799C36.0178,-162.163 36.5689,-146.548 37.0387,-133.237"/>
|
||||||
|
<polygon fill="black" stroke="black" points="40.5388,-133.293 37.3938,-123.175 33.5432,-133.046 40.5388,-133.293"/>
|
||||||
|
<text text-anchor="middle" x="39.5" y="-144.8" font-family="Times,serif" font-size="14.00">d</text>
|
||||||
|
</g>
|
||||||
|
<!-- 344 -->
|
||||||
|
<g id="node9" class="node"><title>344</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="66" cy="-18" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="66" y="-14.3" font-family="Times,serif" font-size="14.00">344</text>
|
||||||
|
</g>
|
||||||
|
<!-- 202->344 -->
|
||||||
|
<g id="edge7" class="edge"><title>202->344</title>
|
||||||
|
<path fill="none" stroke="black" d="M98.8986,-227.867C98.3494,-194.105 95.3561,-116.797 80,-54 79.2621,-50.9823 78.3165,-47.8804 77.2655,-44.8292"/>
|
||||||
|
<polygon fill="black" stroke="black" points="80.5104,-43.5152 73.6749,-35.4203 73.9704,-46.011 80.5104,-43.5152"/>
|
||||||
|
<text text-anchor="middle" x="115" y="-144.8" font-family="Times,serif" font-size="14.00">epsilon</text>
|
||||||
|
</g>
|
||||||
|
<!-- 344->344 -->
|
||||||
|
<g id="edge13" class="edge"><title>344->344</title>
|
||||||
|
<path fill="none" stroke="black" d="M90.5325,-25.752C101.508,-26.4902 111,-23.9062 111,-18 111,-14.0317 106.715,-11.5632 100.574,-10.5944"/>
|
||||||
|
<polygon fill="black" stroke="black" points="100.647,-7.09491 90.5325,-10.248 100.406,-14.0908 100.647,-7.09491"/>
|
||||||
|
<text text-anchor="middle" x="134" y="-14.3" font-family="Times,serif" font-size="14.00">terminal</text>
|
||||||
|
</g>
|
||||||
|
<!-- 535 -->
|
||||||
|
<g id="node10" class="node"><title>535</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="97" cy="-570" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="97" y="-566.3" font-family="Times,serif" font-size="14.00">535</text>
|
||||||
|
</g>
|
||||||
|
<!-- 535->163 -->
|
||||||
|
<g id="edge8" class="edge"><title>535->163</title>
|
||||||
|
<path fill="none" stroke="black" d="M97.3213,-551.969C97.6344,-535.378 98.1154,-509.883 98.4824,-490.431"/>
|
||||||
|
<polygon fill="black" stroke="black" points="101.983,-490.405 98.6728,-480.341 94.9847,-490.273 101.983,-490.405"/>
|
||||||
|
<text text-anchor="middle" x="100" y="-512.3" font-family="Times,serif" font-size="14.00">i</text>
|
||||||
|
</g>
|
||||||
|
<!-- 72 -->
|
||||||
|
<g id="node11" class="node"><title>72</title>
|
||||||
|
<ellipse fill="none" stroke="black" cx="64" cy="-711" rx="27" ry="18"/>
|
||||||
|
<text text-anchor="middle" x="64" y="-707.3" font-family="Times,serif" font-size="14.00">72</text>
|
||||||
|
</g>
|
||||||
|
<!-- 72->742 -->
|
||||||
|
<g id="edge9" class="edge"><title>72->742</title>
|
||||||
|
<path fill="none" stroke="black" d="M48.9733,-695.577C43.72,-689.693 38.3127,-682.509 35,-675 31.9218,-668.023 30.1274,-660.014 29.0997,-652.479"/>
|
||||||
|
<polygon fill="black" stroke="black" points="32.561,-651.913 28.1034,-642.301 25.5943,-652.595 32.561,-651.913"/>
|
||||||
|
<text text-anchor="middle" x="55" y="-663.8" font-family="Times,serif" font-size="14.00">epsilon</text>
|
||||||
|
</g>
|
||||||
|
<!-- 72->535 -->
|
||||||
|
<g id="edge10" class="edge"><title>72->535</title>
|
||||||
|
<path fill="none" stroke="black" d="M69.5886,-693.36C71.4334,-687.582 73.4165,-681.038 75,-675 81.8087,-649.039 87.9814,-619.011 92.0434,-597.915"/>
|
||||||
|
<polygon fill="black" stroke="black" points="95.4896,-598.527 93.9144,-588.05 88.6122,-597.223 95.4896,-598.527"/>
|
||||||
|
<text text-anchor="middle" x="98" y="-663.8" font-family="Times,serif" font-size="14.00">epsilon</text>
|
||||||
|
</g>
|
||||||
|
<!-- 72->72 -->
|
||||||
|
<g id="edge14" class="edge"><title>72->72</title>
|
||||||
|
<path fill="none" stroke="black" d="M88.5325,-718.752C99.5078,-719.49 109,-716.906 109,-711 109,-707.032 104.715,-704.563 98.5743,-703.594"/>
|
||||||
|
<polygon fill="black" stroke="black" points="98.6472,-700.095 88.5325,-703.248 98.4059,-707.091 98.6472,-700.095"/>
|
||||||
|
<text text-anchor="middle" x="121" y="-707.3" font-family="Times,serif" font-size="14.00">start</text>
|
||||||
|
</g>
|
||||||
|
<!-- 591->344 -->
|
||||||
|
<g id="edge12" class="edge"><title>591->344</title>
|
||||||
|
<path fill="none" stroke="black" d="M36.2231,-86.8641C35.7974,-76.9855 36.2974,-64.4855 40,-54 41.4744,-49.8245 43.6327,-45.7297 46.0853,-41.8967"/>
|
||||||
|
<polygon fill="black" stroke="black" points="49.1262,-43.6649 52.1588,-33.5134 43.4575,-39.5581 49.1262,-43.6649"/>
|
||||||
|
<text text-anchor="middle" x="60" y="-57.8" font-family="Times,serif" font-size="14.00">epsilon</text>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 9.3 KiB |
Loading…
Reference in New Issue