Lexer works

This commit is contained in:
Niles Rogoff 2017-03-24 16:47:40 -04:00
parent 852c3a30b0
commit b389d28a0d
No known key found for this signature in database
GPG Key ID: B78B908F23430F80
3 changed files with 226 additions and 5 deletions

View File

@ -9,14 +9,74 @@ whitespace = " +\n+\f+\r+\t+\v"
any_char = digit + "+" + letter + "+" + whitespace any_char = digit + "+" + letter + "+" + whitespace
any_string = "("+any_char+")*" any_string = "("+any_char+")*"
integer = nfa.compile("-?(" + digit + ")*") # may or may not be negative integer = nfa.compile("(" + digit + ")*") # may not be negative because my "-?" at the beginning broke everything
integer.type = "integer"
identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits
identifier.type = "identifier"
string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')") string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')")
string.type = "string"
comment = nfa.compile("--(" + any_char + ")*") comment = nfa.compile("--(" + any_char + ")*")
comment.type = "comment"
keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"])) keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"]))
keyword.type = "keyword"
assign = nfa.compile("<-") assign = nfa.compile("<-")
assign.type = "assign"
relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>"])) relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>"]))
relop.type = "relop"
semicolon = nfa.compile(";")
semicolon.type = "semicolon"
whitespace_nfa = nfa.compile(whitespace)
whitespace_nfa.type = "whitespace_nfa"
parens = nfa.either(nfa.build_from_char("("), nfa.build_from_char(")"))
parens.type = "parens"
# print("Integer's regex was " + integer.orig)
# print("string's regex was " + string.orig) test_data = """
#nfa.pmap(identifier) if x = y then
x <- 10;
else
x <- 20;
print("string literal test");
fi
"""
#print(nfa.match(keyword, "if"))
#print(nfa.match(nfa.compile("if+and"), "if"))
#print(nfa.match(integer, "10"))
#nfa.pmap(nfa.compile("if+and"))
#sys.exit(0)
class token():
def __init__(self):
self.matched_string = ""
self.type = False
def lex(data):
priority_order = [whitespace_nfa, comment, parens, semicolon, keyword, assign, relop, integer, string, identifier]
done = []
data_ptr = 0
while data_ptr < len(data):
one_matched = False
for regex in priority_order:
data_end = len(data)
while data_end - data_ptr > 0:
considering = data[data_ptr:data_end]
#print("Considering " + considering.replace("\n", "\\n"))
#print("matching '" + considering + "' against regex '" + regex.orig + "'")
if nfa.match(regex, considering):
#print("Matched " + considering)
data_ptr += len(considering)
t = token()
t.matched_string = considering
t.type = regex.type
done.append(t)
this_regex_matched = True
one_matched = True
break
data_end -= 1
if not one_matched:
print("Nothing matched '" + considering + "', bailing out")
return []
return done
for tkn in lex(test_data):
if tkn.type != "whitespace_nfa": print("token was '" + tkn.matched_string + "' of type " + tkn.type)

6
nfa.py
View File

@ -4,6 +4,7 @@ class field(): # a "field" has a list of nodes and a starting node, that's it
def __init__(self): def __init__(self):
self.nodes = set() self.nodes = set()
self.start = False self.start = False
self.orig = "not compiled"
# a node is either terminal (accepting) or not, and it has a list of possible moves. # a node is either terminal (accepting) or not, and it has a list of possible moves.
# the moves are usually indexed by a character, so my_nfa.moves['a'] will return another nfa # the moves are usually indexed by a character, so my_nfa.moves['a'] will return another nfa
# ε is special because there can be more than one ε moves, and they don't consume a character of input # ε is special because there can be more than one ε moves, and they don't consume a character of input
@ -119,6 +120,7 @@ def match(f, inp):
if len(states) == 0: # if there are no states, we can't possibly end up at a terminal state so just stop reading if len(states) == 0: # if there are no states, we can't possibly end up at a terminal state so just stop reading
return False return False
# now we've consumed all the input. If any of the states we are in are accepting states, it matched, otherwise return false # now we've consumed all the input. If any of the states we are in are accepting states, it matched, otherwise return false
states = epsilon_closure(states) # expand into epsilon connected states
for state in states: for state in states:
if state.terminal: if state.terminal:
return True return True
@ -173,7 +175,9 @@ def compile(regex):
elif regex[i] == "*": # if we find a *, iterate the last thing on the stack, which might have been a subregex (and that's ok) elif regex[i] == "*": # if we find a *, iterate the last thing on the stack, which might have been a subregex (and that's ok)
to_concat[-1] = iterate(to_concat[-1]) to_concat[-1] = iterate(to_concat[-1])
elif regex[i] == "+": # kind of a hack and gives + the highest possible operator precedence elif regex[i] == "+": # kind of a hack and gives + the highest possible operator precedence
return either(list_to_field(to_concat), compile(regex[i+1:])) ret = either(list_to_field(to_concat), compile(regex[i+1:]))
ret.orig = regex
return ret
elif regex[i] == "?": # COMPLETELY UNTESTED elif regex[i] == "?": # COMPLETELY UNTESTED
to_concat.append(zero_or_more(to_concat[-1])) to_concat.append(zero_or_more(to_concat[-1]))
else: # if we just found a regular character, add it to the stuff to concatenate else: # if we just found a regular character, add it to the stuff to concatenate

157
temp.svg Normal file
View File

@ -0,0 +1,157 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 2.38.0 (20140413.2041)
-->
<!-- Title: test Pages: 1 -->
<svg width="165pt" height="737pt"
viewBox="0.00 0.00 165.00 737.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 733)">
<title>test</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-733 161,-733 161,4 -4,4"/>
<!-- 742 -->
<g id="node1" class="node"><title>742</title>
<ellipse fill="none" stroke="black" cx="28" cy="-624" rx="27" ry="18"/>
<text text-anchor="middle" x="28" y="-620.3" font-family="Times,serif" font-size="14.00">742</text>
</g>
<!-- 812 -->
<g id="node2" class="node"><title>812</title>
<ellipse fill="none" stroke="black" cx="27" cy="-516" rx="27" ry="18"/>
<text text-anchor="middle" x="27" y="-512.3" font-family="Times,serif" font-size="14.00">812</text>
</g>
<!-- 742&#45;&gt;812 -->
<g id="edge1" class="edge"><title>742&#45;&gt;812</title>
<path fill="none" stroke="black" d="M27.8393,-605.969C27.6828,-589.378 27.4423,-563.883 27.2588,-544.431"/>
<polygon fill="black" stroke="black" points="30.7579,-544.307 27.1636,-534.341 23.7582,-544.373 30.7579,-544.307"/>
<text text-anchor="middle" x="31.5" y="-566.3" font-family="Times,serif" font-size="14.00">a</text>
</g>
<!-- 240 -->
<g id="node3" class="node"><title>240</title>
<ellipse fill="none" stroke="black" cx="27" cy="-408" rx="27" ry="18"/>
<text text-anchor="middle" x="27" y="-404.3" font-family="Times,serif" font-size="14.00">240</text>
</g>
<!-- 812&#45;&gt;240 -->
<g id="edge2" class="edge"><title>812&#45;&gt;240</title>
<path fill="none" stroke="black" d="M24.6569,-497.944C23.9798,-492.249 23.3339,-485.862 23,-480 22.0902,-464.026 22.0902,-459.974 23,-444 23.1461,-441.435 23.3519,-438.77 23.5939,-436.107"/>
<polygon fill="black" stroke="black" points="27.0856,-436.369 24.6569,-426.056 20.1244,-435.633 27.0856,-436.369"/>
<text text-anchor="middle" x="43" y="-458.3" font-family="Times,serif" font-size="14.00">epsilon</text>
</g>
<!-- 635 -->
<g id="node4" class="node"><title>635</title>
<ellipse fill="none" stroke="black" cx="27" cy="-300" rx="27" ry="18"/>
<text text-anchor="middle" x="27" y="-296.3" font-family="Times,serif" font-size="14.00">635</text>
</g>
<!-- 240&#45;&gt;635 -->
<g id="edge3" class="edge"><title>240&#45;&gt;635</title>
<path fill="none" stroke="black" d="M27,-389.969C27,-373.378 27,-347.883 27,-328.431"/>
<polygon fill="black" stroke="black" points="30.5001,-328.341 27,-318.341 23.5001,-328.341 30.5001,-328.341"/>
<text text-anchor="middle" x="30.5" y="-350.3" font-family="Times,serif" font-size="14.00">n</text>
</g>
<!-- 957 -->
<g id="node7" class="node"><title>957</title>
<ellipse fill="none" stroke="black" cx="35" cy="-192" rx="27" ry="18"/>
<text text-anchor="middle" x="35" y="-188.3" font-family="Times,serif" font-size="14.00">957</text>
</g>
<!-- 635&#45;&gt;957 -->
<g id="edge5" class="edge"><title>635&#45;&gt;957</title>
<path fill="none" stroke="black" d="M23.9446,-281.958C21.8521,-267.503 19.9555,-246.267 23,-228 23.4851,-225.089 24.1825,-222.096 25.0004,-219.144"/>
<polygon fill="black" stroke="black" points="28.3903,-220.035 28.1119,-209.444 21.7248,-217.897 28.3903,-220.035"/>
<text text-anchor="middle" x="43" y="-242.3" font-family="Times,serif" font-size="14.00">epsilon</text>
</g>
<!-- 163 -->
<g id="node5" class="node"><title>163</title>
<ellipse fill="none" stroke="black" cx="99" cy="-462" rx="27" ry="18"/>
<text text-anchor="middle" x="99" y="-458.3" font-family="Times,serif" font-size="14.00">163</text>
</g>
<!-- 401 -->
<g id="node6" class="node"><title>401</title>
<ellipse fill="none" stroke="black" cx="99" cy="-354" rx="27" ry="18"/>
<text text-anchor="middle" x="99" y="-350.3" font-family="Times,serif" font-size="14.00">401</text>
</g>
<!-- 163&#45;&gt;401 -->
<g id="edge4" class="edge"><title>163&#45;&gt;401</title>
<path fill="none" stroke="black" d="M99,-443.969C99,-427.378 99,-401.883 99,-382.431"/>
<polygon fill="black" stroke="black" points="102.5,-382.341 99,-372.341 95.5001,-382.341 102.5,-382.341"/>
<text text-anchor="middle" x="119" y="-404.3" font-family="Times,serif" font-size="14.00">epsilon</text>
</g>
<!-- 202 -->
<g id="node8" class="node"><title>202</title>
<ellipse fill="none" stroke="black" cx="99" cy="-246" rx="27" ry="18"/>
<text text-anchor="middle" x="99" y="-242.3" font-family="Times,serif" font-size="14.00">202</text>
</g>
<!-- 401&#45;&gt;202 -->
<g id="edge6" class="edge"><title>401&#45;&gt;202</title>
<path fill="none" stroke="black" d="M99,-335.969C99,-319.378 99,-293.883 99,-274.431"/>
<polygon fill="black" stroke="black" points="102.5,-274.341 99,-264.341 95.5001,-274.341 102.5,-274.341"/>
<text text-anchor="middle" x="101.5" y="-296.3" font-family="Times,serif" font-size="14.00">f</text>
</g>
<!-- 591 -->
<g id="node12" class="node"><title>591</title>
<ellipse fill="none" stroke="black" cx="38" cy="-105" rx="27" ry="18"/>
<text text-anchor="middle" x="38" y="-101.3" font-family="Times,serif" font-size="14.00">591</text>
</g>
<!-- 957&#45;&gt;591 -->
<g id="edge11" class="edge"><title>957&#45;&gt;591</title>
<path fill="none" stroke="black" d="M35.6071,-173.799C36.0178,-162.163 36.5689,-146.548 37.0387,-133.237"/>
<polygon fill="black" stroke="black" points="40.5388,-133.293 37.3938,-123.175 33.5432,-133.046 40.5388,-133.293"/>
<text text-anchor="middle" x="39.5" y="-144.8" font-family="Times,serif" font-size="14.00">d</text>
</g>
<!-- 344 -->
<g id="node9" class="node"><title>344</title>
<ellipse fill="none" stroke="black" cx="66" cy="-18" rx="27" ry="18"/>
<text text-anchor="middle" x="66" y="-14.3" font-family="Times,serif" font-size="14.00">344</text>
</g>
<!-- 202&#45;&gt;344 -->
<g id="edge7" class="edge"><title>202&#45;&gt;344</title>
<path fill="none" stroke="black" d="M98.8986,-227.867C98.3494,-194.105 95.3561,-116.797 80,-54 79.2621,-50.9823 78.3165,-47.8804 77.2655,-44.8292"/>
<polygon fill="black" stroke="black" points="80.5104,-43.5152 73.6749,-35.4203 73.9704,-46.011 80.5104,-43.5152"/>
<text text-anchor="middle" x="115" y="-144.8" font-family="Times,serif" font-size="14.00">epsilon</text>
</g>
<!-- 344&#45;&gt;344 -->
<g id="edge13" class="edge"><title>344&#45;&gt;344</title>
<path fill="none" stroke="black" d="M90.5325,-25.752C101.508,-26.4902 111,-23.9062 111,-18 111,-14.0317 106.715,-11.5632 100.574,-10.5944"/>
<polygon fill="black" stroke="black" points="100.647,-7.09491 90.5325,-10.248 100.406,-14.0908 100.647,-7.09491"/>
<text text-anchor="middle" x="134" y="-14.3" font-family="Times,serif" font-size="14.00">terminal</text>
</g>
<!-- 535 -->
<g id="node10" class="node"><title>535</title>
<ellipse fill="none" stroke="black" cx="97" cy="-570" rx="27" ry="18"/>
<text text-anchor="middle" x="97" y="-566.3" font-family="Times,serif" font-size="14.00">535</text>
</g>
<!-- 535&#45;&gt;163 -->
<g id="edge8" class="edge"><title>535&#45;&gt;163</title>
<path fill="none" stroke="black" d="M97.3213,-551.969C97.6344,-535.378 98.1154,-509.883 98.4824,-490.431"/>
<polygon fill="black" stroke="black" points="101.983,-490.405 98.6728,-480.341 94.9847,-490.273 101.983,-490.405"/>
<text text-anchor="middle" x="100" y="-512.3" font-family="Times,serif" font-size="14.00">i</text>
</g>
<!-- 72 -->
<g id="node11" class="node"><title>72</title>
<ellipse fill="none" stroke="black" cx="64" cy="-711" rx="27" ry="18"/>
<text text-anchor="middle" x="64" y="-707.3" font-family="Times,serif" font-size="14.00">72</text>
</g>
<!-- 72&#45;&gt;742 -->
<g id="edge9" class="edge"><title>72&#45;&gt;742</title>
<path fill="none" stroke="black" d="M48.9733,-695.577C43.72,-689.693 38.3127,-682.509 35,-675 31.9218,-668.023 30.1274,-660.014 29.0997,-652.479"/>
<polygon fill="black" stroke="black" points="32.561,-651.913 28.1034,-642.301 25.5943,-652.595 32.561,-651.913"/>
<text text-anchor="middle" x="55" y="-663.8" font-family="Times,serif" font-size="14.00">epsilon</text>
</g>
<!-- 72&#45;&gt;535 -->
<g id="edge10" class="edge"><title>72&#45;&gt;535</title>
<path fill="none" stroke="black" d="M69.5886,-693.36C71.4334,-687.582 73.4165,-681.038 75,-675 81.8087,-649.039 87.9814,-619.011 92.0434,-597.915"/>
<polygon fill="black" stroke="black" points="95.4896,-598.527 93.9144,-588.05 88.6122,-597.223 95.4896,-598.527"/>
<text text-anchor="middle" x="98" y="-663.8" font-family="Times,serif" font-size="14.00">epsilon</text>
</g>
<!-- 72&#45;&gt;72 -->
<g id="edge14" class="edge"><title>72&#45;&gt;72</title>
<path fill="none" stroke="black" d="M88.5325,-718.752C99.5078,-719.49 109,-716.906 109,-711 109,-707.032 104.715,-704.563 98.5743,-703.594"/>
<polygon fill="black" stroke="black" points="98.6472,-700.095 88.5325,-703.248 98.4059,-707.091 98.6472,-700.095"/>
<text text-anchor="middle" x="121" y="-707.3" font-family="Times,serif" font-size="14.00">start</text>
</g>
<!-- 591&#45;&gt;344 -->
<g id="edge12" class="edge"><title>591&#45;&gt;344</title>
<path fill="none" stroke="black" d="M36.2231,-86.8641C35.7974,-76.9855 36.2974,-64.4855 40,-54 41.4744,-49.8245 43.6327,-45.7297 46.0853,-41.8967"/>
<polygon fill="black" stroke="black" points="49.1262,-43.6649 52.1588,-33.5134 43.4575,-39.5581 49.1262,-43.6649"/>
<text text-anchor="middle" x="60" y="-57.8" font-family="Times,serif" font-size="14.00">epsilon</text>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 9.3 KiB