From 837b302d0f57a10f9f423d0aeaea244f37adf231 Mon Sep 17 00:00:00 2001
From: Niles Rogoff <me@niles.xyz>
Date: Sat, 25 Mar 2017 14:55:36 -0400
Subject: [PATCH] Parser doesn't use global variables

---
 nfa.py    | 10 +++++--
 parser.py | 86 ++++++++++++++++++++++++++++++++-----------------------
 2 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/nfa.py b/nfa.py
index b7b58a7..d20b9c0 100644
--- a/nfa.py
+++ b/nfa.py
@@ -126,7 +126,7 @@ def match(f, inp):
             return True
     return False
 
-
+# takes a list of fields and returns a new field with all of the lists fields concatenated together
 def list_to_field(l):
     if len(l) == 0: # this base case shouldn't be hit unless you have an empty regex or start your regex with a + or something
         # all it does is make a field with one terminal node in it
@@ -141,6 +141,11 @@ def list_to_field(l):
     for k in l[1:]:
         final = concatenate(final, k)
     return final
+
+# used for ?
+# start -> (passed field) --ε--> (node) <- that one is terminal
+#           \_______________ε____/
+# adds an ε move to the passed field's start node to the new end node, as well as adding it to each of passed field's terminal nodes
 def zero_or_one(f):
     f2 = field()
     f2.nodes = f.nodes
@@ -154,6 +159,7 @@ def zero_or_one(f):
     f2.nodes.add(n)
     f2.start.moves['ε'].add(n)
     return f2
+# takes a regex like "ab(cd*)+f" and makes an nfa field out of it
 def compile(regex):
     to_concat = [] # empty list of things to concatenate
     inparens = False # parenthesis parsing stuff
@@ -178,7 +184,7 @@ def compile(regex):
             ret = either(list_to_field(to_concat), compile(regex[i+1:]))
             ret.orig = regex
             return ret
-        elif regex[i] == "?": # COMPLETELY UNTESTED
+        elif regex[i] == "?":
             to_concat[-1] = zero_or_one(to_concat[-1])
         else: # if we just found a regular character, add it to the stuff to concatenate
             to_concat.append(build_from_char(regex[i]))
diff --git a/parser.py b/parser.py
index d4ccf24..651adf2 100644
--- a/parser.py
+++ b/parser.py
@@ -1,17 +1,54 @@
 import sys
 sys.path.append(".")
 import lexer
-def term(t_type, literal = False):
-    global tokens_ptr, tokens
-    this_token = tokens[tokens_ptr]
-    tokens_ptr += 1
-    print("attempting to match '" + str(literal) + "' ("+t_type+") to " + this_token.matched_string + " at position " + str(tokens_ptr - 1))
-    if t_type != this_token.type:
-        return False
-    if not literal:
-        return True
-    return literal == this_token.matched_string
 
+# I know I really shouldn't be using a class for this but tokens and tokens_ptr were global variables and I didn't feel like rewriting the whole thing to not use global variables, so now they're technically instance variables
+
+class parser():
+    def __init__(self, grammar):
+        self.grammar = grammar
+    def parse(self, inp):
+        self.tokens = lexer.lex(inp)
+        self.tokens = [t for t in self.tokens if not t.type == "whitespace_nfa"]
+        self.tokens_ptr = 0
+        return self.match_nterm(self.grammar["start"])
+
+    def term(self, t_type, literal = False):
+        this_token = self.tokens[self.tokens_ptr]
+        self.tokens_ptr += 1
+        print("attempting to match '" + str(literal) + "' ("+t_type+") to " + this_token.matched_string + " at position " + str(self.tokens_ptr - 1))
+        if t_type != this_token.type:
+            return False
+        if not literal:
+            return True
+        return literal == this_token.matched_string
+
+    def match_syms(self, syms):
+        # return term(a) and term(b) and term(c)
+        for sym in syms:
+            if not self.match_sym(sym):
+                return False
+        return True
+
+    def match_sym(self, sym):
+        if sym[0] == "terminal":
+            return self.term(*(sym[1]))
+        return self.match_nterm(sym[1])
+
+    def match_nterm(self, nterm):
+        save = self.tokens_ptr
+        for f in grammar[nterm]:
+            self.tokens_ptr = save
+            if self.match_syms(f):
+                return True
+        return False
+
+# Our productions for this context-free grammar
+# E -> T + E
+#    | T
+# T -> int * T
+#    | int
+#    | ( E )
 grammar = {
         "e": [
             [["nonterminal", "t"], ["terminal", ["mathbinop", "+"]], ["nonterminal", "e"]],
@@ -22,31 +59,8 @@ grammar = {
             [["terminal", ["integer"]]],
             [["terminal", ["parens", "("]], ["nonterminal", "e"], ["terminal", ["parens", ")"]]],
         ],
-        "order": ["e", "t"]
+        "start": "e",
 }
 
-def match_syms(syms):
-    # return term(a) and term(b) and term(c)
-    for sym in syms:
-        if not match_sym(sym):
-            return False
-    return True
-
-def match_sym(sym):
-    if sym[0] == "terminal":
-        return term(*(sym[1]))
-    return match_nterm(sym[1])
-
-def match_nterm(nterm):
-    global tokens_ptr
-    save = tokens_ptr
-    for f in grammar[nterm]:
-        tokens_ptr = save
-        if match_syms(f):
-            return True
-    return False
-
-tokens = lexer.lex("(10+1)*3")
-tokens_ptr = 0
-print(match_nterm("e"))
-print(tokens_ptr)
+p = parser(grammar)
+print(p.parse("(10 + (99 * 44))*3 + 1231"))