pycoolc/nfa.py

import random # only used for nfa id, which is only used by pmap
epsilon_literal = 'ε'
class field(): # a "field" has a list of nodes and a starting node, that's it
    def __init__(self):
        self.nodes = set()
        self.start = False
        self.orig = "not compiled"
# a node is either terminal (accepting) or not, and it has a list of possible moves. 
# the moves are usually indexed by a character, so my_nfa.moves['a'] will return another nfa
# ε is special because there can be more than one ε moves, and they don't consume a character of input
# so my_nfa.moves['ε'] will actually return a set of nodes
class nfa():
    def __init__(self):
        self.terminal = False
        self.moves = {'ε': set()}
        self.id = random.randrange(1000)
# makes a super simple field with a start node and a terminal node. The start node will move to the end node but only if it's fed the character we were passed.
# it puts them in a field and sets the start node of the field to the start node
def build_from_char(c):
    base = nfa()
    end = nfa()
    end.terminal = True
    base.moves[c] = end
    f = field()
    f.start = base
    f.nodes = set([base, end])
    return f
# this takes a field and allows that field to be repeated zero or more times.
#                _______________ε_______________
#               /                               \ 
#              /                                 v
# start -> (node) --ε--> (passed field) --ε--> (node) <- that one is terminal
#              ^             /
#               \_____ε_____/
# 
# 
def iterate(f):
    new_f = field()
    outer_start = nfa()
    outer_end = nfa()
    for node in f.nodes:
        if node.terminal:
            node.moves['ε'].add(outer_end)
            node.moves['ε'].add(outer_start)
            node.terminal = False
    for k in f.nodes:
        new_f.nodes.add(k)
    new_f.nodes.add(outer_start)
    new_f.nodes.add(outer_end)
    outer_end.terminal = True
    outer_start.moves['ε'] = set([f.start, outer_end])
    new_f.start = outer_start
    return new_f
# This allows either token in the language
#                .--ε--> (a) --ε--.
#               /                  \
# start -> (node)                 (end) <- that one is terminal
#               \                  /
#                `--ε--> (b) --ε--`
#  
def either(a, b):
    f = field()
    f.nodes = a.nodes
    for k in b.nodes:
        f.nodes.add(k)
    new_start = nfa()
    new_start.moves['ε'].add(a.start)
    new_start.moves['ε'].add(b.start)
    new_end = nfa()
    new_end.terminal = True
    for node in (a.nodes | b.nodes):
        if node.terminal:
            node.terminal = False
            node.moves['ε'].add(new_end)
    f.start = new_start
    f.nodes.add(new_start)
    f.nodes.add(new_end)
    return f
# returns a field with a then b
# start -> (a) --ε--> (b) <-- that one has terminal nodes
def concatenate(a, b):
    f = field()
    f.start = a.start
    for node in a.nodes:
        f.nodes.add(node)
        if node.terminal:
            node.terminal = False
            node.moves['ε'].add(b.start)
    for node in b.nodes:
        f.nodes.add(node)
    return f
# takes a set of nodes, returns a set of those nodes and all nodes connected to those nodes by epsilon moves
# uses an ignore list for its recursive calls so it won't loop infinitely
def epsilon_closure(s, ignore=False):
    new_s = set(s) # we can't modify the set we're iterating over or the program will explode, so we return a new set instead of modifying s
    if not ignore:
        ignore = new_s
    for state in s:
        for possible_move in state.moves['ε']: # for all possible ε moves in all the moves we're expanding over, recurse
            if possible_move in ignore:
                continue
            for m in epsilon_closure(set([possible_move]),(set([possible_move]) | ignore)):
                new_s.add(m)
    return new_s

# determines if a compiled regular expression matches a string
def match(f, inp):
    states = set([f.start])
    idx = 0
    while idx < len(inp):
        states = epsilon_closure(states) # expand into epsilon connected states
        new_states = set() # the states we will be in after we consume the input
        c = inp[idx] # the character we're taking in right now
        for state in states:
            for key, move in state.moves.items():
                if key == c:
                    new_states.add(move) # add this move to new_states if we could get there from one of our old states by consuming that exact character
        states = new_states; # prepare to do it all over again
        idx += 1
        if len(states) == 0: # if there are no states, we can't possibly end up at a terminal state so just stop reading
            return False
    # now we've consumed all the input. If any of the states we are in are accepting states, it matched, otherwise return false
    states = epsilon_closure(states) # expand into epsilon connected states
    for state in states:
        if state.terminal:
            return True
    return False

# takes a list of fields and returns a new field with all of the lists fields concatenated together
def list_to_field(l):
    if len(l) == 0: # this base case shouldn't be hit unless you have an empty regex or start your regex with a + or something
        # all it does is make a field with one terminal node in it
        f = field()
        n = nfa()
        n.terminal = True
        f.start = n
        f.nodes = set([n])
        return f
    # just maps concatenate over all the things in the list and returns the giant field at the end
    final = l[0]
    for k in l[1:]:
        final = concatenate(final, k)
    return final

# used for ?
# start -> (passed field) --ε--> (node) <- that one is terminal
#           \_______________ε____/
# adds an ε move to the passed field's start node to the new end node, as well as adding it to each of passed field's terminal nodes
def zero_or_one(f):
    f2 = field()
    f2.nodes = f.nodes
    f2.start = f.start
    n = nfa()
    n.terminal = True
    for k in f.nodes:
        if k.terminal:
            k.terminal = False
            k.moves['ε'].add(n)
    f2.nodes.add(n)
    f2.start.moves['ε'].add(n)
    return f2
# takes a regex like "ab(cd*)+f" and makes an nfa field out of it
def compile(regex):
    to_concat = [] # empty list of things to concatenate
    inparens = False # parenthesis parsing stuff
    for i in range(len(regex)):
        if inparens: # if we're in parenthesis
            if regex[i] == ")": # and we find a close parenthesis
                if inparens == 1: # and we're now at the same level we started at
                    to_concat.append(compile(subregex)) # compile the stuff that was in parenthesis and add it to the list
                    inparens = False # and we're no longer in parenthesis
                else:
                    inparens -= 1 # otherwise decrease parens level
            if regex[i] == "(": # increase parens level if we find a (
                inparens += 1
            subregex += regex[i] # add the character to the string we will compile when we reach the end of the highest set of parenthesis
        elif regex[i] == "(": # if we weren't in parens and we find a "(", enter parenthesis
            inparens = 1
            subregex = ""
            continue
        elif regex[i] == "*": # if we find a *, iterate the last thing on the stack, which might have been a subregex (and that's ok)
            to_concat[-1] = iterate(to_concat[-1])
        elif regex[i] == "+": # kind of a hack and gives + the highest possible operator precedence
            ret = either(list_to_field(to_concat), compile(regex[i+1:]))
            ret.orig = regex
            return ret
        elif regex[i] == "?":
            to_concat[-1] = zero_or_one(to_concat[-1])
        else: # if we just found a regular character, add it to the stuff to concatenate
            to_concat.append(build_from_char(regex[i]))
    ret = list_to_field(to_concat)
    ret.orig = regex
    return ret

def addr(node): # this used to be a hack that would print the memory address of the node starting with a _ so graphviz didn't split it at the end of the number
    return str(node.id)
def pmap(f): # prints out the passed field in a way that dot can compile to an svg
    print("digraph test {")
    for node in f.nodes:
        for char, move in node.moves.items():
            if char == 'ε':
                for m in move:
                    print(addr(node) + " -> " + addr(m) + " [label=epsilon]")
            else:
                print(addr(node) + " -> " + addr(move) + " [label="+char+"]")
        if node.terminal:
            print(addr(node) + " -> " + addr(node) + " [label=terminal]")
    print(addr(f.start) + " -> " + addr(f.start) + " [label=start]")
    print("}")


# DEBUG

# BAD CODE - WILL BREAK EVERYTHING
# a = build_from_char('a')
# b = build_from_char('b')
# full = concatenate(a, b)
# full2 = concatenate(iterate(a), b)
# The reason you can't do this is because concatenate is a destructive operation (unsets terminal and adds moves to previously terminal nodes on a)
# so iterate(a) in the definition of full2 will set all terminal nodes of a to the new terminal node of iterate, but there are no terminal nodes of a so you get a mess. It will actually continue being connected to b as part of full
# End bad code

# this works
# full = concatenate(build_from_char('a'), build_from_char('b'))
# full2 = concatenate(iterate(build_from_char('a')), build_from_char('b'))

# strings = ["aa", "ab", "ba", "bb", "aba", "aab", "aaab", "aaba", "b"]
# for s in strings:
    # print("String " + s + " " + ("matches" if match(full, s) else "does not match") + " pattern " + "ab")
# print()
# for s in strings:
    # print("String " + s + " " + ("matches" if match(full2, s) else "does not match") + " pattern " + "a*b")

#print(match(full2, "aab"))
#pmap(full2)
#pmap(zero_or_one(build_from_char("a")))

#pmap(compile("ab(c1+2d(e*f)d)*e"))
#pmap(either(build_from_char('a'), build_from_char('b')))
# x = compile("(1+0)*1")
# pmap(x)
# for s in ["101", "111110", "11001", "1", "0"]:
    # print(match(x, s))
# x = compile("a+b+c")
# pmap(x)
Add a ton of comments 2017-03-22 14:47:18 +01:00			`import random # only used for nfa id, which is only used by pmap`
initial commit 2017-03-22 13:53:06 +01:00			`epsilon_literal = 'ε'`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`class field(): # a "field" has a list of nodes and a starting node, that's it`
initial commit 2017-03-22 13:53:06 +01:00			`def __init__(self):`
			`self.nodes = set()`
			`self.start = False`
Lexer works 2017-03-24 21:47:40 +01:00			`self.orig = "not compiled"`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# a node is either terminal (accepting) or not, and it has a list of possible moves.`
			`# the moves are usually indexed by a character, so my_nfa.moves['a'] will return another nfa`
			`# ε is special because there can be more than one ε moves, and they don't consume a character of input`
			`# so my_nfa.moves['ε'] will actually return a set of nodes`
initial commit 2017-03-22 13:53:06 +01:00			`class nfa():`
			`def __init__(self):`
			`self.terminal = False`
			`self.moves = {'ε': set()}`
			`self.id = random.randrange(1000)`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# makes a super simple field with a start node and a terminal node. The start node will move to the end node but only if it's fed the character we were passed.`
			`# it puts them in a field and sets the start node of the field to the start node`
initial commit 2017-03-22 13:53:06 +01:00			`def build_from_char(c):`
			`base = nfa()`
			`end = nfa()`
			`end.terminal = True`
			`base.moves[c] = end`
			`f = field()`
			`f.start = base`
			`f.nodes = set([base, end])`
			`return f`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# this takes a field and allows that field to be repeated zero or more times.`
			`# _______________ε_______________`
			`# / \`
			`# / v`
			`# start -> (node) --ε--> (passed field) --ε--> (node) <- that one is terminal`
			`# ^ /`
			`# \_____ε_____/`
			`#`
			`#`
initial commit 2017-03-22 13:53:06 +01:00			`def iterate(f):`
			`new_f = field()`
			`outer_start = nfa()`
			`outer_end = nfa()`
			`for node in f.nodes:`
			`if node.terminal:`
			`node.moves['ε'].add(outer_end)`
			`node.moves['ε'].add(outer_start)`
			`node.terminal = False`
			`for k in f.nodes:`
			`new_f.nodes.add(k)`
			`new_f.nodes.add(outer_start)`
			`new_f.nodes.add(outer_end)`
			`outer_end.terminal = True`
			`outer_start.moves['ε'] = set([f.start, outer_end])`
			`new_f.start = outer_start`
			`return new_f`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# This allows either token in the language`
			`# .--ε--> (a) --ε--.`
			`# / \`
			`# start -> (node) (end) <- that one is terminal`
			`# \ /`
			# `--ε--> (b) --ε--`
			`#`
initial commit 2017-03-22 13:53:06 +01:00			`def either(a, b):`
			`f = field()`
			`f.nodes = a.nodes`
			`for k in b.nodes:`
			`f.nodes.add(k)`
			`new_start = nfa()`
			`new_start.moves['ε'].add(a.start)`
			`new_start.moves['ε'].add(b.start)`
			`new_end = nfa()`
			`new_end.terminal = True`
add graphs 2017-03-22 14:49:56 +01:00			`for node in (a.nodes \| b.nodes):`
initial commit 2017-03-22 13:53:06 +01:00			`if node.terminal:`
			`node.terminal = False`
			`node.moves['ε'].add(new_end)`
			`f.start = new_start`
			`f.nodes.add(new_start)`
			`f.nodes.add(new_end)`
			`return f`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# returns a field with a then b`
			`# start -> (a) --ε--> (b) <-- that one has terminal nodes`
initial commit 2017-03-22 13:53:06 +01:00			`def concatenate(a, b):`
			`f = field()`
			`f.start = a.start`
			`for node in a.nodes:`
			`f.nodes.add(node)`
			`if node.terminal:`
			`node.terminal = False`
			`node.moves['ε'].add(b.start)`
			`for node in b.nodes:`
			`f.nodes.add(node)`
			`return f`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# takes a set of nodes, returns a set of those nodes and all nodes connected to those nodes by epsilon moves`
			`# uses an ignore list for its recursive calls so it won't loop infinitely`
use correct notation 2017-03-22 17:38:26 +01:00			`def epsilon_closure(s, ignore=False):`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`new_s = set(s) # we can't modify the set we're iterating over or the program will explode, so we return a new set instead of modifying s`
			`if not ignore:`
			`ignore = new_s`
initial commit 2017-03-22 13:53:06 +01:00			`for state in s:`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`for possible_move in state.moves['ε']: # for all possible ε moves in all the moves we're expanding over, recurse`
initial commit 2017-03-22 13:53:06 +01:00			`if possible_move in ignore:`
			`continue`
use correct notation 2017-03-22 17:38:26 +01:00			`for m in epsilon_closure(set([possible_move]),(set([possible_move]) \| ignore)):`
initial commit 2017-03-22 13:53:06 +01:00			`new_s.add(m)`
			`return new_s`

Add a ton of comments 2017-03-22 14:47:18 +01:00			`# determines if a compiled regular expression matches a string`
cleaned up 2017-03-22 14:04:22 +01:00			`def match(f, inp):`
initial commit 2017-03-22 13:53:06 +01:00			`states = set([f.start])`
			`idx = 0`
			`while idx < len(inp):`
use correct notation 2017-03-22 17:38:26 +01:00			`states = epsilon_closure(states) # expand into epsilon connected states`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`new_states = set() # the states we will be in after we consume the input`
			`c = inp[idx] # the character we're taking in right now`
initial commit 2017-03-22 13:53:06 +01:00			`for state in states:`
			`for key, move in state.moves.items():`
			`if key == c:`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`new_states.add(move) # add this move to new_states if we could get there from one of our old states by consuming that exact character`
			`states = new_states; # prepare to do it all over again`
initial commit 2017-03-22 13:53:06 +01:00			`idx += 1`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`if len(states) == 0: # if there are no states, we can't possibly end up at a terminal state so just stop reading`
			`return False`
			`# now we've consumed all the input. If any of the states we are in are accepting states, it matched, otherwise return false`
Lexer works 2017-03-24 21:47:40 +01:00			`states = epsilon_closure(states) # expand into epsilon connected states`
initial commit 2017-03-22 13:53:06 +01:00			`for state in states:`
			`if state.terminal:`
			`return True`
			`return False`

Parser doesn't use global variables 2017-03-25 19:55:36 +01:00			`# takes a list of fields and returns a new field with all of the lists fields concatenated together`
initial commit 2017-03-22 13:53:06 +01:00			`def list_to_field(l):`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`if len(l) == 0: # this base case shouldn't be hit unless you have an empty regex or start your regex with a + or something`
			`# all it does is make a field with one terminal node in it`
initial commit 2017-03-22 13:53:06 +01:00			`f = field()`
			`n = nfa()`
			`n.terminal = True`
			`f.start = n`
			`f.nodes = set([n])`
			`return f`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# just maps concatenate over all the things in the list and returns the giant field at the end`
initial commit 2017-03-22 13:53:06 +01:00			`final = l[0]`
			`for k in l[1:]:`
			`final = concatenate(final, k)`
			`return final`
Parser doesn't use global variables 2017-03-25 19:55:36 +01:00
			`# used for ?`
			`# start -> (passed field) --ε--> (node) <- that one is terminal`
			`# \_______________ε____/`
			`# adds an ε move to the passed field's start node to the new end node, as well as adding it to each of passed field's terminal nodes`
Fixed integers, fixed ? operator 2017-03-25 00:25:47 +01:00			`def zero_or_one(f):`
updated lexer definitions 2017-03-22 20:21:15 +01:00			`f2 = field()`
			`f2.nodes = f.nodes`
			`f2.start = f.start`
			`n = nfa()`
			`n.terminal = True`
			`for k in f.nodes:`
			`if k.terminal:`
			`k.terminal = False`
			`k.moves['ε'].add(n)`
			`f2.nodes.add(n)`
			`f2.start.moves['ε'].add(n)`
			`return f2`
Parser doesn't use global variables 2017-03-25 19:55:36 +01:00			`# takes a regex like "ab(cd*)+f" and makes an nfa field out of it`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`def compile(regex):`
			`to_concat = [] # empty list of things to concatenate`
			`inparens = False # parenthesis parsing stuff`
initial commit 2017-03-22 13:53:06 +01:00			`for i in range(len(regex)):`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`if inparens: # if we're in parenthesis`
			`if regex[i] == ")": # and we find a close parenthesis`
			`if inparens == 1: # and we're now at the same level we started at`
			`to_concat.append(compile(subregex)) # compile the stuff that was in parenthesis and add it to the list`
			`inparens = False # and we're no longer in parenthesis`
initial commit 2017-03-22 13:53:06 +01:00			`else:`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`inparens -= 1 # otherwise decrease parens level`
			`if regex[i] == "(": # increase parens level if we find a (`
initial commit 2017-03-22 13:53:06 +01:00			`inparens += 1`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`subregex += regex[i] # add the character to the string we will compile when we reach the end of the highest set of parenthesis`
			`elif regex[i] == "(": # if we weren't in parens and we find a "(", enter parenthesis`
initial commit 2017-03-22 13:53:06 +01:00			`inparens = 1`
			`subregex = ""`
			`continue`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`elif regex[i] == "": # if we find a , iterate the last thing on the stack, which might have been a subregex (and that's ok)`
initial commit 2017-03-22 13:53:06 +01:00			`to_concat[-1] = iterate(to_concat[-1])`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`elif regex[i] == "+": # kind of a hack and gives + the highest possible operator precedence`
Lexer works 2017-03-24 21:47:40 +01:00			`ret = either(list_to_field(to_concat), compile(regex[i+1:]))`
			`ret.orig = regex`
			`return ret`
Parser doesn't use global variables 2017-03-25 19:55:36 +01:00			`elif regex[i] == "?":`
Fixed integers, fixed ? operator 2017-03-25 00:25:47 +01:00			`to_concat[-1] = zero_or_one(to_concat[-1])`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`else: # if we just found a regular character, add it to the stuff to concatenate`
initial commit 2017-03-22 13:53:06 +01:00			`to_concat.append(build_from_char(regex[i]))`
updated lexer definitions 2017-03-22 20:21:15 +01:00			`ret = list_to_field(to_concat)`
			`ret.orig = regex`
			`return ret`
cleaned up 2017-03-22 14:04:22 +01:00
Add a ton of comments 2017-03-22 14:47:18 +01:00			`def addr(node): # this used to be a hack that would print the memory address of the node starting with a _ so graphviz didn't split it at the end of the number`
cleaned up 2017-03-22 14:04:22 +01:00			`return str(node.id)`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`def pmap(f): # prints out the passed field in a way that dot can compile to an svg`
cleaned up 2017-03-22 14:04:22 +01:00			`print("digraph test {")`
			`for node in f.nodes:`
			`for char, move in node.moves.items():`
			`if char == 'ε':`
			`for m in move:`
			`print(addr(node) + " -> " + addr(m) + " [label=epsilon]")`
			`else:`
			`print(addr(node) + " -> " + addr(move) + " [label="+char+"]")`
			`if node.terminal:`
			`print(addr(node) + " -> " + addr(node) + " [label=terminal]")`
			`print(addr(f.start) + " -> " + addr(f.start) + " [label=start]")`
			`print("}")`


			`# DEBUG`

Add a ton of comments 2017-03-22 14:47:18 +01:00			`# BAD CODE - WILL BREAK EVERYTHING`
			`# a = build_from_char('a')`
			`# b = build_from_char('b')`
			`# full = concatenate(a, b)`
			`# full2 = concatenate(iterate(a), b)`
			`# The reason you can't do this is because concatenate is a destructive operation (unsets terminal and adds moves to previously terminal nodes on a)`
			`# so iterate(a) in the definition of full2 will set all terminal nodes of a to the new terminal node of iterate, but there are no terminal nodes of a so you get a mess. It will actually continue being connected to b as part of full`
			`# End bad code`

			`# this works`
cleaned up 2017-03-22 14:04:22 +01:00			`# full = concatenate(build_from_char('a'), build_from_char('b'))`
			`# full2 = concatenate(iterate(build_from_char('a')), build_from_char('b'))`

			`# strings = ["aa", "ab", "ba", "bb", "aba", "aab", "aaab", "aaba", "b"]`
			`# for s in strings:`
			`# print("String " + s + " " + ("matches" if match(full, s) else "does not match") + " pattern " + "ab")`
			`# print()`
			`# for s in strings:`
			`# print("String " + s + " " + ("matches" if match(full2, s) else "does not match") + " pattern " + "a*b")`

			`#print(match(full2, "aab"))`
			`#pmap(full2)`
Fixed integers, fixed ? operator 2017-03-25 00:25:47 +01:00			`#pmap(zero_or_one(build_from_char("a")))`
cleaned up 2017-03-22 14:04:22 +01:00
Add a ton of comments 2017-03-22 14:47:18 +01:00			`#pmap(compile("ab(c1+2d(ef)d)e"))`
initial commit 2017-03-22 13:53:06 +01:00			`#pmap(either(build_from_char('a'), build_from_char('b')))`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# x = compile("(1+0)*1")`
add graphs 2017-03-22 14:49:56 +01:00			`# pmap(x)`
initial commit 2017-03-22 13:53:06 +01:00			`# for s in ["101", "111110", "11001", "1", "0"]:`
cleaned up 2017-03-22 14:04:22 +01:00			`# print(match(x, s))`
Add a ton of comments 2017-03-22 14:47:18 +01:00			`# x = compile("a+b+c")`
cleaned up 2017-03-22 14:04:22 +01:00			`# pmap(x)`