2017-03-22 14:47:18 +01:00
import random # only used for nfa id, which is only used by pmap
2017-03-22 13:53:06 +01:00
epsilon_literal = ' ε '
2017-03-22 14:47:18 +01:00
class field ( ) : # a "field" has a list of nodes and a starting node, that's it
2017-03-22 13:53:06 +01:00
def __init__ ( self ) :
self . nodes = set ( )
self . start = False
2017-03-24 21:47:40 +01:00
self . orig = " not compiled "
2017-03-22 14:47:18 +01:00
# a node is either terminal (accepting) or not, and it has a list of possible moves.
# the moves are usually indexed by a character, so my_nfa.moves['a'] will return another nfa
# ε is special because there can be more than one ε moves, and they don't consume a character of input
# so my_nfa.moves['ε'] will actually return a set of nodes
2017-03-22 13:53:06 +01:00
class nfa ( ) :
def __init__ ( self ) :
self . terminal = False
self . moves = { ' ε ' : set ( ) }
self . id = random . randrange ( 1000 )
2017-03-22 14:47:18 +01:00
# makes a super simple field with a start node and a terminal node. The start node will move to the end node but only if it's fed the character we were passed.
# it puts them in a field and sets the start node of the field to the start node
2017-03-22 13:53:06 +01:00
def build_from_char ( c ) :
base = nfa ( )
end = nfa ( )
end . terminal = True
base . moves [ c ] = end
f = field ( )
f . start = base
f . nodes = set ( [ base , end ] )
return f
2017-03-22 14:47:18 +01:00
# this takes a field and allows that field to be repeated zero or more times.
# _______________ε_______________
# / \
# / v
# start -> (node) --ε--> (passed field) --ε--> (node) <- that one is terminal
# ^ /
# \_____ε_____/
#
#
2017-03-22 13:53:06 +01:00
def iterate ( f ) :
new_f = field ( )
outer_start = nfa ( )
outer_end = nfa ( )
for node in f . nodes :
if node . terminal :
node . moves [ ' ε ' ] . add ( outer_end )
node . moves [ ' ε ' ] . add ( outer_start )
node . terminal = False
for k in f . nodes :
new_f . nodes . add ( k )
new_f . nodes . add ( outer_start )
new_f . nodes . add ( outer_end )
outer_end . terminal = True
outer_start . moves [ ' ε ' ] = set ( [ f . start , outer_end ] )
new_f . start = outer_start
return new_f
2017-03-22 14:47:18 +01:00
# This allows either token in the language
# .--ε--> (a) --ε--.
# / \
# start -> (node) (end) <- that one is terminal
# \ /
# `--ε--> (b) --ε--`
#
2017-03-22 13:53:06 +01:00
def either ( a , b ) :
f = field ( )
f . nodes = a . nodes
for k in b . nodes :
f . nodes . add ( k )
new_start = nfa ( )
new_start . moves [ ' ε ' ] . add ( a . start )
new_start . moves [ ' ε ' ] . add ( b . start )
new_end = nfa ( )
new_end . terminal = True
2017-03-22 14:49:56 +01:00
for node in ( a . nodes | b . nodes ) :
2017-03-22 13:53:06 +01:00
if node . terminal :
node . terminal = False
node . moves [ ' ε ' ] . add ( new_end )
f . start = new_start
f . nodes . add ( new_start )
f . nodes . add ( new_end )
return f
2017-03-22 14:47:18 +01:00
# returns a field with a then b
# start -> (a) --ε--> (b) <-- that one has terminal nodes
2017-03-22 13:53:06 +01:00
def concatenate ( a , b ) :
f = field ( )
f . start = a . start
for node in a . nodes :
f . nodes . add ( node )
if node . terminal :
node . terminal = False
node . moves [ ' ε ' ] . add ( b . start )
for node in b . nodes :
f . nodes . add ( node )
return f
2017-03-22 14:47:18 +01:00
# takes a set of nodes, returns a set of those nodes and all nodes connected to those nodes by epsilon moves
# uses an ignore list for its recursive calls so it won't loop infinitely
2017-03-22 17:38:26 +01:00
def epsilon_closure ( s , ignore = False ) :
2017-03-22 14:47:18 +01:00
new_s = set ( s ) # we can't modify the set we're iterating over or the program will explode, so we return a new set instead of modifying s
if not ignore :
ignore = new_s
2017-03-22 13:53:06 +01:00
for state in s :
2017-03-22 14:47:18 +01:00
for possible_move in state . moves [ ' ε ' ] : # for all possible ε moves in all the moves we're expanding over, recurse
2017-03-22 13:53:06 +01:00
if possible_move in ignore :
continue
2017-03-22 17:38:26 +01:00
for m in epsilon_closure ( set ( [ possible_move ] ) , ( set ( [ possible_move ] ) | ignore ) ) :
2017-03-22 13:53:06 +01:00
new_s . add ( m )
return new_s
2017-03-22 14:47:18 +01:00
# determines if a compiled regular expression matches a string
2017-03-22 14:04:22 +01:00
def match ( f , inp ) :
2017-03-22 13:53:06 +01:00
states = set ( [ f . start ] )
idx = 0
while idx < len ( inp ) :
2017-03-22 17:38:26 +01:00
states = epsilon_closure ( states ) # expand into epsilon connected states
2017-03-22 14:47:18 +01:00
new_states = set ( ) # the states we will be in after we consume the input
c = inp [ idx ] # the character we're taking in right now
2017-03-22 13:53:06 +01:00
for state in states :
for key , move in state . moves . items ( ) :
if key == c :
2017-03-22 14:47:18 +01:00
new_states . add ( move ) # add this move to new_states if we could get there from one of our old states by consuming that exact character
states = new_states ; # prepare to do it all over again
2017-03-22 13:53:06 +01:00
idx + = 1
2017-03-22 14:47:18 +01:00
if len ( states ) == 0 : # if there are no states, we can't possibly end up at a terminal state so just stop reading
return False
# now we've consumed all the input. If any of the states we are in are accepting states, it matched, otherwise return false
2017-03-24 21:47:40 +01:00
states = epsilon_closure ( states ) # expand into epsilon connected states
2017-03-22 13:53:06 +01:00
for state in states :
if state . terminal :
return True
return False
2017-03-25 19:55:36 +01:00
# takes a list of fields and returns a new field with all of the lists fields concatenated together
2017-03-22 13:53:06 +01:00
def list_to_field ( l ) :
2017-03-22 14:47:18 +01:00
if len ( l ) == 0 : # this base case shouldn't be hit unless you have an empty regex or start your regex with a + or something
# all it does is make a field with one terminal node in it
2017-03-22 13:53:06 +01:00
f = field ( )
n = nfa ( )
n . terminal = True
f . start = n
f . nodes = set ( [ n ] )
return f
2017-03-22 14:47:18 +01:00
# just maps concatenate over all the things in the list and returns the giant field at the end
2017-03-22 13:53:06 +01:00
final = l [ 0 ]
for k in l [ 1 : ] :
final = concatenate ( final , k )
return final
2017-03-25 19:55:36 +01:00
# used for ?
# start -> (passed field) --ε--> (node) <- that one is terminal
# \_______________ε____/
# adds an ε move to the passed field's start node to the new end node, as well as adding it to each of passed field's terminal nodes
2017-03-25 00:25:47 +01:00
def zero_or_one ( f ) :
2017-03-22 20:21:15 +01:00
f2 = field ( )
f2 . nodes = f . nodes
f2 . start = f . start
n = nfa ( )
n . terminal = True
for k in f . nodes :
if k . terminal :
k . terminal = False
k . moves [ ' ε ' ] . add ( n )
f2 . nodes . add ( n )
f2 . start . moves [ ' ε ' ] . add ( n )
return f2
2017-03-25 19:55:36 +01:00
# takes a regex like "ab(cd*)+f" and makes an nfa field out of it
2017-03-22 14:47:18 +01:00
def compile ( regex ) :
to_concat = [ ] # empty list of things to concatenate
inparens = False # parenthesis parsing stuff
2017-03-22 13:53:06 +01:00
for i in range ( len ( regex ) ) :
2017-03-22 14:47:18 +01:00
if inparens : # if we're in parenthesis
if regex [ i ] == " ) " : # and we find a close parenthesis
if inparens == 1 : # and we're now at the same level we started at
to_concat . append ( compile ( subregex ) ) # compile the stuff that was in parenthesis and add it to the list
inparens = False # and we're no longer in parenthesis
2017-03-22 13:53:06 +01:00
else :
2017-03-22 14:47:18 +01:00
inparens - = 1 # otherwise decrease parens level
if regex [ i ] == " ( " : # increase parens level if we find a (
2017-03-22 13:53:06 +01:00
inparens + = 1
2017-03-22 14:47:18 +01:00
subregex + = regex [ i ] # add the character to the string we will compile when we reach the end of the highest set of parenthesis
elif regex [ i ] == " ( " : # if we weren't in parens and we find a "(", enter parenthesis
2017-03-22 13:53:06 +01:00
inparens = 1
subregex = " "
continue
2017-03-22 14:47:18 +01:00
elif regex [ i ] == " * " : # if we find a *, iterate the last thing on the stack, which might have been a subregex (and that's ok)
2017-03-22 13:53:06 +01:00
to_concat [ - 1 ] = iterate ( to_concat [ - 1 ] )
2017-03-22 14:47:18 +01:00
elif regex [ i ] == " + " : # kind of a hack and gives + the highest possible operator precedence
2017-03-24 21:47:40 +01:00
ret = either ( list_to_field ( to_concat ) , compile ( regex [ i + 1 : ] ) )
ret . orig = regex
return ret
2017-03-25 19:55:36 +01:00
elif regex [ i ] == " ? " :
2017-03-25 00:25:47 +01:00
to_concat [ - 1 ] = zero_or_one ( to_concat [ - 1 ] )
2017-03-22 14:47:18 +01:00
else : # if we just found a regular character, add it to the stuff to concatenate
2017-03-22 13:53:06 +01:00
to_concat . append ( build_from_char ( regex [ i ] ) )
2017-03-22 20:21:15 +01:00
ret = list_to_field ( to_concat )
ret . orig = regex
return ret
2017-03-22 14:04:22 +01:00
2017-03-22 14:47:18 +01:00
def addr ( node ) : # this used to be a hack that would print the memory address of the node starting with a _ so graphviz didn't split it at the end of the number
2017-03-22 14:04:22 +01:00
return str ( node . id )
2017-03-22 14:47:18 +01:00
def pmap ( f ) : # prints out the passed field in a way that dot can compile to an svg
2017-03-22 14:04:22 +01:00
print ( " digraph test { " )
for node in f . nodes :
for char , move in node . moves . items ( ) :
if char == ' ε ' :
for m in move :
print ( addr ( node ) + " -> " + addr ( m ) + " [label=epsilon] " )
else :
print ( addr ( node ) + " -> " + addr ( move ) + " [label= " + char + " ] " )
if node . terminal :
print ( addr ( node ) + " -> " + addr ( node ) + " [label=terminal] " )
print ( addr ( f . start ) + " -> " + addr ( f . start ) + " [label=start] " )
print ( " } " )
# DEBUG
2017-03-22 14:47:18 +01:00
# BAD CODE - WILL BREAK EVERYTHING
# a = build_from_char('a')
# b = build_from_char('b')
# full = concatenate(a, b)
# full2 = concatenate(iterate(a), b)
# The reason you can't do this is because concatenate is a destructive operation (unsets terminal and adds moves to previously terminal nodes on a)
# so iterate(a) in the definition of full2 will set all terminal nodes of a to the new terminal node of iterate, but there are no terminal nodes of a so you get a mess. It will actually continue being connected to b as part of full
# End bad code
# this works
2017-03-22 14:04:22 +01:00
# full = concatenate(build_from_char('a'), build_from_char('b'))
# full2 = concatenate(iterate(build_from_char('a')), build_from_char('b'))
# strings = ["aa", "ab", "ba", "bb", "aba", "aab", "aaab", "aaba", "b"]
# for s in strings:
# print("String " + s + " " + ("matches" if match(full, s) else "does not match") + " pattern " + "ab")
# print()
# for s in strings:
# print("String " + s + " " + ("matches" if match(full2, s) else "does not match") + " pattern " + "a*b")
#print(match(full2, "aab"))
#pmap(full2)
2017-03-25 00:25:47 +01:00
#pmap(zero_or_one(build_from_char("a")))
2017-03-22 14:04:22 +01:00
2017-03-22 14:47:18 +01:00
#pmap(compile("ab(c1+2d(e*f)d)*e"))
2017-03-22 13:53:06 +01:00
#pmap(either(build_from_char('a'), build_from_char('b')))
2017-03-22 14:47:18 +01:00
# x = compile("(1+0)*1")
2017-03-22 14:49:56 +01:00
# pmap(x)
2017-03-22 13:53:06 +01:00
# for s in ["101", "111110", "11001", "1", "0"]:
2017-03-22 14:04:22 +01:00
# print(match(x, s))
2017-03-22 14:47:18 +01:00
# x = compile("a+b+c")
2017-03-22 14:04:22 +01:00
# pmap(x)