pycoolc/lexer.py

import sys
sys.path.append(".")
import nfa, string
digit = '+'.join("0123456789")
letter = '+'.join(string.ascii_letters) + "+_"
upper = '+'.join(string.ascii_uppercase)
lower = '+'.join(string.ascii_lowercase)
whitespace_no_newline = " +\f+\r+\t+\v"
whitespace = "(" + whitespace_no_newline + "+\n)*"
any_char = digit + "+" + letter + "+" + whitespace
any_string = "("+any_char+")*"

integer = nfa.compile("-?(" + digit + ")(" + digit + ")*")
integer.type = "integer"
identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits
identifier.type = "identifier"
string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')")
string.type = "string"
comment = nfa.compile("--(" + digit + "+" + letter + "+" + whitespace_no_newline + ")*\n") # Untested
comment.type = "comment"
keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"]))
keyword.type = "keyword"
assign = nfa.compile("<-")
assign.type = "assign"
relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>", "!="]))
relop.type = "relop"
semicolon = nfa.compile(";")
semicolon.type = "semicolon"
colon = nfa.compile(":")
colon.type = "colon"
comma = nfa.compile(",")
comma.type = "comma"
whitespace_nfa = nfa.compile(whitespace)
whitespace_nfa.type = "whitespace_nfa"
parens = nfa.either(nfa.build_from_char("("), nfa.build_from_char(")"))
parens.type = "parens"
mathbinop = nfa.either(nfa.either(nfa.compile("-+/+%+^+|+&"), nfa.build_from_char("+")), nfa.build_from_char("*"))
mathbinop.type = "mathbinop"
mathunop = nfa.compile("~")
mathunop.type = "mathunop"
brace = nfa.compile("{+}")
brace.type = "brace"
bracket = nfa.compile("[+]")
bracket.type = "bracket"
unop = nfa.compile("!")
unop.type = "unop"


test_data = """
if x = y then
    x <- 10;
else
    x <- x - (y * -1); -- comment test
    print("string literal test");
fi
"""

#print(nfa.match(keyword, "if"))
#print(nfa.match(nfa.compile("if+and"), "if"))
#print(nfa.match(integer, "10"))
#nfa.pmap(nfa.compile("if+and"))
#sys.exit(0)

class token():
    def __init__(self):
        self.matched_string = ""
        self.type = False

# returns a list of tokens in the order they appeared in the input string
def lex(data):
    # import subprocess
    # process = subprocess.Popen(["gpp", "+c", "--", "\\n"], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
    # data = process.communicate(input=data.encode("utf-8"))[0].decode("utf-8")
    # whichever of these is the first to match a substring of the text is used to create the token
    priority_order = [whitespace_nfa, comment, integer, parens, bracket, brace, mathbinop, mathunop, unop, semicolon, colon, comma, keyword, assign, relop, string, identifier]
    done = []
    data_ptr = 0
    while data_ptr < len(data): # loop until we've read the whole input string
        one_matched = False
        # start by trying to match the whole rest of the input string, and chop one character off the end until there are no characters left. If none of those substrings matched, move on to the next regex in the priority order
        for regex in priority_order: # starting with the highest priority regex
            data_end = len(data) # MAXIMUM MUNCH - literally the largest lookahead possible
            this_regex_matched = False
            while data_end - data_ptr > 0:
                considering = data[data_ptr:data_end]
                if nfa.match(regex, considering): # If this regex matched the substring
                    data_ptr += len(considering) # add the length of what it matched to where we'll start reading the next token from
                    t = token() # construct a token and add it to the result list
                    t.matched_string = considering
                    t.type = regex.type
                    done.append(t)
                    one_matched = True # don't die
                    this_regex_matched = True
                    break
                data_end -= 1
            if this_regex_matched:
                break
        if not one_matched:
            print("Nothing matched '" + considering + "', bailing out") # if we encounter something that we can't parse, just die
            return []
    return done
# debug
# for tkn in lex(test_data):
    # if tkn.type != "whitespace_nfa": print("token was '" + tkn.matched_string.replace("\n", "\\n") + "' of type " + tkn.type)
lexer definitions started, prob. not done 2017-03-22 19:57:31 +01:00			`import sys`
			`sys.path.append(".")`
			`import nfa, string`
			`digit = '+'.join("0123456789")`
			`letter = '+'.join(string.ascii_letters) + "+_"`
			`upper = '+'.join(string.ascii_uppercase)`
			`lower = '+'.join(string.ascii_lowercase)`
Apparently we're losers 2017-03-24 23:47:16 +01:00			`whitespace_no_newline = " +\f+\r+\t+\v"`
Whitespace can be big 2017-03-25 05:59:00 +01:00			`whitespace = "(" + whitespace_no_newline + "+\n)*"`
updated lexer definitions 2017-03-22 20:21:15 +01:00			`any_char = digit + "+" + letter + "+" + whitespace`
			`any_string = "("+any_char+")*"`
lexer definitions started, prob. not done 2017-03-22 19:57:31 +01:00
Fixed integers, fixed ? operator 2017-03-25 00:25:47 +01:00			`integer = nfa.compile("-?(" + digit + ")(" + digit + ")*")`
Lexer works 2017-03-24 21:47:40 +01:00			`integer.type = "integer"`
updated lexer definitions 2017-03-22 20:21:15 +01:00			`identifier = nfa.compile("(" + letter + ")(" + letter + "+" + digit + ")*") # letter followed by any number of letters or digits`
Lexer works 2017-03-24 21:47:40 +01:00			`identifier.type = "identifier"`
updated lexer definitions 2017-03-22 20:21:15 +01:00			`string = nfa.compile("(\"(" + any_string + ")\")+('("+any_string+")')")`
Lexer works 2017-03-24 21:47:40 +01:00			`string.type = "string"`
Apparently we're losers 2017-03-24 23:47:16 +01:00			`comment = nfa.compile("--(" + digit + "+" + letter + "+" + whitespace_no_newline + ")*\n") # Untested`
			`comment.type = "comment"`
lexer definitions started, prob. not done 2017-03-22 19:57:31 +01:00			`keyword = nfa.compile("+".join(["class", "else", "false", "fi", "if", "in", "inherits", "isvoid", "let", "loop", "pool", "then", "while", "case", "esac", "new", "of", "not", "true"]))`
Lexer works 2017-03-24 21:47:40 +01:00			`keyword.type = "keyword"`
updated lexer definitions 2017-03-22 20:21:15 +01:00			`assign = nfa.compile("<-")`
Lexer works 2017-03-24 21:47:40 +01:00			`assign.type = "assign"`
readme updated 2017-03-25 00:16:47 +01:00			`relop = nfa.compile("+".join(["<", "<=", ">", ">=", "=", "<>", "!="]))`
Lexer works 2017-03-24 21:47:40 +01:00			`relop.type = "relop"`
			`semicolon = nfa.compile(";")`
			`semicolon.type = "semicolon"`
Started on grammar, but probably won't work 2017-03-25 20:37:11 +01:00			`colon = nfa.compile(":")`
			`colon.type = "colon"`
			`comma = nfa.compile(",")`
			`comma.type = "comma"`
Lexer works 2017-03-24 21:47:40 +01:00			`whitespace_nfa = nfa.compile(whitespace)`
			`whitespace_nfa.type = "whitespace_nfa"`
			`parens = nfa.either(nfa.build_from_char("("), nfa.build_from_char(")"))`
			`parens.type = "parens"`
New operators 2017-03-25 00:32:58 +01:00			`mathbinop = nfa.either(nfa.either(nfa.compile("-+/+%+^+\|+&"), nfa.build_from_char("+")), nfa.build_from_char("*"))`
readme updated 2017-03-25 00:16:47 +01:00			`mathbinop.type = "mathbinop"`
			`mathunop = nfa.compile("~")`
			`mathunop.type = "mathunop"`
			`brace = nfa.compile("{+}")`
			`brace.type = "brace"`
			`bracket = nfa.compile("[+]")`
			`bracket.type = "bracket"`
			`unop = nfa.compile("!")`
			`unop.type = "unop"`
updated lexer definitions 2017-03-22 20:21:15 +01:00
Lexer works 2017-03-24 21:47:40 +01:00
			`test_data = """`
			`if x = y then`
			`x <- 10;`
			`else`
Fixed integers, fixed ? operator 2017-03-25 00:25:47 +01:00			`x <- x - (y * -1); -- comment test`
Lexer works 2017-03-24 21:47:40 +01:00			`print("string literal test");`
			`fi`
			`"""`

			`#print(nfa.match(keyword, "if"))`
			`#print(nfa.match(nfa.compile("if+and"), "if"))`
			`#print(nfa.match(integer, "10"))`
			`#nfa.pmap(nfa.compile("if+and"))`
			`#sys.exit(0)`

			`class token():`
			`def __init__(self):`
			`self.matched_string = ""`
			`self.type = False`

readme updated 2017-03-25 00:16:47 +01:00			`# returns a list of tokens in the order they appeared in the input string`
Lexer works 2017-03-24 21:47:40 +01:00			`def lex(data):`
Apparently we're losers 2017-03-24 23:47:16 +01:00			`# import subprocess`
			`# process = subprocess.Popen(["gpp", "+c", "--", "\\n"], stdin = subprocess.PIPE, stdout = subprocess.PIPE)`
			`# data = process.communicate(input=data.encode("utf-8"))[0].decode("utf-8")`
readme updated 2017-03-25 00:16:47 +01:00			`# whichever of these is the first to match a substring of the text is used to create the token`
Started on grammar, but probably won't work 2017-03-25 20:37:11 +01:00			`priority_order = [whitespace_nfa, comment, integer, parens, bracket, brace, mathbinop, mathunop, unop, semicolon, colon, comma, keyword, assign, relop, string, identifier]`
Lexer works 2017-03-24 21:47:40 +01:00			`done = []`
			`data_ptr = 0`
readme updated 2017-03-25 00:16:47 +01:00			`while data_ptr < len(data): # loop until we've read the whole input string`
Lexer works 2017-03-24 21:47:40 +01:00			`one_matched = False`
readme updated 2017-03-25 00:16:47 +01:00			`# start by trying to match the whole rest of the input string, and chop one character off the end until there are no characters left. If none of those substrings matched, move on to the next regex in the priority order`
			`for regex in priority_order: # starting with the highest priority regex`
			`data_end = len(data) # MAXIMUM MUNCH - literally the largest lookahead possible`
			`this_regex_matched = False`
Lexer works 2017-03-24 21:47:40 +01:00			`while data_end - data_ptr > 0:`
			`considering = data[data_ptr:data_end]`
readme updated 2017-03-25 00:16:47 +01:00			`if nfa.match(regex, considering): # If this regex matched the substring`
			`data_ptr += len(considering) # add the length of what it matched to where we'll start reading the next token from`
			`t = token() # construct a token and add it to the result list`
Lexer works 2017-03-24 21:47:40 +01:00			`t.matched_string = considering`
			`t.type = regex.type`
			`done.append(t)`
readme updated 2017-03-25 00:16:47 +01:00			`one_matched = True # don't die`
Lexer works 2017-03-24 21:47:40 +01:00			`this_regex_matched = True`
			`break`
			`data_end -= 1`
readme updated 2017-03-25 00:16:47 +01:00			`if this_regex_matched:`
			`break`
Lexer works 2017-03-24 21:47:40 +01:00			`if not one_matched:`
readme updated 2017-03-25 00:16:47 +01:00			`print("Nothing matched '" + considering + "', bailing out") # if we encounter something that we can't parse, just die`
Lexer works 2017-03-24 21:47:40 +01:00			`return []`
			`return done`
readme updated 2017-03-25 00:16:47 +01:00			`# debug`
Parser started. Manual implementation works but not generation from an abstract grammar. 2017-03-25 07:23:14 +01:00			`# for tkn in lex(test_data):`
			`# if tkn.type != "whitespace_nfa": print("token was '" + tkn.matched_string.replace("\n", "\\n") + "' of type " + tkn.type)`