How to make a language (using Python)
asciidude
Posted on February 17, 2022
After completing this tutorial, you should be able to make a file that looks like this:
var age = 18 + 5;
output "You are " + age;
endproc 0;
This should do the following:
- Store the "age" variable in the memory, with the value of 23
- Output "You are 23"
- End the process on exit code 0
So, to start off, we should first make a main file. This shouldn't take too long:
# import the parser and lexer
import lexer as l
import parser as p
with open('main.lopa', 'r') as f:
#################
# LEXER #
#################
contents = [i for j in f.read().split() for i in (j, ' ')][:-1] # include spaces in the input file
lexer = l.Lexer(contents)
tokens = lexer.tokenize() # tokenize the contents
################
# PARSER #
################
parser = p.Parser(tokens)
parser.parse() # parse the tokenized contents from the lexer
parser.generateFile('main.py') # generate a python file after the parsing has been completed, this is what the user will run
Once you have completed that, it is time to move on to the lexer!
import re # import regex
class Lexer:
def __init__(self, source):
self.source = source
# tokenize function
def tokenize(self):
# initalize variables
tokens = [] # the tokens array, this is returned after tokenization
index = 0 # the index the lexer is on of the inputs
while index < len(self.source):
word = self.source[index]
# this already contains a few things from my own language, but i will describe what it does anyways and how to utilize it
# to add on to this lexer, all you have to do is simply check if a word equals something, if it does then append it to the tokens list with an array with two values (name, literal)
if word == 'set':
tokens.append(['VARIABLE_DECLARATOR', word])
elif re.match('[a-zA-Z]', word):
if word[len(word) - 1] == ';':
tokens.append(['IDENTIFIER', word[0:len(word) - 1]])
else:
tokens.append(['IDENTIFIER', word])
elif word[0] == '"':
buffer = []
if word[len(word) - 1] != ';':
while word[len(word) - 1] != '"':
buffer.append(word)
index += 1
word = self.source[index]
buffer.append(word)
else:
buffer.append(word[0:len(word) - 1])
tokens.append(['STRING', ''.join(buffer)])
elif re.match('-?[0-9]', word):
if word[len(word) - 1] == ';':
tokens.append(['NUMBER', word[0:len(word) - 1]])
else:
tokens.append(['NUMBER', word])
elif word in '=/*-+{}()':
tokens.append(['OPERATOR', word])
if word[len(word) -1] == ";":
tokens.append(['END', ';'])
# increment the index - this should not be in the if statements
index += 1
# finally, return the tokens, on to parsing!
return tokens
After all of that, it's time to parse!
import lexer as l
class Parser:
def __init__(self, tokens):
self.tokens = tokens
self.index = 0
self.transpiled = '' # you can include a watermark, if you'd like. anything that is transpiled will be appended to this string
# this function will loop over the tokenized contents and determine what to parse
def parse(self):
while self.index < len(self.tokens):
# stores token types
t_type = self.tokens[self.index][0]
# stores value of token
t_value = self.tokens[self.index][1]
if t_type == "VARIABLE_DECLARATOR" and t_value == 'set':
self.parse_variable_declaration(self.tokens[self.index:len(self.tokens)])
elif t_type == "IDENTIFIER" and t_value == 'output':
self.parse_output_statement(self.tokens[self.index:len(self.tokens)])
elif t_type == "IDENTIFIER" and t_value == 'endproc':
self.parse_endproc(self.tokens[self.index:len(self.tokens)])
self.index += 1
# this function will generate the file
def generateFile(self, output):
with open(output, 'w') as f:
f.write(self.transpiled)
# the parser functions
# variables
def parse_variable_declaration(self, stream):
check = 0
# these 3 variables determine the name, operator, and value of the variable (eg: +-/*...)
name = ''
operator = ''
value = ''
for token in range(0, len(stream)):
t_type = stream[check][0]
t_value = stream[check][1]
if t_type == 'END':
break
elif token == 1 and t_type == 'IDENTIFIER':
name = t_value
elif token == 1 and t_type != 'IDENTIFIER':
print(f'ERR -> Failed to parse, invalid variable name \'{t_value}\'')
quit(-1)
elif token == 2 and t_type == 'OPERATOR':
operator = t_value
elif token == 2 and t_type != 'OPERATOR':
print('ERR -> Failed to parse, assignment operator is missing or invalid on the declaration of a variable')
quit(-1)
elif token > 2 and t_type in ['STRING', 'NUMBER', 'IDENTIFIER', 'OPERATOR']:
value += t_value
elif token > 2 and t_type not in ['STRING', 'NUMBER', 'IDENTIFIER', 'OPERATOR']:
print(f'ERR -> Failed to parse, invalid assignment value, {t_value}')
quit(-1)
check += 1
# finally, push the transpiled code
self.transpiled += f'{name} {operator} {value}\n'
self.index += check
# everything else is basically the same, i recommend to seperate these all into different files to avoid bloating up your file and making it unreadable. happy coding!
# output statement
def parse_output_statement(self, stream):
check = 0
value = ''
for token in range(0, len(stream)):
t_type = stream[check][0]
t_value = stream[check][1]
if t_type == 'END':
break
elif token > 0 and t_type in ['STRING', 'NUMBER', 'IDENTIFIER', 'OPERATOR']:
value += t_value
elif token > 0 and t_type not in ['NUMBER', 'IDENTIFIER', 'OPERATOR']:
print(f'ERR -> Failed to parse, invalid assignment value, {t_value}')
quit(-1)
check += 1
self.transpiled += f'print({value})\n'
self.index += check
# endproc
def parse_endproc(self, stream):
check = 0
code = 0
for token in range(0, len(stream)):
t_type = stream[check][0]
t_value = stream[check][1]
if t_type == 'END':
break
elif token == 1 and t_type == 'NUMBER':
if '.' in t_value:
print('ERR -> Failed to parse, cannot quit program on a decimal number')
quit(-1)
code = t_value
elif token == 1 and t_type not in ['NUMBER', None]:
print(f'ERR -> Failed to parse, cannot parse value {t_value} on \'endproc\'')
quit(-1)
check += 1
self.transpiled += f'quit({code})\n'
self.index += check
💖 💪 🙅 🚩
asciidude
Posted on February 17, 2022
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.