tlang/source/tlang/compiler/lexer/lexer.d

919 lines
28 KiB
D

module compiler.lexer.core;
import std.container.slist;
import gogga;
import std.conv : to;
import std.string : cmp;
import std.ascii : isDigit;
import misc.exceptions : TError;
public enum LexerError
{
EXHAUSTED_CHARACTERS,
OTHER
}
public final class LexerException : TError
{
public const Lexer offendingInstance;
public const LexerError errType;
this(Lexer offendingInstance, LexerError errType = LexerError.OTHER, string msg = "")
{
string positionString = "("~to!(string)(offendingInstance.line)~", "~to!(string)(offendingInstance.column)~")";
super("LexerException("~to!(string)(errType)~")"~(msg.length ? ": "~msg : "")~" at "~positionString);
this.offendingInstance = offendingInstance;
this.errType = errType;
}
this(Lexer offendingInstance, string msg)
{
this(offendingInstance, LexerError.OTHER, msg);
}
}
/* TODO: Add Token type (which matches column and position too) */
public final class Token
{
/* The token */
private string token;
/* Line number information */
private ulong line, column;
this(string token, ulong line, ulong column)
{
this.token = token;
this.line = line;
this.column = column;
}
override bool opEquals(Object other)
{
return cmp(token, (cast(Token)other).getToken()) == 0;
}
override string toString()
{
/* TODO (Column number): Don't adjust here, do it maybe in the lexer itself */
return token~" at ("~to!(string)(line)~", "~to!(string)(column-token.length)~")";
}
public string getToken()
{
return token;
}
}
public final class Lexer
{
/**
* Lexer state data
*/
private string sourceCode; /* The source to be lexed */
private ulong line = 1; /* Current line */
private ulong column = 1;
private Token[] currentTokens; /* Current token set */
private string currentToken; /* Current token */
private ulong position; /* Current character position */
private char currentChar; /* Current character */
private bool stringMode; /* Whether we are in a string "we are here" or not */
private bool floatMode; /* Whether or not we are building a floating point constant */
/* The tokens */
private Token[] tokens;
this(string sourceCode)
{
this.sourceCode = sourceCode;
}
private bool isForward()
{
return position+1 < sourceCode.length;
}
public bool isBackward()
{
return position-1 < sourceCode.length;
}
/**
* Used for tokenising a2.b2
*
* When the `.` is encountered
* and there are some characters
* behind it this checks if we can
* append a further dot to it
*/
private bool isBuildUpValidIdent()
{
import compiler.symbols.check;
return isPathIdentifier(currentToken) || isIdentifier(currentToken);
}
/**
* Returns true if we have a token being built
* false otherwise
*/
private bool hasToken()
{
return currentToken.length != 0;
}
/* Perform the lexing process */
/* TODO: Use return value */
public void performLex()
{
while(position < sourceCode.length)
{
// gprintln("SrcCodeLen: "~to!(string)(sourceCode.length));
// gprintln("Position: "~to!(string)(position));
currentChar = sourceCode[position];
if(floatMode == true)
{
if(isDigit(currentChar))
{
/* tack on and move to next iteration */
currentToken~=currentChar;
position++;
column++;
continue;
}
/* TODO; handle closer case and error case */
else
{
/* TODO: Throw erropr here */
if(isSpliter(currentChar))
{
floatMode = false;
currentTokens ~= new Token(currentToken, line, column);
currentToken = "";
/* We just flush and catch splitter in next round, hence below is commented out */
// column++;
// position++;
}
else
{
throw new LexerException(this, "Floating point '"~currentToken~"' cannot be followed by a '"~currentChar~"'");
}
}
}
else if(currentChar == ' ' && !stringMode)
{
/* TODO: Check if current token is fulled, then flush */
if(currentToken.length != 0)
{
currentTokens ~= new Token(currentToken, line, column);
currentToken = "";
}
column++;
position++;
}
else if(isSpliter(currentChar) && !stringMode)
{
/* The splitter token to finally insert */
string splitterToken;
gprintln("Build up: "~currentToken);
gprintln("Current char: "~currentChar);
/* Check for case of `==` (where we are on the first `=` sign) */
if(currentChar == '=' && isForward() && sourceCode[position+1] == '=')
{
/* Flush any current token (if exists) */
if(currentToken.length)
{
currentTokens ~= new Token(currentToken, line, column);
currentToken = "";
}
// Create the `==` token
currentTokens ~= new Token("==", line, column);
// Skip over the current `=` and the next `=`
position+=2;
column+=2;
continue;
}
/* FIXME: Add floating point support here */
/* TODO: IF buildUp is all numerical and we have dot go into float mode */
/* TODO: Error checking will need to be added */
if(isNumericalStr(currentToken) && currentChar == '.')
{
/* Tack on the dot */
currentToken~=".";
/* Enable floating point mode and go to next iteration*/
floatMode = true;
gprintln("Float mode just got enabled: Current build up: \""~currentToken~"\"");
column++;
position++;
continue;
}
/**
* Here we check if we have a `.` and that the characters
* preceding us were all godd for an identifier
*/
import misc.utils;
if(currentChar == '.' && hasToken() && isBuildUpValidIdent())
{
gprintln("Bruh");
/**
* Now we check that we have a character infront of us
* and that it is a letter
*
* TODO: Add _ check too as that is a valid identifier start
*/
if(isForward() && isCharacterAlpha(sourceCode[position+1]))
{
position++;
column+=1;
currentToken ~= '.';
continue;
}
else
{
throw new LexerException(this, "Expected a letter to follow the .");
}
}
/* Check if we need to do combinators (e.g. for ||, &&) */
/* TODO: Second operand in condition out of bounds */
else if(currentChar == '|' && (position+1) != sourceCode.length && sourceCode[position+1] == '|')
{
splitterToken = "||";
column += 2;
position += 2;
}
else if(currentChar == '&' && (position+1) != sourceCode.length && sourceCode[position+1] == '&')
{
splitterToken = "&&";
column += 2;
position += 2;
}
else if (currentChar == '\n') /* TODO: Unrelated!!!!!, but we shouldn't allow this bahevaipur in string mode */
{
line++;
column = 1;
position++;
}
else
{
splitterToken = ""~currentChar;
column++;
position++;
}
/* Flush the current token (if one exists) */
if(currentToken.length)
{
currentTokens ~= new Token(currentToken, line, column);
currentToken = "";
}
/* Add the splitter token (only if it isn't empty) */
if(splitterToken.length)
{
currentTokens ~= new Token(splitterToken, line, column);
}
}
else if(currentChar == '"')
{
/* If we are not in string mode */
if(!stringMode)
{
/* Add the opening " to the token */
currentToken ~= '"';
/* Enable string mode */
stringMode = true;
}
/* If we are in string mode */
else
{
/* Add the closing " to the token */
currentToken ~= '"';
/* Flush the token */
currentTokens ~= new Token(currentToken, line, column);
currentToken = "";
/* Get out of string mode */
stringMode = false;
}
column++;
position++;
}
else if(currentChar == '\\')
{
/* You can only use these in strings */
if(stringMode)
{
/* Check if we have a next character */
if(position+1 != sourceCode.length && isValidEscape_String(sourceCode[position+1]))
{
/* Add to the string */
currentToken ~= "\\"~sourceCode[position+1];
column += 2;
position += 2;
}
/* If we don't have a next character then raise error */
else
{
throw new LexerException(this, "Unfinished escape sequence");
}
}
else
{
throw new LexerException(this, "Escape sequences can only be used within strings");
}
}
/* Character literal support */
else if(!stringMode && currentChar == '\'')
{
currentToken ~= "'";
/* Character literal must be next */
if(position+1 != sourceCode.length)
{
/* TODO: Escape support for \' */
/* Get the character */
currentToken ~= ""~sourceCode[position+1];
column++;
position++;
/* Closing ' must be next */
if(position+1 != sourceCode.length && sourceCode[position+1] == '\'')
{
/* Generate and add the token */
currentToken ~= "'";
currentTokens ~= new Token(currentToken, line, column);
/* Flush the token */
currentToken = "";
column += 2;
position += 2;
}
else
{
throw new LexerException(this, "Was expecting closing ' when finishing character literal");
}
}
else
{
throw new LexerException(this, LexerError.EXHAUSTED_CHARACTERS, "EOSC reached when trying to get character literal");
}
}
/**
* If we are building up a number
*
* TODO: Build up token right at the end (#DuplicateCode)
*/
else if(isBuildUpNumerical())
{
gprintln("jfdjkhfdjkhfsdkj");
/* fetch the encoder segment */
char[] encoderSegment = numbericalEncoderSegmentFetch();
gprintln("isBuildUpNumerical(): Enter");
/**
* If we don't have any encoders
*/
if(encoderSegment.length == 0)
{
/* We can add a signage encoder */
if(isNumericalEncoder_Signage(currentChar))
{
gprintln("Hello");
/* Check if the next character is a size (it MUST be) */
if(isForward() && isNumericalEncoder_Size(sourceCode[position+1]))
{
currentToken ~= currentChar;
column++;
position++;
}
else
{
throw new LexerException(this, "You MUST specify a size encoder after a signagae encoder");
}
}
/* We can add a size encoder */
else if(isNumericalEncoder_Size(currentChar))
{
currentToken ~= currentChar;
column++;
position++;
}
/* We can add more numbers */
else if(isDigit(currentChar))
{
currentToken ~= currentChar;
column++;
position++;
}
/* Splitter (TODO) */
else if(isSpliter(currentChar))
{
/* Add the numerical literal as a new token */
currentTokens ~= new Token(currentToken, line, column);
/* Add the splitter token if not a newline */
if(currentChar != '\n')
{
currentTokens ~= new Token(""~currentChar, line, column);
}
/* Flush the token */
currentToken = "";
/* TODO: Check these */
column += 2;
position += 2;
}
/* Anything else is invalid */
else
{
throw new LexerException(this, "Not valid TODO");
}
}
/**
* If we have one encoder
*/
else if((encoderSegment.length == 1))
{
/* Check what the encoder is */
/**
* If we had a signage then we must have a size after it
*/
if(isNumericalEncoder_Signage(encoderSegment[0]))
{
/**
* Size encoder must then follow
*/
if(isNumericalEncoder_Size(currentChar))
{
currentToken ~= currentChar;
column++;
position++;
/* Add the numerical literal as a new token */
currentTokens ~= new Token(currentToken, line, column);
/* Flush the token */
currentToken = "";
}
/**
* Anything else is invalid
*/
else
{
throw new LexerException(this, "A size-encoder must follow a signage encoder");
}
}
else
{
throw new LexerException(this, "Cannot have another encoder after a size encoder");
}
}
/* It is impossible to reach this as flushing means we cannot add more */
else
{
assert(false);
}
}
/* Any other case, keep building the curent token */
else
{
currentToken ~= currentChar;
column++;
position++;
}
}
/* If there was a token made at the end then flush it */
if(currentToken.length)
{
currentTokens ~= new Token(currentToken, line, column);
}
tokens = currentTokens;
}
private char[] numbericalEncoderSegmentFetch()
{
char[] numberPart;
ulong stopped;
for(ulong i = 0; i < currentToken.length; i++)
{
char character = currentToken[i];
if(isDigit(character))
{
numberPart~=character;
}
else
{
stopped = i;
break;
}
}
char[] remaining = cast(char[])currentToken[stopped..currentToken.length];
return remaining;
}
/**
* Returns true if the current build up is entirely
* numerical
*
* FIXME: THis, probably by its own will pick up `UL`
* as a number, or even just ``
*/
private bool isBuildUpNumerical()
{
import std.ascii : isDigit;
char[] numberPart;
ulong stopped;
for(ulong i = 0; i < currentToken.length; i++)
{
char character = currentToken[i];
if(isDigit(character))
{
numberPart~=character;
}
else
{
stopped = i;
break;
}
}
/**
* We need SOME numerical stuff
*/
if(stopped == 0)
{
return false;
}
char[] remaining = cast(char[])currentToken[stopped..currentToken.length];
char lstEncoder;
for(ulong i = 0; i < remaining.length; i++)
{
char character = remaining[i];
if(!isNumericalEncoder(character))
{
return false;
}
}
return true;
}
/**
* Given a string return true if all characters
* are digits, false otherwise and false if
* the string is empty
*/
private static bool isNumericalStr(string input)
{
/**
* If the given input is empty then return false
*/
if(input.length == 0)
{
return false;
}
/**
* If there are any characters in the string then
* check if all are digits
*/
for(ulong i = 0; i < input.length; i++)
{
char character = input[i];
if(!isDigit(character))
{
return false;
}
}
return true;
}
/* Return the tokens */
public Token[] getTokens()
{
return tokens;
}
private bool isSpliter(char character)
{
return character == ';' || character == ',' || character == '(' ||
character == ')' || character == '[' || character == ']' ||
character == '+' || character == '-' || character == '/' ||
character == '%' || character == '*' || character == '&' ||
character == '{' || character == '}' || character == '=' ||
character == '|' || character == '^' || character == '!' ||
character == '\n' || character == '~' || character =='.' ||
character == ':'; //|| isNumericalEncoder(character);
}
private bool isNumericalEncoder(char character)
{
return isNumericalEncoder_Size(character) ||
isNumericalEncoder_Signage(character);
}
private bool isNumericalEncoder_Size(char character)
{
return character == 'B' || character == 'W' ||
character == 'I' || character == 'L';
}
private bool isNumericalEncoder_Signage(char character)
{
return character == 'S' || character == 'U';
}
/* Supported escapes \" */
public bool isValidEscape_String(char character)
{
return true; /* TODO: Implement me */
}
}
/* Test input: `hello "world";` */
unittest
{
import std.algorithm.comparison;
string sourceCode = "hello \"world\";";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token(";", 0, 0)]);
}
/* Test input: `hello "world"|| ` */
unittest
{
import std.algorithm.comparison;
string sourceCode = "hello \"world\"|| ";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token("||", 0, 0)]);
}
/* Test input: `hello "world"||` */
unittest
{
import std.algorithm.comparison;
string sourceCode = "hello \"world\"||";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token("||", 0, 0)]);
}
/* Test input: `hello "world"|` */
unittest
{
import std.algorithm.comparison;
string sourceCode = "hello \"world\";|";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token(";", 0, 0), new Token("|", 0, 0)]);
}
/* Test input: ` hello` */
unittest
{
import std.algorithm.comparison;
string sourceCode = " hello";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("hello", 0, 0)]);
}
/* Test input: `hello;` */
unittest
{
import std.algorithm.comparison;
string sourceCode = " hello;";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token(";", 0, 0)]);
}
/* Test input: `hello "world\""` */
unittest
{
import std.algorithm.comparison;
string sourceCode = "hello \"world\\\"\"";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\\\"\"", 0, 0)]);
}
/* Test input: `'c'` */
unittest
{
import std.algorithm.comparison;
string sourceCode = "'c'";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("'c'", 0, 0)]);
}
/* Test input: `2121\n2121` */
unittest
{
import std.algorithm.comparison;
string sourceCode = "2121\n2121";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("2121", 0, 0), new Token("2121", 0, 0)]);
}
/**
* Test `=`` and `==` handling
*/
unittest
{
import std.algorithm.comparison;
string sourceCode = " =\n";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("=", 0, 0)]);
import std.algorithm.comparison;
sourceCode = " = ==\n";
currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("=", 0, 0), new Token("==", 0, 0)]);
import std.algorithm.comparison;
sourceCode = " ==\n";
currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("==", 0, 0)]);
import std.algorithm.comparison;
sourceCode = " = =\n";
currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("=", 0, 0), new Token("=", 0, 0)]);
import std.algorithm.comparison;
sourceCode = " ==, = ==\n";
currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("==", 0, 0), new Token(",", 0, 0), new Token("=", 0, 0), new Token("==", 0, 0)]);
// Test flushing of previous token
import std.algorithm.comparison;
sourceCode = "i==i=\n";
currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("i", 0, 0), new Token("==", 0, 0), new Token("i", 0, 0), new Token("=", 0, 0)]);
}
/**
* Test: Literal value encoding
*
* Tests validity
*/
unittest
{
import std.algorithm.comparison;
string sourceCode;
Lexer currentLexer;
/* 21L (valid) */
sourceCode = "21L";
currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("21L", 0, 0)]);
/* 21UL (valid) */
sourceCode = "21UL";
currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("21UL", 0, 0)]);
// /* 21U (invalid) */
// sourceCode = "21U ";
// currentLexer = new Lexer(sourceCode);
// // gprintln(currentLexer.performLex());
// bool status = currentLexer.performLex();
// gprintln("Collected "~to!(string)(currentLexer.getTokens()));
// assert(!status);
// /* 21UL (valid) */
// sourceCode = "21UL";
// currentLexer = new Lexer(sourceCode);
// currentLexer.performLex();
// gprintln("Collected "~to!(string)(currentLexer.getTokens()));
// assert(currentLexer.getTokens() == [new Token("21UL", 0, 0)]);
}
/* Test input: `1.5` */
unittest
{
import std.algorithm.comparison;
string sourceCode = "1.5";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [new Token("1.5", 0, 0)]);
}
/**
* Test correct handling of dot-operator for
* non-floating point cases
*
* Input: `new A().l.p.p;`
*/
unittest
{
import std.algorithm.comparison;
string sourceCode = "new A().l.p.p;";
Lexer currentLexer = new Lexer(sourceCode);
currentLexer.performLex();
gprintln("Collected "~to!(string)(currentLexer.getTokens()));
assert(currentLexer.getTokens() == [
new Token("new", 0, 0),
new Token("A", 0, 0),
new Token("(", 0, 0),
new Token(")", 0, 0),
new Token(".", 0, 0),
new Token("l.p.p", 0, 0),
new Token(";", 0, 0)
]);
}