diff --git a/source/tlang/compiler/lexer/core/lexer.d b/source/tlang/compiler/lexer/core/lexer.d index 0b0af86..258fa83 100644 --- a/source/tlang/compiler/lexer/core/lexer.d +++ b/source/tlang/compiler/lexer/core/lexer.d @@ -4,6 +4,7 @@ module tlang.compiler.lexer.core.lexer; import tlang.compiler.lexer.core.tokens : Token; +import std.ascii : isDigit, isAlpha, isWhite; /** * Defines the interface a lexer must provide @@ -73,4 +74,163 @@ public interface LexerInterface * Returns: a `Token[]` containing all tokens */ public Token[] getTokens(); +} + +/** + * Human-readable names assigned + * to commonly used character + * constants + */ +public enum LexerSymbols : char +{ + L_PAREN = '(', + R_PAREN = ')', + SEMI_COLON = ';', + COMMA = ',', + L_BRACK = '[' , + R_BRACK = ']' , + PLUS = '+' , + MINUS = '-' , + FORWARD_SLASH = '/' , + PERCENT = '%' , + STAR = '*' , + AMPERSAND = '&' , + L_BRACE = '{' , + R_BRACE = '}' , + EQUALS = '=' , + SHEFFER_STROKE = '|' , + CARET = '^' , + EXCLAMATION = '!' , + TILDE = '~' , + DOT = '.' , + COLON = ':', + SPACE = ' ', + TAB = '\t', + NEWLINE = '\n', + DOUBLE_QUOTE = '"', + SINGLE_QUOTE = '\'' , + BACKSLASH = '\\' , + UNDERSCORE = '_' , + LESS_THAN = '<' , + BIGGER_THAN = '>' , + + ESC_NOTHING = '0' , + ESC_CARRIAGE_RETURN = 'r' , + ESC_TAB = 't' , + ESC_NEWLINE = 'n' , + ESC_BELL= 'a' , + + ENC_BYTE = 'B' , + ENC_INT = 'I' , + ENC_LONG = 'L' , + ENC_WORD = 'W' , + ENC_UNSIGNED = 'U' , + ENC_SIGNED = 'S' , +} + +/** + * Alias to `LexerSymbols` + */ +public alias LS = LexerSymbols; + +/** + * Checks if the provided character is an operator + * + * Params: + * c = the character to check + * Returns: `true` if it is a character, `false` + * otherwise + */ +public bool isOperator(char c) +{ + return c == LS.PLUS || c == LS.TILDE || c == LS.MINUS || + c == LS.STAR || c == LS.FORWARD_SLASH || c == LS.AMPERSAND || + c == LS.CARET || c == LS.EXCLAMATION || c == LS.SHEFFER_STROKE || + c == LS.LESS_THAN || c == LS.BIGGER_THAN; +} + +/** + * Checks if the provided character is a splitter + * + * Params: + * c = the character to check + * Returns: `true` if it is a splitter, `false` + * otherwise + */ +public bool isSplitter(char c) +{ + return c == LS.SEMI_COLON || c == LS.COMMA || c == LS.L_PAREN || + c == LS.R_PAREN || c == LS.L_BRACK || c == LS.R_BRACK || + c == LS.PERCENT || c == LS.L_BRACE || c == LS.R_BRACE || + c == LS.EQUALS || c == LS.DOT || c == LS.COLON || + isOperator(c) || isWhite(c); +} + +/** + * Checks if the provided character is a + * numerical size encoder + * + * Params: + * character = the character to check + * Returns: `true` if so, `false` otheriwse + */ +public bool isNumericalEncoder_Size(char character) +{ + return character == LS.ENC_BYTE || character == LS.ENC_WORD || + character == LS.ENC_INT || character == LS.ENC_LONG; +} + +/** + * Checks if the provided character is a + * numerical signage encoder + * + * Params: + * character = the character to check + * Returns: `true` if so, `false` otherwise + */ +public bool isNumericalEncoder_Signage(char character) +{ + return character == LS.ENC_SIGNED || character == LS.ENC_UNSIGNED; +} + +/** + * Checks if the provided character is + * either a numerical size encoder + * or signage encoder + * + * Params: + * character = the character to check + * Returns: `true` if so, `false` otherwise + */ +public bool isNumericalEncoder(char character) +{ + return isNumericalEncoder_Size(character) || + isNumericalEncoder_Signage(character); +} + +/** + * Checks if the given character is a valid + * escape character (something which would + * have followed a `\`) + * + * Params: + * character = the character to check + * Returns: `true` if so, `false` otherwise + */ +public bool isValidEscape_String(char character) +{ + return character == LS.BACKSLASH || character == LS.DOUBLE_QUOTE || character == LS.SINGLE_QUOTE || + character == LS.ESC_NOTHING || character == LS.ESC_NEWLINE || character == LS.ESC_CARRIAGE_RETURN || + character == LS.TAB || character == LS.ESC_BELL; +} + +/** + * Given a character return whether it is valid entry + * for preceding a '.'. + * + * Returns: `true` if so, otherwise `false` + */ +public bool isValidDotPrecede(char character) +{ + return character == LS.R_PAREN || character == LS.R_BRACK; // || isAlpha(character) || isDigit(character); } \ No newline at end of file diff --git a/source/tlang/compiler/lexer/kinds/arr.d b/source/tlang/compiler/lexer/kinds/arr.d new file mode 100644 index 0000000..719d7f3 --- /dev/null +++ b/source/tlang/compiler/lexer/kinds/arr.d @@ -0,0 +1,124 @@ +module tlang.compiler.lexer.kinds.arr; + +import tlang.compiler.lexer.core; + +/** + * An array-based tokenizer which takes a + * provided array of `Token[]`. useful + * for testing parser-only related things + * with concrete tokens + */ +public final class ArrLexer : LexerInterface +{ + /** + * The concrete token source + */ + private Token[] tokens; + + /** + * Position in the `tokens` array + */ + private ulong tokenPtr = 0; + + /** + * Constructs a new `ArrLexer` (dummy lexer) with + * the tokens already in concrete form in the + * provided array. + * + * Params: + * tokens = the `Token[]` + */ + this(Token[] tokens) + { + this.tokens = tokens; + } + + /** + * Returns the token at the current cursor + * position + * + * Returns: the `Token` + */ + public Token getCurrentToken() + { + return tokens[tokenPtr]; + } + + /** + * Moves the cursor one token forward + */ + public void nextToken() + { + tokenPtr++; + } + + /** + * Moves the cursor one token backwards + */ + public void previousToken() + { + tokenPtr--; + } + + /** + * Sets the position of the cursor + * + * Params: + * cursor = the new position + */ + public void setCursor(ulong cursor) + { + this.tokenPtr = cursor; + } + + /** + * Retrieves the cursor's current position + * + * Returns: the position + */ + public ulong getCursor() + { + return this.tokenPtr; + } + + /** + * Checks whether more tokens are available + * of not + * + * Returns: true if more tokens are available, false otherwise + */ + public bool hasTokens() + { + return tokenPtr < tokens.length; + } + + /** + * Get the line position of the lexer in the source text + * + * Returns: the position + */ + public ulong getLine() + { + return 0; // TODO: anything meaningful? + } + + /** + * Get the column position of the lexer in the source text + * + * Returns: the position + */ + public ulong getColumn() + { + return 0; // TODO: anything meaningful? + } + + /** + * Exhaustively provide a list of all tokens + * + * Returns: a `Token[]` containing all tokens + */ + public Token[] getTokens() + { + return tokens; + } +} \ No newline at end of file diff --git a/source/tlang/compiler/lexer/kinds/basic.d b/source/tlang/compiler/lexer/kinds/basic.d index ed48fd7..3d0db44 100644 --- a/source/tlang/compiler/lexer/kinds/basic.d +++ b/source/tlang/compiler/lexer/kinds/basic.d @@ -4,11 +4,14 @@ module tlang.compiler.lexer.kinds.basic; import std.container.slist; +import std.string : replace; import gogga; import std.conv : to; -import std.ascii : isDigit; +import std.ascii : isDigit, isAlpha, isWhite; import tlang.compiler.lexer.core; +enum EMPTY = ""; + /** * Represents a basic lexer which performs the whole tokenization * process in one short via a call to `performLex()`, only after @@ -54,7 +57,7 @@ public final class BasicLexer : LexerInterface */ public override void previousToken() { - tokenPtr--; + tokenPtr--; } /** @@ -119,7 +122,6 @@ public final class BasicLexer : LexerInterface return tokens; } - /** * Lexer state data */ @@ -130,458 +132,243 @@ public final class BasicLexer : LexerInterface private string currentToken; /* Current token */ private ulong position; /* Current character position */ private char currentChar; /* Current character */ - private bool stringMode; /* Whether we are in a string "we are here" or not */ - private bool floatMode; /* Whether or not we are building a floating point constant */ - /* The tokens */ private Token[] tokens; + /** + * Constructs a new lexer with the given + * source code of which is should tokenize + * + * Params: + * sourceCode = the source text + */ this(string sourceCode) { this.sourceCode = sourceCode; } + /** + * Checks whether or not we could shift our + * source text pointer forward if it would + * be within the boundries of the source text + * or not + * + * Returns: `true` if within the boundries, + * `false` otherwise + */ private bool isForward() { - return position+1 < sourceCode.length; + return position + 1 < sourceCode.length; } - public bool isBackward() + /** + * Checks whether or not we could shift our + * source text pointer backwards and it it + * would be within the boundries of the source + * text or not + * + * Returns: `true` if within the boundries, + * `false` otherwise + */ + private bool isBackward() { - return position-1 < sourceCode.length; - } - - - - /** - * Used for tokenising a2.b2 - * - * When the `.` is encountered - * and there are some characters - * behind it this checks if we can - * append a further dot to it - */ - private bool isBuildUpValidIdent() - { - import tlang.compiler.symbols.check; - return isPathIdentifier(currentToken) || isIdentifier(currentToken); + return position - 1 < sourceCode.length; } /** * Returns true if we have a token being built * false otherwise + * + * Returns: `true` if we have a token built-up, + * `false` otherwise */ private bool hasToken() { return currentToken.length != 0; } - /* Perform the lexing process */ - /* TODO: Use return value */ + /** + * Performs the lexing process + * + * Throws: + * LexerException on error tokenizing + */ public void performLex() { - while(position < sourceCode.length) + currentChar = sourceCode[position]; + while (position < sourceCode.length) { // gprintln("SrcCodeLen: "~to!(string)(sourceCode.length)); // gprintln("Position: "~to!(string)(position)); - currentChar = sourceCode[position]; - if(floatMode == true) + // // currentChar = sourceCode[position]; + // gprintln("Current Char\"" ~ currentChar ~ "\""); + // gprintln("Current Token\"" ~ currentToken ~ "\""); + // gprintln("Match alpha check" ~ to!(bool)(currentChar == LS.UNDERSCORE || isAlpha(currentChar))); + + if (isSplitter(currentChar)) { - if(isDigit(currentChar)) - { - /* tack on and move to next iteration */ - currentToken~=currentChar; - position++; - column++; - continue; - } - /* TODO; handle closer case and error case */ - else - { - /* TODO: Throw erropr here */ - if(isSpliter(currentChar)) - { - floatMode = false; - currentTokens ~= new Token(currentToken, line, column); - currentToken = ""; - /* We just flush and catch splitter in next round, hence below is commented out */ - // column++; - // position++; + if (currentToken.length != 0) + { + flush(); + } + if (isWhite(currentChar) ) { + if (improvedAdvance()) { + continue; + } else { + break; } - else - { - throw new LexerException(this, "Floating point '"~currentToken~"' cannot be followed by a '"~currentChar~"'"); - } - } - } - else if(currentChar == ' ' && !stringMode) - { - /* TODO: Check if current token is fulled, then flush */ - if(currentToken.length != 0) - { - currentTokens ~= new Token(currentToken, line, column); - currentToken = ""; - } - - column++; - position++; - } - else if(isSpliter(currentChar) && !stringMode) - { - /* The splitter token to finally insert */ + } /* The splitter token to finally insert */ string splitterToken; - gprintln("Build up: "~currentToken); - gprintln("Current char: "~currentChar); - - /* Check for case of `==` (where we are on the first `=` sign) */ - if(currentChar == '=' && isForward() && sourceCode[position+1] == '=') - { - /* Flush any current token (if exists) */ - if(currentToken.length) - { - currentTokens ~= new Token(currentToken, line, column); - currentToken = ""; + // gprintln("Build up: " ~ currentToken); + // gprintln("Current char, splitter: " ~ currentChar); + if (currentChar == LS.FORWARD_SLASH && isForward() && (sourceCode[position+1] == LS.FORWARD_SLASH || sourceCode[position+1] == LS.STAR)) { + if (!doComment()) { + break; } - - // Create the `==` token - currentTokens ~= new Token("==", line, column); - - // Skip over the current `=` and the next `=` - position+=2; - - column+=2; - - continue; } - /* FIXME: Add floating point support here */ - /* TODO: IF buildUp is all numerical and we have dot go into float mode */ - /* TODO: Error checking will need to be added */ - if(isNumericalStr(currentToken) && currentChar == '.') + /* Check for case of `==` or `=<` or `=>` (where we are on the first `=` sign) */ + if (currentChar == LS.EQUALS && isForward() && (sourceCode[position + 1] == LS.EQUALS || sourceCode[position + 1] == LS.LESS_THAN || sourceCode[position + 1] == LS.BIGGER_THAN)) { - /* Tack on the dot */ - currentToken~="."; - - /* Enable floating point mode and go to next iteration*/ - floatMode = true; - gprintln("Float mode just got enabled: Current build up: \""~currentToken~"\""); - column++; - position++; + buildAdvance(); + buildAdvance(); + flush(); continue; } + /* Check for case of `<=` or `>=` */ + if ((currentChar == LS.LESS_THAN || currentChar == LS.BIGGER_THAN) && isForward() && (sourceCode[position + 1] == LS.EQUALS || sourceCode[position + 1] == LS.LESS_THAN || sourceCode[position + 1] == LS.BIGGER_THAN)) + { + buildAdvance(); + buildAdvance(); + flush(); + continue; + } /** * Here we check if we have a `.` and that the characters - * preceding us were all godd for an identifier + * preceding us were all good for an identifier */ import misc.utils; - - if(currentChar == '.' && hasToken() && isBuildUpValidIdent()) + + if (currentChar == LS.DOT) { - gprintln("Bruh"); - /** - * Now we check that we have a character infront of us - * and that it is a letter - * - * TODO: Add _ check too as that is a valid identifier start - */ - if(isForward() && isCharacterAlpha(sourceCode[position+1])) + if (isBackward() && isWhite(sourceCode[position - 1])) { - position++; - column+=1; - - currentToken ~= '.'; - - continue; + throw new LexerException(this, "Character '.' is not allowed to follow a whitespace."); + } + if (isForward() && isWhite(sourceCode[position + 1])) + { + throw new LexerException(this, "Character '.' is not allowed to precede a whitespace."); + } + else if (!hasToken() && (isBackward() && !isValidDotPrecede( + sourceCode[position - 1]))) + { + throw new LexerException(this, "Character '.' should be preceded by valid identifier or numerical."); } else { - throw new LexerException(this, "Expected a letter to follow the ."); + splitterToken = EMPTY ~ currentChar; + improvedAdvance(); } - - } - /* Check if we need to do combinators (e.g. for ||, &&) */ - /* TODO: Second operand in condition out of bounds */ - else if(currentChar == '|' && (position+1) != sourceCode.length && sourceCode[position+1] == '|') - { - splitterToken = "||"; - column += 2; - position += 2; - } - else if(currentChar == '&' && (position+1) != sourceCode.length && sourceCode[position+1] == '&') + }else if (currentChar == LS.AMPERSAND && (position + 1) != sourceCode.length && sourceCode[position + 1] == LS.AMPERSAND) { splitterToken = "&&"; - column += 2; - position += 2; - } - else if (currentChar == '\n') /* TODO: Unrelated!!!!!, but we shouldn't allow this bahevaipur in string mode */ + improvedAdvance(2, false); + } + /* Check if we need to do combinators (e.g. for ||, &&) */ + /* TODO: Second operand in condition out of bounds */ + else if (currentChar == LS.SHEFFER_STROKE && isForward() && sourceCode[position + 1] == LS.SHEFFER_STROKE) { - line++; - column = 1; - - position++; + splitterToken = "||"; + improvedAdvance(2, false); + } else if (currentChar == LS.EXCLAMATION && isForward() && sourceCode[position + 1] == LS.EQUALS) + { + splitterToken = "!="; + improvedAdvance(2, false); + }else if (currentChar == LS.SHEFFER_STROKE) { + splitterToken = "|"; + improvedAdvance(1, false); + } else if (currentChar == LS.AMPERSAND) { + splitterToken = "&"; + improvedAdvance(1, false); + } else if (currentChar == LS.CARET) { + splitterToken = "^"; + improvedAdvance(1, false); + } else if (currentChar == LS.LESS_THAN) { + splitterToken = [LS.LESS_THAN]; + improvedAdvance(1, false); + } else if (currentChar == LS.BIGGER_THAN) { + splitterToken = [LS.BIGGER_THAN]; + improvedAdvance(1, false); + } + else if (isWhite(currentChar)) { + if (!improvedAdvance()) { + break; + } } else { - splitterToken = ""~currentChar; - column++; - position++; + splitterToken = EMPTY ~ currentChar; + improvedAdvance(); } - /* Flush the current token (if one exists) */ - if(currentToken.length) + if (currentToken.length) { - currentTokens ~= new Token(currentToken, line, column); - currentToken = ""; + flush(); } - + /* Add the splitter token (only if it isn't empty) */ - if(splitterToken.length) + if (splitterToken.length) { currentTokens ~= new Token(splitterToken, line, column); } } - else if(currentChar == '"') - { - /* If we are not in string mode */ - if(!stringMode) - { - /* Add the opening " to the token */ - currentToken ~= '"'; - - /* Enable string mode */ - stringMode = true; - } - /* If we are in string mode */ - else - { - /* Add the closing " to the token */ - currentToken ~= '"'; - - /* Flush the token */ - currentTokens ~= new Token(currentToken, line, column); - currentToken = ""; - - /* Get out of string mode */ - stringMode = false; - } - - column++; - position++; - } - else if(currentChar == '\\') - { - /* You can only use these in strings */ - if(stringMode) - { - /* Check if we have a next character */ - if(position+1 != sourceCode.length && isValidEscape_String(sourceCode[position+1])) - { - /* Add to the string */ - currentToken ~= "\\"~sourceCode[position+1]; - - column += 2; - position += 2; - } - /* If we don't have a next character then raise error */ - else - { - throw new LexerException(this, "Unfinished escape sequence"); - } - } - else - { - throw new LexerException(this, "Escape sequences can only be used within strings"); + //else if (currentChar == LS.UNDERSCORE || ((!isSplitter(currentChar) && !isDigit(currentChar)) && currentChar != LS.DOUBLE_QUOTE && currentChar != LS.SINGLE_QUOTE && currentChar != LS.BACKSLASH)) { + else if (currentChar == LS.UNDERSCORE || isAlpha(currentChar)) { + gprintln("path ident String"); + if (!doIdentOrPath()) { + break; + } else { + continue; } } - /* Character literal support */ - else if(!stringMode && currentChar == '\'') + else if (currentChar == LS.DOUBLE_QUOTE) { - currentToken ~= "'"; - - /* Character literal must be next */ - if(position+1 != sourceCode.length) - { - /* TODO: Escape support for \' */ - - /* Get the character */ - currentToken ~= ""~sourceCode[position+1]; - column++; - position++; - - - /* Closing ' must be next */ - if(position+1 != sourceCode.length && sourceCode[position+1] == '\'') - { - /* Generate and add the token */ - currentToken ~= "'"; - currentTokens ~= new Token(currentToken, line, column); - - /* Flush the token */ - currentToken = ""; - - column += 2; - position += 2; - } - else - { - throw new LexerException(this, "Was expecting closing ' when finishing character literal"); - } - } - else - { - throw new LexerException(this, LexerError.EXHAUSTED_CHARACTERS, "EOSC reached when trying to get character literal"); + if (!doString()) { + break; } } - /** - * If we are building up a number - * - * TODO: Build up token right at the end (#DuplicateCode) - */ - else if(isBuildUpNumerical()) + else if (currentChar == LS.SINGLE_QUOTE) { - gprintln("jfdjkhfdjkhfsdkj"); - /* fetch the encoder segment */ - char[] encoderSegment = numbericalEncoderSegmentFetch(); - - gprintln("isBuildUpNumerical(): Enter"); - - /** - * If we don't have any encoders - */ - if(encoderSegment.length == 0) - { - /* We can add a signage encoder */ - if(isNumericalEncoder_Signage(currentChar)) - { - gprintln("Hello"); - - /* Check if the next character is a size (it MUST be) */ - if(isForward() && isNumericalEncoder_Size(sourceCode[position+1])) - { - currentToken ~= currentChar; - column++; - position++; - - - } - else - { - throw new LexerException(this, "You MUST specify a size encoder after a signagae encoder"); - } - - - - - } - /* We can add a size encoder */ - else if(isNumericalEncoder_Size(currentChar)) - { - currentToken ~= currentChar; - column++; - position++; - } - /* We can add more numbers */ - else if(isDigit(currentChar)) - { - currentToken ~= currentChar; - column++; - position++; - } - /* Splitter (TODO) */ - else if(isSpliter(currentChar)) - { - /* Add the numerical literal as a new token */ - currentTokens ~= new Token(currentToken, line, column); - - /* Add the splitter token if not a newline */ - if(currentChar != '\n') - { - currentTokens ~= new Token(""~currentChar, line, column); - } - - - /* Flush the token */ - currentToken = ""; - - /* TODO: Check these */ - column += 2; - position += 2; - } - /* Anything else is invalid */ - else - { - throw new LexerException(this, "Not valid TODO"); - } + if (!doChar()) { + break; } - /** - * If we have one encoder - */ - else if((encoderSegment.length == 1)) - { - /* Check what the encoder is */ - - /** - * If we had a signage then we must have a size after it - */ - if(isNumericalEncoder_Signage(encoderSegment[0])) - { - /** - * Size encoder must then follow - */ - if(isNumericalEncoder_Size(currentChar)) - { - currentToken ~= currentChar; - column++; - position++; - - /* Add the numerical literal as a new token */ - currentTokens ~= new Token(currentToken, line, column); - - /* Flush the token */ - currentToken = ""; - - } - /** - * Anything else is invalid - */ - else - { - throw new LexerException(this, "A size-encoder must follow a signage encoder"); - } - } - else - { - throw new LexerException(this, "Cannot have another encoder after a size encoder"); - } - } - /* It is impossible to reach this as flushing means we cannot add more */ - else - { - assert(false); - } - - } - /* Any other case, keep building the curent token */ - else + else if (isDigit(currentChar)){ + if (!doNumber()) { + break; + } + currentToken = currentToken.replace("_", ""); + } + else if (currentChar == LS.BACKSLASH) { - currentToken ~= currentChar; - column++; - position++; + throw new LexerException(this, "Escape sequences can only be used within strings"); + } else { + throw new LexerException(this, "Unsupported Character in this position"); + //gprintln("Fuck " ~ " me col" ~ to!(string)(column)); } } /* If there was a token made at the end then flush it */ - if(currentToken.length) + if (currentToken.length) { currentTokens ~= new Token(currentToken, line, column); } @@ -589,311 +376,745 @@ public final class BasicLexer : LexerInterface tokens = currentTokens; } - private char[] numbericalEncoderSegmentFetch() - { - char[] numberPart; - ulong stopped; - for(ulong i = 0; i < currentToken.length; i++) - { - char character = currentToken[i]; - - if(isDigit(character)) - { - numberPart~=character; - } - else - { - stopped = i; - break; - } - } - - char[] remaining = cast(char[])currentToken[stopped..currentToken.length]; - - return remaining; - } - - /** - * Returns true if the current build up is entirely - * numerical - * - * FIXME: THis, probably by its own will pick up `UL` - * as a number, or even just `` - */ - private bool isBuildUpNumerical() - { - import std.ascii : isDigit; - - - char[] numberPart; - ulong stopped; - for(ulong i = 0; i < currentToken.length; i++) - { - char character = currentToken[i]; - - if(isDigit(character)) - { - numberPart~=character; - } - else - { - stopped = i; - break; - } - } - - /** - * We need SOME numerical stuff - */ - if(stopped == 0) - { + /** + * Processes an ident with or without a dot-path + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool doIdentOrPath () { + if (!buildAdvance()) { + flush(); return false; } - char[] remaining = cast(char[])currentToken[stopped..currentToken.length]; - - char lstEncoder; - - for(ulong i = 0; i < remaining.length; i++) - { - char character = remaining[i]; - - if(!isNumericalEncoder(character)) - { - return false; + while (true) { + if (currentChar == LS.DOT) { + if (isForward() && (isSplitter(sourceCode[position + 1]) || isDigit(sourceCode[position + 1]))) { + throw new LexerException(this, "Invalid character in identifier build up."); + } else { + if (!buildAdvance()) { + throw new LexerException(this, "Invalid character in identifier build up."); + //return false; + } + } + } else if (isSplitter(currentChar)) { + flush(); + return true; + } else if (!(isAlpha(currentChar) || isDigit(currentChar) || currentChar == LS.UNDERSCORE)) { + throw new LexerException(this, "Invalid character in identifier build up."); + } else { + if (!buildAdvance()) { + return false; + } } } - - return true; - - - } - /** - * Given a string return true if all characters - * are digits, false otherwise and false if - * the string is empty - */ - private static bool isNumericalStr(string input) + /** + * Tokenizes a character + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool doChar() { - /** - * If the given input is empty then return false - */ - if(input.length == 0) + if(!buildAdvance()) + { + throw new LexerException(this, "Expected character, but got EOF"); + } + /* Character literal must be next */ + bool valid; + if(currentChar == LS.BACKSLASH) + { + valid = doEscapeCode(); + } + else + { + valid = buildAdvance(); + } + if(!valid) + { + throw new LexerException(this, "Expected ''', but got EOF"); + } + + if(currentChar != LS.SINGLE_QUOTE) + { + throw new LexerException(this, "Expected ''', but got EOF"); + } + if(!buildAdvance()) + { + flush(); + return false; + } + flush(); + return true; + } + + /** + * Tokenizes a string + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool doString() + { + if(!buildAdvance()) + { + throw new LexerException(this, "Expected closing \", but got EOF"); + } + + while (true) { + if (currentChar == LS.DOUBLE_QUOTE) { + if (!buildAdvance) { + flush(); + return false; + } + return true; + } else if (currentChar == LS.BACKSLASH) { + if (!doEscapeCode()) { + throw new LexerException(this, "Expected closing \", but got EOF"); + } + } else if (currentChar == LS.NEWLINE) { + throw new LexerException(this, "Expected closing \", but got NEWLINE"); + } else { + if (!buildAdvance()) { + throw new LexerException(this, "Expected closing \", but got EOF"); + } + } + } + } + + /** + * Lex a comment, start by consuming the '/' and setting a flag for + * multi-line based on the next character and consume. + * + * Enters a loop that looks for the end of the comment and if not + * builds up the comment. + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool doComment() { + buildAdvance(); + // if (!buildAdvance()) { + // flush(); + // return false; + // } + bool multiLine = currentChar == LS.STAR; + if (!buildAdvance()) { + if (multiLine) { + throw new LexerException(this, "Expected closing Comment, but got EOF"); + } else { + flush(); + return false; + } + } + while (true) { + if (!multiLine && currentChar == LS.NEWLINE) { + flush(); + return advanceLine(); + } + if (multiLine && currentChar == LS.STAR && isForward() && sourceCode[position+1] == LS.FORWARD_SLASH) { + buildAdvance(); + if (!buildAdvance()) { + flush(); + return false; + } else { + return true; + } + } else { + if (!buildAdvance()) { + if (multiLine) + { + throw new LexerException(this, "Expected closing Comment, but got EOF"); + } + else + { + flush(); + return false; + } + } + } + } + } + + /** + * Lex an escape code. If valid one id found, add it to the token, else throw Exception + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool doEscapeCode() { + if (!buildAdvance()) { + return false; + } + // currentToken ~= LS.BACKSLASH; + if (isValidEscape_String(currentChar)) { + return buildAdvance(); + } else { + throw new LexerException(this, "Invalid escape code"); + } + // flush(); + } + + + /** + * Lex a number, this method lexes a plain number, float or numerically encoded. + * The Float and numerically encoded numbers are deferred to other methods. + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool doNumber() { + while (true) { + if (isDigit(currentChar) || currentChar == LS.UNDERSCORE) { + if(!buildAdvance()) { + currentToken = currentToken.replace("_", ""); + flush(); + return false; + } + } else if (currentChar == LS.DOT) { + return doFloat(); + } else if (isNumericalEncoder(currentChar)) { + return doEncoder(); + } else { + return true; + } + } + } + + /** + * Lex a numerical encoder, looks for Signage followed by Size, or if there is + * no signage, just the size. + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool doEncoder() { + if (isNumericalEncoder_Signage(currentChar)) { + if (!buildAdvance() || !isNumericalEncoder_Size(currentChar)) { + throw new LexerException(this, "Expected size indicator B,I,L,W but got EOF"); + } + } + if (isNumericalEncoder_Size(currentChar)) { + if (!buildAdvance()) { + flush(); + return false; + } else { + if (!isSplitter(currentChar)) { + throw new LexerException(this, "Expected splitter but got \"" ~ currentChar ~ "\"."); + } + } + } + flush(); + return true; + } + + /** + * Lex a floating point, the initial part of the number is lexed by the `doNumber()` + * method. Here we consume the '.' and consume digits until a splitter is reached. + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool doFloat() { + if (!buildAdvance()) { + throw new LexerException(this, "Floating point expected digit, got EOF."); + //return false; + } + size_t count = 0; + bool valid = false; + while (true) { + + if (isDigit(currentChar) || (count > 0 && currentChar == LS.UNDERSCORE)) + { + /* tack on and move to next iteration */ + valid = true; + if (!buildAdvance()) { + currentToken = currentToken.replace("_", ""); + flush(); + return false; + } + count++; + continue; + } + else + { + /* TODO: Throw erropr here */ + if (isSplitter(currentChar) && valid) + { + currentToken = currentToken.replace("_", ""); + flush(); + return true; + } + else + { + throw new LexerException(this, "Floating point '" ~ currentToken ~ "' cannot be followed by a '" ~ currentChar ~ "'"); + } + } + } + } + + /** + * Flush the current token to the token buffer. + */ + private void flush() + { + currentTokens ~= new Token(currentToken, line, column); + currentToken = EMPTY; + } + + /** + * Consume the current char into the current token + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool buildAdvance() + { + currentToken ~= currentChar; + return improvedAdvance(); + } + + /** + * Advances the source code pointer + * + * Params: + * inc = advancement counter, default 1 + * shouldFlush = whether or not to flush, default is `false` + * Returns: `true` if characters left in buffer, else `false` + */ + private bool improvedAdvance(int inc = 1, bool shouldFlush = false) + { + if (currentChar == LS.NEWLINE) + { + shouldFlush && flush(); + line++; + column = 1; + position++; + } + else + { + column += inc; + position += inc; + } + + if (position >= sourceCode.length) { return false; } - - /** - * If there are any characters in the string then - * check if all are digits - */ - for(ulong i = 0; i < input.length; i++) - { - char character = input[i]; - - if(!isDigit(character)) - { - return false; - } - } - + currentChar = sourceCode[position]; return true; } - private bool isSpliter(char character) + /** + * Advance the position, line and current token, reset the column to 1. + * + * Returns: `true` if characters left in buffer, else `false` + */ + private bool advanceLine() { - return character == ';' || character == ',' || character == '(' || - character == ')' || character == '[' || character == ']' || - character == '+' || character == '-' || character == '/' || - character == '%' || character == '*' || character == '&' || - character == '{' || character == '}' || character == '=' || - character == '|' || character == '^' || character == '!' || - character == '\n' || character == '~' || character =='.' || - character == ':'; //|| isNumericalEncoder(character); - } - - private bool isNumericalEncoder(char character) - { - return isNumericalEncoder_Size(character) || - isNumericalEncoder_Signage(character); - } - - private bool isNumericalEncoder_Size(char character) - { - return character == 'B' || character == 'W' || - character == 'I' || character == 'L'; - } - - private bool isNumericalEncoder_Signage(char character) - { - return character == 'S' || character == 'U'; - } - - /* Supported escapes \" */ - public bool isValidEscape_String(char character) - { - return true; /* TODO: Implement me */ + column = 1; + line++; + position++; + if (position >= sourceCode.length) + { + return false; + } + currentChar = sourceCode[position]; + return true; } } -/* Test input: `hello "world";` */ +version(unittest) +{ + /** + * Does a print out of some text just to show you + * where you are from within the caller + * + * Params: + * __LINE__ = line number (auto-filled) + * __MODULE__ = module name (auto-filled) + * __FUNCTION__ = function name (auto-filled) + */ + private void shout(int i = __LINE__, string mod = __MODULE__, string func = __FUNCTION__) + { + gprintln("Unittest at "~to!(string)(i)~" in "~func~" (within module "~mod~")"); + } +} + +/** + * Test input: `hello "world";` + */ unittest { + shout(); import std.algorithm.comparison; + string sourceCode = "hello \"world\";"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token(";", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("hello", 0, 0), new Token("\"world\"", 0, 0), + new Token(";", 0, 0) + ]); } -/* Test input: `hello "world"|| ` */ +/** + * Test input: `hello \n "world";` + */ unittest { + shout(); import std.algorithm.comparison; + + string sourceCode = "hello \n \"world\";"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("hello", 0, 0), new Token("\"world\"", 0, 0), + new Token(";", 0, 0) + ]); +} + +/** + * Test input: `hello "wo\nrld";` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "hello \"wo\nrld\";"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try { + currentLexer.performLex(); + } catch (LexerException) { + assert(true); + + } +} + +/** + * Test input: `hello "world"|| ` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + string sourceCode = "hello \"world\"|| "; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token("||", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("hello", 0, 0), new Token("\"world\"", 0, 0), + new Token("||", 0, 0) + ]); } -/* Test input: `hello "world"||` */ +/** + * Test input: `hello "world"&& ` + */ unittest { + shout(); import std.algorithm.comparison; - string sourceCode = "hello \"world\"||"; + + string sourceCode = "hello \"world\"&& "; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token("||", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("hello", 0, 0), new Token("\"world\"", 0, 0), + new Token("&&", 0, 0) + ]); } -/* Test input: `hello "world"|` */ +/** + * Test input: `hello "wooorld"||` + */ unittest { + shout(); import std.algorithm.comparison; + + string sourceCode = "hello \"wooorld\"||"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("hello", 0, 0), new Token("\"wooorld\"", 0, 0), + new Token("||", 0, 0) + ]); +} + +/** + * Test input: `hello "world"|` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + string sourceCode = "hello \"world\";|"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token(";", 0, 0), new Token("|", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("hello", 0, 0), new Token("\"world\"", 0, 0), + new Token(";", 0, 0), new Token("|", 0, 0) + ]); } -/* Test input: ` hello` */ +/** + * Test input: ` hello` + */ unittest { + shout(); import std.algorithm.comparison; + string sourceCode = " hello"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); assert(currentLexer.getTokens() == [new Token("hello", 0, 0)]); } -/* Test input: `hello;` */ +/** + * Test input: `//trist` + */ unittest { + shout(); import std.algorithm.comparison; + + string sourceCode = "//trist"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [new Token("//trist", 0, 0)]); +} + +/** + * Test input: `/*trist\*\/` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "/*trist*/"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [new Token("/*trist*/", 0, 0)]); +} + +/** + * Test input: `/*t\nr\ni\ns\nt\*\/` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "/*t\nr\ni\ns\nt*/"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [new Token("/*t\nr\ni\ns\nt*/", 0, 0)]); +} + +/** + * Test input: `/*t\nr\ni\ns\nt\*\/ ` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "/*t\nr\ni\ns\nt*/ "; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [new Token("/*t\nr\ni\ns\nt*/", 0, 0)]); +} + +/** + * Test input: `//trist \n hello` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "//trist \n hello"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("//trist ", 0, 0), + new Token("hello", 0, 0), + ]); +} + +/** + * Test input: `hello;` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + string sourceCode = " hello;"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token(";", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("hello", 0, 0), new Token(";", 0, 0) + ]); } -/* Test input: `hello "world\""` */ +/** + * Test input: `5+5` + */ unittest { + shout(); import std.algorithm.comparison; + + string sourceCode = "5+5"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("5", 0, 0), + new Token("+", 0, 0), + new Token("5", 0, 0), + ]); +} + +/** + * Test input: `hello "world\""` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + string sourceCode = "hello \"world\\\"\""; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\\\"\"", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("hello", 0, 0), new Token("\"world\\\"\"", 0, 0) + ]); } -/* Test input: `'c'` */ +/** + * Test input: `'c'` + */ unittest { + shout(); import std.algorithm.comparison; + string sourceCode = "'c'"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); assert(currentLexer.getTokens() == [new Token("'c'", 0, 0)]); } -/* Test input: `2121\n2121` */ +/** + * Test input: `2121\n2121` + */ unittest { + shout(); import std.algorithm.comparison; + string sourceCode = "2121\n2121"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("2121", 0, 0), new Token("2121", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("2121", 0, 0), new Token("2121", 0, 0) + ]); } /** -* Test `=`` and `==` handling -*/ + * Test `=`` and `==` handling + */ unittest { + shout(); import std.algorithm.comparison; + string sourceCode = " =\n"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); assert(currentLexer.getTokens() == [new Token("=", 0, 0)]); import std.algorithm.comparison; + sourceCode = " = ==\n"; currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("=", 0, 0), new Token("==", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("=", 0, 0), new Token("==", 0, 0) + ]); import std.algorithm.comparison; + sourceCode = " ==\n"; currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); assert(currentLexer.getTokens() == [new Token("==", 0, 0)]); import std.algorithm.comparison; + sourceCode = " = =\n"; currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("=", 0, 0), new Token("=", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("=", 0, 0), new Token("=", 0, 0) + ]); import std.algorithm.comparison; + sourceCode = " ==, = ==\n"; currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("==", 0, 0), new Token(",", 0, 0), new Token("=", 0, 0), new Token("==", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("==", 0, 0), new Token(",", 0, 0), new Token("=", 0, 0), + new Token("==", 0, 0) + ]); // Test flushing of previous token import std.algorithm.comparison; + sourceCode = "i==i=\n"; currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); - assert(currentLexer.getTokens() == [new Token("i", 0, 0), new Token("==", 0, 0), new Token("i", 0, 0), new Token("=", 0, 0)]); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("i", 0, 0), new Token("==", 0, 0), new Token("i", 0, 0), + new Token("=", 0, 0) + ]); } /** -* Test: Literal value encoding -* -* Tests validity -*/ + * Test: Literal value encoding + * + * Tests validity + */ unittest { + shout(); import std.algorithm.comparison; + string sourceCode; BasicLexer currentLexer; @@ -901,66 +1122,843 @@ unittest sourceCode = "21L"; currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); assert(currentLexer.getTokens() == [new Token("21L", 0, 0)]); /* 21UL (valid) */ sourceCode = "21UL"; currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); assert(currentLexer.getTokens() == [new Token("21UL", 0, 0)]); - // /* 21U (invalid) */ - // sourceCode = "21U "; - // currentLexer = new Lexer(sourceCode); - // // gprintln(currentLexer.performLex()); - // bool status = currentLexer.performLex(); - // gprintln("Collected "~to!(string)(currentLexer.getTokens())); - // assert(!status); + /* 21U (invalid) */ + sourceCode = "21U "; + currentLexer = new BasicLexer(sourceCode); + // gprintln(currentLexer.performLex()); + try { + currentLexer.performLex(); + assert(false); + } catch (LexerException) { + assert(true); + } + /* 21ULa (invalid) */ + sourceCode = "21ULa"; + currentLexer = new BasicLexer(sourceCode); + // gprintln(currentLexer.performLex()); + try { + currentLexer.performLex(); + assert(false); + } catch (LexerException) { + assert(true); + } - // /* 21UL (valid) */ - // sourceCode = "21UL"; - // currentLexer = new Lexer(sourceCode); - // currentLexer.performLex(); - // gprintln("Collected "~to!(string)(currentLexer.getTokens())); - // assert(currentLexer.getTokens() == [new Token("21UL", 0, 0)]); + /* 21UL (valid) */ + sourceCode = "21SI"; + currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected "~to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [new Token("21SI", 0, 0)]); - + /* 21UL; (valid) */ + sourceCode = "21SI;"; + currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected "~to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("21SI", 0, 0), + new Token(";", 0, 0) + ]); } -/* Test input: `1.5` */ +/** + * Test input: `1.5` + */ unittest { + shout(); import std.algorithm.comparison; + string sourceCode = "1.5"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); assert(currentLexer.getTokens() == [new Token("1.5", 0, 0)]); } /** -* Test correct handling of dot-operator for -* non-floating point cases -* -* Input: `new A().l.p.p;` -*/ + * Test correct handling of dot-operator for + * non-floating point cases + * + * Input: `new A().l.p.p;` + */ unittest { + shout(); import std.algorithm.comparison; + string sourceCode = "new A().l.p.p;"; BasicLexer currentLexer = new BasicLexer(sourceCode); currentLexer.performLex(); - gprintln("Collected "~to!(string)(currentLexer.getTokens())); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); assert(currentLexer.getTokens() == [ - new Token("new", 0, 0), - new Token("A", 0, 0), - new Token("(", 0, 0), - new Token(")", 0, 0), - new Token(".", 0, 0), - new Token("l.p.p", 0, 0), - new Token(";", 0, 0) - ]); + new Token("new", 0, 0), + new Token("A", 0, 0), + new Token("(", 0, 0), + new Token(")", 0, 0), + new Token(".", 0, 0), + new Token("l.p.p", 0, 0), + new Token(";", 0, 0) + ]); +} + +/** + * Tab testing + */ +unittest +{ + shout(); + /** + * Test tab dropping in front of a float. + * Test calssification: Valid + * Test input: `\t1.5` + */ + gprintln("Tab Unit Test"); + import std.algorithm.comparison; + + string sourceCode = "\t1.5"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [new Token("1.5", 0, 0)]); + + /** + * Test tab dropping before '.' of float. + * Catch fail for verification. + * Test calssification: Invalid + * Test input: `1\t.5` + */ + import std.algorithm.comparison; + + bool didFail = false; + sourceCode = "1\t.5"; + currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); + + /** + * Testing Float EOF after '.'. + * Test calssification: Invalid + * Test input: `1.` + */ + sourceCode = "1."; + currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + assert(false); + } + catch (LexerException e) + { + } + + /** + * Testing illegal backslash. + * Test calssification: Invalid + * Test input: `1.` + */ + sourceCode = "hello \\ "; + currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + assert(false); + } + catch (LexerException e) + { + } + + /** + * Test tab dropping after '.' of float. + * Catch fail for verification. + * Test calssification: Invalid + * Test input: `1.\t5` + */ + import std.algorithm.comparison; + + didFail = false; + sourceCode = "1.\t5"; + currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); + + /** + * Test tab dropping for an empty token array. + * Test calssification: Valid + * Test input: `\t\t\t\t\t` + */ + gprintln("Tab Unit Test"); + import std.algorithm.comparison; + + sourceCode = "\t\t\t\t\t"; + currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens().length == 0); +} + +/** + * Test correct handling of dot-operator for + * non-floating point cases where whitespace has been inserted before and after. + * Test Classification: Invalid + * + * Input: `new A() .l.p.p;` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + bool didFail = false; + string sourceCode = "new A(). l.p.p;"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException) + { + didFail = true; + } + assert(didFail); +} + +/** + * Test correct handling of dot-operator for + * non-floating point cases where whitespace has been inserted before and after. + * Test Classification: Invalid + * + * Input: `new A() . l.p.p;` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + bool didFail = false; + string sourceCode = "new A() . l.p.p;"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException) + { + didFail = true; + } + assert(didFail); +} + +unittest +{ + shout(); + + /** + * Test dot for fail on dot operator with no buildup and invalid lead + * Catch fail for verification. + * Test calssification: Invalid + * Test input: `1.5.5` + */ + import std.algorithm.comparison; + + bool didFail = false; + string sourceCode = "1.5.5"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); + + /** + * Test for fail on space following dot operator. + * Test Classification: Invalid + * Input: `1. a` + */ + didFail = false; + sourceCode = "1. a"; + currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); + + /** + * Test for correct lex space following paren + * Test Classification: Valid + * Input: `).x` + */ + sourceCode = ").x"; + currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token(")", 0, 0), + new Token(".", 0, 0), + new Token("x", 0, 0), + ]); + /** + * Test for fail on space preceding dot operator. + * Test Classification: Invalid + * Input: `1 .a` + */ + didFail = false; + sourceCode = "1 .a"; + currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Test newlines + * Test Classification: Valid + * Input: `\n\n\n\n` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "\n\n\n\n"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens().length == 0); +} + +/** + * Test for character escape codes + * + * Input: `'\\'` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "'\\\\'"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("'\\\\'", 0, 0), + ]); +} + +/** + * Test for character escape codes + * + * Input: `'\a'` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "'\\a'"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("'\\a'", 0, 0), + ]); +} + +/** + * Test for invalid escape sequence + * Input: `'\f'` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "\\f"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Test for invalid char in ident + * Input: `hello$k` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "hello$k"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Test for invalid char in ident + * Input: `$` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "$"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing Underscores in numbers + * + * Input: `1_ 1_2 1_2.3 1_2.3_ 1__2 1__2.3 1__.23__` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "1_ 1_2 1_2.3 1_2.3_ 1__2 1__2.3 1__.23__"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("1", 0, 0), + new Token("12", 0, 0), + new Token("12.3", 0, 0), + new Token("12.3", 0, 0), + new Token("12", 0, 0), + new Token("12.3", 0, 0), + new Token("1.23", 0, 0), + ]); +} + +/** + * Testing Comparison in numbers + * + * Input: `<= >= ==` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "<= >= =< => == != < > ^"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("<=", 0, 0), + new Token(">=", 0, 0), + new Token("=<", 0, 0), + new Token("=>", 0, 0), + new Token("==", 0, 0), + new Token("!=", 0, 0), + new Token("<", 0, 0), + new Token(">", 0, 0), + new Token("^", 0, 0), + ]); +} + +/** + * Testing Chars + * + * Input: `'a'` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "'a'"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("'a'", 0, 0), + ]); +} + +/** + * Test for invalid ident + * Input: `hello. ` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "hello. "; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Test for invalid ident + * Input: `hello.` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "hello."; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing Chars + * Input: `'` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "'"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing Chars + * Input: `'a` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "'a"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing Chars + * Input: `'aa` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "'aa"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing String EOF + * Input: `"a` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "\"a"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing String EOF + * Input: `"a` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "\""; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing String EOF + * Input: `"\` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "\"\\"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing Comment EOF + * Input: `/*` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "/*"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing Comment EOF + * Input: `/* ` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "/* "; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** +* Testing Line comment EOF +* +* Input: `//` +*/ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "//"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("//", 0, 0) + ]); +} + +/** + * Testing invalid Escape Code + * Input: `\p` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "\"\\p"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing invalid Escape Code + * Input: `\p` + */ +unittest +{ + shout(); + + bool didFail = false; + string sourceCode = "\\p"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + try + { + currentLexer.performLex(); + } + catch (LexerException e) + { + didFail = true; + } + assert(didFail); +} + +/** + * Testing comment + * + * Input: `'a' ` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "'a' "; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("'a'", 0, 0) + ]); +} + +/** + * Testing comment + * + * Input: `// \n` + */ +unittest +{ + shout(); + import std.algorithm.comparison; + + string sourceCode = "// \n"; + BasicLexer currentLexer = new BasicLexer(sourceCode); + currentLexer.performLex(); + gprintln("Collected " ~ to!(string)(currentLexer.getTokens())); + assert(currentLexer.getTokens() == [ + new Token("// ", 0, 0) + ]); } \ No newline at end of file diff --git a/source/tlang/compiler/parsing/core.d b/source/tlang/compiler/parsing/core.d index 72b8e25..11c6972 100644 --- a/source/tlang/compiler/parsing/core.d +++ b/source/tlang/compiler/parsing/core.d @@ -2008,6 +2008,124 @@ public final class Parser return statement; } + import std.container.slist : SList; + private SList!(Token) commentStack; + private void pushComment(Token commentToken) + { + // Sanity check + assert(getSymbolType(commentToken) == SymbolType.SINGLE_LINE_COMMENT || + getSymbolType(commentToken) == SymbolType.MULTI_LINE_COMMENT + ); + + // Push it onto top of stack + commentStack.insertFront(commentToken); + } + //TODO: Add a popToken() (also think if we want a stack-based mechanism) + private bool hasCommentsOnStack() + { + return getCommentCount() != 0; + } + + private ulong getCommentCount() + { + import std.range : walkLength; + return walkLength(commentStack[]); + } + + private void parseComment() + { + gprintln("parseComment(): Enter", DebugType.WARNING); + + Token curCommentToken = lexer.getCurrentToken(); + + pushComment(curCommentToken); + + // TODO: Do something here like placing it on some kind of stack + gprintln("Comment is: '"~curCommentToken.getToken()~"'"); + lexer.nextToken(); // Move off comment + + gprintln("parseComment(): Leave", DebugType.WARNING); + } + + /** + * Tests the handling of comments + */ + unittest + { + import tlang.compiler.lexer.kinds.arr : ArrLexer; + + string sourceCode = `module myCommentModule; + // Hello`; + + LexerInterface currentLexer = new BasicLexer(sourceCode); + (cast(BasicLexer)currentLexer).performLex(); + + Parser parser = new Parser(currentLexer); + + try + { + Module modulle = parser.parse(); + + assert(parser.hasCommentsOnStack()); + assert(parser.getCommentCount() == 1); + } + catch(TError e) + { + assert(false); + } + + sourceCode = `module myCommntedModule; + /*Hello */ + + /* Hello*/`; + + currentLexer = new BasicLexer(sourceCode); + (cast(BasicLexer)currentLexer).performLex(); + parser = new Parser(currentLexer); + + try + { + Module modulle = parser.parse(); + + assert(parser.hasCommentsOnStack()); + assert(parser.getCommentCount() == 2); + } + catch(TError e) + { + assert(false); + } + + sourceCode = `module myCommentedModule; + + void function() + { + /*Hello */ + /* Hello */ + // Hello + //Hello + } + `; + + currentLexer = new BasicLexer(sourceCode); + (cast(BasicLexer)currentLexer).performLex(); + parser = new Parser(currentLexer); + + try + { + Module modulle = parser.parse(); + + assert(parser.hasCommentsOnStack()); + assert(parser.getCommentCount() == 4); + } + catch(TError e) + { + assert(false); + } + } + + // TODO: We need to add `parseComment()` + // support here (see issue #84) + // TODO: This ic currently dead code and ought to be used/implemented private Statement parseStatement(SymbolType terminatingSymbol = SymbolType.SEMICOLON) { gprintln("parseStatement(): Enter", DebugType.WARNING); @@ -2080,6 +2198,12 @@ public final class Parser { statement = parseDerefAssignment(); } + /* If it is a kind-of comment */ + else if(symbol == SymbolType.SINGLE_LINE_COMMENT || symbol == SymbolType.MULTI_LINE_COMMENT) + { + gprintln("COMMENTS NOT YET PROPERLY SUPOORTED", DebugType.ERROR); + parseComment(); + } /* Error out */ else { @@ -2303,6 +2427,12 @@ public final class Parser modulle.addStatement(externStatement); } + /* If it is a kind-of comment */ + else if(symbol == SymbolType.SINGLE_LINE_COMMENT || symbol == SymbolType.MULTI_LINE_COMMENT) + { + gprintln("COMMENTS NOT YET PROPERLY SUPOORTED", DebugType.ERROR); + parseComment(); + } else { expect("parse(): Unknown '" ~ tok.getToken() ~ "'"); diff --git a/source/tlang/compiler/symbols/check.d b/source/tlang/compiler/symbols/check.d index 9cd8c84..121c923 100644 --- a/source/tlang/compiler/symbols/check.d +++ b/source/tlang/compiler/symbols/check.d @@ -290,6 +290,16 @@ public enum SymbolType */ GENERIC_TYPE_DECLARE, + /** + * Multi-line comment (frwd-slash-star) + */ + MULTI_LINE_COMMENT, + + /** + * Singleiline comment (frwd-slash-slash) + */ + SINGLE_LINE_COMMENT, + /** * Unknown symbol */ @@ -780,6 +790,16 @@ public SymbolType getSymbolType(Token tokenIn) { return SymbolType.STAR; } + /* Multi-line comment (fwrd-slash-star) check */ + else if(token[0] == '/' && token.length >= 2 && token[1]=='*') + { + return SymbolType.MULTI_LINE_COMMENT; + } + /* Single-line comment (fwrd-slash-slash) check */ + else if(token[0] == '/' && token.length >= 2 && token[1]=='/') + { + return SymbolType.SINGLE_LINE_COMMENT; + } /* Divide `/` operator check */ else if(token[0] == '/') {