tlang/source/tlang/compiler/symbols/check.d

1008 lines
20 KiB
D

/**
* Token-to-symbol mappings (and vice-versa),
* facilities for performing tests on what sort
* of tokens are of certain classes (operators, etc.)
* and detection of different types of identifiers
*/
module tlang.compiler.symbols.check;
import tlang.compiler.lexer.core.tokens : Token;
import std.conv : to;
import std.string : isNumeric, cmp;
import std.algorithm.searching : canFind;
import tlang.misc.utils;
import tlang.misc.logging;
/**
* All allowed symbols
*/
public enum SymbolType
{
/**
* Default symbol (TODO: idk why this exists)
*/
LE_SYMBOL,
/**
* Any sort of identifier
*
* Must start with a letter,
* can contain numbers and
* may contain periods.
*
* It may also contain underscores.
*/
IDENT_TYPE,
/**
* Any sort of number, this can
* be `8` or `8.5`
*/
NUMBER_LITERAL,
/**
* A character constant like `'a'`
*/
CHARACTER_LITERAL,
/**
* A string constant like `"FELLA"`
*/
STRING_LITERAL,
/**
* Semicolon `;`
*/
SEMICOLON,
/**
* Left smooth brace $(LPAREN)
*/
LBRACE,
/**
* Right smooth brace $(RPAREN)
*/
RBRACE,
/**
* Assigmment symbol `=`
*/
ASSIGN,
/**
* Comma `,`
*/
COMMA,
/**
* Left curly brace `{`
*/
OCURLY,
/**
* Right curly brace `}`
*/
CCURLY,
/**
* Module keyword `module`
*/
MODULE,
/**
* New keyword `new`
*/
NEW,
/**
* If keyword `if`
*/
IF,
/**
* Else keyword `else`
*/
ELSE,
/**
* Discard keyword `discard`
*/
DISCARD,
/**
* While keyword `while`
*/
WHILE,
/**
* Class keyword `class`
*/
CLASS,
/**
* Inherit keyword `:`
*/
INHERIT_OPP,
/**
* Tilde `~`
*/
TILDE,
/**
* For keyword `for`
*/
FOR,
/**
* Super keyword `super`
*/
SUPER,
/**
* This keyword `this`
*/
THIS,
/**
* Switch keyword `switch`
*/
SWITCH,
/**
* Return keyword `return`
*/
RETURN,
/**
* Public keyword `public`
*/
PUBLIC,
/**
* Private keyword `private`
*/
PRIVATE,
/**
* Protected keyword `protected`
*/
PROTECTED,
/**
* Static keyword `static`
*/
STATIC,
/**
* Case keyword `case`
*/
CASE,
/**
* Goto keyword `goto`
*/
GOTO,
/**
* Do keyword `do`
*/
DO,
/**
* Dot operator `.`
*/
DOT,
/**
* Delete keyword `delete`
*/
DELETE,
/**
* Struct keyword `struct`
*/
STRUCT,
/**
* Subtraction operator `-`
*/
SUB,
/**
* Addition operator `+`
*/
ADD,
/**
* Division operator `/`
*/
DIVIDE,
/**
* Star operator `*`
*/
STAR,
/**
* Ampersand (reffer) operator `&`
*/
AMPERSAND,
/**
* Equality operator `==`
*/
EQUALS,
/**
* Greater than operator `>`
*/
GREATER_THAN,
/**
* Smaller than operator `<`
*/
SMALLER_THAN,
/**
* Greater than or equals to operator `>=`
*/
GREATER_THAN_OR_EQUALS,
/**
* Smaller than or equals to operator `<=`
*/
SMALLER_THAN_OR_EQUALS,
/**
* Opening bracket `[`
*/
OBRACKET,
/**
* Closing bracket `]`
*/
CBRACKET,
/**
* Cast keyword `cast`
*/
CAST,
/**
* Extern keyword `extern`
*/
EXTERN,
/**
* Extern-function keyword `efunc`
*/
EXTERN_EFUNC,
/**
* Extern-variable keyword `evar`
*/
EXTERN_EVAR,
/**
* `generic`
*/
GENERIC_TYPE_DECLARE,
/**
* `import` keyword
*/
IMPORT,
/**
* Multi-line comment (frwd-slash-star)
*/
MULTI_LINE_COMMENT,
/**
* Singleiline comment (frwd-slash-slash)
*/
SINGLE_LINE_COMMENT,
/**
* Unknown symbol
*/
UNKNOWN
}
/* TODO: Later build classes specific to symbol */
/* TODO: Check if below is even used */
/**
* Checks if the given token string is that of
* a built-in type
*
* Params:
* tokenStr = the string to check
* Returns: `true` if one of the built-in types,
* `false` otherwise
*/
public bool isType(string tokenStr)
{
return cmp(tokenStr, "byte") == 0 || cmp(tokenStr, "ubyte") == 0
|| cmp(tokenStr, "short") == 0 || cmp(tokenStr, "ushort") == 0
|| cmp(tokenStr, "int") == 0 || cmp(tokenStr, "uint") == 0 || cmp(tokenStr,
"long") == 0 || cmp(tokenStr, "ulong") == 0 || cmp(tokenStr, "void") == 0;
}
/**
* Checks if the given token string is a path
* identifier. This means that it is something
* which contains dots inbetween it like `a.b`
* but does not appear as a floating point literal
* such as `7.5`. It may also contain udnerscores `_`.
*
* Params:
* token = the token string to check
* Returns: `true` if it is a path identifier,
* `false` otherwise
*/
public bool isPathIdentifier(string token)
{
/* This is used to prevent the first character from not being number */
bool isFirstRun = true;
/* Whether we found a dot or not */
bool isDot;
foreach (char character; token)
{
if(isFirstRun)
{
/* Only allow underscore of letter */
if(isCharacterAlpha(character) || character == '_')
{
}
else
{
return false;
}
isFirstRun = false;
}
else
{
/* Check for dot */
if(character == '.')
{
isDot = true;
}
else if(isCharacterAlpha(character) || character == '_' || isCharacterNumber(character))
{
}
else
{
return false;
}
}
}
if(token.length)
{
if(token[token.length-1] == '.')
{
return false;
}
}
return isDot;
}
/**
* Checks if the given token string is an identifier
* which means it can contains letters and umbers
* but MUST start with a letter. It may also
* contain udnerscores `_`.
*
* Params:
* token = the token string to check
* Returns: `true` if an identifier, `flase`
* otherwise
*/
public bool isIdentifier(string token)
{
/* This is used to prevent the first character from not being number */
bool isFirstRun = true;
foreach (char character; token)
{
if(isFirstRun)
{
/* Only allow underscore of letter */
if(isCharacterAlpha(character) || character == '_')
{
}
else
{
return false;
}
isFirstRun = false;
}
else
{
if(isCharacterAlpha(character) || character == '_' || isCharacterNumber(character))
{
}
else
{
return false;
}
}
}
return true;
}
/**
* Checks if the given `Token` is an accessor
*
* Params:
* token = the `Token` to check
* Returns: `true` if so, `false` otherwise
*/
public bool isAccessor(Token token)
{
return getSymbolType(token) == SymbolType.PUBLIC ||
getSymbolType(token) == SymbolType.PRIVATE ||
getSymbolType(token) == SymbolType.PROTECTED;
}
/**
* Checks if the given `Token` is a modifier
*
* Params:
* token = the `Token` to check
* Returns: `true` if so, `false` otherwise
*/
public bool isModifier(Token token)
{
return getSymbolType(token) == SymbolType.STATIC;
}
/**
* Checks if the given `Token` is a normal
* identifier (with no dots/periods)
*
* Params:
* tokenIn = the `Token` to test
* Returns: `true` if so, `false` otherwise
*/
public bool isIdentifier_NoDot(Token tokenIn)
{
/* Make sure it isn't any other type of symbol */
if(getSymbolType(tokenIn) == SymbolType.IDENT_TYPE)
{
return isIdentifier(tokenIn.getToken());
}
else
{
return false;
}
}
/**
* Checks if the given `Token` is a dotted-identifier
* meaning it contains `.`/periods in it - a so-called
* path identifier.
*
* Params:
* tokenIn = the `Token` to test
* Returns: `true` if so, `false` otherwise
*/
public bool isIdentifier_Dot(Token tokenIn)
{
/* Make sure it isn't any other type of symbol */
if(getSymbolType(tokenIn) == SymbolType.IDENT_TYPE)
{
return isPathIdentifier(tokenIn.getToken()) || isIdentifier(tokenIn.getToken());
}
else
{
return false;
}
}
/**
* Checks if the given token string
* as a numeric literal. It has support
* for checking if it has a size specifier
* as well.
*
* Params:
* token = the string token to check
* Returns: `true` if it is a numeric literal,
* `false` otherwise
*/
private bool isNumericLiteral(string token)
{
if(canFind(token, "UL") || canFind(token, "UI"))
{
return isNumeric(token[0..$-2]);
}
else if(canFind(token, "L") || canFind(token, "I"))
{
return isNumeric(token[0..$-1]);
}
else
{
// TODO: Check if we would even get here in terms of what the lexer
// ... would be able to rpoduce.
// We would get ehre with `1` for example, however check if `1A`
// would even be possible (if not then remove isNumeric below, else keep)
return isNumeric(token);
}
}
/**
* Maps a given `Token` to its `SymbolType` such
* that you can determine the type of symbol it
* is.
*
* Params:
* tokenIn = the `Token` to check
* Returns: the `SymbolType` of this token, if
* unrecgnizable then `SymbolType.UNKNOWN` is
* returned
*/
public SymbolType getSymbolType(Token tokenIn)
{
string token = tokenIn.getToken();
/* TODO: Get symbol type of token */
/* Character literal check */
if (token[0] == '\'')
{
/* TODO: Add escape sequnece support */
if (token[2] == '\'')
{
return SymbolType.CHARACTER_LITERAL;
}
}
/* String literal check */
else if (token[0] == '\"' && token[token.length - 1] == '\"')
{
return SymbolType.STRING_LITERAL;
}
/* Number literal check */
// FIXME: Add support for 2UI and 2I (isNumeric checks via D's logic)
else if (isNumericLiteral(token))
{
return SymbolType.NUMBER_LITERAL;
}
/* `struct` */
else if(cmp(token, "struct") == 0)
{
return SymbolType.STRUCT;
}
/* `if` */
else if(cmp(token, "if") == 0)
{
return SymbolType.IF;
}
/* `else` */
else if(cmp(token, "else") == 0)
{
return SymbolType.ELSE;
}
/* `while` */
else if(cmp(token, "while") == 0)
{
return SymbolType.WHILE;
}
/* class keyword */
else if(cmp(token, "class") == 0)
{
return SymbolType.CLASS;
}
/* static keyword */
else if(cmp(token, "static") == 0)
{
return SymbolType.STATIC;
}
/* private keyword */
else if(cmp(token, "private") == 0)
{
return SymbolType.PRIVATE;
}
/* public keyword */
else if(cmp(token, "public") == 0)
{
return SymbolType.PUBLIC;
}
/* protected keyword */
else if(cmp(token, "protected") == 0)
{
return SymbolType.PROTECTED;
}
/* return keyword */
else if(cmp(token, "return") == 0)
{
return SymbolType.RETURN;
}
/* switch keyword */
else if(cmp(token, "switch") == 0)
{
return SymbolType.SWITCH;
}
/* this keyword */
else if(cmp(token, "this") == 0)
{
return SymbolType.THIS;
}
/* super keyword */
else if(cmp(token, "super") == 0)
{
return SymbolType.SUPER;
}
/* for keyword */
else if(cmp(token, "for") == 0)
{
return SymbolType.FOR;
}
/* case keyword */
else if(cmp(token, "case") == 0)
{
return SymbolType.CASE;
}
/* goto keyword */
else if(cmp(token, "goto") == 0)
{
return SymbolType.GOTO;
}
/* do keyword */
else if(cmp(token, "do") == 0)
{
return SymbolType.DO;
}
/* delete keyword */
else if(cmp(token, "delete") == 0)
{
return SymbolType.DELETE;
}
/* efunc keyword */
else if(cmp(token, "efunc") == 0)
{
return SymbolType.EXTERN_EFUNC;
}
/* evar keyword */
else if(cmp(token, "evar") == 0)
{
return SymbolType.EXTERN_EVAR;
}
/* extern keyword */
else if(cmp(token, "extern") == 0)
{
return SymbolType.EXTERN;
}
/* module keyword */
else if(cmp(token, "module") == 0)
{
return SymbolType.MODULE;
}
/* new keyword */
else if(cmp(token, "new") == 0)
{
return SymbolType.NEW;
}
/* cast keyword */
else if(cmp(token, "cast") == 0)
{
return SymbolType.CAST;
}
/* discard keyword */
else if(cmp(token, "discard") == 0)
{
return SymbolType.DISCARD;
}
/* generic keyword */
else if(cmp(token, "generic") == 0)
{
return SymbolType.GENERIC_TYPE_DECLARE;
}
/* import keyword */
else if(cmp(token, "import") == 0)
{
return SymbolType.IMPORT;
}
/* An identifier/type (of some sorts) - further inspection in parser is needed */
else if(isPathIdentifier(token) || isIdentifier(token))
{
return SymbolType.IDENT_TYPE;
}
/* Semi-colon `;` check */
else if (token[0] == ';')
{
return SymbolType.SEMICOLON;
}
/* Equality `==` check */
else if(cmp(token, "==") == 0)
{
return SymbolType.EQUALS;
}
/* Assign `=` check */
else if (token[0] == '=')
{
return SymbolType.ASSIGN;
}
/* Left-brace check */
else if (token[0] == '(')
{
return SymbolType.LBRACE;
}
/* Right-brace check */
else if (token[0] == ')')
{
return SymbolType.RBRACE;
}
/* Left-curly check */
else if (token[0] == '{')
{
return SymbolType.OCURLY;
}
/* Right-curly check */
else if (token[0] == '}')
{
return SymbolType.CCURLY;
}
/* Left-bracket checl */
else if(token[0] == '[')
{
return SymbolType.OBRACKET;
}
/* Right-bracket check */
else if(token[0] == ']')
{
return SymbolType.CBRACKET;
}
/* Comma check */
else if (token[0] == ',')
{
return SymbolType.COMMA;
}
/* Inheritance operator check */
else if (token[0] == ':')
{
return SymbolType.INHERIT_OPP;
}
/* Tilde operator check */
else if (token[0] == '~')
{
return SymbolType.TILDE;
}
/* Dot operator check */
else if (token[0] == '.')
{
return SymbolType.DOT;
}
/* Add `+` operator check */
else if(token[0] == '+')
{
return SymbolType.ADD;
}
/* Subtraction `-` operator check */
else if(token[0] == '-')
{
return SymbolType.SUB;
}
/* Multiply `*` operator check */
else if(token[0] == '*')
{
return SymbolType.STAR;
}
/* Multi-line comment (fwrd-slash-star) check */
else if(token[0] == '/' && token.length >= 2 && token[1]=='*')
{
return SymbolType.MULTI_LINE_COMMENT;
}
/* Single-line comment (fwrd-slash-slash) check */
else if(token[0] == '/' && token.length >= 2 && token[1]=='/')
{
return SymbolType.SINGLE_LINE_COMMENT;
}
/* Divide `/` operator check */
else if(token[0] == '/')
{
return SymbolType.DIVIDE;
}
/* Ampersand `&` operator check */
else if(token[0] == '&')
{
return SymbolType.AMPERSAND;
}
/* Greater than `>` operator check */
else if(token[0] == '>')
{
return SymbolType.GREATER_THAN;
}
/* Smaller than `<` operator check */
else if(token[0] == '<')
{
return SymbolType.SMALLER_THAN;
}
/* Greater than or equals to `>=` operator check */
else if(cmp(">=", token) == 0)
{
return SymbolType.GREATER_THAN_OR_EQUALS;
}
/* Smaller than or equals to `<=` operator check */
else if(cmp("<=", token) == 0)
{
return SymbolType.SMALLER_THAN_OR_EQUALS;
}
return SymbolType.UNKNOWN;
}
/**
* Determines whether the given token is
* a mathematical operator
*
* Params:
* token = the `Token` to test
* Returns: `true` if it is a mathematical
* operator, `false` otherwise
*/
public bool isMathOp(Token token)
{
string tokenStr = token.getToken();
return tokenStr[0] == '+' || tokenStr[0] == '-' ||
tokenStr[0] == '*' || tokenStr[0] == '/';
}
/**
* Determines whether the given token is
* a binary operator, meaning one which
* would be infixed/flanked by two operands
* (one to the left and one to the right)
*
* Params:
* token = the `Token` to test
* Returns: `true` if it is a binary
* operator, `false` otherwise
*/
public bool isBinaryOp(Token token)
{
string tokenStr = token.getToken();
return tokenStr[0] == '&' || cmp("&&", tokenStr) == 0 ||
tokenStr[0] == '|' || cmp("||", tokenStr) == 0 ||
tokenStr[0] == '^' || tokenStr[0] == '~' ||
tokenStr[0] == '<' || tokenStr[0] == '>' ||
cmp(">=", tokenStr) == 0 || cmp("<=", tokenStr) == 0 ||
cmp("==", tokenStr) == 0;
}
/**
* Returns the corresponding character for a given SymbolType
*
* For example <code>SymbolType.ADD</code> returns +
*
* Params:
* symbolIn = The symbol to lookup against
* Returns: The corresponding character
*
*/
public string getCharacter(SymbolType symbolIn)
{
if(symbolIn == SymbolType.ADD)
{
return "+";
}
else if(symbolIn == SymbolType.STAR)
{
return "*";
}
else if(symbolIn == SymbolType.SUB)
{
return "-";
}
else if(symbolIn == SymbolType.DIVIDE)
{
return "/";
}
else if(symbolIn == SymbolType.OCURLY)
{
return "{";
}
else if(symbolIn == SymbolType.CCURLY)
{
return "}";
}
else if(symbolIn == SymbolType.EQUALS)
{
return "==";
}
else if(symbolIn == SymbolType.SMALLER_THAN)
{
return "<";
}
else if(symbolIn == SymbolType.SMALLER_THAN_OR_EQUALS)
{
return "<=";
}
else if(symbolIn == SymbolType.GREATER_THAN)
{
return ">";
}
else if(symbolIn == SymbolType.GREATER_THAN_OR_EQUALS)
{
return ">=";
}
else if(symbolIn == SymbolType.AMPERSAND)
{
return "&";
}
else if(symbolIn == SymbolType.SEMICOLON)
{
return ";";
}
else
{
ERROR("getCharacter: No back-mapping for "~to!(string)(symbolIn));
assert(false);
}
}
/* Test: Character literal */
unittest
{
SymbolType symbol = getSymbolType(new Token("'c'", 0, 0));
assert(symbol == SymbolType.CHARACTER_LITERAL);
}
/* Test: String literals */
unittest
{
SymbolType symbol = getSymbolType(new Token("\"hello\"", 0, 0));
assert(symbol == SymbolType.STRING_LITERAL);
}
/* Test: Number literals */
unittest
{
SymbolType symbol = getSymbolType(new Token("2121", 0, 0));
assert(symbol == SymbolType.NUMBER_LITERAL);
symbol = getSymbolType(new Token("2121a", 0, 0));
assert(symbol != SymbolType.NUMBER_LITERAL);
}
/* Test: Identifer tests */
unittest
{
SymbolType symbol = getSymbolType(new Token("_yolo2", 0, 0));
assert(symbol == SymbolType.IDENT_TYPE);
symbol = getSymbolType(new Token("2_2ff", 0, 0));
assert(symbol != SymbolType.IDENT_TYPE);
}
/* Test: Identifier type detection */
unittest
{
assert(isPathIdentifier("hello.e.e"));
assert(!isPathIdentifier("hello"));
assert(!isIdentifier("hello.e.e"));
assert(isIdentifier("hello"));
/* TODO: Add support for the below in lexer */
assert(isPathIdentifier("hello._a.e"));
assert(isPathIdentifier("hello._2._e"));
}