Got parser working. now i'm switching from toy language to CG syntax. lexer is... done??? maybe

This commit is contained in:
Dustin Swan 2026-01-31 16:07:44 -07:00
parent f74d374555
commit 71237d0307
No known key found for this signature in database
GPG key ID: 30D46587E2100467
3 changed files with 312 additions and 132 deletions

View file

@ -1,135 +1,161 @@
export type Token =
| { kind: 'let' }
| { kind: 'in' }
| { kind: 'number', value: number }
// Literals
| { kind: 'int', value: number }
| { kind: 'float', value: number }
| { kind: 'string', value: string }
| { kind: 'ident', value: string }
| { kind: 'type-ident', value: string }
| { kind: 'equals' }
// Brackets
| { kind: 'open-paren' }
| { kind: 'close-paren' }
| { kind: 'open-brace' }
| { kind: 'close-brace' }
| { kind: 'open-bracket' }
| { kind: 'close-bracket' }
// Symbols
| { kind: 'equals' }
| { kind: 'colon' }
| { kind: 'backslash' }
| { kind: 'pipe' }
| { kind: 'greater-than' }
| { kind: 'comma' }
| { kind: 'arrow' }
| { kind: 'if' }
| { kind: 'then' }
| { kind: 'else' }
| { kind: 'true' }
| { kind: 'false' }
| { kind: 'ampersand' }
| { kind: 'underscore' }
| { kind: 'dot' }
| { kind: 'at' }
// Anithmetic
| { kind: 'plus' }
| { kind: 'minus' }
| { kind: 'star' }
| { kind: 'slash' }
| { kind: 'less-than' }
| { kind: 'greater-than' }
| { kind: 'double-equals' }
| { kind: 'eof' }
export function tokenize(source: string): Token[] {
const tokens = [];
const tokens: Token[] = [];
let i = 0;
while (i < source.length) {
const char = source[i];
// skip whitespace
// Whitespace
if (/\s/.test(char)) {
i++;
continue;
}
// Multi-char: numbers
if (/[0-9]/.test(char)) {
let num = '';
while (i < source.length && /[0-9]/.test(source[i])) {
num += source[i];
if (char === '#') {
while (i < source.length && source[i] !== '\n') {
i++;
}
tokens.push({ kind: 'number', value: parseInt(num) });
continue;
}
// Multi-char: equals
if (char === '=') {
const nextChar = source[i + 1];
// Numbers
if (/[0-9]/.test(char)) { // have to start with a digit (?)
let num = '';
let hasDot = false;
if (nextChar === '=') {
tokens.push({ kind: 'double-equals' });
while (i < source.length && /[0-9.]/.test(source[i])) {
if (source[i] === '.') {
if (hasDot) break;
hasDot = true;
}
num += source[i];
i++;
continue;
} else if (nextChar === '>') {
tokens.push({ kind: 'arrow' });
i++;
continue;
} else {
tokens.push({ kind: 'equals' });
i++;
continue;
}
tokens.push(hasDot
? { kind: 'float', value: parseFloat(num) }
: { kind: 'int', value: parseInt(num) });
continue;
}
// Multi-char: strings
if (/[A-Za-z]/.test(char)) {
// Idents
if (/[A-Za-z_]/.test(char)) {
let str = '';
while (i < source.length && /[A-Za-z]/.test(source[i])) {
while (i < source.length && /[A-Za-z0-9_!-]/.test(source[i])) {
str += source[i];
i++;
}
if (str === 'let') {
tokens.push({ kind: 'let' });
} else if (str === 'in') {
tokens.push({ kind: 'in' });
} else if (str === 'if') {
tokens.push({ kind: 'if' });
} else if (str === 'then') {
tokens.push({ kind: 'then' });
} else if (str === 'else') {
tokens.push({ kind: 'else' });
} else if (str === 'true') {
tokens.push({ kind: 'true' });
} else if (str === 'false') {
tokens.push({ kind: 'false' });
} else {
tokens.push({ kind: 'ident', value: str });
}
const isType = /[A-Z]/.test(str[0]);
tokens.push(isType
? { kind: 'type-ident', value: str }
: { kind: 'ident', value: str });
continue;
}
// TODO: floats
// Strings
if (char === '"') {
i++;
let str = '';
while (i < source.length && source[i] !== '"') {
if (source[i] === '\\') {
i++;
if (i >= source.length) {
throw new Error('Unterminated string');
}
switch(source[i]) {
case 'n': str += '\n'; break;
case '"': str += '"'; break;
case '\\': str += '\\'; break;
case 't': str += '\t'; break;
}
} else {
str += source[i];
}
i++;
}
if (i >= source.length) {
throw new Error('Unterminated string');
}
tokens.push({ kind: 'string', value: str });
i++;
continue;
}
switch (char) {
case ',':
tokens.push({ kind: 'comma' });
break;
case '+':
tokens.push({ kind: 'plus' });
break;
case '-':
tokens.push({ kind: 'minus' });
break;
case '*':
tokens.push({ kind: 'star' });
break;
case '/':
tokens.push({ kind: 'slash' });
break;
case '(':
tokens.push({ kind: 'open-paren' });
break;
case ')':
tokens.push({ kind: 'close-paren' });
break;
case '<':
tokens.push({ kind: 'less-than' });
break;
case '>':
tokens.push({ kind: 'greater-than' });
break;
// Brackets
case '(': tokens.push({ kind: 'open-paren' }); break;
case ')': tokens.push({ kind: 'close-paren' }); break;
case '{': tokens.push({ kind: 'open-brace' }); break;
case '}': tokens.push({ kind: 'close-brace' }); break;
case '[': tokens.push({ kind: 'open-bracket' }); break;
case ']': tokens.push({ kind: 'close-bracket' }); break;
// Symbols
case '=': tokens.push({ kind: 'equals' }); break;
case ':': tokens.push({ kind: 'colon' }); break;
case '\\': tokens.push({ kind: 'backslash' }); break;
case '|': tokens.push({ kind: 'pipe' }); break;
// case '<': tokens.push({ kind: 'less-than' }); break;
case '>': tokens.push({ kind: 'greater-than' }); break;
case ',': tokens.push({ kind: 'comma' }); break;
case '&': tokens.push({ kind: 'ampersand' }); break;
case '_': tokens.push({ kind: 'underscore' }); break;
case '.': tokens.push({ kind: 'dot' }); break;
case '@': tokens.push({ kind: 'at' }); break;
// Arithmetic
case '+': tokens.push({ kind: 'plus' }); break;
case '-': tokens.push({ kind: 'minus' }); break;
case '*': tokens.push({ kind: 'star' }); break;
case '/': tokens.push({ kind: 'slash' }); break;
}
i++;