Last active
January 29, 2025 15:54
-
-
Save japajoe/d5701ae5cd7c422fc173ee960f2c87e3 to your computer and use it in GitHub Desktop.
Simple tokenizer for C
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System.Collections.Generic; | |
| using System.Text; | |
| public class CTokenizer | |
| { | |
| public enum TokenType | |
| { | |
| Number, | |
| Identifier, | |
| Keyword, | |
| String, | |
| Operator, | |
| Comma, | |
| Semicolon, | |
| Colon, | |
| Period, | |
| Pound, | |
| SquareBracketOpen, | |
| SquareBracketClose, | |
| ParenthesisOpen, | |
| ParenthesisClose, | |
| CurlyBraceOpen, | |
| CurlyBraceClose, | |
| Comment, | |
| EndOfFile, | |
| Unknown | |
| } | |
| public class Token | |
| { | |
| public TokenType Type { get; } | |
| public string Value { get; } | |
| public int Position { get; } | |
| public Token(TokenType type, string value, int position) | |
| { | |
| Type = type; | |
| Value = value; | |
| Position = position; | |
| } | |
| public override string ToString() | |
| { | |
| return $"{Position} {Type} {Value}"; | |
| } | |
| } | |
| private string _input; | |
| private int _position; | |
| private static readonly HashSet<string> Keywords = new HashSet<string> | |
| { | |
| "auto", "break", "case", "char", "const", | |
| "continue", "default", "do", "double", "else", | |
| "enum", "extern", "float", "for", "goto", "if", | |
| "inline", "int", "long", "register", "restrict", | |
| "return", "short", "signed", "sizeof", "static", | |
| "struct", "switch", "typedef", "union", "unsigned", | |
| "void", "volatile", "while", | |
| }; | |
| public CTokenizer() | |
| { | |
| _input = string.Empty; | |
| _position = 0; | |
| } | |
| private char CurrentChar => _position < _input.Length ? _input[_position] : '\0'; | |
| private void Advance() | |
| { | |
| _position++; | |
| } | |
| private bool IsEndOfFile => _position >= _input.Length; | |
| public List<Token> Tokenize(string input) | |
| { | |
| _input = input; | |
| _position = 0; | |
| var tokens = new List<Token>(); | |
| while (!IsEndOfFile) | |
| { | |
| if (char.IsWhiteSpace(CurrentChar)) | |
| { | |
| Advance(); | |
| } | |
| else if (CurrentChar == '/' && Peek() == '/') | |
| { | |
| tokens.Add(TokenizeComment()); | |
| } | |
| else if (CurrentChar == '-' && (!IsEndOfFile || char.IsDigit(Peek())) && Peek() != '-') | |
| { | |
| // Handle negative numbers | |
| tokens.Add(TokenizeNegativeNumber()); | |
| } | |
| else if (char.IsDigit(CurrentChar) || (CurrentChar == '.' && char.IsDigit(Peek()))) | |
| { | |
| tokens.Add(TokenizeNumber()); | |
| } | |
| else if (char.IsLetter(CurrentChar) || CurrentChar == '_') | |
| { | |
| tokens.Add(TokenizeIdentifier()); | |
| } | |
| else if (CurrentChar == '"') | |
| { | |
| tokens.Add(TokenizeString()); | |
| } | |
| else if ("+-*/=<>!&|".Contains(CurrentChar)) | |
| { | |
| tokens.Add(TokenizeOperator()); | |
| } | |
| else if(CurrentChar == '[') | |
| { | |
| tokens.Add(TokenizeSquareBracketOpen()); | |
| } | |
| else if(CurrentChar == ']') | |
| { | |
| tokens.Add(TokenizeSquareBracketClose()); | |
| } | |
| else if(CurrentChar == '(') | |
| { | |
| tokens.Add(TokenizeParenthesisOpen()); | |
| } | |
| else if(CurrentChar == ')') | |
| { | |
| tokens.Add(TokenizeParenthesisClose()); | |
| } | |
| else if(CurrentChar == '{') | |
| { | |
| tokens.Add(TokenizeCurlyBraceOpen()); | |
| } | |
| else if(CurrentChar == '}') | |
| { | |
| tokens.Add(TokenizeCurlyBraceClose()); | |
| } | |
| else if(CurrentChar == ',') | |
| { | |
| tokens.Add(TokenizeComma()); | |
| } | |
| else if(CurrentChar == ';') | |
| { | |
| tokens.Add(TokenizeSemicolon()); | |
| } | |
| else if(CurrentChar == ':') | |
| { | |
| tokens.Add(TokenizeColon()); | |
| } | |
| else if(CurrentChar == '.') | |
| { | |
| tokens.Add(TokenizePeriod()); | |
| } | |
| else if(CurrentChar == '#') | |
| { | |
| tokens.Add(TokenizePound()); | |
| } | |
| else | |
| { | |
| tokens.Add(new Token(TokenType.Unknown, CurrentChar.ToString(), _position)); | |
| Advance(); | |
| } | |
| } | |
| tokens.Add(new Token(TokenType.EndOfFile, "", _position)); | |
| return tokens; | |
| } | |
| private Token TokenizeComment() | |
| { | |
| var start = _position; | |
| Advance(); // Skip the first '/' | |
| Advance(); // Skip the second '/' | |
| while (!IsEndOfFile && CurrentChar != '\n') | |
| { | |
| Advance(); | |
| } | |
| var value = _input.Substring(start, _position - start); | |
| return new Token(TokenType.Comment, value, start); | |
| } | |
| private Token TokenizeNegativeNumber() | |
| { | |
| var start = _position; | |
| Advance(); // Skip the minus sign | |
| // Now we expect a number | |
| bool hasDecimal = false; | |
| while (!IsEndOfFile && (char.IsDigit(CurrentChar) || CurrentChar == '.')) | |
| { | |
| if (CurrentChar == '.') | |
| { | |
| if (hasDecimal) break; // Only one decimal point allowed | |
| hasDecimal = true; | |
| } | |
| Advance(); | |
| } | |
| var value = _input.Substring(start, _position - start); | |
| return new Token(TokenType.Number, value, start); | |
| } | |
| private Token TokenizeNumber() | |
| { | |
| var start = _position; | |
| bool hasDecimal = false; | |
| while (!IsEndOfFile && (char.IsDigit(CurrentChar) || CurrentChar == '.')) | |
| { | |
| if (CurrentChar == '.') | |
| { | |
| if (hasDecimal) break; // Only one decimal point allowed | |
| hasDecimal = true; | |
| } | |
| Advance(); | |
| } | |
| var value = _input.Substring(start, _position - start); | |
| return new Token(TokenType.Number, value, start); | |
| } | |
| private Token TokenizeIdentifier() | |
| { | |
| var start = _position; | |
| while (!IsEndOfFile && (char.IsLetterOrDigit(CurrentChar) || CurrentChar == '_')) | |
| { | |
| Advance(); | |
| } | |
| var value = _input.Substring(start, _position - start); | |
| var type = Keywords.Contains(value) ? TokenType.Keyword : TokenType.Identifier; | |
| return new Token(type, value, start); | |
| } | |
| private Token TokenizeString() | |
| { | |
| var start = _position; | |
| Advance(); // Skip the opening quote | |
| while (!IsEndOfFile && CurrentChar != '"') | |
| { | |
| Advance(); | |
| } | |
| Advance(); // Skip the closing quote | |
| var value = _input.Substring(start, _position - start); | |
| return new Token(TokenType.String, value, start); | |
| } | |
| private Token TokenizeOperator() | |
| { | |
| var start = _position; | |
| Advance(); // Move past the operator | |
| var value = _input.Substring(start, _position - start); | |
| return new Token(TokenType.Operator, value, start); | |
| } | |
| private Token TokenizeSquareBracketOpen() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.SquareBracketOpen, value, position); | |
| } | |
| private Token TokenizeSquareBracketClose() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.SquareBracketClose, value, position); | |
| } | |
| private Token TokenizeParenthesisOpen() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.ParenthesisOpen, value, position); | |
| } | |
| private Token TokenizeParenthesisClose() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.ParenthesisClose, value, position); | |
| } | |
| private Token TokenizeCurlyBraceOpen() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.CurlyBraceOpen, value, position); | |
| } | |
| private Token TokenizeCurlyBraceClose() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.CurlyBraceClose, value, position); | |
| } | |
| private Token TokenizeComma() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.Comma, value, position); | |
| } | |
| private Token TokenizeSemicolon() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.Semicolon, value, position); | |
| } | |
| private Token TokenizeColon() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.Colon, value, position); | |
| } | |
| private Token TokenizePeriod() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.Period, value, position); | |
| } | |
| private Token TokenizePound() | |
| { | |
| var value = CurrentChar.ToString(); | |
| var position = _position; // Store the position | |
| Advance(); | |
| return new Token(TokenType.Pound, value, position); | |
| } | |
| private char Peek() | |
| { | |
| return _position + 1 < _input.Length ? _input[_position + 1] : '\0'; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment