Skip to content

Instantly share code, notes, and snippets.

@japajoe
Last active January 29, 2025 15:54
Show Gist options
  • Save japajoe/d5701ae5cd7c422fc173ee960f2c87e3 to your computer and use it in GitHub Desktop.
Save japajoe/d5701ae5cd7c422fc173ee960f2c87e3 to your computer and use it in GitHub Desktop.
Simple tokenizer for C
using System.Collections.Generic;
using System.Text;
public class CTokenizer
{
public enum TokenType
{
Number,
Identifier,
Keyword,
String,
Operator,
Comma,
Semicolon,
Colon,
Period,
Pound,
SquareBracketOpen,
SquareBracketClose,
ParenthesisOpen,
ParenthesisClose,
CurlyBraceOpen,
CurlyBraceClose,
Comment,
EndOfFile,
Unknown
}
public class Token
{
public TokenType Type { get; }
public string Value { get; }
public int Position { get; }
public Token(TokenType type, string value, int position)
{
Type = type;
Value = value;
Position = position;
}
public override string ToString()
{
return $"{Position} {Type} {Value}";
}
}
private string _input;
private int _position;
private static readonly HashSet<string> Keywords = new HashSet<string>
{
"auto", "break", "case", "char", "const",
"continue", "default", "do", "double", "else",
"enum", "extern", "float", "for", "goto", "if",
"inline", "int", "long", "register", "restrict",
"return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned",
"void", "volatile", "while",
};
public CTokenizer()
{
_input = string.Empty;
_position = 0;
}
private char CurrentChar => _position < _input.Length ? _input[_position] : '\0';
private void Advance()
{
_position++;
}
private bool IsEndOfFile => _position >= _input.Length;
public List<Token> Tokenize(string input)
{
_input = input;
_position = 0;
var tokens = new List<Token>();
while (!IsEndOfFile)
{
if (char.IsWhiteSpace(CurrentChar))
{
Advance();
}
else if (CurrentChar == '/' && Peek() == '/')
{
tokens.Add(TokenizeComment());
}
else if (CurrentChar == '-' && (!IsEndOfFile || char.IsDigit(Peek())) && Peek() != '-')
{
// Handle negative numbers
tokens.Add(TokenizeNegativeNumber());
}
else if (char.IsDigit(CurrentChar) || (CurrentChar == '.' && char.IsDigit(Peek())))
{
tokens.Add(TokenizeNumber());
}
else if (char.IsLetter(CurrentChar) || CurrentChar == '_')
{
tokens.Add(TokenizeIdentifier());
}
else if (CurrentChar == '"')
{
tokens.Add(TokenizeString());
}
else if ("+-*/=<>!&|".Contains(CurrentChar))
{
tokens.Add(TokenizeOperator());
}
else if(CurrentChar == '[')
{
tokens.Add(TokenizeSquareBracketOpen());
}
else if(CurrentChar == ']')
{
tokens.Add(TokenizeSquareBracketClose());
}
else if(CurrentChar == '(')
{
tokens.Add(TokenizeParenthesisOpen());
}
else if(CurrentChar == ')')
{
tokens.Add(TokenizeParenthesisClose());
}
else if(CurrentChar == '{')
{
tokens.Add(TokenizeCurlyBraceOpen());
}
else if(CurrentChar == '}')
{
tokens.Add(TokenizeCurlyBraceClose());
}
else if(CurrentChar == ',')
{
tokens.Add(TokenizeComma());
}
else if(CurrentChar == ';')
{
tokens.Add(TokenizeSemicolon());
}
else if(CurrentChar == ':')
{
tokens.Add(TokenizeColon());
}
else if(CurrentChar == '.')
{
tokens.Add(TokenizePeriod());
}
else if(CurrentChar == '#')
{
tokens.Add(TokenizePound());
}
else
{
tokens.Add(new Token(TokenType.Unknown, CurrentChar.ToString(), _position));
Advance();
}
}
tokens.Add(new Token(TokenType.EndOfFile, "", _position));
return tokens;
}
private Token TokenizeComment()
{
var start = _position;
Advance(); // Skip the first '/'
Advance(); // Skip the second '/'
while (!IsEndOfFile && CurrentChar != '\n')
{
Advance();
}
var value = _input.Substring(start, _position - start);
return new Token(TokenType.Comment, value, start);
}
private Token TokenizeNegativeNumber()
{
var start = _position;
Advance(); // Skip the minus sign
// Now we expect a number
bool hasDecimal = false;
while (!IsEndOfFile && (char.IsDigit(CurrentChar) || CurrentChar == '.'))
{
if (CurrentChar == '.')
{
if (hasDecimal) break; // Only one decimal point allowed
hasDecimal = true;
}
Advance();
}
var value = _input.Substring(start, _position - start);
return new Token(TokenType.Number, value, start);
}
private Token TokenizeNumber()
{
var start = _position;
bool hasDecimal = false;
while (!IsEndOfFile && (char.IsDigit(CurrentChar) || CurrentChar == '.'))
{
if (CurrentChar == '.')
{
if (hasDecimal) break; // Only one decimal point allowed
hasDecimal = true;
}
Advance();
}
var value = _input.Substring(start, _position - start);
return new Token(TokenType.Number, value, start);
}
private Token TokenizeIdentifier()
{
var start = _position;
while (!IsEndOfFile && (char.IsLetterOrDigit(CurrentChar) || CurrentChar == '_'))
{
Advance();
}
var value = _input.Substring(start, _position - start);
var type = Keywords.Contains(value) ? TokenType.Keyword : TokenType.Identifier;
return new Token(type, value, start);
}
private Token TokenizeString()
{
var start = _position;
Advance(); // Skip the opening quote
while (!IsEndOfFile && CurrentChar != '"')
{
Advance();
}
Advance(); // Skip the closing quote
var value = _input.Substring(start, _position - start);
return new Token(TokenType.String, value, start);
}
private Token TokenizeOperator()
{
var start = _position;
Advance(); // Move past the operator
var value = _input.Substring(start, _position - start);
return new Token(TokenType.Operator, value, start);
}
private Token TokenizeSquareBracketOpen()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.SquareBracketOpen, value, position);
}
private Token TokenizeSquareBracketClose()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.SquareBracketClose, value, position);
}
private Token TokenizeParenthesisOpen()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.ParenthesisOpen, value, position);
}
private Token TokenizeParenthesisClose()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.ParenthesisClose, value, position);
}
private Token TokenizeCurlyBraceOpen()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.CurlyBraceOpen, value, position);
}
private Token TokenizeCurlyBraceClose()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.CurlyBraceClose, value, position);
}
private Token TokenizeComma()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.Comma, value, position);
}
private Token TokenizeSemicolon()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.Semicolon, value, position);
}
private Token TokenizeColon()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.Colon, value, position);
}
private Token TokenizePeriod()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.Period, value, position);
}
private Token TokenizePound()
{
var value = CurrentChar.ToString();
var position = _position; // Store the position
Advance();
return new Token(TokenType.Pound, value, position);
}
private char Peek()
{
return _position + 1 < _input.Length ? _input[_position + 1] : '\0';
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment