608 lines
18 KiB
Rust
608 lines
18 KiB
Rust
//! Lexical analyzer (tokenizer) for Cypher query language
|
|
//!
|
|
//! Hand-rolled lexer for WASM compatibility - no external dependencies.
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
use std::fmt;
|
|
use std::iter::Peekable;
|
|
use std::str::Chars;
|
|
|
|
/// Token with kind and location information
|
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
|
pub struct Token {
|
|
pub kind: TokenKind,
|
|
pub lexeme: String,
|
|
pub position: Position,
|
|
}
|
|
|
|
/// Source position for error reporting
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
|
pub struct Position {
|
|
pub line: usize,
|
|
pub column: usize,
|
|
pub offset: usize,
|
|
}
|
|
|
|
/// Token kinds
|
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
|
pub enum TokenKind {
|
|
// Keywords
|
|
Match,
|
|
OptionalMatch,
|
|
Where,
|
|
Return,
|
|
Create,
|
|
Merge,
|
|
Delete,
|
|
DetachDelete,
|
|
Set,
|
|
Remove,
|
|
With,
|
|
OrderBy,
|
|
Limit,
|
|
Skip,
|
|
Distinct,
|
|
As,
|
|
Asc,
|
|
Desc,
|
|
Case,
|
|
When,
|
|
Then,
|
|
Else,
|
|
End,
|
|
And,
|
|
Or,
|
|
Xor,
|
|
Not,
|
|
In,
|
|
Is,
|
|
Null,
|
|
True,
|
|
False,
|
|
OnCreate,
|
|
OnMatch,
|
|
|
|
// Identifiers and literals
|
|
Identifier(String),
|
|
Integer(i64),
|
|
Float(f64),
|
|
String(String),
|
|
|
|
// Operators
|
|
Plus,
|
|
Minus,
|
|
Star,
|
|
Slash,
|
|
Percent,
|
|
Caret,
|
|
Equal,
|
|
NotEqual,
|
|
LessThan,
|
|
LessThanOrEqual,
|
|
GreaterThan,
|
|
GreaterThanOrEqual,
|
|
Arrow, // ->
|
|
LeftArrow, // <-
|
|
Dash, // -
|
|
|
|
// Delimiters
|
|
LeftParen,
|
|
RightParen,
|
|
LeftBracket,
|
|
RightBracket,
|
|
LeftBrace,
|
|
RightBrace,
|
|
Comma,
|
|
Dot,
|
|
Colon,
|
|
Semicolon,
|
|
Pipe,
|
|
|
|
// Special
|
|
DotDot, // ..
|
|
Eof,
|
|
}
|
|
|
|
impl fmt::Display for TokenKind {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
TokenKind::Identifier(s) => write!(f, "identifier '{}'", s),
|
|
TokenKind::Integer(n) => write!(f, "integer {}", n),
|
|
TokenKind::Float(n) => write!(f, "float {}", n),
|
|
TokenKind::String(s) => write!(f, "string \"{}\"", s),
|
|
_ => write!(f, "{:?}", self),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Lexer error
|
|
#[derive(Debug, Clone)]
|
|
pub struct LexerError {
|
|
pub message: String,
|
|
pub position: Position,
|
|
}
|
|
|
|
impl fmt::Display for LexerError {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
write!(
|
|
f,
|
|
"Lexer error at {}:{}: {}",
|
|
self.position.line, self.position.column, self.message
|
|
)
|
|
}
|
|
}
|
|
|
|
impl std::error::Error for LexerError {}
|
|
|
|
/// Hand-rolled Cypher lexer
|
|
pub struct Lexer<'a> {
|
|
input: &'a str,
|
|
chars: Peekable<Chars<'a>>,
|
|
position: Position,
|
|
current_offset: usize,
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
pub fn new(input: &'a str) -> Self {
|
|
Self {
|
|
input,
|
|
chars: input.chars().peekable(),
|
|
position: Position {
|
|
line: 1,
|
|
column: 1,
|
|
offset: 0,
|
|
},
|
|
current_offset: 0,
|
|
}
|
|
}
|
|
|
|
fn peek(&mut self) -> Option<char> {
|
|
self.chars.peek().copied()
|
|
}
|
|
|
|
fn advance(&mut self) -> Option<char> {
|
|
let ch = self.chars.next()?;
|
|
self.current_offset += ch.len_utf8();
|
|
if ch == '\n' {
|
|
self.position.line += 1;
|
|
self.position.column = 1;
|
|
} else {
|
|
self.position.column += 1;
|
|
}
|
|
self.position.offset = self.current_offset;
|
|
Some(ch)
|
|
}
|
|
|
|
fn skip_whitespace(&mut self) {
|
|
while let Some(ch) = self.peek() {
|
|
if ch.is_whitespace() {
|
|
self.advance();
|
|
} else if ch == '/' && self.lookahead(1) == Some('/') {
|
|
// Skip line comments
|
|
while let Some(c) = self.peek() {
|
|
if c == '\n' {
|
|
break;
|
|
}
|
|
self.advance();
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
fn lookahead(&self, n: usize) -> Option<char> {
|
|
self.input[self.current_offset..].chars().nth(n)
|
|
}
|
|
|
|
fn make_token(&self, kind: TokenKind, lexeme: &str, start_pos: Position) -> Token {
|
|
Token {
|
|
kind,
|
|
lexeme: lexeme.to_string(),
|
|
position: start_pos,
|
|
}
|
|
}
|
|
|
|
fn scan_string(&mut self, quote: char) -> Result<Token, LexerError> {
|
|
let start = self.position;
|
|
self.advance(); // consume opening quote
|
|
let mut value = String::new();
|
|
|
|
while let Some(ch) = self.peek() {
|
|
if ch == quote {
|
|
self.advance(); // consume closing quote
|
|
return Ok(self.make_token(TokenKind::String(value.clone()), &value, start));
|
|
} else if ch == '\\' {
|
|
self.advance();
|
|
match self.peek() {
|
|
Some('n') => {
|
|
value.push('\n');
|
|
self.advance();
|
|
}
|
|
Some('t') => {
|
|
value.push('\t');
|
|
self.advance();
|
|
}
|
|
Some('r') => {
|
|
value.push('\r');
|
|
self.advance();
|
|
}
|
|
Some('\\') => {
|
|
value.push('\\');
|
|
self.advance();
|
|
}
|
|
Some(c) if c == quote => {
|
|
value.push(c);
|
|
self.advance();
|
|
}
|
|
_ => value.push('\\'),
|
|
}
|
|
} else {
|
|
value.push(ch);
|
|
self.advance();
|
|
}
|
|
}
|
|
|
|
Err(LexerError {
|
|
message: "Unterminated string".to_string(),
|
|
position: start,
|
|
})
|
|
}
|
|
|
|
fn scan_number(&mut self) -> Token {
|
|
let start = self.position;
|
|
let start_offset = self.current_offset;
|
|
|
|
while let Some(ch) = self.peek() {
|
|
if ch.is_ascii_digit() {
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check for decimal
|
|
if self.peek() == Some('.')
|
|
&& self
|
|
.lookahead(1)
|
|
.map(|c| c.is_ascii_digit())
|
|
.unwrap_or(false)
|
|
{
|
|
self.advance(); // consume '.'
|
|
while let Some(ch) = self.peek() {
|
|
if ch.is_ascii_digit() {
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
let lexeme = &self.input[start_offset..self.current_offset];
|
|
let value: f64 = lexeme.parse().unwrap_or(0.0);
|
|
return self.make_token(TokenKind::Float(value), lexeme, start);
|
|
}
|
|
|
|
// Check for exponent
|
|
if matches!(self.peek(), Some('e') | Some('E')) {
|
|
self.advance();
|
|
if matches!(self.peek(), Some('+') | Some('-')) {
|
|
self.advance();
|
|
}
|
|
while let Some(ch) = self.peek() {
|
|
if ch.is_ascii_digit() {
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
let lexeme = &self.input[start_offset..self.current_offset];
|
|
let value: f64 = lexeme.parse().unwrap_or(0.0);
|
|
return self.make_token(TokenKind::Float(value), lexeme, start);
|
|
}
|
|
|
|
let lexeme = &self.input[start_offset..self.current_offset];
|
|
let value: i64 = lexeme.parse().unwrap_or(0);
|
|
self.make_token(TokenKind::Integer(value), lexeme, start)
|
|
}
|
|
|
|
fn scan_identifier(&mut self) -> Token {
|
|
let start = self.position;
|
|
let start_offset = self.current_offset;
|
|
|
|
while let Some(ch) = self.peek() {
|
|
if ch.is_ascii_alphanumeric() || ch == '_' {
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
let lexeme = &self.input[start_offset..self.current_offset];
|
|
let kind = match lexeme.to_uppercase().as_str() {
|
|
"MATCH" => TokenKind::Match,
|
|
"OPTIONAL" if self.peek_keyword("MATCH") => {
|
|
self.skip_whitespace();
|
|
self.scan_keyword("MATCH");
|
|
TokenKind::OptionalMatch
|
|
}
|
|
"WHERE" => TokenKind::Where,
|
|
"RETURN" => TokenKind::Return,
|
|
"CREATE" => TokenKind::Create,
|
|
"MERGE" => TokenKind::Merge,
|
|
"DELETE" => TokenKind::Delete,
|
|
"DETACH" if self.peek_keyword("DELETE") => {
|
|
self.skip_whitespace();
|
|
self.scan_keyword("DELETE");
|
|
TokenKind::DetachDelete
|
|
}
|
|
"SET" => TokenKind::Set,
|
|
"REMOVE" => TokenKind::Remove,
|
|
"WITH" => TokenKind::With,
|
|
"ORDER" if self.peek_keyword("BY") => {
|
|
self.skip_whitespace();
|
|
self.scan_keyword("BY");
|
|
TokenKind::OrderBy
|
|
}
|
|
"LIMIT" => TokenKind::Limit,
|
|
"SKIP" => TokenKind::Skip,
|
|
"DISTINCT" => TokenKind::Distinct,
|
|
"AS" => TokenKind::As,
|
|
"ASC" => TokenKind::Asc,
|
|
"DESC" => TokenKind::Desc,
|
|
"CASE" => TokenKind::Case,
|
|
"WHEN" => TokenKind::When,
|
|
"THEN" => TokenKind::Then,
|
|
"ELSE" => TokenKind::Else,
|
|
"END" => TokenKind::End,
|
|
"AND" => TokenKind::And,
|
|
"OR" => TokenKind::Or,
|
|
"XOR" => TokenKind::Xor,
|
|
"NOT" => TokenKind::Not,
|
|
"IN" => TokenKind::In,
|
|
"IS" => TokenKind::Is,
|
|
"NULL" => TokenKind::Null,
|
|
"TRUE" => TokenKind::True,
|
|
"FALSE" => TokenKind::False,
|
|
"ON" if self.peek_keyword("CREATE") => {
|
|
self.skip_whitespace();
|
|
self.scan_keyword("CREATE");
|
|
TokenKind::OnCreate
|
|
}
|
|
_ if lexeme.to_uppercase() == "ON" && self.peek_keyword("MATCH") => {
|
|
self.skip_whitespace();
|
|
self.scan_keyword("MATCH");
|
|
TokenKind::OnMatch
|
|
}
|
|
_ => TokenKind::Identifier(lexeme.to_string()),
|
|
};
|
|
|
|
self.make_token(kind, lexeme, start)
|
|
}
|
|
|
|
fn peek_keyword(&mut self, keyword: &str) -> bool {
|
|
let saved_offset = self.current_offset;
|
|
self.skip_whitespace();
|
|
let remaining = &self.input[self.current_offset..];
|
|
let matches = remaining.to_uppercase().starts_with(keyword)
|
|
&& remaining
|
|
.chars()
|
|
.nth(keyword.len())
|
|
.map(|c| !c.is_ascii_alphanumeric() && c != '_')
|
|
.unwrap_or(true);
|
|
// Reset position if not consuming
|
|
if !matches {
|
|
self.current_offset = saved_offset;
|
|
self.chars = self.input[saved_offset..].chars().peekable();
|
|
}
|
|
matches
|
|
}
|
|
|
|
fn scan_keyword(&mut self, keyword: &str) {
|
|
for _ in 0..keyword.len() {
|
|
self.advance();
|
|
}
|
|
}
|
|
|
|
pub fn next_token(&mut self) -> Result<Token, LexerError> {
|
|
self.skip_whitespace();
|
|
|
|
let start = self.position;
|
|
|
|
match self.peek() {
|
|
None => Ok(self.make_token(TokenKind::Eof, "", start)),
|
|
Some(ch) => {
|
|
match ch {
|
|
// Strings
|
|
'"' | '\'' => self.scan_string(ch),
|
|
|
|
// Numbers
|
|
'0'..='9' => Ok(self.scan_number()),
|
|
|
|
// Identifiers
|
|
'a'..='z' | 'A'..='Z' | '_' | '$' => Ok(self.scan_identifier()),
|
|
|
|
// Backtick-quoted identifiers
|
|
'`' => {
|
|
self.advance();
|
|
let id_start = self.current_offset;
|
|
while let Some(c) = self.peek() {
|
|
if c == '`' {
|
|
break;
|
|
}
|
|
self.advance();
|
|
}
|
|
let id = self.input[id_start..self.current_offset].to_string();
|
|
self.advance(); // consume closing backtick
|
|
Ok(self.make_token(TokenKind::Identifier(id.clone()), &id, start))
|
|
}
|
|
|
|
// Two-character operators
|
|
'<' => {
|
|
self.advance();
|
|
match self.peek() {
|
|
Some('=') => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::LessThanOrEqual, "<=", start))
|
|
}
|
|
Some('>') => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::NotEqual, "<>", start))
|
|
}
|
|
Some('-') => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::LeftArrow, "<-", start))
|
|
}
|
|
_ => Ok(self.make_token(TokenKind::LessThan, "<", start)),
|
|
}
|
|
}
|
|
'>' => {
|
|
self.advance();
|
|
if self.peek() == Some('=') {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::GreaterThanOrEqual, ">=", start))
|
|
} else {
|
|
Ok(self.make_token(TokenKind::GreaterThan, ">", start))
|
|
}
|
|
}
|
|
'-' => {
|
|
self.advance();
|
|
if self.peek() == Some('>') {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Arrow, "->", start))
|
|
} else {
|
|
Ok(self.make_token(TokenKind::Dash, "-", start))
|
|
}
|
|
}
|
|
'.' => {
|
|
self.advance();
|
|
if self.peek() == Some('.') {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::DotDot, "..", start))
|
|
} else {
|
|
Ok(self.make_token(TokenKind::Dot, ".", start))
|
|
}
|
|
}
|
|
'=' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Equal, "=", start))
|
|
}
|
|
|
|
// Single-character tokens
|
|
'(' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::LeftParen, "(", start))
|
|
}
|
|
')' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::RightParen, ")", start))
|
|
}
|
|
'[' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::LeftBracket, "[", start))
|
|
}
|
|
']' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::RightBracket, "]", start))
|
|
}
|
|
'{' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::LeftBrace, "{", start))
|
|
}
|
|
'}' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::RightBrace, "}", start))
|
|
}
|
|
',' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Comma, ",", start))
|
|
}
|
|
':' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Colon, ":", start))
|
|
}
|
|
';' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Semicolon, ";", start))
|
|
}
|
|
'|' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Pipe, "|", start))
|
|
}
|
|
'+' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Plus, "+", start))
|
|
}
|
|
'*' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Star, "*", start))
|
|
}
|
|
'/' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Slash, "/", start))
|
|
}
|
|
'%' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Percent, "%", start))
|
|
}
|
|
'^' => {
|
|
self.advance();
|
|
Ok(self.make_token(TokenKind::Caret, "^", start))
|
|
}
|
|
|
|
_ => Err(LexerError {
|
|
message: format!("Unexpected character: '{}'", ch),
|
|
position: start,
|
|
}),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Tokenize a Cypher query string
|
|
pub fn tokenize(input: &str) -> Result<Vec<Token>, LexerError> {
|
|
let mut lexer = Lexer::new(input);
|
|
let mut tokens = Vec::new();
|
|
|
|
loop {
|
|
let token = lexer.next_token()?;
|
|
let is_eof = token.kind == TokenKind::Eof;
|
|
tokens.push(token);
|
|
if is_eof {
|
|
break;
|
|
}
|
|
}
|
|
|
|
Ok(tokens)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_simple_tokens() {
|
|
let tokens = tokenize("MATCH (n) RETURN n").unwrap();
|
|
assert_eq!(tokens[0].kind, TokenKind::Match);
|
|
assert_eq!(tokens[1].kind, TokenKind::LeftParen);
|
|
}
|
|
|
|
#[test]
|
|
fn test_string() {
|
|
let tokens = tokenize("'hello world'").unwrap();
|
|
assert_eq!(tokens[0].kind, TokenKind::String("hello world".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_number() {
|
|
let tokens = tokenize("42 3.14").unwrap();
|
|
assert_eq!(tokens[0].kind, TokenKind::Integer(42));
|
|
assert_eq!(tokens[1].kind, TokenKind::Float(3.14));
|
|
}
|
|
|
|
#[test]
|
|
fn test_relationship() {
|
|
let tokens = tokenize("(a)-[:KNOWS]->(b)").unwrap();
|
|
assert!(tokens.iter().any(|t| t.kind == TokenKind::Arrow));
|
|
}
|
|
}
|