From 1307e3126867a05bc99d71e2bc2d25110c62f6a3 Mon Sep 17 00:00:00 2001 From: John Lorentzson Date: Thu, 22 May 2025 12:17:56 +0200 Subject: [PATCH] Better handling of operators, source, and comments in tokenizer ...among other minor changes to accomodate the parser. --- wip-duuqnd/user-side-compiler/tokenizer.lisp | 90 +++++++++++++++----- 1 file changed, 68 insertions(+), 22 deletions(-) diff --git a/wip-duuqnd/user-side-compiler/tokenizer.lisp b/wip-duuqnd/user-side-compiler/tokenizer.lisp index 3bce133..8eddf3f 100644 --- a/wip-duuqnd/user-side-compiler/tokenizer.lisp +++ b/wip-duuqnd/user-side-compiler/tokenizer.lisp @@ -15,13 +15,16 @@ the processing of a new token.") "Characters that are ALWAYS single-character tokens and will end their own reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.") +(defparameter *line-comment-chars* + '(#\# #\;)) + (defclass token () ((%source :accessor source :initarg :source :initform nil))) ;; Tokens containing user data (defclass token-name (token) - ((%name :accessor name :initarg :text))) + ((%name :accessor name :initarg :name))) (defmethod print-object ((object token-name) stream) (print-unreadable-object (object stream :type t :identity t) @@ -31,6 +34,15 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.") (defclass token-number (token) ((%value :accessor value :initarg :value))) +(defclass token-comment (token) + ((%text :accessor text :initarg :text))) + +(defmethod comment-p (obj) + nil) + +(defmethod comment-p ((obj token-comment)) + t) + ;; Special syntax tokens (defmacro define-atomic-token (name) @@ -77,7 +89,7 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.") ;;; Keyword tokens -(defclass token-keyword (token) +(defclass token-keyword (token-name) ((%name :accessor name :initarg :name))) (defparameter *syntax-keywords* @@ -98,12 +110,21 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.") ((member text *syntax-keywords* :test #'equalp) (make-instance 'token-keyword :source source :name (intern (string-upcase text) (find-package '#:keyword)))) - (t (make-instance 'token-name :source source :text text)))))) + ((member (aref text 0) *line-comment-chars*) + (let ((start (position-if-not #'whitespacep (subseq text 1)))) + (make-instance 'token-comment + :source source + :text (if (null start) "" + (subseq text start))))) + (t (make-instance 'token-name :source source :name text)))))) (defun tokenize (stream &optional source) (let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t)) (tokens '()) (new-token-p t) + (in-comment-p nil) + (line 1) + (column 0) (token-source (cons source 0))) (labels ((next-token () @@ -113,26 +134,51 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.") token-source (cons source (file-position stream))))) (loop :for char := (read-char stream) :for next := (peek-char nil stream nil :eof) - :until (eql :eof next) + :do (incf column) :do - (cond ((and (not new-token-p) (whitespacep char)) - (next-token)) - ((char= char #\Newline) - ;; this will be different later - (unless (zerop (length token-text-buffer)) - (next-token)) - (vector-push #\Newline token-text-buffer) - (next-token)) - ((not (whitespacep char)) - (vector-push char token-text-buffer) - (setf new-token-p nil) - ;; Check if we should end here based on what's next - (when (member next *special-token-chars*) - (next-token)) - ;; Or if we *are* a special one-off character (that's a different set) - (when (member char *single-token-chars*) - (next-token)))) - :finally (next-token))) + (cond + ;; Break for next token at whitespace + ((and (not new-token-p) (whitespacep char) (not in-comment-p)) + (next-token)) + ;; Process and add a newline + ((char= char #\Newline) + (setf in-comment-p nil + line (1+ line) + column 0) + (unless (zerop (length token-text-buffer)) + (next-token)) + (vector-push #\Newline token-text-buffer) + (next-token)) + ;; Starting a comment + ((and (member char *line-comment-chars* :test #'char=) (not in-comment-p)) + (unless (zerop (length token-text-buffer)) + (next-token)) + (vector-push char token-text-buffer) + (setf in-comment-p t)) + ;; Comment character (next-token'd by newline) + (in-comment-p + (vector-push char token-text-buffer)) + ;; Non-whitespace non-comment characters + ((and (not (whitespacep char)) (not in-comment-p)) + (vector-push char token-text-buffer) + (when new-token-p + (setf token-source (cons source (cons line column))) + (setf new-token-p nil)) + (cond + ;; End if the next is a non-alphanumeric token + ((and (not (member char *special-token-chars*)) + (member next *special-token-chars*)) + (next-token)) + ;; End if the next is an alphanumeric token + ((and (member char *special-token-chars*) + (not (member next *special-token-chars*))) + (next-token)) + ;; Or if we *are* a special one-off character (that's a different set) + ((member char *single-token-chars*) + (next-token)) + ((eql :eof next) + (next-token))))) + :until (eql :eof next))) (nreverse tokens))) ;;; Jigs