Better handling of operators, source, and comments in tokenizer

...among other minor changes to accomodate the parser.
This commit is contained in:
John Lorentzson 2025-05-22 12:17:56 +02:00
parent 8acbf2caec
commit 1307e31268

View file

@ -15,13 +15,16 @@ the processing of a new token.")
"Characters that are ALWAYS single-character tokens and will end their own "Characters that are ALWAYS single-character tokens and will end their own
reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.") reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
(defparameter *line-comment-chars*
'(#\# #\;))
(defclass token () (defclass token ()
((%source :accessor source :initarg :source :initform nil))) ((%source :accessor source :initarg :source :initform nil)))
;; Tokens containing user data ;; Tokens containing user data
(defclass token-name (token) (defclass token-name (token)
((%name :accessor name :initarg :text))) ((%name :accessor name :initarg :name)))
(defmethod print-object ((object token-name) stream) (defmethod print-object ((object token-name) stream)
(print-unreadable-object (object stream :type t :identity t) (print-unreadable-object (object stream :type t :identity t)
@ -31,6 +34,15 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
(defclass token-number (token) (defclass token-number (token)
((%value :accessor value :initarg :value))) ((%value :accessor value :initarg :value)))
(defclass token-comment (token)
((%text :accessor text :initarg :text)))
(defmethod comment-p (obj)
nil)
(defmethod comment-p ((obj token-comment))
t)
;; Special syntax tokens ;; Special syntax tokens
(defmacro define-atomic-token (name) (defmacro define-atomic-token (name)
@ -77,7 +89,7 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
;;; Keyword tokens ;;; Keyword tokens
(defclass token-keyword (token) (defclass token-keyword (token-name)
((%name :accessor name :initarg :name))) ((%name :accessor name :initarg :name)))
(defparameter *syntax-keywords* (defparameter *syntax-keywords*
@ -98,12 +110,21 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
((member text *syntax-keywords* :test #'equalp) ((member text *syntax-keywords* :test #'equalp)
(make-instance 'token-keyword :source source (make-instance 'token-keyword :source source
:name (intern (string-upcase text) (find-package '#:keyword)))) :name (intern (string-upcase text) (find-package '#:keyword))))
(t (make-instance 'token-name :source source :text text)))))) ((member (aref text 0) *line-comment-chars*)
(let ((start (position-if-not #'whitespacep (subseq text 1))))
(make-instance 'token-comment
:source source
:text (if (null start) ""
(subseq text start)))))
(t (make-instance 'token-name :source source :name text))))))
(defun tokenize (stream &optional source) (defun tokenize (stream &optional source)
(let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t)) (let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t))
(tokens '()) (tokens '())
(new-token-p t) (new-token-p t)
(in-comment-p nil)
(line 1)
(column 0)
(token-source (cons source 0))) (token-source (cons source 0)))
(labels (labels
((next-token () ((next-token ()
@ -113,26 +134,51 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
token-source (cons source (file-position stream))))) token-source (cons source (file-position stream)))))
(loop :for char := (read-char stream) (loop :for char := (read-char stream)
:for next := (peek-char nil stream nil :eof) :for next := (peek-char nil stream nil :eof)
:until (eql :eof next) :do (incf column)
:do :do
(cond ((and (not new-token-p) (whitespacep char)) (cond
(next-token)) ;; Break for next token at whitespace
((char= char #\Newline) ((and (not new-token-p) (whitespacep char) (not in-comment-p))
;; this will be different later (next-token))
(unless (zerop (length token-text-buffer)) ;; Process and add a newline
(next-token)) ((char= char #\Newline)
(vector-push #\Newline token-text-buffer) (setf in-comment-p nil
(next-token)) line (1+ line)
((not (whitespacep char)) column 0)
(vector-push char token-text-buffer) (unless (zerop (length token-text-buffer))
(setf new-token-p nil) (next-token))
;; Check if we should end here based on what's next (vector-push #\Newline token-text-buffer)
(when (member next *special-token-chars*) (next-token))
(next-token)) ;; Starting a comment
;; Or if we *are* a special one-off character (that's a different set) ((and (member char *line-comment-chars* :test #'char=) (not in-comment-p))
(when (member char *single-token-chars*) (unless (zerop (length token-text-buffer))
(next-token)))) (next-token))
:finally (next-token))) (vector-push char token-text-buffer)
(setf in-comment-p t))
;; Comment character (next-token'd by newline)
(in-comment-p
(vector-push char token-text-buffer))
;; Non-whitespace non-comment characters
((and (not (whitespacep char)) (not in-comment-p))
(vector-push char token-text-buffer)
(when new-token-p
(setf token-source (cons source (cons line column)))
(setf new-token-p nil))
(cond
;; End if the next is a non-alphanumeric token
((and (not (member char *special-token-chars*))
(member next *special-token-chars*))
(next-token))
;; End if the next is an alphanumeric token
((and (member char *special-token-chars*)
(not (member next *special-token-chars*)))
(next-token))
;; Or if we *are* a special one-off character (that's a different set)
((member char *single-token-chars*)
(next-token))
((eql :eof next)
(next-token)))))
:until (eql :eof next)))
(nreverse tokens))) (nreverse tokens)))
;;; Jigs ;;; Jigs