Better handling of operators, source, and comments in tokenizer

...among other minor changes to accomodate the parser.
This commit is contained in:
John Lorentzson 2025-05-22 12:17:56 +02:00
parent 8acbf2caec
commit 1307e31268

View file

@ -15,13 +15,16 @@ the processing of a new token.")
"Characters that are ALWAYS single-character tokens and will end their own
reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
(defparameter *line-comment-chars*
'(#\# #\;))
(defclass token ()
((%source :accessor source :initarg :source :initform nil)))
;; Tokens containing user data
(defclass token-name (token)
((%name :accessor name :initarg :text)))
((%name :accessor name :initarg :name)))
(defmethod print-object ((object token-name) stream)
(print-unreadable-object (object stream :type t :identity t)
@ -31,6 +34,15 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
(defclass token-number (token)
((%value :accessor value :initarg :value)))
(defclass token-comment (token)
((%text :accessor text :initarg :text)))
(defmethod comment-p (obj)
nil)
(defmethod comment-p ((obj token-comment))
t)
;; Special syntax tokens
(defmacro define-atomic-token (name)
@ -77,7 +89,7 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
;;; Keyword tokens
(defclass token-keyword (token)
(defclass token-keyword (token-name)
((%name :accessor name :initarg :name)))
(defparameter *syntax-keywords*
@ -98,12 +110,21 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
((member text *syntax-keywords* :test #'equalp)
(make-instance 'token-keyword :source source
:name (intern (string-upcase text) (find-package '#:keyword))))
(t (make-instance 'token-name :source source :text text))))))
((member (aref text 0) *line-comment-chars*)
(let ((start (position-if-not #'whitespacep (subseq text 1))))
(make-instance 'token-comment
:source source
:text (if (null start) ""
(subseq text start)))))
(t (make-instance 'token-name :source source :name text))))))
(defun tokenize (stream &optional source)
(let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t))
(tokens '())
(new-token-p t)
(in-comment-p nil)
(line 1)
(column 0)
(token-source (cons source 0)))
(labels
((next-token ()
@ -113,26 +134,51 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
token-source (cons source (file-position stream)))))
(loop :for char := (read-char stream)
:for next := (peek-char nil stream nil :eof)
:until (eql :eof next)
:do (incf column)
:do
(cond ((and (not new-token-p) (whitespacep char))
(next-token))
((char= char #\Newline)
;; this will be different later
(unless (zerop (length token-text-buffer))
(next-token))
(vector-push #\Newline token-text-buffer)
(next-token))
((not (whitespacep char))
(vector-push char token-text-buffer)
(setf new-token-p nil)
;; Check if we should end here based on what's next
(when (member next *special-token-chars*)
(next-token))
;; Or if we *are* a special one-off character (that's a different set)
(when (member char *single-token-chars*)
(next-token))))
:finally (next-token)))
(cond
;; Break for next token at whitespace
((and (not new-token-p) (whitespacep char) (not in-comment-p))
(next-token))
;; Process and add a newline
((char= char #\Newline)
(setf in-comment-p nil
line (1+ line)
column 0)
(unless (zerop (length token-text-buffer))
(next-token))
(vector-push #\Newline token-text-buffer)
(next-token))
;; Starting a comment
((and (member char *line-comment-chars* :test #'char=) (not in-comment-p))
(unless (zerop (length token-text-buffer))
(next-token))
(vector-push char token-text-buffer)
(setf in-comment-p t))
;; Comment character (next-token'd by newline)
(in-comment-p
(vector-push char token-text-buffer))
;; Non-whitespace non-comment characters
((and (not (whitespacep char)) (not in-comment-p))
(vector-push char token-text-buffer)
(when new-token-p
(setf token-source (cons source (cons line column)))
(setf new-token-p nil))
(cond
;; End if the next is a non-alphanumeric token
((and (not (member char *special-token-chars*))
(member next *special-token-chars*))
(next-token))
;; End if the next is an alphanumeric token
((and (member char *special-token-chars*)
(not (member next *special-token-chars*)))
(next-token))
;; Or if we *are* a special one-off character (that's a different set)
((member char *single-token-chars*)
(next-token))
((eql :eof next)
(next-token)))))
:until (eql :eof next)))
(nreverse tokens)))
;;; Jigs