Better handling of operators, source, and comments in tokenizer
...among other minor changes to accomodate the parser.
This commit is contained in:
parent
8acbf2caec
commit
1307e31268
1 changed files with 68 additions and 22 deletions
|
@ -15,13 +15,16 @@ the processing of a new token.")
|
|||
"Characters that are ALWAYS single-character tokens and will end their own
|
||||
reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
|
||||
|
||||
(defparameter *line-comment-chars*
|
||||
'(#\# #\;))
|
||||
|
||||
(defclass token ()
|
||||
((%source :accessor source :initarg :source :initform nil)))
|
||||
|
||||
;; Tokens containing user data
|
||||
|
||||
(defclass token-name (token)
|
||||
((%name :accessor name :initarg :text)))
|
||||
((%name :accessor name :initarg :name)))
|
||||
|
||||
(defmethod print-object ((object token-name) stream)
|
||||
(print-unreadable-object (object stream :type t :identity t)
|
||||
|
@ -31,6 +34,15 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
|
|||
(defclass token-number (token)
|
||||
((%value :accessor value :initarg :value)))
|
||||
|
||||
(defclass token-comment (token)
|
||||
((%text :accessor text :initarg :text)))
|
||||
|
||||
(defmethod comment-p (obj)
|
||||
nil)
|
||||
|
||||
(defmethod comment-p ((obj token-comment))
|
||||
t)
|
||||
|
||||
;; Special syntax tokens
|
||||
|
||||
(defmacro define-atomic-token (name)
|
||||
|
@ -77,7 +89,7 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
|
|||
|
||||
;;; Keyword tokens
|
||||
|
||||
(defclass token-keyword (token)
|
||||
(defclass token-keyword (token-name)
|
||||
((%name :accessor name :initarg :name)))
|
||||
|
||||
(defparameter *syntax-keywords*
|
||||
|
@ -98,12 +110,21 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
|
|||
((member text *syntax-keywords* :test #'equalp)
|
||||
(make-instance 'token-keyword :source source
|
||||
:name (intern (string-upcase text) (find-package '#:keyword))))
|
||||
(t (make-instance 'token-name :source source :text text))))))
|
||||
((member (aref text 0) *line-comment-chars*)
|
||||
(let ((start (position-if-not #'whitespacep (subseq text 1))))
|
||||
(make-instance 'token-comment
|
||||
:source source
|
||||
:text (if (null start) ""
|
||||
(subseq text start)))))
|
||||
(t (make-instance 'token-name :source source :name text))))))
|
||||
|
||||
(defun tokenize (stream &optional source)
|
||||
(let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t))
|
||||
(tokens '())
|
||||
(new-token-p t)
|
||||
(in-comment-p nil)
|
||||
(line 1)
|
||||
(column 0)
|
||||
(token-source (cons source 0)))
|
||||
(labels
|
||||
((next-token ()
|
||||
|
@ -113,26 +134,51 @@ reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
|
|||
token-source (cons source (file-position stream)))))
|
||||
(loop :for char := (read-char stream)
|
||||
:for next := (peek-char nil stream nil :eof)
|
||||
:until (eql :eof next)
|
||||
:do (incf column)
|
||||
:do
|
||||
(cond ((and (not new-token-p) (whitespacep char))
|
||||
(next-token))
|
||||
((char= char #\Newline)
|
||||
;; this will be different later
|
||||
(unless (zerop (length token-text-buffer))
|
||||
(next-token))
|
||||
(vector-push #\Newline token-text-buffer)
|
||||
(next-token))
|
||||
((not (whitespacep char))
|
||||
(vector-push char token-text-buffer)
|
||||
(setf new-token-p nil)
|
||||
;; Check if we should end here based on what's next
|
||||
(when (member next *special-token-chars*)
|
||||
(next-token))
|
||||
;; Or if we *are* a special one-off character (that's a different set)
|
||||
(when (member char *single-token-chars*)
|
||||
(next-token))))
|
||||
:finally (next-token)))
|
||||
(cond
|
||||
;; Break for next token at whitespace
|
||||
((and (not new-token-p) (whitespacep char) (not in-comment-p))
|
||||
(next-token))
|
||||
;; Process and add a newline
|
||||
((char= char #\Newline)
|
||||
(setf in-comment-p nil
|
||||
line (1+ line)
|
||||
column 0)
|
||||
(unless (zerop (length token-text-buffer))
|
||||
(next-token))
|
||||
(vector-push #\Newline token-text-buffer)
|
||||
(next-token))
|
||||
;; Starting a comment
|
||||
((and (member char *line-comment-chars* :test #'char=) (not in-comment-p))
|
||||
(unless (zerop (length token-text-buffer))
|
||||
(next-token))
|
||||
(vector-push char token-text-buffer)
|
||||
(setf in-comment-p t))
|
||||
;; Comment character (next-token'd by newline)
|
||||
(in-comment-p
|
||||
(vector-push char token-text-buffer))
|
||||
;; Non-whitespace non-comment characters
|
||||
((and (not (whitespacep char)) (not in-comment-p))
|
||||
(vector-push char token-text-buffer)
|
||||
(when new-token-p
|
||||
(setf token-source (cons source (cons line column)))
|
||||
(setf new-token-p nil))
|
||||
(cond
|
||||
;; End if the next is a non-alphanumeric token
|
||||
((and (not (member char *special-token-chars*))
|
||||
(member next *special-token-chars*))
|
||||
(next-token))
|
||||
;; End if the next is an alphanumeric token
|
||||
((and (member char *special-token-chars*)
|
||||
(not (member next *special-token-chars*)))
|
||||
(next-token))
|
||||
;; Or if we *are* a special one-off character (that's a different set)
|
||||
((member char *single-token-chars*)
|
||||
(next-token))
|
||||
((eql :eof next)
|
||||
(next-token)))))
|
||||
:until (eql :eof next)))
|
||||
(nreverse tokens)))
|
||||
|
||||
;;; Jigs
|
||||
|
|
Loading…
Add table
Reference in a new issue