Add primitive tokenizer for USC

This commit is contained in:
John Lorentzson 2025-05-16 12:31:03 +02:00
parent b9bda5ad68
commit 3211f6d441
2 changed files with 163 additions and 0 deletions

View file

@ -0,0 +1,162 @@
(in-package #:user-side-compiler)
(defparameter *special-token-chars*
'(#\+ #\-
#\< #\>
#\( #\)
#\, #\.
#\= #\!)
"Characters that when encountered will finish up textual tokens and begin
the processing of a new token.")
(defparameter *single-token-chars*
'(#\, #\.
#\( #\))
"Characters that are ALWAYS single-character tokens and will end their own
reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
(defclass token ()
((%source :accessor source :initarg :source)))
;; Tokens containing user data
(defclass token-name (token)
((%text :accessor text :initarg :text)))
(defclass token-number (token)
((%value :accessor value :initarg :value)))
;; Special syntax tokens
(defclass token-end-of-statement (token) ())
(defclass token-comma (token) ())
(defclass token-plus (token) ())
(defclass token-open-paren (token) ())
(defclass token-close-paren (token) ())
(defclass token-less-than (token) ())
(defclass token-greater-than (token) ())
;;; String->class-name mappings for operator tokens
(defparameter *operator-token-classes*
`((,(string #\Newline) token-end-of-statement)
(";" token-end-of-statement)
("," token-comma)
("+" token-plus)
("(" token-open-paren)
(")" token-close-paren)
("<" token-less-than)
(">" token-greater-than)))
;;; Keyword tokens
(defclass token-keyword (token)
((%name :accessor name :initarg :name)))
(defparameter *syntax-keywords*
'("end" "for" "do" "times" "if" "then" "else"))
(defun whitespacep (char)
(declare (type character char)
(optimize (speed 3)))
(not (null (member char '(#\Space #\Tab)))))
(defun text-to-token (text &optional source)
(let ((operator-token (cadr (find text *operator-token-classes* :test #'equalp :key #'car))))
(if (not (null operator-token))
(make-instance operator-token :source source)
(cond ((every #'digit-char-p text)
(make-instance 'token-number :source source
:value (parse-integer text)))
((member text *syntax-keywords* :test #'equalp)
(make-instance 'token-keyword :source source
:name (intern (string-upcase text) (find-package '#:keyword))))
(t (make-instance 'token-name :source source :text text))))))
(defun tokenize (stream &optional source)
(let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t))
(tokens '())
(new-token-p t)
(token-source (cons source 0)))
(labels
((next-token ()
(push (text-to-token (copy-seq token-text-buffer) token-source) tokens)
(setf (fill-pointer token-text-buffer) 0
new-token-p t
token-source (cons source (file-position stream)))))
(loop :for char := (read-char stream)
:for next := (peek-char nil stream nil :eof)
:until (eql :eof next)
:do
(cond ((and (not new-token-p) (whitespacep char))
(next-token))
((char= char #\Newline)
;; this will be different later
(unless (zerop (length token-text-buffer))
(next-token))
(vector-push #\Newline token-text-buffer)
(next-token))
((not (whitespacep char))
(vector-push char token-text-buffer)
(setf new-token-p nil)
;; Check if we should end here based on what's next
(when (member next *special-token-chars*)
(next-token))
;; Or if we *are* a special one-off character (that's a different set)
(when (member char *single-token-chars*)
(next-token))))
:finally (next-token)))
(nreverse tokens)))
;;; Jigs
(defparameter *tokens-no-space-before*
'(token-comma
token-open-paren
token-close-paren
token-end-of-statement)
"Token classes which do not allow a space before being reprinted.")
(defparameter *tokens-no-space-after*
'(token-end-of-statement
token-open-paren)
"Token classes which do not add a space after being reprinted.")
(defparameter *reprint-indent-size* 4
"The amount of spaces that an indentation level adds to its line.")
(defun reprint-from-tokens (tokens)
(let ((indent-size 4))
(loop :with indent := 0
:with line-empty-p := t
:for token :in tokens
:for tokens-left := tokens :then (cdr tokens-left)
:for next-token := (second tokens-left)
;; Pre-token style actions
:do (when (and (typep token 'token-keyword)
(member (name token) '(:end :else)))
(decf indent indent-size))
;; Printing action
:do (when line-empty-p
(loop :repeat indent :do (write-char #\Space))
;;(format t "(~D)" indent)
(setf line-empty-p nil))
:do (let ((token-string
(cond ((typep token 'token-keyword)
(format nil "~A" (name token)))
((typep token 'token-name)
(text token))
((typep token 'token-number)
(format nil "~D" (value token)))
(t (car (find (type-of token) *operator-token-classes* :key #'cadr))))))
(format t "~A~A" token-string
(if (or (typep token `(or ,@*tokens-no-space-after*))
(typep next-token `(or ,@*tokens-no-space-before*)))
""
" "))
(when (typep token 'token-end-of-statement)
(setf line-empty-p t)))
;; Post-token style actions
:do (when (and (typep token 'token-keyword)
(member (name token) '(:times :then :else)))
(incf indent indent-size)))))

View file

@ -4,6 +4,7 @@
:serial t :serial t
:components :components
((:file "package") ((:file "package")
(:file "tokenizer")
(:file "label") (:file "label")
(:file "high-level") (:file "high-level")
(:file "instruction"))) (:file "instruction")))