diff --git a/wip-duuqnd/user-side-compiler/tokenizer.lisp b/wip-duuqnd/user-side-compiler/tokenizer.lisp new file mode 100644 index 0000000..21c6a03 --- /dev/null +++ b/wip-duuqnd/user-side-compiler/tokenizer.lisp @@ -0,0 +1,162 @@ +(in-package #:user-side-compiler) + +(defparameter *special-token-chars* + '(#\+ #\- + #\< #\> + #\( #\) + #\, #\. + #\= #\!) + "Characters that when encountered will finish up textual tokens and begin +the processing of a new token.") + +(defparameter *single-token-chars* + '(#\, #\. + #\( #\)) + "Characters that are ALWAYS single-character tokens and will end their own +reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.") + +(defclass token () + ((%source :accessor source :initarg :source))) + +;; Tokens containing user data + +(defclass token-name (token) + ((%text :accessor text :initarg :text))) + +(defclass token-number (token) + ((%value :accessor value :initarg :value))) + +;; Special syntax tokens + +(defclass token-end-of-statement (token) ()) +(defclass token-comma (token) ()) +(defclass token-plus (token) ()) +(defclass token-open-paren (token) ()) +(defclass token-close-paren (token) ()) +(defclass token-less-than (token) ()) +(defclass token-greater-than (token) ()) + +;;; String->class-name mappings for operator tokens + +(defparameter *operator-token-classes* + `((,(string #\Newline) token-end-of-statement) + (";" token-end-of-statement) + ("," token-comma) + ("+" token-plus) + ("(" token-open-paren) + (")" token-close-paren) + ("<" token-less-than) + (">" token-greater-than))) + +;;; Keyword tokens + +(defclass token-keyword (token) + ((%name :accessor name :initarg :name))) + +(defparameter *syntax-keywords* + '("end" "for" "do" "times" "if" "then" "else")) + +(defun whitespacep (char) + (declare (type character char) + (optimize (speed 3))) + (not (null (member char '(#\Space #\Tab))))) + +(defun text-to-token (text &optional source) + (let ((operator-token (cadr (find text *operator-token-classes* :test #'equalp :key #'car)))) + (if (not (null operator-token)) + (make-instance operator-token :source source) + (cond ((every #'digit-char-p text) + (make-instance 'token-number :source source + :value (parse-integer text))) + ((member text *syntax-keywords* :test #'equalp) + (make-instance 'token-keyword :source source + :name (intern (string-upcase text) (find-package '#:keyword)))) + (t (make-instance 'token-name :source source :text text)))))) + +(defun tokenize (stream &optional source) + (let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t)) + (tokens '()) + (new-token-p t) + (token-source (cons source 0))) + (labels + ((next-token () + (push (text-to-token (copy-seq token-text-buffer) token-source) tokens) + (setf (fill-pointer token-text-buffer) 0 + new-token-p t + token-source (cons source (file-position stream))))) + (loop :for char := (read-char stream) + :for next := (peek-char nil stream nil :eof) + :until (eql :eof next) + :do + (cond ((and (not new-token-p) (whitespacep char)) + (next-token)) + ((char= char #\Newline) + ;; this will be different later + (unless (zerop (length token-text-buffer)) + (next-token)) + (vector-push #\Newline token-text-buffer) + (next-token)) + ((not (whitespacep char)) + (vector-push char token-text-buffer) + (setf new-token-p nil) + ;; Check if we should end here based on what's next + (when (member next *special-token-chars*) + (next-token)) + ;; Or if we *are* a special one-off character (that's a different set) + (when (member char *single-token-chars*) + (next-token)))) + :finally (next-token))) + (nreverse tokens))) + +;;; Jigs + +(defparameter *tokens-no-space-before* + '(token-comma + token-open-paren + token-close-paren + token-end-of-statement) + "Token classes which do not allow a space before being reprinted.") + +(defparameter *tokens-no-space-after* + '(token-end-of-statement + token-open-paren) + "Token classes which do not add a space after being reprinted.") + +(defparameter *reprint-indent-size* 4 + "The amount of spaces that an indentation level adds to its line.") + +(defun reprint-from-tokens (tokens) + (let ((indent-size 4)) + (loop :with indent := 0 + :with line-empty-p := t + :for token :in tokens + :for tokens-left := tokens :then (cdr tokens-left) + :for next-token := (second tokens-left) + ;; Pre-token style actions + :do (when (and (typep token 'token-keyword) + (member (name token) '(:end :else))) + (decf indent indent-size)) + ;; Printing action + :do (when line-empty-p + (loop :repeat indent :do (write-char #\Space)) + ;;(format t "(~D)" indent) + (setf line-empty-p nil)) + :do (let ((token-string + (cond ((typep token 'token-keyword) + (format nil "~A" (name token))) + ((typep token 'token-name) + (text token)) + ((typep token 'token-number) + (format nil "~D" (value token))) + (t (car (find (type-of token) *operator-token-classes* :key #'cadr)))))) + (format t "~A~A" token-string + (if (or (typep token `(or ,@*tokens-no-space-after*)) + (typep next-token `(or ,@*tokens-no-space-before*))) + "" + " ")) + (when (typep token 'token-end-of-statement) + (setf line-empty-p t))) + ;; Post-token style actions + :do (when (and (typep token 'token-keyword) + (member (name token) '(:times :then :else))) + (incf indent indent-size))))) diff --git a/wip-duuqnd/user-side-compiler/user-side-compiler.asd b/wip-duuqnd/user-side-compiler/user-side-compiler.asd index 26b18d6..246fd0a 100644 --- a/wip-duuqnd/user-side-compiler/user-side-compiler.asd +++ b/wip-duuqnd/user-side-compiler/user-side-compiler.asd @@ -4,6 +4,7 @@ :serial t :components ((:file "package") + (:file "tokenizer") (:file "label") (:file "high-level") (:file "instruction")))