From 3211f6d4410cc65f24571282219f740fc40247d6 Mon Sep 17 00:00:00 2001
From: John Lorentzson <duuqnd@stacken.kth.se>
Date: Fri, 16 May 2025 12:31:03 +0200
Subject: [PATCH] Add primitive tokenizer for USC

---
 wip-duuqnd/user-side-compiler/tokenizer.lisp  | 162 ++++++++++++++++++
 .../user-side-compiler/user-side-compiler.asd |   1 +
 2 files changed, 163 insertions(+)
 create mode 100644 wip-duuqnd/user-side-compiler/tokenizer.lisp

diff --git a/wip-duuqnd/user-side-compiler/tokenizer.lisp b/wip-duuqnd/user-side-compiler/tokenizer.lisp
new file mode 100644
index 0000000..21c6a03
--- /dev/null
+++ b/wip-duuqnd/user-side-compiler/tokenizer.lisp
@@ -0,0 +1,162 @@
+(in-package #:user-side-compiler)
+
+(defparameter *special-token-chars*
+  '(#\+ #\-
+    #\< #\>
+    #\( #\)
+    #\, #\.
+    #\= #\!)
+  "Characters that when encountered will finish up textual tokens and begin
+the processing of a new token.")
+
+(defparameter *single-token-chars*
+  '(#\, #\.
+    #\( #\))
+  "Characters that are ALWAYS single-character tokens and will end their own
+reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
+
+(defclass token ()
+  ((%source :accessor source :initarg :source)))
+
+;; Tokens containing user data
+
+(defclass token-name (token)
+  ((%text :accessor text :initarg :text)))
+
+(defclass token-number (token)
+  ((%value :accessor value :initarg :value)))
+
+;; Special syntax tokens
+
+(defclass token-end-of-statement (token) ())
+(defclass token-comma (token) ())
+(defclass token-plus (token) ())
+(defclass token-open-paren (token) ())
+(defclass token-close-paren (token) ())
+(defclass token-less-than (token) ())
+(defclass token-greater-than (token) ())
+
+;;; String->class-name mappings for operator tokens
+
+(defparameter *operator-token-classes*
+  `((,(string #\Newline) token-end-of-statement)
+    (";" token-end-of-statement)
+    ("," token-comma)
+    ("+" token-plus)
+    ("(" token-open-paren)
+    (")" token-close-paren)
+    ("<" token-less-than)
+    (">" token-greater-than)))
+
+;;; Keyword tokens
+
+(defclass token-keyword (token)
+  ((%name :accessor name :initarg :name)))
+
+(defparameter *syntax-keywords*
+  '("end" "for" "do" "times" "if" "then" "else"))
+
+(defun whitespacep (char)
+  (declare (type character char)
+           (optimize (speed 3)))
+  (not (null (member char '(#\Space #\Tab)))))
+
+(defun text-to-token (text &optional source)
+  (let ((operator-token (cadr (find text *operator-token-classes* :test #'equalp :key #'car))))
+    (if (not (null operator-token))
+        (make-instance operator-token :source source)
+        (cond ((every #'digit-char-p text)
+               (make-instance 'token-number :source source
+                                            :value (parse-integer text)))
+              ((member text *syntax-keywords* :test #'equalp)
+               (make-instance 'token-keyword :source source
+                              :name (intern (string-upcase text) (find-package '#:keyword))))
+              (t (make-instance 'token-name :source source :text text))))))
+
+(defun tokenize (stream &optional source)
+  (let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t))
+        (tokens '())
+        (new-token-p t)
+        (token-source (cons source 0)))
+    (labels
+        ((next-token ()
+           (push (text-to-token (copy-seq token-text-buffer) token-source) tokens)
+           (setf (fill-pointer token-text-buffer) 0
+                 new-token-p t
+                 token-source (cons source (file-position stream)))))
+      (loop :for char := (read-char stream)
+            :for next := (peek-char nil stream nil :eof)
+            :until (eql :eof next)
+            :do
+               (cond ((and (not new-token-p) (whitespacep char))
+                      (next-token))
+                     ((char= char #\Newline)
+                      ;; this will be different later
+                      (unless (zerop (length token-text-buffer))
+                        (next-token))
+                      (vector-push #\Newline token-text-buffer)
+                      (next-token))
+                     ((not (whitespacep char))
+                      (vector-push char token-text-buffer)
+                      (setf new-token-p nil)
+                      ;; Check if we should end here based on what's next
+                      (when (member next *special-token-chars*)
+                        (next-token))
+                      ;; Or if we *are* a special one-off character (that's a different set)
+                      (when (member char *single-token-chars*)
+                        (next-token))))
+            :finally (next-token)))
+    (nreverse tokens)))
+
+;;; Jigs
+
+(defparameter *tokens-no-space-before*
+  '(token-comma
+    token-open-paren
+    token-close-paren
+    token-end-of-statement)
+  "Token classes which do not allow a space before being reprinted.")
+
+(defparameter *tokens-no-space-after*
+  '(token-end-of-statement
+    token-open-paren)
+  "Token classes which do not add a space after being reprinted.")
+
+(defparameter *reprint-indent-size* 4
+  "The amount of spaces that an indentation level adds to its line.")
+
+(defun reprint-from-tokens (tokens)
+  (let ((indent-size 4))
+    (loop :with indent := 0
+          :with line-empty-p := t
+          :for token :in tokens
+          :for tokens-left := tokens :then (cdr tokens-left)
+          :for next-token := (second tokens-left)
+          ;; Pre-token style actions
+          :do (when (and (typep token 'token-keyword)
+                         (member (name token) '(:end :else)))
+                (decf indent indent-size))
+              ;; Printing action
+          :do (when line-empty-p
+                (loop :repeat indent :do (write-char #\Space))
+                ;;(format t "(~D)" indent)
+                (setf line-empty-p nil))
+          :do (let ((token-string
+                      (cond ((typep token 'token-keyword)
+                             (format nil "~A" (name token)))
+                            ((typep token 'token-name)
+                             (text token))
+                            ((typep token 'token-number)
+                             (format nil "~D" (value token)))
+                            (t (car (find (type-of token) *operator-token-classes* :key #'cadr))))))
+                (format t "~A~A" token-string
+                        (if (or (typep token `(or ,@*tokens-no-space-after*))
+                                (typep next-token `(or ,@*tokens-no-space-before*)))
+                            ""
+                            " "))
+                (when (typep token 'token-end-of-statement)
+                  (setf line-empty-p t)))
+              ;; Post-token style actions
+          :do (when (and (typep token 'token-keyword)
+                         (member (name token) '(:times :then :else)))
+                (incf indent indent-size)))))
diff --git a/wip-duuqnd/user-side-compiler/user-side-compiler.asd b/wip-duuqnd/user-side-compiler/user-side-compiler.asd
index 26b18d6..246fd0a 100644
--- a/wip-duuqnd/user-side-compiler/user-side-compiler.asd
+++ b/wip-duuqnd/user-side-compiler/user-side-compiler.asd
@@ -4,6 +4,7 @@
   :serial t
   :components
   ((:file "package")
+   (:file "tokenizer")
    (:file "label")
    (:file "high-level")
    (:file "instruction")))