Add primitive tokenizer for USC
This commit is contained in:
parent
b9bda5ad68
commit
3211f6d441
2 changed files with 163 additions and 0 deletions
162
wip-duuqnd/user-side-compiler/tokenizer.lisp
Normal file
162
wip-duuqnd/user-side-compiler/tokenizer.lisp
Normal file
|
@ -0,0 +1,162 @@
|
|||
(in-package #:user-side-compiler)
|
||||
|
||||
(defparameter *special-token-chars*
|
||||
'(#\+ #\-
|
||||
#\< #\>
|
||||
#\( #\)
|
||||
#\, #\.
|
||||
#\= #\!)
|
||||
"Characters that when encountered will finish up textual tokens and begin
|
||||
the processing of a new token.")
|
||||
|
||||
(defparameter *single-token-chars*
|
||||
'(#\, #\.
|
||||
#\( #\))
|
||||
"Characters that are ALWAYS single-character tokens and will end their own
|
||||
reading immediately. Should be a subset of *SPECIAL-TOKEN-CHARS*.")
|
||||
|
||||
(defclass token ()
|
||||
((%source :accessor source :initarg :source)))
|
||||
|
||||
;; Tokens containing user data
|
||||
|
||||
(defclass token-name (token)
|
||||
((%text :accessor text :initarg :text)))
|
||||
|
||||
(defclass token-number (token)
|
||||
((%value :accessor value :initarg :value)))
|
||||
|
||||
;; Special syntax tokens
|
||||
|
||||
(defclass token-end-of-statement (token) ())
|
||||
(defclass token-comma (token) ())
|
||||
(defclass token-plus (token) ())
|
||||
(defclass token-open-paren (token) ())
|
||||
(defclass token-close-paren (token) ())
|
||||
(defclass token-less-than (token) ())
|
||||
(defclass token-greater-than (token) ())
|
||||
|
||||
;;; String->class-name mappings for operator tokens
|
||||
|
||||
(defparameter *operator-token-classes*
|
||||
`((,(string #\Newline) token-end-of-statement)
|
||||
(";" token-end-of-statement)
|
||||
("," token-comma)
|
||||
("+" token-plus)
|
||||
("(" token-open-paren)
|
||||
(")" token-close-paren)
|
||||
("<" token-less-than)
|
||||
(">" token-greater-than)))
|
||||
|
||||
;;; Keyword tokens
|
||||
|
||||
(defclass token-keyword (token)
|
||||
((%name :accessor name :initarg :name)))
|
||||
|
||||
(defparameter *syntax-keywords*
|
||||
'("end" "for" "do" "times" "if" "then" "else"))
|
||||
|
||||
(defun whitespacep (char)
|
||||
(declare (type character char)
|
||||
(optimize (speed 3)))
|
||||
(not (null (member char '(#\Space #\Tab)))))
|
||||
|
||||
(defun text-to-token (text &optional source)
|
||||
(let ((operator-token (cadr (find text *operator-token-classes* :test #'equalp :key #'car))))
|
||||
(if (not (null operator-token))
|
||||
(make-instance operator-token :source source)
|
||||
(cond ((every #'digit-char-p text)
|
||||
(make-instance 'token-number :source source
|
||||
:value (parse-integer text)))
|
||||
((member text *syntax-keywords* :test #'equalp)
|
||||
(make-instance 'token-keyword :source source
|
||||
:name (intern (string-upcase text) (find-package '#:keyword))))
|
||||
(t (make-instance 'token-name :source source :text text))))))
|
||||
|
||||
(defun tokenize (stream &optional source)
|
||||
(let ((token-text-buffer (make-array 32 :element-type 'character :fill-pointer 0 :adjustable t))
|
||||
(tokens '())
|
||||
(new-token-p t)
|
||||
(token-source (cons source 0)))
|
||||
(labels
|
||||
((next-token ()
|
||||
(push (text-to-token (copy-seq token-text-buffer) token-source) tokens)
|
||||
(setf (fill-pointer token-text-buffer) 0
|
||||
new-token-p t
|
||||
token-source (cons source (file-position stream)))))
|
||||
(loop :for char := (read-char stream)
|
||||
:for next := (peek-char nil stream nil :eof)
|
||||
:until (eql :eof next)
|
||||
:do
|
||||
(cond ((and (not new-token-p) (whitespacep char))
|
||||
(next-token))
|
||||
((char= char #\Newline)
|
||||
;; this will be different later
|
||||
(unless (zerop (length token-text-buffer))
|
||||
(next-token))
|
||||
(vector-push #\Newline token-text-buffer)
|
||||
(next-token))
|
||||
((not (whitespacep char))
|
||||
(vector-push char token-text-buffer)
|
||||
(setf new-token-p nil)
|
||||
;; Check if we should end here based on what's next
|
||||
(when (member next *special-token-chars*)
|
||||
(next-token))
|
||||
;; Or if we *are* a special one-off character (that's a different set)
|
||||
(when (member char *single-token-chars*)
|
||||
(next-token))))
|
||||
:finally (next-token)))
|
||||
(nreverse tokens)))
|
||||
|
||||
;;; Jigs
|
||||
|
||||
(defparameter *tokens-no-space-before*
|
||||
'(token-comma
|
||||
token-open-paren
|
||||
token-close-paren
|
||||
token-end-of-statement)
|
||||
"Token classes which do not allow a space before being reprinted.")
|
||||
|
||||
(defparameter *tokens-no-space-after*
|
||||
'(token-end-of-statement
|
||||
token-open-paren)
|
||||
"Token classes which do not add a space after being reprinted.")
|
||||
|
||||
(defparameter *reprint-indent-size* 4
|
||||
"The amount of spaces that an indentation level adds to its line.")
|
||||
|
||||
(defun reprint-from-tokens (tokens)
|
||||
(let ((indent-size 4))
|
||||
(loop :with indent := 0
|
||||
:with line-empty-p := t
|
||||
:for token :in tokens
|
||||
:for tokens-left := tokens :then (cdr tokens-left)
|
||||
:for next-token := (second tokens-left)
|
||||
;; Pre-token style actions
|
||||
:do (when (and (typep token 'token-keyword)
|
||||
(member (name token) '(:end :else)))
|
||||
(decf indent indent-size))
|
||||
;; Printing action
|
||||
:do (when line-empty-p
|
||||
(loop :repeat indent :do (write-char #\Space))
|
||||
;;(format t "(~D)" indent)
|
||||
(setf line-empty-p nil))
|
||||
:do (let ((token-string
|
||||
(cond ((typep token 'token-keyword)
|
||||
(format nil "~A" (name token)))
|
||||
((typep token 'token-name)
|
||||
(text token))
|
||||
((typep token 'token-number)
|
||||
(format nil "~D" (value token)))
|
||||
(t (car (find (type-of token) *operator-token-classes* :key #'cadr))))))
|
||||
(format t "~A~A" token-string
|
||||
(if (or (typep token `(or ,@*tokens-no-space-after*))
|
||||
(typep next-token `(or ,@*tokens-no-space-before*)))
|
||||
""
|
||||
" "))
|
||||
(when (typep token 'token-end-of-statement)
|
||||
(setf line-empty-p t)))
|
||||
;; Post-token style actions
|
||||
:do (when (and (typep token 'token-keyword)
|
||||
(member (name token) '(:times :then :else)))
|
||||
(incf indent indent-size)))))
|
|
@ -4,6 +4,7 @@
|
|||
:serial t
|
||||
:components
|
||||
((:file "package")
|
||||
(:file "tokenizer")
|
||||
(:file "label")
|
||||
(:file "high-level")
|
||||
(:file "instruction")))
|
||||
|
|
Loading…
Add table
Reference in a new issue