;;; ------------------------------------------- ;;; General utilities ;; Convert string to list (defun string-to-list (s) (assert (stringp s) (s) "~s should be a string but is not") (coerce s 'list)) ;; Produce a list of all numbers from i to j (defun make-range (i j) (if (> i j) nil (cons i (make-range (1+ i) j)))) ;; Produce a list of all characters from a to b (defun char-range (a b) (map 'list #'code-char (make-range (char-code a) (char-code b)))) ;;; ------------------------------------------- ;;; Functions to act on a simple regular expression ;; Define a list of regular expressions (defstruct rexpr-list (state-count 1) (transitions nil) (finals nil)) ;; Allocate a new state (defun get-new-state (rlist) (let ((state (rexpr-list-state-count rlist))) (setf (rexpr-list-state-count rlist) (+ 1 state)) state)) ;; Add a transition to the table (defun add-transition (rlist transition) (push transition (rexpr-list-transitions rlist))) ;;; ------------------------------------------- ;;; Tools to generate an NFA from a rexpr ;; Convert a single character to an NFA (defun rexpr-char (rlist start a) (let ((state (get-new-state rlist))) (add-transition rlist `(,start ,state (,a))) state)) ;; Convert an epsilon production to an NFA (defun rexpr-eps (rlist start) (let ((state (get-new-state rlist))) (add-transition rlist `(,start ,state)) state)) ;; Convert a range to an NFA (defun rexpr-range (rlist start a b) (let ((state (get-new-state rlist))) (add-transition rlist `(,start ,state ,(char-range a b))) state)) ;; Convert an OR to an NFA (defun rexpr-or (rlist start rexprs) (let ((state (get-new-state rlist))) (dolist (rexpr rexprs) (add-transition rlist `(,(rexpr-sub rlist start rexpr) ,state))) state)) ;; Convert concatenation to an NFA (defun rexpr-cat (rlist start rexprs) (dolist (rexpr rexprs) (setf start (rexpr-sub rlist start rexpr))) start) ;; Convert * to an NFA (defun rexpr-* (rlist start rexpr) (let* ((star-start (get-new-state rlist)) (star-end (rexpr-sub rlist star-start rexpr))) (add-transition rlist `(,start ,star-start)) (add-transition rlist `(,star-end ,star-start)) star-start)) ;; Convert + to an NFA (defun rexpr-+ (rlist start rexpr) (rexpr-sub rlist start `(cat ,rexpr (* ,rexpr)))) ;; Convert ? to an NFA (defun rexpr-? (rlist start rexpr) (rexpr-sub rlist start `(or (eps) ,rexpr))) ;; Convert a digit range to an NFA (defun rexpr-digit (rlist start) (rexpr-sub rlist start `(range #\0 #\9))) ;; Convert a letter to an NFA (defun rexpr-letter (rlist start) (rexpr-sub rlist start `(or (range #\a #\z) (range #\A #\Z)))) ;; Convert a string to an NFA (defun rexpr-string (rlist start s) (let ((l (string-to-list s))) (rexpr-sub rlist start `(cat ,@(mapcar #'(lambda (c) `(char ,c)) l))))) ;; Handle a regular expression or subexpression (defun rexpr-sub (rlist start rexpr) (case (car rexpr) (char (rexpr-char rlist start (second rexpr))) (eps (rexpr-eps rlist start)) (range (rexpr-range rlist start (second rexpr) (third rexpr))) (or (rexpr-or rlist start (cdr rexpr))) (cat (rexpr-cat rlist start (cdr rexpr))) (* (rexpr-* rlist start (second rexpr))) (+ (rexpr-+ rlist start (second rexpr))) (? (rexpr-? rlist start (second rexpr))) (digit (rexpr-digit rlist start)) (letter (rexpr-letter rlist start)) (string (rexpr-string rlist start (second rexpr))) (otherwise (error "Unknown rexpr command")))) ;; Produce a regular expression (defun rexpr-process (rlist name rexpr) (push (cons (rexpr-sub rlist 0 rexpr) name) (rexpr-list-finals rlist))) ;;; ------------------------------------------- ;;; Tools to generate a DFA from an NFA ;; Find the states reachable from s through c (defun nfa-edge (rlist s c) (let ((reachable nil)) (dolist (edge (rexpr-list-transitions rlist)) (let ((source (first edge)) (dest (second edge)) (label (third edge))) (if (and (eq source s) (or (and (eq c 'epsilon) (null label)) (member c label))) (progn (pushnew dest reachable))))) reachable)) ;; Find the states reachable from S through c (defun nfa-edges (rlist S c) (let ((reachable nil)) (dolist (s S) (setf reachable (union reachable (nfa-edge rlist s c)))) reachable)) ;; Find the epsilon closure of a set of states S (defun eps-closure (rlist S) (let ((reachable (nfa-edges rlist S 'epsilon))) (if (subsetp reachable S) S (eps-closure rlist (union reachable S))))) ;; Find the set of states which can be reached through S on c (defun dfa-edge (rlist S c) (eps-closure rlist (nfa-edges rlist S c))) ;; Get the entire alphabet (defun rlist-alphabet (rlist) (let ((alphabet nil) (edges (rexpr-list-transitions rlist))) (dolist (edge edges) (setf alphabet (union alphabet (third edge)))) alphabet)) ;; Decide if two sets are equal (defun set-equal (S1 S2) (and (subsetp S1 S2) (subsetp S2 S1))) ;; Get the highest-priority final state (defun choose-final (finals state-set) (let ((best nil) (state-set (if (listp state-set) state-set (list state-set)))) (dolist (state state-set) (let ((state-final (assoc state finals))) (if (or (not best) (and state-final (< (car state-final) (car best)))) (setf best state-final)))) (cdr best))) ;; Convert an NFA to a DFA (defun nfa-to-dfa (rlist) (let ((dfa-rlist (make-rexpr-list)) (alphabet (rlist-alphabet rlist)) (nfa-finals (rexpr-list-finals rlist)) (dfa-sets nil) (queue nil)) (push `(0 . ,(eps-closure rlist '(0))) dfa-sets) (push 0 queue) (until (null queue) (let* ((dfa-state (pop queue)) (S (cdr (assoc dfa-state dfa-sets)))) (dolist (c alphabet) (let* ((Snew (dfa-edge rlist S c)) (state (car (rassoc Snew dfa-sets :test #'set-equal)))) (if Snew (progn (if (not state) (progn (setf state (get-new-state dfa-rlist)) (push `(,state . ,Snew) dfa-sets) (push state queue) (let ((final-state (choose-final nfa-finals Snew))) (if final-state (push (cons state final-state) (rexpr-list-finals dfa-rlist)))))) (add-transition dfa-rlist `(,dfa-state ,state (,c))))))))) (pack-dfa-rlist dfa-rlist))) ;;; ------------------------------------------- ;;; Tools to post-process a DFA (packing the transitions list) ;; Order predicate for pairs (defun pair-order-pred (x y) (or (< (first x) (first y)) (and (= (first x) (first y)) (< (second x) (second y))))) ;; Put the transition list into sorted order as a prelude for packing (defun sort-dfa-rlist (rlist) (setf (rexpr-list-transitions rlist) (sort (rexpr-list-transitions rlist) #'pair-order-pred))) ;; Check whether two transitions can be merged (defun same-transition (t1 t2) (and (eq (first t1) (first t2)) (eq (second t1) (second t2)))) ;; Pack the list (defun pack-dfa-rlist (rlist) (sort-dfa-rlist rlist) (let ((old-list (rexpr-list-transitions rlist)) (new-list nil)) (push (pop old-list) new-list) (dolist (tr old-list) (if (same-transition tr (car new-list)) (setf (third (car new-list)) (union (third tr) (third (car new-list)))) (push tr new-list))) (setf (rexpr-list-transitions rlist) new-list)) rlist) ;;; ------------------------------------------- ;;; Tools to compress a DFA ;; Mark (p,q) as inequivalent (defun mark-inequiv (inequiv p q) (setf (aref inequiv p q) t) (setf (aref inequiv q p) t)) ;; Mark inequivalent pairs of final states (defun mark-inequiv-among-finals (rlist inequiv) (let ((finals (rexpr-list-finals rlist))) (dolist (p finals) (dolist (q finals) (if (not (eq (cdr p) (cdr q))) (mark-inequiv inequiv (car p) (car q))))))) ;; Mark non-final states as inequivalent to final states (defun mark-inequiv-to-finals (rlist inequiv) (let ((nstates (rexpr-list-state-count rlist)) (finals (mapcar #'car (rexpr-list-finals rlist)))) (dolist (p finals) (dotimes (q nstates) (if (not (member q finals)) (mark-inequiv inequiv p q)))))) ;; Start the iteration by marking inequivalence to finals (defun mark-finals-inequiv (rlist inequiv) (mark-inequiv-among-finals rlist inequiv) (mark-inequiv-to-finals rlist inequiv)) ;; Search for new inequivalences (defun mark-inequiv-once (rlist inequiv) (let ((rules (rexpr-list-transitions rlist)) (newfound nil)) (dolist (rule1 rules) (dolist (rule2 rules) (let ((x (first rule1)) (y (first rule2)) (xp (second rule1)) (yp (second rule2)) (xc (third rule1)) (yc (third rule2))) (if (and (aref inequiv xp yp) (intersection xc yc)) (progn (setf newfound (or newfound (not (aref inequiv x y)))) (mark-inequiv inequiv x y)))))) newfound)) ;; Iteratively seek out all inequivalences (defun mark-inequiv-all (rlist inequiv) (mark-finals-inequiv rlist inequiv) (while (mark-inequiv-once rlist inequiv)) nil) ;; Build a table of inequivalences (defun build-inequiv-table (rlist) (let* ((nstates (rexpr-list-state-count rlist)) (inequiv (make-array `(,nstates ,nstates) :initial-element nil))) (mark-inequiv-all rlist inequiv) inequiv)) ;; Turn a table of inequivalences into a list of equivalence classes (defun find-equiv-helper (rlist inequiv) (let* ((nstates (rexpr-list-state-count rlist)) (processed (make-array `(,nstates) :initial-element nil)) (equiv nil)) (dotimes (p nstates) (if (not (aref processed p)) (let ((p-equiv nil)) (dotimes (q nstates) (if (and (not (aref inequiv p q)) (>= q p)) (progn (push q p-equiv) (setf (aref processed q) t)))) (push p-equiv equiv)))) (nreverse equiv))) ;; Find equivalence classes (defun find-equiv (rlist) (find-equiv-helper rlist (build-inequiv-table rlist))) ;; Map the old rule list to the compressed rule list (defun map-compressed-rules (map old-rlist new-rlist) (dolist (rule (rexpr-list-transitions old-rlist)) (push `(,(aref map (first rule)) ,(aref map (second rule)) ,(third rule)) (rexpr-list-transitions new-rlist)))) ;; Construct a compressed DFA (defun compress-dfa (rlist) (let* ((old-nstates (rexpr-list-state-count rlist)) (new-rlist (make-rexpr-list)) (equiv (find-equiv rlist)) (map (make-array `(,old-nstates) :initial-element nil))) (setf (rexpr-list-state-count new-rlist) 0) (dolist (state-class equiv) (let ((state-id (get-new-state new-rlist))) (dolist (state state-class) (setf (aref map state) state-id)) (let ((final-name (choose-final (rexpr-list-finals rlist) state-class))) (if final-name (push (cons state-id final-name) (rexpr-list-finals new-rlist)))))) (map-compressed-rules map rlist new-rlist) (pack-dfa-rlist new-rlist))) ;;; ------------------------------------------- ;;; Parse a string (the simple inefficient way) (defstruct lex-machine (state 0) (pos 0) (last-start-pos 0) (last-final-state nil) (last-final-pos nil) (tokens nil)) ;; Find the final id for this state (or nil if it's not final) (defun lex-final-id (rlist state) (let* ((finals (rexpr-list-finals rlist)) (final (assoc state finals))) (if final (cdr final)))) ;; Find the next state (defun lex-next-state (rlist l c) (let ((rules (rexpr-list-transitions rlist)) (state (lex-machine-state l)) (next-state nil)) (dolist (rule rules) (if (and (eq state (first rule)) (member c (third rule))) (setf next-state (second rule)))) next-state)) ;; Record if we're in a final state (defun lex-update-final (rlist l) (let* ((state (lex-machine-state l)) (pos (lex-machine-pos l)) (final (lex-final-id rlist state))) (if final (progn (setf (lex-machine-last-final-state l) final) (setf (lex-machine-last-final-pos l) pos))) final)) ;; Move states (defun lex-move-state (rlist l c) (let ((next-state (lex-next-state rlist l c))) (if next-state (progn (setf (lex-machine-state l) next-state) (lex-update-final rlist l)) (lex-accept-token l)))) ;; Accept a token (defun lex-accept-token (l) (let ((final (lex-machine-last-final-state l)) (p1 (lex-machine-last-start-pos l)) (p2 (lex-machine-last-final-pos l))) (if final (progn (push `(,final ,p1 ,p2) (lex-machine-tokens l)) (setf (lex-machine-pos l) p2) (setf (lex-machine-state l) 0) (setf (lex-machine-last-start-pos l) p2) (setf (lex-machine-last-final-pos l) nil) (setf (lex-machine-last-final-state l) nil)) (error "Lex error in string")))) ;; Tokenize a string into (id start end) tuples (defun lex-string-1 (rlist s) (let ((lex (make-lex-machine)) (len (length s))) (while (< (lex-machine-pos lex) len) (let ((c (aref s (lex-machine-pos lex)))) (incf (lex-machine-pos lex)) (lex-move-state rlist lex c))) (lex-accept-token lex) (nreverse (lex-machine-tokens lex)))) ;; Unpack the token lists produced by lex-string-1 (defun lexed-string-unpack (tok-list s) (when tok-list (let* ((tok-spec (car tok-list)) (tok-rest (cdr tok-list)) (tok-type (first tok-spec)) (tok-str (subseq s (second tok-spec) (third tok-spec))) (tok-val (cond ((eq tok-type :ignore) nil) ((functionp tok-type) (funcall tok-type tok-str)) (t tok-type)))) (if tok-val (cons tok-val (lexed-string-unpack tok-rest s)) (lexed-string-unpack tok-rest s))))) (defun lex-string (rlist s) (lexed-string-unpack (lex-string-1 rlist s) s)) ;;; ------------------------------------------- ;;; Macro to construct and compact a lexer (defmacro deflexer (name &rest rules) `(setf ,name (let ((rlist (make-rexpr-list))) ,@(map 'list #'(lambda (rule) (let ((lhs (first rule)) (rhs (second rule))) (if (listp lhs) `(rexpr-process rlist #'(lambda (token) ,lhs) ',rhs) `(rexpr-process rlist ',lhs ',rhs)))) rules) (compress-dfa (nfa-to-dfa rlist))))) ;;; ------------------------------------------- ;;; Test case #| (deflexer test-rlist ;; Real number ((cons 'real (string->num token)) (or (cat (* (digit)) (char #\.) (+ (digit))) (cat (+ (digit)) (char #\.) (* (digit))))) ;; Integer ((cons 'int (string->int token)) (+ (digit))) ;; Keywords (if (string "if")) (then (string "then")) (else (string "else")) ;; Identifier ((cons 'id token) (cat (letter) (* (or (letter) (digit))))) ;; Whitespace (:ignore (char #\ ))) ;; (pprint test-rlist) |#