git.fiddlerwoaroof.com
Raw Blame History
;;; File: parser/lexer    Author: John

;;; token data structure: a list with the token type in the
;;; car and other information in the rest of the list.  Symbols
;;; designate the token type.

;;; Reserved tokens use the name as the type and have no args.
;;; Reserved tokens:
;;;  case class data default deriving else hiding if import in infix
;;;  infixl infixr instance interface let module of renaming then to
;;;  type where .. :: => = @ \ | ~ <- -> `
;;; Other tokens:
;;;  (file string)
;;;  (newline line indent-column)
;;;  (conid string)
;;;  (varid string)
;;;  (consym string)
;;;  (varsym string)
;;;  (comment string) ;;; not used at the moment
;;;  (integer integer)
;;;  (float integer fraction exponent) 
;;;  (string string)
;;;  (eof)


;;; *** All of the stuff for lexing character and string literals is
;;; *** broken because it assumes that the host Lisp uses the ASCII
;;; *** encoding for characters and supports at least 255 characters.
;;; *** I have marked the specific places in the code where these
;;; *** assumptions are made, but fixing the problem will probably
;;; *** require more drastic changes anyway -- such as using integers
;;; *** instead of characters and vectors of integers instead of characters
;;; *** throughout the compiler.

(define *max-char* 255)  ; highest char-code allowed.

;;; This defines the long names of the control chars.  Note that some of
;;; this duplicates the table above & the reader.

(define *control-char-names* '(
  ("NUL" . 0) ("SOH" . 1) ("STX" . 2) ("ETX" . 3)
  ("EOT" . 4) ("ENQ" . 5) ("ACK" . 6) ("BEL" . 7)
  ("BS" . 8) ("HT" . 9) ("LF" . 10) ("VT" . 11)
  ("FF" . 12) ("CR" . 13) ("SO" . 14) ("SI" . 15)
  ("DLE" . 16) ("DC1" . 17) ("DC2" . 18) ("DC3" . 19)
  ("DC4" . 20) ("NAK" . 21) ("SYN" . 22) ("ETB" . 23)
  ("CAN" . 24) ("EM" . 25) ("SUB" . 26) ("ESC" . 27)
  ("FS" . 28) ("GS" . 29) ("RS" . 30) ("US" . 31)
  ("SP" . 32) ("DEL" . 127)))

;;; This defines the short names for a few control chars.  This
;;; is keyed off the previous table

(define *short-control-char-names* '(
   (#\a . "BEL")    (#\b . "BS")    (#\f . "FF")    (#\n . "LF")
   (#\r . "CR") (#\t . "HT") (#\v . "VT")))

;;; This is used in the ^X construct.  Assume that ^X = code for ^A + X-A
;;; *** This is an invalid assumption.

(define *control-A* 1)

;;; This function is the interface between the lexer and the rest
;;; of the system.  Note that the `file' reported in error messages
;;; must be bound in an outer context.


;;; *** I think this function should be binding these variables and not
;;; *** just assigning them.

(define (lex-port port literate?)
  (setf *lex-literate?* literate?)
  (setf *current-line* 1)
  (setf *current-col* 0)
  (setf *on-new-line?* '#t)
  (setf *save-col?* '#f)
  (setf *port* port)
  (setf *tokens* '())
  (setf *char* (read-char *port*))
  (setf *peek-char* (read-char *port*))
  (when (eof-object? *char*)
	(setf *char* '#\space))
  (when (eof-object? *peek-char*)
	(setf *peek-char* '#\space))
  (setf *at-eof/p?* '#f)
  (setf *at-eof?* '#f)
  (when *lex-literate?*
     (process-literate-comments '#t))
  (parse-till-eof)
  (nreverse *tokens*))

(define (parse-till-eof)
  (cond (*at-eof?*
	 (emit-token 'eof)
	 '())
	(else
	 (lex-one-token)
	 (parse-till-eof))))

;;; There is an assumption that the scanner never peeks beyond a newline.
;;; In literate mode, this may reveal the wrong thing.

(define (advance-char)
  (if (and *lex-literate?* (eqv? *char* #\newline))
      (process-literate-comments '#f)
      (advance-char-1)))

(define (advance-char-1)
  (cond ((eqv? *char* #\newline)
	 (setf *on-new-line?* '#t)
	 (incf (the fixnum *current-line*))
	 (setf *current-col* 0))
	((eqv? *char* #\tab)
	 (incf (the fixnum *current-col*) (- 8 (modulo *current-col* 8))))
	(else
	 (incf (the fixnum *current-col*))))
  (setf *char* *peek-char*)
  (setf *at-eof?* *at-eof/p?*)
  (setf *peek-char* (read-char *port*))
  (when (eof-object? *peek-char*)
     (setf *at-eof/p?* '#t)
     (setf *peek-char* '#\space))
  *char*)

(define (peek-char-2)
  (let ((ch (peek-char *port*)))
    (if (eof-object? ch)
	'#\space
	ch)))

(define (lex-one-token)
 (setf *start-line* *current-line*) ; capture the loc at the start of the token
 (setf *start-col* *current-col*)
 (unless *at-eof?*
  (char-case *char*
    (whitechar
     (advance-char)
     (lex-one-token))
    (#\- (char-case *peek-char*
	    (#\- (lex-comment))
	    (#\> (advance-char)
		 (advance-char)
		 (emit-token '\-\>))
	    (#\} (signal-missing-begin-comment)
		 (advance-char)
		 (advance-char)
		 (lex-one-token))
	    (else
	     (lex-varsym))))
    (#\{ (cond ((char=? *peek-char* '#\-)
		(advance-char)
		(advance-char)
		(cond ((char=? *char* '#\#)
		       (advance-char)
		       (emit-token 'begin-annotation))
		      (else
		       (lex-ncomment)
		       (lex-one-token))))
	       (else
		(advance-char)
		(emit-token '\{ ))))
    (small (lex-varid))
    (large (lex-conid))
    (#\( (advance-char)
	 (emit-token '\())
    (#\: (lex-consym))
    (#\` (advance-char)
	 (emit-token '\`))
    ((symbol presymbol) (lex-varsym))
    (digit (lex-numeric))
    (#\' (lex-char))
    (#\" (lex-string))
    (#\) (advance-char)
	 (emit-token '\)))
    (#\, (advance-char)
	 (emit-token '\,))
    (#\; (advance-char)
	 (emit-token '\;))
    (#\[ (advance-char)
	 (emit-token '\[))
    (#\] (advance-char)
	 (emit-token '\]))
    (#\_ (advance-char)
	 (emit-token '\_))
    (#\} (advance-char)
	 (emit-token '\}))
    (else
     (signal-invalid-character *char*)
     (advance-char)
     (lex-one-token)))))

(define (signal-missing-begin-comment)
  (lexer-error 'missing-begin-comment
	       "`-}' appears outside of a nested comment."))

(define (signal-invalid-character ch)
  (lexer-error 'invalid-character 
	       "Invalid character `~a' appears in source program." ch))

(define (advance-past-white)
  (unless *at-eof?*
    (char-case *char*
      (whitechar
        (advance-char)
	(advance-past-white))
      (else
       '()))))

(define (process-literate-comments at-start?)
  (unless at-start? (advance-char-1))
  (let ((l (classify-line)))
    (cond ((or *at-eof?* (eq? l 'program))
	   '())
	  ((eq? l 'blank)
	   (skip-literate-comment '#t))
	  (else
	   (when (not at-start?)
		 (lexer-error 'blank-line-needed
		    "Literate comments must be preceeded by a blank line"))
	   (skip-literate-comment '#f)))))

(define (skip-literate-comment prev-blank)
  (skip-past-line)
  (let ((l (classify-line)))
    (cond (*at-eof?*
	   '())
	  ((eq? l 'comment)
	   (skip-literate-comment '#f))
	  ((eq? l 'blank)
	   (skip-literate-comment '#t))
	  (else
	   (when (not prev-blank)
	     (lexer-error 'blank-line-needed
		  "Literate comments must be followed by a blank line"))))))
  
(define (classify-line)
  (if *at-eof?*
      'blank
      (char-case *char*
       (#\>
	(advance-char-1)
	'program)
       (#\newline 'blank)
       (whitechar
	(classify-line-1))
       (else 'comment))))

(define (classify-line-1)
  (advance-char-1)
  (char-case *char*
    (#\newline 'blank)
    (whitechar (classify-line-1))
    (else 'comment)))

(define (skip-past-line)
  (when (not *at-eof?*)
    (char-case *char*
      (#\newline
       (advance-char-1))
      (else
       (advance-char-1)
       (skip-past-line)))))
	  
(define (lex-comment)  ;; a -- style comment
  (advance-char)
  (cond (*at-eof?* (lexer-eof-in-comment *current-line*))
	((char=? *char* #\newline)
	 (lex-one-token))
	(else
	 (lex-comment))))

(define (lexer-eof-in-comment start-line)
  (signal-eof-in-comment start-line)
  (lex-one-token))  ; will return the eof token

(define (signal-eof-in-comment start-line)
  (lexer-error 'eof-in-comment
	       "End of file in comment starting at line ~A." start-line))

;;; Here *char* and *peek-char* are the first two chars on a line.

(define (scan-symbol)
  (scan-list-of (symbol #\:)))

(define (scan-var-con)
  (scan-list-of (large small digit #\' #\_)))

(define (lex-ncomment)
  (lex-ncomment-1 *current-line*))

(define (lex-ncomment-1 start-line)
 (if *at-eof?*
  (lexer-eof-in-comment start-line)
  (char-case *char*
    (#\- (cond ((char=? *peek-char* #\})
		(advance-char)
		(advance-char))
	       (else
		(advance-char)
		(lex-ncomment-1 start-line))))
    (#\{ (cond ((char=? *peek-char* #\-)
		(advance-char)
		(advance-char)
		(lex-ncomment)
		(lex-ncomment-1 start-line))
	       (else
		(advance-char)
		(lex-ncomment-1 start-line))))
    (else
     (advance-char)
     (lex-ncomment-1 start-line)))))

(define (lex-varid)
  (let ((sym (scan-var-con)))
    (parse-reserved sym varid
       "case" "class"
       "data" "default" "deriving"
       "else"
       "hiding"
       "if" "import" "in" "infix" "infixl" "infixr" "instance" "interface"
       "let"
       "module"
       "of"
       "renaming"
       "then" "to" "type"
       "where")))

(define (lex-conid)
  (let ((sym (scan-var-con)))
    (emit-token/string 'conid sym)))

(define (lex-consym)
  (let ((sym (scan-symbol)))
    (cond ((string=/list? (cdr sym) ":")
	   (emit-token '\:\:))
	  (else
	   (emit-token/string 'consym sym)))))

(define (lex-varsym)
  (let ((sym (scan-symbol)))
    (cond ((and (string=/list? sym "<") (char=? *char* #\-))
	   (advance-char)
	   (emit-token '\<\-))
	  ((and (string=/list? sym "#")
		(char=? *char* #\-)
		(char=? *peek-char* #\}))
	   (advance-char)
	   (advance-char)
	   (emit-token 'end-annotation))
	  (else
	   (parse-reserved sym varsym
	      ".."
	      "=>" "="
	      "@"
	      "\\"
	      "|"
	      "~")))))

(define (lex-integer radix)
  (lex-integer-1 radix 0))

(define (lex-integer-1 radix psum)
  (declare (type fixnum radix)
	   (type integer psum))
  (let ((d  (char->digit *char* radix)))
    (if d
	(begin
	  (advance-char)
	  (lex-integer-1 radix (+ (* psum radix) (the fixnum d))))
	psum)))

(define (lex-fraction int-part denominator)
  (declare (type integer int-part denominator))
  (let ((d  (char->digit *char* 10)))
    (if d
	(begin
	  (advance-char)
	  (lex-fraction
	    (+ (* int-part 10) (the fixnum d)) (* denominator 10)))
	(values int-part denominator))))

(define (lex-numeric)
  (let ((int-part (lex-integer 10)))
    (if (and (char=? *char* #\.)
	     (char->digit *peek-char* 10))
	(lex-float int-part)
	(emit-token 'integer int-part))))

(define (lex-float int-part)
  (advance-char)
  (multiple-value-bind (numerator denominator) (lex-fraction int-part 1)
    (let ((no-exponent
	   (lambda () (emit-token 'float numerator denominator 0))))
      (char-case *char*
	(exponent
	  (char-case *peek-char*
	    (digit
	     (advance-char)
	     (lex-float/exp numerator denominator 1))
	    ((#\+ #\-)
	     (cond ((char->digit (peek-char-2) 10)
		    (let ((sign (if (char=? *peek-char* '#\+) 1 -1)))
		      (advance-char)
		      (advance-char)
		    (lex-float/exp numerator denominator sign)))
		 (else
		  (funcall no-exponent))))
	  (else
	   (funcall no-exponent))))
       (else
	(emit-token 'float numerator denominator 0))))))

(define (lex-float/exp numerator denominator sign)
  (let ((exponent (lex-integer 10)))
    (emit-token 'float numerator denominator (* sign exponent))))

(define (lex-char)
  (advance-char)
  (let ((c
    (char-case *char*
      (#\' (signal-null-character)
	   '#\?)
      (#\\ (lex-escaped-char '#f))
      ((#\space graphic)
       (let ((ch *char*))
	 (advance-char)
	 ch))
      (else
       (signal-bad-character-constant *char*)
       (advance-char)
       `#\?))))
    (cond ((char=? *char* '#\')
	   (advance-char)
	   (emit-token 'char c))
	  (else
	   (signal-missing-char-quote)
	   (skip-to-quote-or-eol)))))

(define (signal-null-character)
  (lexer-error 'null-character
	       "Null character '' is illegal - use '\\'' for a quote."))

(define (signal-bad-character-constant ch)
  (lexer-error 'bad-character-constant
	       "The character `~a' may not appear in a character literal." ch))

(define (signal-missing-char-quote)
  (lexer-error 'missing-char-quote
	       "Character constant has more than one character."))
  

(define (skip-to-quote-or-eol)
  (if *at-eof?*
      (lex-one-token)
      (char-case *char*
	 (#\' (advance-char)
	      (lex-one-token))
	 (#\newline (advance-char)
		    (lex-one-token))
	 (else
	  (advance-char)
	  (skip-to-quote-or-eol)))))

(define (lex-string)
  (advance-char)
  (emit-token 'string (list->string (gather-string-chars))))

(define (gather-string-chars)
  (char-case *char*
    (#\\
      (let ((ch (lex-escaped-char '#t)))
	(if (eq? ch 'null)
	    (gather-string-chars)
	    (cons ch (gather-string-chars)))))
    (#\"
      (advance-char)
      '())
    ((graphic #\space)
     (let ((ch *char*))
       (advance-char)
       (cons ch (gather-string-chars))))
    (#\newline
     (signal-missing-string-quote)
     '())
    (else
     (signal-bad-string-constant *char*)
     (advance-char)
     (gather-string-chars))))

(define (signal-missing-string-quote)
  (lexer-error 'missing-string-quote
	       "String continued over end of line."))

(define (signal-bad-string-constant ch)
  (lexer-error 'bad-string-constant
	       "The character `~a' may not appear in a string literal." ch))


(define (convert-stupid-control-character-names)
  (let ((c1 *char*)
	(c2 *peek-char*))
    (advance-char)
    (advance-char)
    (let ((s2 (string c1 c2))
	  (s3 (string c1 c2 *char*)))
      (let ((srch3 (assoc s3 *control-char-names*)))
	(cond (srch3
	       (advance-char)
	       (integer->char (cdr srch3)))
	      (else
	       (let ((srch2 (assoc s2 *control-char-names*)))
		 (cond (srch2
			(integer->char (cdr srch2)))
		       (else
			(signal-bad-control-char s3)
			`#\?)))))))))

(define (signal-bad-control-char name)
  (lexer-error 'invalid-control-char
	       "`~a' is not a recognized control character name." name))


(define (lex-escaped-char in-string?)
  (advance-char)
  (char-case *char*
    ((#\a #\b #\f #\n #\r #\t #\v)
     (let* ((ccode (cdr (assoc *char* *short-control-char-names*)))
	    (ccode1 (cdr (assoc ccode *control-char-names*))))
       (advance-char)
       (integer->char ccode1)))
    ((#\\ #\' #\")
     (let ((ch *char*))
       (advance-char)
       ch))
    (#\&
     (advance-char)
     (cond (in-string? 'null)
	   (else
	    (signal-bad-&-escape)
	    '#\?)))
    (#\^
     ;; *** This code is problematic because it assumes
     ;; *** (1) that you can do the arithmetic on the character codes
     ;; *** (2) that the resulting integer can actually be coerced to
     ;; ***     the right character object in the host Lisp.
     (advance-char)
     (char-case *char*
       ((large #\@ #\[ #\\ #\] #\^ #\_)
	(let ((code (+ (- (char->integer *char*)
			  (char->integer '#\A))
		       *control-A*)))
	  (advance-char)
	  (integer->char code)))
       (else
	(signal-bad-^-escape *char*)
	'#\?)))
    (large
     (convert-stupid-control-character-names))
    (digit
     (convert-num-to-char (lex-integer 10)))
    (#\o
     (advance-char)
     (cond ((char->digit *char* 8)
	    (convert-num-to-char (lex-integer 8)))
	   (else
	    (signal-missing-octal-digits)
	    '#\?)))
    (#\x
     (advance-char)
     (cond ((char->digit *char* 16)
	    (convert-num-to-char (lex-integer 16)))
	   (else
	    (signal-missing-hex-digits)
	    `#\?)))
    (whitechar
     (cond (in-string?
	    (lex-gap))
	   (else
	    (signal-bad-gap)
	    `#\?)))
    (else
     (signal-bad-escape *char*)
     `#\?)))

(define (signal-bad-&-escape)
  (lexer-error 'bad-&-escape
	       "The escape `\\&' is not allowed inside a character literal."))

(define (signal-bad-^-escape ch)
  (lexer-error 'bad-^-escape
	       "The escape `\\^~a' is not recognized." ch))

(define (signal-missing-octal-digits)
  (lexer-error 'missing-octal-digits
	       "No digits provided for `\\o' escape."))

(define (signal-missing-hex-digits)
  (lexer-error 'missing-hex-digits
	       "No digits provided for `\\x' escape."))

(define (signal-bad-gap)
  (lexer-error 'invalid-gap
	       "Gaps are not allowed inside character literals."))

(define (signal-bad-escape ch)
  (lexer-error 'bad-escape
	       "The escape `\\~a' is not recognized." ch))



;;; *** This code is problematic because it assumes that integers
;;; *** between 0 and 255 map on to characters with the corresponding
;;; *** ASCII encoding in the host Lisp, and that the host Lisp actually
;;; *** supports 255 characters.

(define (convert-num-to-char num)
  (cond ((and (>= num 0) (>= *max-char* num))
	 (integer->char num))
	(else
	 (signal-char-out-of-range num)
	 '#\?)))

(define (signal-char-out-of-range num)
  (lexer-error 'char-out-of-range
	       "There is no character corresponding to code ~s." num))


(define (lex-gap)
  (cond (*at-eof?*
	 (signal-eof-in-gap)
	 'null)
	(else
	 (char-case *char*
	   (whitechar
	    (advance-char)
	    (lex-gap))
	   (#\\
	    (advance-char)
	    'null)
	   (else
	    (signal-missing-gap)
	    'null)))))
  
      
(define (signal-eof-in-gap)
  (lexer-error 'eof-in-gap
	       "End of file encountered inside gap."))

(define (signal-missing-gap)
  (lexer-error 'missing-gap
	       "Missing gap delimiter, or junk inside gap."))