git.fiddlerwoaroof.com
Raw Blame History
;; -*- mode: common-lisp; package: net.post-office -*-
;;
;; See the file LICENSE for the full license governing this code.
;;

(defpackage :net.post-office
  (:use #:lisp #:excl)
  (:export
   #:parse-mime-structure
   #:mime-dequote
   #:with-part-stream
   
   ;; accessors
   #:mime-part-headers-size
   #:mime-part-body-size
   #:mime-part-lines
   #:mime-part-position
   #:mime-part-body-position
   #:mime-part-message

   ;; class name
   #:mime-part-parsed
   
   ))

(in-package :net.post-office)

(eval-when (compile)
  (declaim (optimize (speed 3))))

(eval-when (compile load eval)
  (require :streamp))

;;; MIME structure parser.
;;; Ref: RFC2045/2046

(defclass mime-part-parsed (mime-part)
  (
   (headers-size ;; in bytes. Includes the bytes for the blank line
    :accessor mime-part-headers-size :initform nil)
   (body-size ;; in bytes.
    :accessor mime-part-body-size :initform nil)
   (lines ;; line count of body (for non-multipart types)
    :accessor mime-part-lines :initform nil)
   (position ;; file position of start of headers
    :accessor mime-part-position :initform nil)
   (body-position ;; file position of start of body
    :accessor mime-part-body-position :initform nil)
   (message ;; for message/rfc822 encapsulated message.  
    ;; This will be a mime-part
    :accessor mime-part-message :initform nil)))

(defmacro get-header (name headers)
  `(cdr (assoc ,name ,headers :test #'equalp)))

(defvar *mime-read-line-unread*)

(defun parse-mime-structure (stream &key mbox)
  (let ((*mime-read-line-unread* nil))
    (multiple-value-bind (part stop-reason newpos)
	(parse-mime-structure-1 stream nil nil 0 mbox :outer t)
      (when (and part mbox (not (eq stop-reason :eof)))
	;;(format t "advancing to next mbox boundary~%")
	(multiple-value-bind (x y z newpos2)
	    (read-until-boundary stream nil newpos t)
	  (declare (ignore x y z))
	  (setf stop-reason :eof)
	  (setf newpos newpos2)))
      (values part stop-reason newpos))))

;; Returns values:
;; 1) The part (or nil if EOF while reading readers)
;; 2) The stop reason (:eof, :close-boundary, nil (meaning regular boundary))
;; 3) The new position

;: mime-parse-message-rfc822, parse-mime-structure, mime-parse-multipart
;: 
(defun parse-mime-structure-1 (stream boundary digest pos mbox &key outer)
  (let ((part (make-instance 'mime-part-parsed)))
    (setf (mime-part-position part) pos)
    (setf (mime-part-boundary part) boundary)
    (multiple-value-bind (headers bytes)
	(parse-headers stream mbox)
      (if (and (null headers) outer)
	  (return-from parse-mime-structure-1))
      (setf (mime-part-headers-size part) bytes)
      (incf pos bytes)
      (setf (mime-part-body-position part) pos)
      (setf (mime-part-headers part) headers)
      
      (let ((content-type (get-header "content-type" headers)))
	(setf (mime-part-id part) (get-header "Content-Id" headers))
	(setf (mime-part-description part) 
	  (get-header "Content-description" headers))
	(setf (mime-part-encoding part) 
	  (or (get-header "Content-transfer-encoding" headers)
	      "7bit"))
	
	(multiple-value-bind (type subtype params)
	    (parse-content-type content-type)
	  
	  (if* (null type)
	     then (if* digest
		     then (setf (mime-part-type part) "message")
			  (setf (mime-part-subtype part) "rfc822")
			  (setf (mime-part-parameters part) 
			    '(("charset" . "us-ascii")))
			  (mime-parse-message-rfc822 part stream boundary pos
						     mbox)
		     else (setup-text-plain-part part stream boundary pos 
						 mbox))
	     else (setf (mime-part-type part) type)
		  (setf (mime-part-subtype part) subtype)
		  (setf (mime-part-parameters part) params)
		  
		  (cond 
		   ((equalp type "multipart")
		    (mime-parse-multipart part stream boundary pos 
					  mbox))
		   ((message-rfc822-p type subtype)
		    (mime-parse-message-rfc822 part stream boundary pos 
					       mbox))
		   (t
		    (mime-parse-non-multipart part stream boundary pos 
					      mbox)))))))))

;: skip-whitespace, parse-header-line, parse-headers
;: 
(defmacro whitespace-char-p (char)
  (let ((c (gensym)))
    `(let ((,c ,char))
       (or (char= ,c #\space) (char= ,c #\tab) (char= ,c #\newline)))))

;; OK if 'string' is nil.
;; Might return nil
;; called by parse-mime-structure-1
;: parse-mime-structure-1
;: 
(defun parse-content-type (string)
  (block nil
    (if (null string)
	(return))
    (let ((max (length string))
	  pos type subtype)
      (multiple-value-setq (type pos)
	(mime-get-token string 0 max))
      (if (string= type "")
	  (return))
      
      (setf pos (skip-whitespace string pos max))
      
      (if (or (>= pos max) (char/= (schar string pos) #\/))
	  (return)) ;; bogus input
      
      (multiple-value-setq (subtype pos)
	(mime-get-token string (1+ pos) max))
      
      (if (string= subtype "")
	  (return)) ;; bogus input
      
      (values type subtype (parse-parameters string pos max)))))





;; called by parse-content-type.
;: parse-content-type
;: 
(defun parse-parameters (string pos max)
  (let (char pairs param value)
    (while (< pos max)
      (setf pos (skip-whitespace string pos max))
      (setf char (schar string pos))
      
      (if (char/= char #\;)
	  (return))
      
      (multiple-value-setq (param pos)
	(mime-get-token string (1+ pos) max))
      (setf pos (skip-whitespace string pos max))
      (if (or (>= pos max) (char/= (schar string pos) #\=))
	  (return))
      (multiple-value-setq (value pos)
	(mime-get-parameter-value string (1+ pos) max))

      (push (cons param value) pairs))
    (values (nreverse pairs) pos)))
  

(defconstant *mime-tspecials*
    '(#\( #\) #\< #\> #\@ 
      #\, #\; #\: #\\ #\" 
      #\/ #\[ #\] #\? #\=))

;: parse-content-type, parse-parameters, mime-get-parameter-value
;: mime-get-token, blank-line-p, parse-header-line
;: 
(defun skip-whitespace (string pos max)
  (declare (optimize (speed 3) (safety 0))
	   (fixnum pos max))
  (while (< pos max)
    (if (not (whitespace-char-p (schar string pos)))
	(return))
    (incf pos))
  pos)

;: parse-parameters
;: 
(defun mime-get-parameter-value (string pos max)
  (setf pos (skip-whitespace string pos max))
  (if* (>= pos max)
     then (values "" pos)
     else (if (char= (schar string pos) #\")
	      (mime-get-quoted-string string pos max)
	    (mime-get-token string pos max))))

;: parse-content-type, parse-parameters, mime-get-parameter-value
;: 
(defun mime-get-token (string pos max)
  (setf pos (skip-whitespace string pos max))
  (let ((startpos pos)
	char)
    (while (< pos max)
      (setf char (schar string pos))
      (if (or (char= #\space char) (member char *mime-tspecials*))
	  (return))
      (incf pos))
    (values (subseq string startpos pos) pos)))

;; Doesn't attempt to dequote
;: mime-get-parameter-value
;: 
(defun mime-get-quoted-string (string pos max)
  (let ((res (make-string (- max pos)))
	(outpos 0)
	char inquote inbackslash)
    (while (< pos max)
      (setf char (schar string pos))
      
      (when (and (char= char #\") (not inbackslash))
	(if* inquote
	   then	(setf (schar res outpos) char)
		(incf outpos)
		(incf pos)
		(return))
	(setf inquote t))
      
      (if* inbackslash
	 then (setf inbackslash nil)
	 else (if (char= char #\\)
		  (setf inbackslash t)))
      
      (setf (schar res outpos) char)
      (incf outpos)
      (incf pos))
    
    (values (subseq res 0 outpos) pos)))

;; mime-parse-multipart
;: 
(defun mime-dequote (string)
  (block nil
    (if (or (string= string "") (char/= (schar string 0) #\"))
	(return string))
    
    (let* ((max (length string))
	   (pos 1)
	   (res (make-string max))
	   (outpos 0)
	   char inbackslash)
      
      (while (< pos max)
	(setf char (schar string pos))
	
	(if (and (char= char #\") (not inbackslash))
	    (return))
	
	(if* (and (not inbackslash) (char= char #\\))
	   then	(setf inbackslash t)
		(incf pos)
	   else	(setf (schar res outpos) char)
		(incf outpos)
		(incf pos)
		(setf inbackslash nil)))
      
      (subseq res 0 outpos))))

;: parse-mime-structure-1
;: 
(defun setup-text-plain-part (part stream boundary pos mbox)
  (setf (mime-part-type part) "text")
  (setf (mime-part-subtype part) "plain")
  (setf (mime-part-parameters part) '(("charset" . "us-ascii")))
  (mime-parse-non-multipart part stream boundary pos mbox))

;: setup-text-plain-part, parse-mime-structure-1
;: 
(defun mime-parse-non-multipart (part stream boundary pos mbox)
  (let ((startpos pos))
    (multiple-value-bind (size lines eof pos)
	(read-until-boundary stream boundary pos mbox)
      
      (setf (mime-part-lines part) lines)
      (setf (mime-part-body-position part) startpos)
      (setf (mime-part-body-size part) size)
      
      (values part eof pos))))

;: parse-mime-structure-1
;: 
(defun mime-parse-message-rfc822 (part stream boundary pos mbox)
  (let ((startpos pos))
    (multiple-value-bind (message eof pos)
	(parse-mime-structure-1 stream boundary nil pos mbox)
      
      (setf (mime-part-message part) message)
      
      (setf (mime-part-body-position part) startpos)
      (setf (mime-part-body-size part) (- pos startpos))
      
      (values part eof pos))))
  

;: parse-mime-structure-1
;: 
(defun mime-parse-multipart (part stream parent-boundary pos mbox)
  (let* ((params (mime-part-parameters part))
	 (boundary (cdr (assoc "boundary" params :test #'equalp)))
	 (startpos pos)
	 parts eof newpart)
    
    (setf (mime-part-boundary part) parent-boundary)
    
    ;; If boundary isn't specified.. try to compensate by using our
    ;; parent's boundary.
    (if* (null boundary)
       then (setf boundary parent-boundary)
       else (setf boundary (mime-dequote boundary)))

    ;; Locate the first boundary.
    (multiple-value-bind (ignore1 ignore2 ignore3 newpos)
	(read-until-boundary stream boundary pos mbox)
      (declare (ignore ignore1 ignore2 ignore3))
      (setf pos newpos))
    
    (until eof
      (multiple-value-setq (newpart eof pos)
	(parse-mime-structure-1 stream boundary
				(equalp (mime-part-subtype part) "digest")
				pos mbox))
      (push newpart parts))
    
    (setf (mime-part-parts part) (nreverse parts))
    
    (setf (mime-part-body-size part) (- pos startpos))
    
    ;; Discard everything that follows until we reach the parent-boundary.
    (multiple-value-bind (ignore1 ignore2 eof pos)
	(read-until-boundary stream parent-boundary pos mbox)
      (declare (ignore ignore1 ignore2))
      (values part eof pos))))


;; support

(defconstant *whitespace* '(#\space #\tab #\return #\newline))


;: parse-headers
;: 
(defun parse-header-line (line len)
  (declare (optimize (speed 3) (safety 0)))
  (let ((pos 0)
	colonpos
	spacepos)
    (declare (fixnum len pos spacepos))
    
    (while (< pos len)
      (let ((char (schar line pos))) 
	(when (char= char #\:)
	  (setf colonpos pos)
	  (return))
	
	(if (and (null spacepos) (whitespace-char-p char))
	    (setf spacepos pos)))
      
      (incf pos))
 
    (if (null colonpos) ;; bogus header line
	(return-from parse-header-line))
    
    (if (null spacepos)
	(setf spacepos colonpos))
    
    (if (= 0 spacepos) ;; bogus header line (no name)
	(return-from parse-header-line))
    
    (values (subseq line 0 spacepos)
	    (subseq line (skip-whitespace line (1+ colonpos) len) len))))

;; Returns offset of end of line in buffer.  Or nil if EOF
;; Second value is the number of characters read (including EOL chars)

;: parse-headers, read-until-boundary, collect-message-data-from-mbox
;: 
(defun mime-read-line (stream buffer)
  (declare (optimize (speed 3) (safety 0)))
  
  (if* *mime-read-line-unread*
     then (let* ((line (car *mime-read-line-unread*))
		 (bytes (cdr *mime-read-line-unread*))
		 (len (length line)))
	    (declare (simple-string line))
	    (setf *mime-read-line-unread* nil)
	    (dotimes (n len)
	      (setf (schar buffer n) (schar line n)))
	    (values len bytes))
     else (let ((pos 0)
		(end (length buffer))
		(count 0)
		char)
	    (declare (fixnum pos end count))
    
	    (while (and (< pos end) (setf char (read-char stream nil nil)))
	      (incf count)
	      (if (char= char #\newline)
		  (return))
	      (setf (schar buffer pos) char)
	      (incf pos))
    
	    (if* (= count 0)
	       then nil ;; EOF
	       else ;; Check for CR/LF combo
		    (if (and (> pos 0) 
			     (char= (schar buffer (1- pos)) #\return))
			(decf pos))
		    
		    (values pos count)))))
	    
(defun mime-unread-line (line end bytes)
  ;; This should never happen
  (if *mime-read-line-unread*
      (error "Unread buffer is full."))
  (setf *mime-read-line-unread* 
    (cons (subseq line 0 end) bytes)))

(eval-when (compile)
  (defconstant *parse-headers-line-len* 1024))

;; Returns:
;; 1) headers alist
;; 2) # of characters composing the header and terminator.
;:
;: parse-mime-structure-1
;: 
(defun parse-headers (stream mbox)
  (declare (optimize (speed 3) (safety 0)))
  (let ((count 0)
	(line (make-array #.*parse-headers-line-len* :element-type 'character))
	headers lastcons current incomplete lastincomplete)
    (declare (fixnum count)
	     (dynamic-extent line))

    (loop
      (multiple-value-bind (end bytes)
	  (mime-read-line stream line)
	(declare (fixnum end bytes))

	(if (null end)  ;; EOF
	    (return))
	
	(setf incomplete (= end #.*parse-headers-line-len*))
	
	(if (and mbox (not lastincomplete) (my-prefixp "From " line end))
	    (return))
	
	(incf count bytes)

	(cond
	 (lastincomplete ;; rest of a long line
	  (setf (car lastcons)
	    (concatenate 'string (car lastcons) (subseq line 0 end))))
	 
	 ((zerop end) ;; blank line
	  (return))
	 
	 ((whitespace-char-p (schar line 0)) ;; Continuation line
	  (when (null current) ;; Malformed header line
	    (decf count bytes) 
	    (mime-unread-line line end bytes)
	    (return)) 
	  
	  (let ((newcons (cons (subseq line 0 end) nil)))
	    (setf (cdr lastcons) newcons)
	    (setf lastcons newcons)))

	 (t ;; Fresh header line
	  (multiple-value-bind (name value)
	      (parse-header-line line end)
	    (when (null name)
	      ;; Malformed header line.  Unread it (so that it
	      ;; will be treated as part of the body) and
	      ;; consider the headers terminated.
	      (decf count bytes)
	      (mime-unread-line line end bytes)
	      (return))
	    
	    (setf lastcons (cons value nil))
	    (setf current (cons name lastcons))
	    (push current headers))))
	 
	(setf lastincomplete incomplete)))

    ;; Finalize strings.
    (dolist (header headers)
      (setf (cdr header) (coalesce-header header)))
    
    (values (nreverse headers) count)))

;: parse-headers
;: 
(defun coalesce-header (header)
  (declare (optimize (speed 3) (safety 0)))
  (let ((stringlist (cdr header)))
    (if* (= (length stringlist) 1)
       then (first stringlist)
       else (let ((len 0))
	      (declare (fixnum len))
	      (dolist (string stringlist)
		(incf len (1+ (the fixnum (length string)))))
	      (decf len)
	      (let ((res (make-string len))
		    (pos 0)
		    (first t))
		(declare (fixnum pos))
		(dolist (string stringlist)
		  (if* first
		     then (setf first nil)
		     else (setf (schar res pos) #\newline)
			  (incf pos))
		  (dotimes (n (length string))
		    (declare (fixnum n))
		    (setf (schar res pos) (schar string n))
		    (incf pos)))
		res)))))

;; Returns: (1) size of part 
;;          (2) number of lines read
;;          (3) stop reason (:eof, :close-boundary, or nil (meaning regular
;;                                                          boundary)
;;          (4) new stream position (post boundary read)
;: mime-parse-multipart, mime-parse-non-multipart
;: 
(defun read-until-boundary (stream boundary pos mbox)
  (declare (optimize (speed 3) (safety 0))
	   (fixnum pos))
  (if* (and (null boundary) (null mbox))
     then 
	  (multiple-value-bind (lines count)
	      (count-lines-to-eof stream)
	    (declare (fixnum count))
	    (values count lines :eof (+ pos count)))
     else 
	  (let ((line (make-array 16000 :element-type 'character))
		(size 0)
		(lines 0)
		(stop-reason :eof)
		delimiter close-delimiter)
	    (declare (dynamic-extent line)
		     (fixnum size lines))
	    
	    (when boundary
	      (setf delimiter (concatenate 'string "--" boundary))
	      (setf close-delimiter (concatenate 'string delimiter "--")))
	    
	    (loop
	      (multiple-value-bind (end bytes)
		  (mime-read-line stream line)
		(declare (fixnum end bytes))
		
		(if (or (null end)
			(and mbox (my-prefixp "From " line end)))
		    (return))
		
		(incf pos bytes)
		
		(when (and delimiter (my-prefixp delimiter line end))
		  (if* (my-prefixp close-delimiter line end)
		     then (setf stop-reason :close-boundary)
		     else (setf stop-reason nil))
		  (return))
		
		(incf size bytes)
		(incf lines))) 
	    
	    (values size lines stop-reason pos))))

;; Returns:
;; 1) number of lines
;; 2) number of bytes read
;: read-until-boundary
;: 
(defun count-lines-to-eof (stream)
  (declare (optimize (speed 3) (safety 0)))
  (let ((buffer (make-array 65536 :element-type '(unsigned-byte 8)))
	(lines 0)
	(pos 0)
	(lastbyte -1)
	(count 0)
	end)
    (declare (dynamic-extent buffer)
	     (fixnum lines pos end lastbyte count))
    ;; count 10's
    ;; XXX: The count will be off if the file has CR/LF convention and
    ;; there are bare LFs.  
    (loop
      (setf end (read-vector buffer stream))
      (incf count end)
      
      (if (= end 0)
	  (return))
      
      (while (< pos end)
	(if (= (aref buffer pos) 10)
	    (incf lines))
	(incf pos))
      
      (setf lastbyte (aref buffer (1- pos))))
    
    ;; Count last partial line.
    (if (and (> lastbyte 0) (/= lastbyte 10))
	(incf lines))
    
    (values lines count)))

(defun my-prefixp (prefix string &optional end)
  (declare (optimize (speed 3) (safety 0)))
  (let ((lenprefix (length prefix))
	(end (or end (length string))))
    (declare (fixnum lenprefix end))
    (when (>= end lenprefix)
      (dotimes (n lenprefix)
	(declare (fixnum n))
	(if (char/= (schar prefix n) (schar string n))
	    (return-from my-prefixp)))
      t)))

;;; misc

(defun stream-to-stream-copy (outstream instream count)
  (declare (optimize (speed 3))
	   (fixnum count))
  (let ((buf (make-array 4096 :element-type '(unsigned-byte 8))))
    (declare (dynamic-extent buf))
    (while (> count 0)
      (let ((got (read-sequence buf instream :end (min count 4096))))
	(declare (fixnum got))
	(if (zerop got)
	    (error "Unexpected EOF while reading from ~a" instream))
	(write-sequence buf outstream :end got)
	(decf count got)))))

;; 'instream' must be positioned appropriately by the caller beforehand.
(defmacro with-part-stream ((sym part instream &key (header t)) &body body)
  (let ((p (gensym))
	(stream (gensym))
	(count (gensym)))
    `(let* ((,p ,part)
	    (,stream ,instream)
	    (,count (mime-part-body-size ,p)))
       (if ,header
	   (incf ,count (mime-part-headers-size ,p)))
       (excl:with-function-input-stream 
	   (,sym #'stream-to-stream-copy ,stream ,count)
	 ,@body))))
	 


;;; testing

#|
(defun test-parse-mime (file &key (pretty t))
  (with-open-file (f file)
    (let ((res     (parse-mime-structure f)))
      (if pretty
	  (pprint-rfc822-part res)
	res))))

(defun pprint-rfc822-part (thing &optional (prefix ""))
  (if (null thing)
      (error "Strange. something called pprint-rfc822-part with nil"))
  (let ((type (mime-part-type thing))
	(subtype (mime-part-subtype thing)))
    (format t "~AHEADER ([RFC-2822] header of the message)~%" prefix)
    (format t "~ATEXT   ([RFC-2822] text body of the message) ~A/~A~%" prefix type subtype)
    
    (if* (message-rfc822-p type subtype)
       then ;; XXX .. what should the new prefix be? 
	    (pprint-rfc822-part (mime-part-message thing) prefix)
     elseif (equalp type "multipart")
       then (pprint-multipart thing prefix))))

(defun pprint-multipart (thing prefix)
  (let ((id 1))
    (dolist (part (mime-part-parts thing))
      (let ((type (mime-part-type part))
	    (subtype (mime-part-subtype part)))
	(format t "~a~a      ~a/~a~%" prefix id type subtype)
	(if* (message-rfc822-p type subtype)
	   then (pprint-rfc822-part (mime-part-message part) 
				    (format nil "~a~a." prefix id))
	 elseif (equalp type "multipart")
	   then (pprint-multipart part (format nil "~a~a." prefix id)))
	(incf id)))))
|#