git.fiddlerwoaroof.com
mime-parse.cl
9d3f79bb
 ;; $Id: mime-parse.cl,v 1.3 2007/03/14 18:44:59 duane Exp $
f6cd6a72
 
 (defpackage :net.post-office
   (:use #:lisp #:excl)
   (:export
    #:parse-mime-structure
    #:mime-dequote
c781d4b2
    #:with-part-stream
f6cd6a72
    
    ;; accessors
    #:mime-part-headers-size
    #:mime-part-body-size
    #:mime-part-lines
    #:mime-part-position
    #:mime-part-body-position
    #:mime-part-message
 
    ;; class name
    #:mime-part-parsed
    
    ))
 
 (in-package :net.post-office)
 
 (eval-when (compile)
   (declaim (optimize (speed 3))))
 
c781d4b2
 (eval-when (compile load eval)
   (require :streamp))
 
f6cd6a72
 ;;; MIME structure parser.
 ;;; Ref: RFC2045/2046
 
 (defclass mime-part-parsed (mime-part)
   (
    (headers-size ;; in bytes. Includes the bytes for the blank line
     :accessor mime-part-headers-size :initform nil)
    (body-size ;; in bytes.
     :accessor mime-part-body-size :initform nil)
    (lines ;; line count of body (for non-multipart types)
     :accessor mime-part-lines :initform nil)
    (position ;; file position of start of headers
     :accessor mime-part-position :initform nil)
    (body-position ;; file position of start of body
     :accessor mime-part-body-position :initform nil)
    (message ;; for message/rfc822 encapsulated message.  
     ;; This will be a mime-part
     :accessor mime-part-message :initform nil)))
 
c781d4b2
 (defmacro get-header (name headers)
   `(cdr (assoc ,name ,headers :test #'equalp)))
f6cd6a72
 
c781d4b2
 (defun parse-mime-structure (stream &key mbox)
   (multiple-value-bind (part stop newpos)
       (parse-mime-structure-1 stream nil nil 0 mbox)
     (declare (ignore stop))
     (values part newpos)))
 
 ;; Returns values:
 ;; 1) The part
 ;; 2) The stop reason (:eof, :close-boundary, nil (meaning regular boundary))
 ;; 3) The new position
f6cd6a72
 
c781d4b2
 ;: mime-parse-message-rfc822, parse-mime-structure, mime-parse-multipart
 ;: 
f6cd6a72
 (defun parse-mime-structure-1 (stream boundary digest pos mbox)
   (let ((part (make-instance 'mime-part-parsed)))
     (setf (mime-part-position part) pos)
     (setf (mime-part-boundary part) boundary)
     (multiple-value-bind (headers bytes)
 	(parse-headers stream mbox)
       (setf (mime-part-headers-size part) bytes)
       (incf pos bytes)
       (setf (mime-part-body-position part) pos)
       (setf (mime-part-headers part) headers)
       
c781d4b2
       (let ((content-type (get-header "content-type" headers)))
 	(setf (mime-part-id part) (get-header "Content-Id" headers))
f6cd6a72
 	(setf (mime-part-description part) 
c781d4b2
 	  (get-header "Content-description" headers))
f6cd6a72
 	(setf (mime-part-encoding part) 
c781d4b2
 	  (or (get-header "Content-transfer-encoding" headers)
f6cd6a72
 	      "7bit"))
 	
 	(multiple-value-bind (type subtype params)
 	    (parse-content-type content-type)
 	  
 	  (if* (null type)
c781d4b2
 	     then (if* digest
 		     then (setf (mime-part-type part) "message")
f6cd6a72
 			  (setf (mime-part-subtype part) "rfc822")
 			  (setf (mime-part-parameters part) 
 			    '(("charset" . "us-ascii")))
 			  (mime-parse-message-rfc822 part stream boundary pos
 						     mbox)
c781d4b2
 		     else (setup-text-plain-part part stream boundary pos 
 						 mbox))
 	     else (setf (mime-part-type part) type)
f6cd6a72
 		  (setf (mime-part-subtype part) subtype)
 		  (setf (mime-part-parameters part) params)
 		  
 		  (cond 
 		   ((equalp type "multipart")
c781d4b2
 		    (mime-parse-multipart part stream boundary pos 
 					  mbox))
f6cd6a72
 		   ((message-rfc822-p type subtype)
c781d4b2
 		    (mime-parse-message-rfc822 part stream boundary pos 
 					       mbox))
f6cd6a72
 		   (t
c781d4b2
 		    (mime-parse-non-multipart part stream boundary pos 
 					      mbox)))))))))
 
 ;: skip-whitespace, parse-header-line, parse-headers
 ;: 
 (defmacro whitespace-char-p (char)
   (let ((c (gensym)))
     `(let ((,c ,char))
        (or (char= ,c #\space) (char= ,c #\tab) (char= ,c #\newline)))))
f6cd6a72
 
 ;; OK if 'string' is nil.
 ;; Might return nil
 ;; called by parse-mime-structure-1
c781d4b2
 ;: parse-mime-structure-1
 ;: 
f6cd6a72
 (defun parse-content-type (string)
   (block nil
     (if (null string)
 	(return))
     (let ((max (length string))
 	  pos type subtype)
       (multiple-value-setq (type pos)
 	(mime-get-token string 0 max))
       (if (string= type "")
 	  (return))
       
       (setf pos (skip-whitespace string pos max))
       
c781d4b2
       (if (or (>= pos max) (char/= (schar string pos) #\/))
f6cd6a72
 	  (return)) ;; bogus input
       
       (multiple-value-setq (subtype pos)
 	(mime-get-token string (1+ pos) max))
       
       (if (string= subtype "")
 	  (return)) ;; bogus input
       
       (values type subtype (parse-parameters string pos max)))))
 
c781d4b2
 
 
 
 
f6cd6a72
 ;; called by parse-content-type.
c781d4b2
 ;: parse-content-type
 ;: 
f6cd6a72
 (defun parse-parameters (string pos max)
   (let (char pairs param value)
     (while (< pos max)
       (setf pos (skip-whitespace string pos max))
c781d4b2
       (setf char (schar string pos))
f6cd6a72
       
       (if (char/= char #\;)
 	  (return))
       
       (multiple-value-setq (param pos)
 	(mime-get-token string (1+ pos) max))
       (setf pos (skip-whitespace string pos max))
c781d4b2
       (if (or (>= pos max) (char/= (schar string pos) #\=))
f6cd6a72
 	  (return))
       (multiple-value-setq (value pos)
 	(mime-get-parameter-value string (1+ pos) max))
 
       (push (cons param value) pairs))
     (values (nreverse pairs) pos)))
   
 
 (defconstant *mime-tspecials*
     '(#\( #\) #\< #\> #\@ 
       #\, #\; #\: #\\ #\" 
       #\/ #\[ #\] #\? #\=))
 
c781d4b2
 ;: parse-content-type, parse-parameters, mime-get-parameter-value
 ;: mime-get-token, blank-line-p, parse-header-line
 ;: 
f6cd6a72
 (defun skip-whitespace (string pos max)
c781d4b2
   (declare (optimize (speed 3) (safety 0))
f6cd6a72
 	   (fixnum pos max))
   (while (< pos max)
c781d4b2
     (if (not (whitespace-char-p (schar string pos)))
f6cd6a72
 	(return))
     (incf pos))
   pos)
 
c781d4b2
 ;: parse-parameters
 ;: 
f6cd6a72
 (defun mime-get-parameter-value (string pos max)
   (setf pos (skip-whitespace string pos max))
   (if* (>= pos max)
c781d4b2
      then (values "" pos)
      else (if (char= (schar string pos) #\")
f6cd6a72
 	      (mime-get-quoted-string string pos max)
 	    (mime-get-token string pos max))))
 
c781d4b2
 ;: parse-content-type, parse-parameters, mime-get-parameter-value
 ;: 
f6cd6a72
 (defun mime-get-token (string pos max)
   (setf pos (skip-whitespace string pos max))
   (let ((startpos pos)
 	char)
     (while (< pos max)
c781d4b2
       (setf char (schar string pos))
f6cd6a72
       (if (or (char= #\space char) (member char *mime-tspecials*))
 	  (return))
       (incf pos))
     (values (subseq string startpos pos) pos)))
 
 ;; Doesn't attempt to dequote
c781d4b2
 ;: mime-get-parameter-value
 ;: 
f6cd6a72
 (defun mime-get-quoted-string (string pos max)
   (let ((res (make-string (- max pos)))
 	(outpos 0)
 	char inquote inbackslash)
     (while (< pos max)
c781d4b2
       (setf char (schar string pos))
       
       (when (and (char= char #\") (not inbackslash))
 	(if* inquote
 	   then	(setf (schar res outpos) char)
 		(incf outpos)
 		(incf pos)
 		(return))
 	(setf inquote t))
f6cd6a72
       
       (if* inbackslash
c781d4b2
 	 then (setf inbackslash nil)
 	 else (if (char= char #\\)
f6cd6a72
 		  (setf inbackslash t)))
       
       (setf (schar res outpos) char)
       (incf outpos)
       (incf pos))
     
     (values (subseq res 0 outpos) pos)))
 
c781d4b2
 ;; mime-parse-multipart
 ;: 
f6cd6a72
 (defun mime-dequote (string)
   (block nil
c781d4b2
     (if (or (string= string "") (char/= (schar string 0) #\"))
f6cd6a72
 	(return string))
     
     (let* ((max (length string))
 	   (pos 1)
 	   (res (make-string max))
 	   (outpos 0)
 	   char inbackslash)
       
       (while (< pos max)
c781d4b2
 	(setf char (schar string pos))
f6cd6a72
 	
 	(if (and (char= char #\") (not inbackslash))
 	    (return))
 	
 	(if* (and (not inbackslash) (char= char #\\))
c781d4b2
 	   then	(setf inbackslash t)
f6cd6a72
 		(incf pos)
c781d4b2
 	   else	(setf (schar res outpos) char)
f6cd6a72
 		(incf outpos)
 		(incf pos)
 		(setf inbackslash nil)))
       
       (subseq res 0 outpos))))
 
c781d4b2
 ;: parse-mime-structure-1
 ;: 
f6cd6a72
 (defun setup-text-plain-part (part stream boundary pos mbox)
   (setf (mime-part-type part) "text")
   (setf (mime-part-subtype part) "plain")
   (setf (mime-part-parameters part) '(("charset" . "us-ascii")))
   (mime-parse-non-multipart part stream boundary pos mbox))
 
c781d4b2
 ;: setup-text-plain-part, parse-mime-structure-1
 ;: 
f6cd6a72
 (defun mime-parse-non-multipart (part stream boundary pos mbox)
   (let ((startpos pos))
c781d4b2
     (multiple-value-bind (size lines eof pos)
f6cd6a72
 	(read-until-boundary stream boundary pos mbox)
       
       (setf (mime-part-lines part) lines)
       (setf (mime-part-body-position part) startpos)
c781d4b2
       (setf (mime-part-body-size part) size)
f6cd6a72
       
       (values part eof pos))))
 
c781d4b2
 ;: parse-mime-structure-1
 ;: 
f6cd6a72
 (defun mime-parse-message-rfc822 (part stream boundary pos mbox)
   (let ((startpos pos))
     (multiple-value-bind (message eof pos)
 	(parse-mime-structure-1 stream boundary nil pos mbox)
       
       (setf (mime-part-message part) message)
       
       (setf (mime-part-body-position part) startpos)
       (setf (mime-part-body-size part) (- pos startpos))
       
       (values part eof pos))))
   
 
c781d4b2
 ;: parse-mime-structure-1
 ;: 
f6cd6a72
 (defun mime-parse-multipart (part stream parent-boundary pos mbox)
   (let* ((params (mime-part-parameters part))
 	 (boundary (cdr (assoc "boundary" params :test #'equalp)))
 	 (startpos pos)
 	 parts eof newpart)
     
     (setf (mime-part-boundary part) parent-boundary)
     
     ;; If boundary isn't specified.. try to compensate by using our
     ;; parent's boundary.
     (if (null boundary)
 	(setf boundary parent-boundary)
       (setf boundary (mime-dequote boundary)))
     
     ;; Locate the first boundary.
     (multiple-value-bind (ignore1 ignore2 ignore3 newpos)
 	(read-until-boundary stream boundary pos mbox)
       (declare (ignore ignore1 ignore2 ignore3))
       (setf pos newpos))
     
     (until eof
       (multiple-value-setq (newpart eof pos)
 	(parse-mime-structure-1 stream boundary
 				(equalp (mime-part-subtype part) "digest")
 				pos mbox))
       (push newpart parts))
     
     (setf (mime-part-parts part) (nreverse parts))
     
     ;; Discard everything that follows until we reach the parent-boundary.
     (multiple-value-bind (ignore1 ignore2 eof pos)
 	(read-until-boundary stream parent-boundary pos mbox)
       (declare (ignore ignore1 ignore2))
       
       (setf (mime-part-body-size part) (- pos startpos))
       
       (values part eof pos))))
c781d4b2
 
 
f6cd6a72
 ;; support
 
c781d4b2
 (defconstant *whitespace* '(#\space #\tab #\return #\newline))
 
f6cd6a72
 
c781d4b2
 ;: parse-headers
 ;: 
 (defun blank-line-p (line len)
   (declare (optimize (speed 3) (safety 0))
 	   (fixnum len))
   (= len (skip-whitespace line 0 len)))
 
 ;: parse-headers
 ;: 
 (defun parse-header-line (line len)
   (declare (optimize (speed 3) (safety 0)))
   (let ((pos 0)
 	colonpos
 	spacepos)
     (declare (fixnum len pos spacepos))
     
     (while (< pos len)
       (let ((char (schar line pos))) 
 	(when (char= char #\:)
 	  (setf colonpos pos)
 	  (return))
f6cd6a72
 	
c781d4b2
 	(if (and (null spacepos) (whitespace-char-p char))
 	    (setf spacepos pos)))
       
       (incf pos))
  
     (if (null colonpos) ;; bogus header line
 	(return-from parse-header-line))
f6cd6a72
     
c781d4b2
     (if (null spacepos)
 	(setf spacepos colonpos))
f6cd6a72
     
c781d4b2
     (if (= 0 spacepos) ;; bogus header line (no name)
 	(return-from parse-header-line))
f6cd6a72
     
c781d4b2
     (values (subseq line 0 spacepos)
 	    (subseq line (skip-whitespace line (1+ colonpos) len) len))))
 
 ;; Returns offset of end of line in buffer.  Or nil if EOF
 ;; Second value is the number of characters read (including EOL chars)
 ;; This is slower than a read-line call, but in the long run can
 ;; lead to big savings in gc time.
 ;: parse-headers, read-until-boundary, collect-message-data-from-mbox
 ;: 
 (defun mime-read-line (stream buffer)
   (declare (optimize (speed 3) (safety 0)))
   (let ((pos 0)
 	(end (length buffer))
 	(count 0)
 	char)
     (declare (fixnum pos end count))
     
     (while (and (< pos end) (setf char (read-char stream nil nil)))
       (incf count)
       (if (char= char #\newline)
 	  (return))
       (setf (schar buffer pos) char)
       (incf pos))
     
     (if* (= count 0)
        then nil ;; EOF
        else ;; Check for CR/LF combo
 	    (if (and (> pos 0) (char= (schar buffer (1- pos)) #\return))
 		(decf pos))
 	    
 	    (values pos count))))
 	    
 
 ;; Returns:
 ;; 1) headers alist
 ;; 2) # of characters composing the header and terminator.
 ;:
 ;: parse-mime-structure-1
 ;: 
 (defun parse-headers (stream mbox)
   (declare (optimize (speed 3) (safety 0)))
   (let ((count 0)
 	(line (make-array 1024 :element-type 'character))
 	headers 
 	lastcons
 	current)
     (declare (fixnum count)
 	     (dynamic-extent line))
 
     (loop
       (multiple-value-bind (end bytes)
 	  (mime-read-line stream line)
 	(declare (fixnum end))
f6cd6a72
 	
c781d4b2
 	(if (or (null end)
 		(and mbox (my-prefixp "From " line end)))
f6cd6a72
 	    (return))
c781d4b2
 
 	(incf count bytes)
f6cd6a72
 	
c781d4b2
 	(if (blank-line-p line end)
 	    (return))
f6cd6a72
 	
c781d4b2
 	(if* (whitespace-char-p (schar line 0))
 	   then ;; Continuation line
 		(if (null current)
 		    (return)) 
 	      
 		(let ((newcons (cons (subseq line 0 end) nil)))
 		  (setf (cdr lastcons) newcons)
 		  (setf lastcons newcons))
 	      
 	   else ;; Fresh header line
 		(multiple-value-bind (name value)
 		    (parse-header-line line end)
 		  (if (null name)
 		      (return)) 
 		
 		  (setf lastcons (cons value nil))
 		  (setf current (cons name lastcons))
 		  (push current headers)))))
 
     ;; Finalize strings.
     (dolist (header headers)
       (setf (cdr header) (coalesce-header header)))
     
     (values (nreverse headers) count)))
f6cd6a72
 
c781d4b2
 ;: parse-headers
 ;: 
 (defun coalesce-header (header)
f6cd6a72
   (declare (optimize (speed 3) (safety 0)))
c781d4b2
   (let ((stringlist (cdr header)))
     (if* (= (length stringlist) 1)
        then (first stringlist)
        else (let ((len 0))
 	      (declare (fixnum len))
 	      (dolist (string stringlist)
 		(incf len (1+ (the fixnum (length string)))))
 	      (decf len)
 	      (let ((res (make-string len))
 		    (pos 0)
 		    (first t))
 		(declare (fixnum pos))
 		(dolist (string stringlist)
 		  (if* first
 		     then (setf first nil)
 		     else (setf (schar res pos) #\newline)
 			  (incf pos))
 		  (dotimes (n (length string))
 		    (declare (fixnum n))
 		    (setf (schar res pos) (schar string n))
 		    (incf pos)))
 		res)))))
 
 ;; Returns: (1) size of part 
 ;;          (2) number of lines read
 ;;          (3) stop reason (:eof, :close-boundary, or nil (meaning regular
 ;;                                                          boundary)
 ;;          (4) new stream position (post boundary read)
 ;: mime-parse-multipart, mime-parse-non-multipart
 ;: 
 (defun read-until-boundary (stream boundary pos mbox)
   (declare (optimize (speed 3) (safety 0))
 	   (fixnum pos))
   (if* (and (null boundary) (null mbox))
      then 
 	  (multiple-value-bind (lines count)
 	      (count-lines-to-eof stream)
 	    (declare (fixnum count))
 	    (values count lines :eof (+ pos count)))
      else 
 	  (let ((line (make-array 16000 :element-type 'character))
 		(size 0)
 		(lines 0)
 		(stop-reason :eof)
 		delimiter close-delimiter)
 	    (declare (dynamic-extent line)
9d3f79bb
 		     (fixnum size lines))
c781d4b2
 	    
 	    (when boundary
 	      (setf delimiter (concatenate 'string "--" boundary))
 	      (setf close-delimiter (concatenate 'string delimiter "--")))
 	    
 	    (loop
 	      (multiple-value-bind (end bytes)
 		  (mime-read-line stream line)
 		(declare (fixnum end bytes))
 		
 		(if (or (null end)
 			(and mbox (my-prefixp "From " line end)))
 		    (return))
 		
 		(incf pos bytes)
 		
 		(when (my-prefixp delimiter line end)
 		  (if* (my-prefixp close-delimiter line end)
 		     then (setf stop-reason :close-boundary)
 		     else (setf stop-reason nil))
 		  (return))
 		
 		(incf size bytes)
 		(incf lines))) 
 	    
 	    (values size lines stop-reason pos))))
 
 ;; Returns:
 ;; 1) number of lines
 ;; 2) number of bytes read
 ;: read-until-boundary
 ;: 
 (defun count-lines-to-eof (stream)
   (declare (optimize (speed 3) (safety 0)))
   (let ((buffer (make-array 65536 :element-type '(unsigned-byte 8)))
 	(lines 0)
 	(pos 0)
 	(lastbyte -1)
 	(count 0)
 	end)
     (declare (dynamic-extent buffer)
 	     (fixnum lines pos end lastbyte count))
     ;; count 10's
     ;; XXX: The count will be off if the file has CR/LF convention and
     ;; there are bare LFs.  
     (loop
       (setf end (read-vector buffer stream))
       (incf count end)
       
       (if (= end 0)
f6cd6a72
 	  (return))
       
c781d4b2
       (while (< pos end)
 	(if (= (aref buffer pos) 10)
 	    (incf lines))
 	(incf pos))
f6cd6a72
       
c781d4b2
       (setf lastbyte (aref buffer (1- pos))))
f6cd6a72
     
c781d4b2
     ;; Count last partial line.
     (if (and (> lastbyte 0) (/= lastbyte 10))
 	(incf lines))
     
     (values lines count)))
f6cd6a72
 
c781d4b2
 (defun my-prefixp (prefix string &optional end)
   (declare (optimize (speed 3) (safety 0)))
   (let ((lenprefix (length prefix))
 	(end (or end (length string))))
9d3f79bb
     (declare (fixnum lenprefix end))
c781d4b2
     (when (>= end lenprefix)
       (dotimes (n lenprefix)
 	(declare (fixnum n))
 	(if (char/= (schar prefix n) (schar string n))
 	    (return-from my-prefixp)))
       t)))
 
 ;;; misc
 
 (defun stream-to-stream-copy (outstream instream count)
f6cd6a72
   (declare (optimize (speed 3))
 	   (fixnum count))
c781d4b2
   (let ((buf (make-array 4096 :element-type '(unsigned-byte 8))))
     (declare (dynamic-extent buf))
     (while (> count 0)
       (let ((got (read-sequence buf instream :end (min count 4096))))
 	(declare (fixnum got))
 	(if (zerop got)
 	    (error "Unexpected EOF while reading from ~a" instream))
 	(write-sequence buf outstream :end got)
 	(decf count got)))))
 
 ;; 'instream' must be positioned appropriately by the caller beforehand.
 (defmacro with-part-stream ((sym part instream &key (header t)) &body body)
   (let ((p (gensym))
 	(stream (gensym))
 	(count (gensym)))
     `(let* ((,p ,part)
 	    (,stream ,instream)
 	    (,count (mime-part-body-size ,p)))
        (if ,header
 	   (incf ,count (mime-part-headers-size ,p)))
        (excl:with-function-input-stream 
 	   (,sym #'stream-to-stream-copy ,stream ,count)
 	 ,@body))))
 	 
 
f6cd6a72
 
 ;;; testing
 
 #|
 (defun test-parse-mime (file &key (pretty t))
   (with-open-file (f file)
     (let ((res     (parse-mime-structure f)))
       (if pretty
 	  (pprint-rfc822-part res)
 	res))))
 
 (defun pprint-rfc822-part (thing &optional (prefix ""))
   (if (null thing)
       (error "Strange. something called pprint-rfc822-part with nil"))
   (let ((type (mime-part-type thing))
 	(subtype (mime-part-subtype thing)))
     (format t "~AHEADER ([RFC-2822] header of the message)~%" prefix)
     (format t "~ATEXT   ([RFC-2822] text body of the message) ~A/~A~%" prefix type subtype)
     
     (if* (message-rfc822-p type subtype)
        then ;; XXX .. what should the new prefix be? 
 	    (pprint-rfc822-part (mime-part-message thing) prefix)
      elseif (equalp type "multipart")
        then (pprint-multipart thing prefix))))
 
 (defun pprint-multipart (thing prefix)
   (let ((id 1))
     (dolist (part (mime-part-parts thing))
       (let ((type (mime-part-type part))
 	    (subtype (mime-part-subtype part)))
 	(format t "~a~a      ~a/~a~%" prefix id type subtype)
 	(if* (message-rfc822-p type subtype)
 	   then (pprint-rfc822-part (mime-part-message part) 
 				    (format nil "~a~a." prefix id))
 	 elseif (equalp type "multipart")
 	   then (pprint-multipart part (format nil "~a~a." prefix id)))
 	(incf id)))))
 |#