git.fiddlerwoaroof.com
Browse code

rfe7210

dancy authored on 05/06/2007 17:15:18
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+2007-06-05  Ahmon Dancy  <dancy@dancy>
2
+
3
+	* rfe7210: rfc2822.cl: new extract-email-addresses function.
4
+
1 5
 2007-06-01  Ahmon Dancy  <dancy@dancy>
2 6
 
3 7
 	* mime-transfer-encoding.cl: New quoted-printable decoding utility
... ...
@@ -1,4 +1,4 @@
1
-;; -*- mode: common-lisp; package: net.post-office -*-
1
+;; -*- mode: common-lisp; package: net.mail -*-
2 2
 ;;
3 3
 ;; copyright (c) 1999-2002 Franz Inc, Berkeley, CA - All rights reserved.
4 4
 ;; copyright (c) 2002-2007 Franz Inc, Oakland, CA - All rights reserved.
... ...
@@ -14,7 +14,7 @@
14 14
 ;; merchantability or fitness for a particular purpose.  See the GNU
15 15
 ;; Lesser General Public License for more details.
16 16
 ;;
17
-;; $Id: rfc2822.cl,v 1.5 2007/06/01 16:21:38 dancy Exp $
17
+;; $Id: rfc2822.cl,v 1.6 2007/06/05 17:15:18 dancy Exp $
18 18
 
19 19
 #+(version= 8 0)
20 20
 (sys:defpatch "rfc2822" 0
... ...
@@ -22,9 +22,16 @@
22 22
   :type :system
23 23
   :post-loadable t)
24 24
 
25
+#+(version= 8 1 :beta)
26
+(sys:defpatch "rfc2822" 0
27
+  "v0: New extract-email-addresses function."
28
+  :type :system
29
+  :post-loadable t)
30
+
25 31
 (defpackage :net.mail
26 32
   (:use #:lisp #:excl)
27 33
   (:export #:parse-email-address
34
+	   #:extract-email-addresses
28 35
 	   #:valid-email-domain-p))
29 36
 
30 37
 (in-package :net.mail)
... ...
@@ -102,86 +109,15 @@ domain.
102 109
   
103 110
   (defconstant *specials* "()<>\\[\\]:;@\\,.\"")
104 111
   
105
-  (defconstant *no-ws-ctl* "\\x1-\\x8\\xb-\\xc\\xe-\\x1f\\x7f")
106
-  
107
-  (defconstant *fws* "(?:(?:[ \\t]*\\r?\\n)?[ \\t]+)")
108
-
109
-  (defconstant *text* "[^\\r\\n]")
110
-  
111
-  (defconstant *quoted-pair* (format nil "\\\\~a" *text*))
112
-
113
-  (defconstant *ctext* "[^\\s()\\\\]")
114
-
115
-  ;; 1 means (xx)
116
-  ;; 2 means (xxx (yyy) zzz)
117
-  (defconstant *max-comment-level* 2)
118
-  
119
-  (defparameter *ccontent nil)
120
-  (defparameter *comment* nil)
121
-  
122
-  (dotimes (n *max-comment-level*)
123
-    (if* (null *comment*)
124
-       then (setf *ccontent* (format nil "(?:~a|~a)" *ctext* *quoted-pair*))
125
-       else (setf *ccontent* (format nil "(?:~a|~a|~a)" 
126
-				     *ctext* *quoted-pair* *comment*)))
127
-    
128
-    (setf *comment* (format nil "\\((?:~a?~a)*~a?\\)"
129
-			    *fws* *ccontent* *fws*)))
130
-  
131
-  (defconstant *cfws* (format nil "(?:~a?~a)*(?:(?:~a?~a)|~a)"
132
-			      *fws* *comment* *fws* *comment* *fws*))
133
-  
134 112
   (defconstant *atext* 
135 113
       (format nil "[^\\s~a~a]" *controls* *specials*))
136
-  
137
-  (defconstant *atom* (format nil "~a?(~a+)~a?" *cfws* *atext* *cfws*))
138
-
139
-  (defconstant *dot-atom-text* (format nil "~a+(?:\\.~a+)*" *atext* *atext*))
140
-  
141
-  (defconstant *dot-atom* (format nil "~a?(~a)~a?" 
142
-				  *cfws* *dot-atom-text* *cfws*))
143
-  
144
-  ;; no control chars, no backslash, no quote
145
-  (defconstant *qtext* (format nil "[^~a\\\\\"]" *controls*))
146
-  
147
-  (defconstant *qcontent* (format nil "~a|~a" *qtext* *quoted-pair*))
148 114
 
149
-  (defconstant *quoted-string*
150
-      (format nil "~a?\"((?:~a?~a)*~a?)\"~a?"
151
-	      *cfws* *fws* *qcontent* *fws* *cfws*))
152
-  
153
-  (defconstant *local-part* 
154
-      (format nil "(~a)|(~a)" *dot-atom* *quoted-string*))
155
-  
156
-  (defconstant *local-part-x*
157
-      (format nil "^(?:~a)|(?:~a)" *dot-atom* *quoted-string*))
158
-  
159
-  ;; domain literals not supported.
160
-  (defconstant *domain* *dot-atom*)
161
-  
162
-  (defconstant *addr-spec* (format nil "(~a)@(~a)" *local-part* *domain*))
163
-  
164
-  (defconstant *angle-addr* (format nil "~a?<~a>~a?" 
165
-				    *cfws* *addr-spec* *cfws*))
166
-  
167
-  (defconstant *word* (format nil "(?:~a|~a)" *atom* *quoted-string*))
168
-  
169
-  (defconstant *phrase* (format nil "~a+" *word*))
170
-  
171
-  (defconstant *display-name* *phrase*)
172
-  
173
-  (defconstant *name-addr* (format nil "~a?~a" *display-name* *angle-addr*))
174
-  
175
-  (defconstant *mailbox* (format nil "(?:~a|~a)" *name-addr* *addr-spec*))
176
-  
177
-  (defconstant *mailbox-list* 
178
-      (format nil "(?:~a(?:,~a)*)" *mailbox* *mailbox*))
115
+  (defconstant *atom* (format nil "^~a+" *atext*))
179 116
 
180
-  (defconstant *group* 
181
-      (format nil "~a:(?:~a|~a)?;~a?" *display-name* *mailbox-list* *cfws*
182
-	      *cfws*))
117
+  (defconstant *dot-atom-text* (format nil "~a+(?:\\.~a+)*" *atext* *atext*))
183 118
   
184
-  ;; More strict than the RFC.
119
+  ;; More strict than the RFC, but good for verifying syntax of email
120
+  ;; addresses that a user supplies.
185 121
   
186 122
   (defconstant *email-address-re*
187 123
       (format nil "^\\s*(~a)(?:@(~a))?\\s*$" *dot-atom-text* *dot-atom-text*))
... ...
@@ -201,6 +137,284 @@ domain.
201 137
        then nil
202 138
        else (values user domain))))
203 139
 
140
+;; Returns a list of entries like so: 
141
+;;  (:mailbox display-name user domain)
142
+;;  or
143
+;;  (:group display-name mailbox-list)
144
+
145
+(defun extract-email-addresses (string &key (start 0) (end (length string))
146
+					    (require-domain t) (errorp t))
147
+  (declare (optimize (speed 3))
148
+	   (fixnum start end))
149
+  (with-underlying-simple-vector (string string disp)
150
+    (declare (simple-string string)
151
+	     (fixnum disp))
152
+    (incf start disp)
153
+    (incf end disp)
154
+    
155
+    ;; Unfold.
156
+    (when (match-re "\\r?\\n\\s" string :start start :end end)
157
+      (setf string (replace-re string "\\r?\\n\\s" " " 
158
+			       :start start :end end))
159
+      (setf start 0)
160
+      (setf end (length string)))
161
+    
162
+    (let ((res 
163
+	   (catch 'syntax-error
164
+	     (parse-address-list string start end require-domain))))
165
+      (if* (stringp res)
166
+	 then (if errorp (error res))
167
+       elseif (null res)
168
+	 then (if errorp 
169
+		  (error "Failed to parse: ~s" (subseq string start end)))
170
+	      nil
171
+	 else res))))
172
+
173
+(macrolet ((parse-special (char skip-ws)
174
+	     `(multiple-value-bind (type value newpos)
175
+		  (rfc2822-lex string start end ,skip-ws)
176
+		(declare (ignore type))
177
+		(when (eq value ,char)
178
+		  (setf start newpos)))))
179
+
180
+  ;; Supports obsolete format which allows for null members in the list.
181
+  (defun parse-address-list (string start end require-domain)
182
+    (let (res)
183
+      (loop
184
+	(while (parse-special #\, t))
185
+	(multiple-value-bind (addr newpos)
186
+	    (parse-address string start end require-domain)
187
+	  (if (null addr)
188
+	      (return))
189
+	  (setf start newpos)
190
+	  (push addr res)))
191
+      (values (nreverse res) start)))
192
+    
193
+  (defun parse-address (string start end require-domain)
194
+    (multiple-value-bind (mb newpos)
195
+	(parse-mailbox string start end require-domain)
196
+      (if* mb
197
+	 then (values mb newpos)
198
+	 else (parse-group string start end require-domain))))
199
+  
200
+  
201
+  (defun parse-mailbox (string start end require-domain)
202
+    (multiple-value-bind (ok display-name localpart domain newpos)
203
+	(parse-name-addr string start end require-domain)
204
+      (if ok
205
+	  (return-from parse-mailbox 
206
+	    (values
207
+	     (list :mailbox display-name localpart domain)
208
+	     newpos))))
209
+    (multiple-value-bind (localpart domain newpos)
210
+	(parse-addr-spec string start end require-domain)
211
+      (when localpart
212
+	(setf start newpos)
213
+	;; Check for a trailing comment and use that as the display name
214
+	(multiple-value-bind (display-name newpos)
215
+	    (grab-next-comment string start end)
216
+	  (if display-name
217
+	      (setf start newpos))
218
+	     (values
219
+	      (list :mailbox display-name localpart domain)
220
+	      start)))))
221
+  
222
+  (defun grab-next-comment (string start end)
223
+    (loop
224
+      (multiple-value-bind (type value newpos)
225
+	  (rfc2822-lex string start end nil)
226
+	(if (eq type :comment)
227
+	    (return (values (replace-re value "^\\((.*)\\)$" "\\1") newpos)))
228
+	(if* (eq type :wsp)
229
+	   then (setf start newpos)
230
+	   else (return)))))
231
+	
232
+  (defun parse-group (string start end require-domain)
233
+    (multiple-value-bind (display-name newpos)
234
+	(parse-phrase string start end)
235
+      (when display-name
236
+	(setf start newpos)
237
+	(when (parse-special #\: t)
238
+	  (multiple-value-bind (mailbox-list newpos)
239
+	      (parse-mailbox-list string start end require-domain)
240
+	    (setf start newpos)
241
+	    (when (parse-special #\; t)
242
+	      (values :group mailbox-list newpos)))))))
243
+
244
+  (defun parse-mailbox-list (string start end require-domain)
245
+    (let (res)
246
+      (loop
247
+	(multiple-value-bind (mailbox newpos)
248
+	    (parse-mailbox string start end require-domain)
249
+	  (if (null mailbox)
250
+	      (return))
251
+	  (push mailbox res)
252
+	  (setf start newpos)
253
+	  (if (not (parse-special #\, t))
254
+	      (return))))
255
+      (values (nreverse res) start)))
256
+  
257
+  (defun parse-name-addr (string start end require-domain)
258
+    (multiple-value-bind (display-name newpos)
259
+	(parse-phrase string start end)
260
+      (if display-name
261
+	  (setf start newpos))
262
+      (multiple-value-bind (localpart domain newpos)
263
+	  (parse-angle-addr string start end require-domain)
264
+	(when localpart
265
+	  (values t display-name localpart domain newpos)))))
266
+  
267
+  ;; This is obs-phrase, which is seen often.  For example:
268
+  ;; From: Mr. T <mr.t@pitythefool.com>
269
+  (defun parse-phrase (string start end)
270
+    (let ((first t)
271
+	  res type value newpos)
272
+      (loop
273
+	(multiple-value-setq (type value newpos)
274
+	  (rfc2822-lex string start end first))
275
+	(if (null type)
276
+	    nil)
277
+	(if* (or (eq type :atom)
278
+		 (eq type :quoted-string)
279
+		 (and (not first) (or (eq value #\.) (eq type :wsp))))
280
+	   then (push value res)
281
+		(setf first nil)
282
+		(setf start newpos)
283
+	   else (return)))
284
+      (if (and res (match-re "^\\s" (first res)))
285
+	  (pop res))
286
+      (if res 
287
+	  (values (list-to-delimited-string (nreverse res) "") start))))
288
+    
289
+  (defun parse-angle-addr (string start end require-domain)
290
+    (when (parse-special #\< t)
291
+      (multiple-value-bind (localpart domain newpos)
292
+	  (parse-addr-spec string start end require-domain)
293
+	(setf start newpos)
294
+	(when (and localpart (parse-special #\> t))
295
+	  (values localpart domain start)))))
296
+  
297
+  (defun parse-addr-spec (string start end require-domain)
298
+    (multiple-value-bind (localpart newpos)
299
+	(parse-local-part string start end)
300
+      (when localpart
301
+	(setf start newpos)
302
+	(when (not (parse-special #\@ t))
303
+	  (if* require-domain
304
+	     then (return-from parse-addr-spec)
305
+	     else (return-from parse-addr-spec 
306
+		    (values localpart nil start))))
307
+	(multiple-value-bind (domain newpos)
308
+	    (parse-dot-atom string start end)
309
+	  (when domain
310
+	    (values localpart domain newpos)))))))
311
+
312
+(defun parse-local-part (string start end)
313
+  (multiple-value-bind (type value newpos)
314
+      (rfc2822-lex string start end t)
315
+    (if* (eq type :quoted-string)
316
+       then (values value newpos)
317
+     elseif (eq type :atom)
318
+       then (parse-dot-atom string start end))))
319
+
320
+(defun parse-dot-atom (string start end)
321
+  (let ((first t)
322
+	res)
323
+    (loop
324
+      (multiple-value-bind (type value newpos)
325
+	  (rfc2822-lex string start end first)
326
+	(setf first nil)
327
+	(if (null type)
328
+	    (return))
329
+	(if* (eq type :atom)
330
+	   then (push value res)
331
+	 elseif (not (eq value #\.))
332
+	   then (return))
333
+	(setf start newpos)))
334
+    (if res
335
+	(values (list-to-delimited-string (nreverse res) #\.) start))))
336
+
337
+(eval-when (compile)
338
+  (defconstant *max-comment-nesting* 3)
339
+  
340
+  (defparameter *cchar* "(?:[^()\\\\]|\\\\.)")
341
+  (defparameter *comment* nil)
342
+  
343
+  (dotimes (n *max-comment-nesting*)
344
+    (if* *comment*
345
+       then (setf *comment* (format nil "(?:\\((?:~a|~a)*\\))"
346
+				   *cchar* *comment*))
347
+       else (setf *comment* (format nil "(?:\\(~a*\\))" *cchar*))))
348
+  
349
+  (setf *comment* (format nil "^~a" *comment*)))
350
+
351
+(defun rfc2822-lex (string start end skip-ws)
352
+  (declare (optimize (speed 3))
353
+	   (simple-string string)
354
+	   (fixnum start end))
355
+  (when (< start end)
356
+    (let ((char (schar string start)))
357
+      (if* (eq char #\") 
358
+	 then ;; quoted string.
359
+	      (multiple-value-bind (matched whole)
360
+		  (match-re "^\"((?:[^\\\\\"]|\\\\.)*)\"" string
361
+			    :start start :end end
362
+			    :return :index)
363
+		(if (not matched)
364
+		    (throw 'syntax-error "Unterminated quoted string"))
365
+		(values :quoted-string 
366
+			(subseq string (car whole) (cdr whole))
367
+			(cdr whole)))
368
+       elseif (or (eq char #\space) (eq char #\tab))
369
+	 then ;; whitespace
370
+	      (multiple-value-bind (x match)
371
+		  (match-re "^\\s+" string 
372
+			    :start start :end end :return :index)
373
+		(declare (ignore x))
374
+		(if* skip-ws
375
+		   then (rfc2822-lex string (cdr match) end t)
376
+		   else (values :wsp
377
+				(subseq string (car match) (cdr match))
378
+				(cdr match))))
379
+       elseif (eq char #\()
380
+	 then ;; comment
381
+	      (multiple-value-bind (matched whole)
382
+		  (match-re #.*comment* string
383
+			    :start start :end end :return :index)
384
+		(if (not matched)
385
+		    (throw 'syntax-error 
386
+		      "Unterminated comment or nesting too deep"))
387
+		(if* skip-ws
388
+		   then (rfc2822-lex string (cdr whole) end t)
389
+		   else (values :comment
390
+				(subseq string (car whole) (cdr whole))
391
+				(cdr whole))))
392
+	 else (multiple-value-bind (matched whole)
393
+		  (match-re *atom* string :start start :end end 
394
+			    :return :index)
395
+		(if* (not matched)
396
+		   then ;; must be a special
397
+			(values :special
398
+				char
399
+				(1+ start))
400
+		   else ;; atom
401
+			(values :atom
402
+				(subseq string (car whole) (cdr whole))
403
+				(cdr whole))))))))
404
+
405
+#+ignore
406
+(defun test ()
407
+  (dolist (file (command-output "find ~/mail/ -name \"[0-9][0-9]*\""))    
408
+    (with-open-file (f file)
409
+      (let* ((part (parse-mime-structure f))
410
+	     (hdrs (mime-part-headers part)))
411
+	(dolist (type '("From" "To" "Cc"))
412
+	  (let ((hdr (cdr (assoc type hdrs :test #'equalp))))
413
+	    (when hdr
414
+	      (if (null (extract-email-addresses hdr :require-domain nil
415
+						 :errorp nil))
416
+		  (format t "Failed to parse: ~s~%" hdr)))))))))
417
+
204 418
 ;; Ripped from maild:dns.cl and modified.
205 419
 
206 420
 (eval-when (compile load eval)