Browse code
Add flexible parsing framework
Ed Langley authored on 28/01/2019 11:02:34
Showing 2 changed files
Showing 2 changed files
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,50 @@ |
1 |
+(defpackage :access-log-reader |
|
2 |
+ (:shadowing-import-from :multi-fun :rest) |
|
3 |
+ (:shadowing-import-from :data-lens :pick :defun-ct) |
|
4 |
+ (:use :cl :multi-fun :data-lens :alexandria :fw.lu) |
|
5 |
+ (:export )) |
|
6 |
+(in-package :access-log-reader) |
|
7 |
+ |
|
8 |
+(defun make-timestamp (record) |
|
9 |
+ (let ((local-time:*default-timezone* local-time:+utc-zone+)) |
|
10 |
+ (destructuring-bind (day month year time) record |
|
11 |
+ (local-time:with-decoded-timestamp (:hour hour :minute minute :sec second :nsec nsec) time |
|
12 |
+ (local-time:encode-timestamp nsec second minute hour day month year))))) |
|
13 |
+ |
|
14 |
+(defun extract-query-params (s) |
|
15 |
+ (serapeum:mapply 'cons |
|
16 |
+ (map 'list |
|
17 |
+ (serapeum:op (coerce (fwoar.string-utils:split #\= _ :count 2) 'list)) |
|
18 |
+ (fwoar.string-utils:split #\& s)))) |
|
19 |
+(defparameter +access-log-parser+ |
|
20 |
+ (list (delimited-field #\:) |
|
21 |
+ (delimited-field #\space) |
|
22 |
+ (as (delimited-field #\space) (lambda (s) (subseq s 1 (1- (length s))))) |
|
23 |
+ (as (delimited-field #\space) (lambda (s) (unless (equal s "-") (parse-integer s)))) |
|
24 |
+ (as (delimited-field #\space) (lambda (s) (unless (equal s "-") (parse-integer s)))) |
|
25 |
+ (as (delimited-field #\space) #'parse-integer) |
|
26 |
+ (as (delimited-field #\space) (lambda (s) (unless (equal s "-") (parse-integer s)))) |
|
27 |
+ (as (subformat (v (ignore-char #\[) |
|
28 |
+ (as (delimited-field #\/) #'parse-integer) |
|
29 |
+ (as (delimited-field #\/) (serapeum:op (position _ local-time:+short-month-names+ :test 'equal))) |
|
30 |
+ (as (delimited-field #\:) #'parse-integer) |
|
31 |
+ (as (delimited-field #\]) (lambda (x) (local-time:parse-timestring (remove #\space x) :allow-missing-date-part t))))) |
|
32 |
+ 'make-timestamp) |
|
33 |
+ (whitespace) |
|
34 |
+ (as (delimited-field #\space) (lambda (s) (subseq s 1))) |
|
35 |
+ (splat-result (as (delimited-field #\space) |
|
36 |
+ (alexandria:compose |
|
37 |
+ (data-lens:juxt 'quri:uri-path |
|
38 |
+ (alexandria:compose 'extract-query-params 'quri:uri-query)) |
|
39 |
+ 'quri:uri))) |
|
40 |
+ (delimited-field #\") |
|
41 |
+ (delimited-field #\space) |
|
42 |
+ (delimited-field #\space) |
|
43 |
+ (delimited-field #\space) |
|
44 |
+ (ignore-char #\") |
|
45 |
+ (as (delimited-field #\") 'quri:uri) |
|
46 |
+ (whitespace) |
|
47 |
+ (rest))) |
|
48 |
+ |
|
49 |
+(defun parse-log (f) |
|
50 |
+ (parse-file +access-log-parser+ f)) |
0 | 51 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,219 @@ |
1 |
+(defpackage :multi-fun |
|
2 |
+ (:shadow :rest) |
|
3 |
+ (:use :cl ) |
|
4 |
+ (:export |
|
5 |
+ #:filter-functions |
|
6 |
+ #:number-recognizer |
|
7 |
+ #:delimited-field |
|
8 |
+ #:ignore-char |
|
9 |
+ #:whitespace |
|
10 |
+ #:iso-8601-timestamp |
|
11 |
+ #:parse-qs |
|
12 |
+ #:rest |
|
13 |
+ #:as |
|
14 |
+ #:splat-result |
|
15 |
+ #:defun |
|
16 |
+ #:month-recognizer |
|
17 |
+ #:take-field |
|
18 |
+ #:parse-format |
|
19 |
+ #:parse-file |
|
20 |
+ #:subformat |
|
21 |
+ #:treeify)) |
|
22 |
+(in-package :multi-fun) |
|
23 |
+ |
|
24 |
+(defun filter-functions (funs list) |
|
25 |
+ (loop |
|
26 |
+ for fun in funs |
|
27 |
+ for results = (loop |
|
28 |
+ for value in list |
|
29 |
+ for (result . num) = (multiple-value-list (funcall fun value)) |
|
30 |
+ when result |
|
31 |
+ collect (list (car num) result)) |
|
32 |
+ when (and results (= (length results) (length list))) |
|
33 |
+ collect fun)) |
|
34 |
+ |
|
35 |
+(defun number-recognizer (range) |
|
36 |
+ (destructuring-bind (min max) |
|
37 |
+ (etypecase range |
|
38 |
+ (integer (list 0 range)) |
|
39 |
+ (list range)) |
|
40 |
+ (flet ((in-range-p (n) |
|
41 |
+ (< min n (1+ max)))) |
|
42 |
+ (lambda (str) |
|
43 |
+ (loop for digits |
|
44 |
+ from (min (length str) |
|
45 |
+ (ceiling (log max 10))) |
|
46 |
+ downto (if (<= min 0) |
|
47 |
+ 1 |
|
48 |
+ (max 1 |
|
49 |
+ (floor (log min 10)))) |
|
50 |
+ for (result . rest) = (multiple-value-list (parse-integer str :end digits :junk-allowed t)) |
|
51 |
+ for relevant-result = (and result (in-range-p result)) |
|
52 |
+ until relevant-result |
|
53 |
+ finally (when relevant-result |
|
54 |
+ (return (values-list |
|
55 |
+ (cons result rest))))))))) |
|
56 |
+ |
|
57 |
+(defun delimited-field (delimiter) |
|
58 |
+ (lambda (str) |
|
59 |
+ (alexandria:if-let ((val (position delimiter str))) |
|
60 |
+ (values (subseq str 0 val) |
|
61 |
+ (min (length str) |
|
62 |
+ (1+ val))) |
|
63 |
+ (values str |
|
64 |
+ (length str))))) |
|
65 |
+ |
|
66 |
+(defun ignore-char (char) |
|
67 |
+ (lambda (str) |
|
68 |
+ (when (char= (elt str 0) char) |
|
69 |
+ (values 'drop 1)))) |
|
70 |
+ |
|
71 |
+(defun whitespace () |
|
72 |
+ (lambda (str) |
|
73 |
+ (values 'drop |
|
74 |
+ (position-if-not 'serapeum:whitespacep |
|
75 |
+ str)))) |
|
76 |
+ |
|
77 |
+(defun iso-8601-timestamp () |
|
78 |
+ (lambda (str) |
|
79 |
+ (values (local-time:parse-timestring (subseq str 0 32)) |
|
80 |
+ 32))) |
|
81 |
+ |
|
82 |
+(defun parse-qs (str) |
|
83 |
+ (serapeum:mapply 'cons |
|
84 |
+ (map 'list |
|
85 |
+ (serapeum:op (coerce (fwoar.string-utils:split #\= _ :count 2) 'list)) |
|
86 |
+ (fwoar.string-utils:split #\& str)))) |
|
87 |
+ |
|
88 |
+(defun rest (&optional (as #'identity)) |
|
89 |
+ (lambda (str) |
|
90 |
+ (values (funcall as str) |
|
91 |
+ (length str)))) |
|
92 |
+ |
|
93 |
+(defun as (fun transform) |
|
94 |
+ (flet ((do-transform (val) (funcall transform val))) |
|
95 |
+ (lambda (str) |
|
96 |
+ (fw.lu:transform-first-value (do-transform (funcall fun str)))))) |
|
97 |
+ |
|
98 |
+(defun splat-result (fun) |
|
99 |
+ (flet ((splatize (val) (list* 'splat (alexandria:ensure-list val)))) |
|
100 |
+ (lambda (str) |
|
101 |
+ (fw.lu:transform-first-value (splatize (funcall fun str)))))) |
|
102 |
+ |
|
103 |
+(defun month-recognizer (str) |
|
104 |
+ (macrolet ((match-char (elt &body body) |
|
105 |
+ (let ((cases (mapcar (serapeum:op |
|
106 |
+ (if (listp _1) |
|
107 |
+ `(,(elt (car _1) elt) (match-char ,(1+ elt) ,@_1)) |
|
108 |
+ `(,(elt _1 elt) (when (alexandria:starts-with-subseq ,_1 str) |
|
109 |
+ ,_1)))) |
|
110 |
+ body))) |
|
111 |
+ `(when (> (length str) ,elt) |
|
112 |
+ (case (elt str ,elt) |
|
113 |
+ ,@cases))))) |
|
114 |
+ (alexandria:if-let ((value (match-char 0 |
|
115 |
+ ("April" "August") |
|
116 |
+ "December" |
|
117 |
+ "February" |
|
118 |
+ ("January" ("July" "June")) |
|
119 |
+ (("March" "May")) |
|
120 |
+ "November" |
|
121 |
+ "October" |
|
122 |
+ "September"))) |
|
123 |
+ (values value (length value)) |
|
124 |
+ (values nil 0)))) |
|
125 |
+ |
|
126 |
+(defun take-field (fun strs) |
|
127 |
+ (loop for str in strs |
|
128 |
+ for (val chars-read) = (multiple-value-list (funcall fun str)) |
|
129 |
+ do (format t "~&VAL: ~s CHARS-READ: ~s" val chars-read) |
|
130 |
+ collect (list val (subseq str chars-read)))) |
|
131 |
+ |
|
132 |
+(defun parse-format (funs str) |
|
133 |
+ (loop |
|
134 |
+ for line = str then (subseq line chars-read) |
|
135 |
+ for fun in funs |
|
136 |
+ for (val chars-read) = (multiple-value-list (funcall fun line)) |
|
137 |
+ for total-chars = chars-read then (+ total-chars chars-read) |
|
138 |
+ if (and (consp val) (eql (car val) 'splat)) append (cdr val) into result |
|
139 |
+ else when (not (eql val 'drop)) collect val into result |
|
140 |
+ finally (return (values result |
|
141 |
+ total-chars)))) |
|
142 |
+ |
|
143 |
+(defmacro subformat ((v-sym &rest parsers) &body transform) |
|
144 |
+ `(flet ((transform (,v-sym) |
|
145 |
+ ,@(if transform |
|
146 |
+ transform |
|
147 |
+ (list v-sym)))) |
|
148 |
+ (lambda (str) |
|
149 |
+ (fw.lu:transform-first-value (transform (parse-format (list ,@parsers) str)))))) |
|
150 |
+ |
|
151 |
+(defun parse-file (format file &optional (get-record #'read-line)) |
|
152 |
+ (let* ((records ()) |
|
153 |
+ (cur records)) |
|
154 |
+ (flet ((collect-record (record) |
|
155 |
+ (let ((new-cdr (list record))) |
|
156 |
+ (if cur |
|
157 |
+ (setf (cdr cur) new-cdr) |
|
158 |
+ (setf records new-cdr)) |
|
159 |
+ (setf cur new-cdr)))) |
|
160 |
+ (loop (let* ((line (funcall get-record file nil 'eof))) |
|
161 |
+ (when (eql line 'eof) |
|
162 |
+ (return records)) |
|
163 |
+ (with-simple-restart (skip-record "Skip line ~s" line) |
|
164 |
+ (collect-record (coerce (parse-format format line) |
|
165 |
+ 'vector)))))))) |
|
166 |
+ |
|
167 |
+(defun treeify (strings) |
|
168 |
+ (declare (optimize (speed 3)) |
|
169 |
+ (inline data-lens:over data-lens:transform-tail data-lens:applicable-when data-lens:of-min-length |
|
170 |
+ data-lens:on data-lens:over data-lens:slice data-lens:compress-runs |
|
171 |
+ data-lens:combine-matching-lists data-lens:juxt data-lens:element data-lens:sorted)) |
|
172 |
+ (let* ((strip-prefixes |
|
173 |
+ (alexandria:compose |
|
174 |
+ (data-lens:over |
|
175 |
+ (data-lens:transform-tail |
|
176 |
+ (data-lens:over |
|
177 |
+ (data-lens:transform-head (data-lens:slice 1))))) |
|
178 |
+ (data-lens:compress-runs |
|
179 |
+ :collector 'data-lens:combine-matching-lists))) |
|
180 |
+ (extract-keys-and-sort |
|
181 |
+ (alexandria:compose (data-lens:over |
|
182 |
+ (data-lens:juxt (alexandria:compose |
|
183 |
+ (data-lens:element 0) |
|
184 |
+ (data-lens:element 0)) |
|
185 |
+ 'identity)) |
|
186 |
+ (data-lens:sorted 'char< |
|
187 |
+ :key (alexandria:compose |
|
188 |
+ (data-lens:element 0) |
|
189 |
+ (data-lens:element 0))))) |
|
190 |
+ (recurse |
|
191 |
+ (data-lens:over |
|
192 |
+ (data-lens:transform-tail |
|
193 |
+ (data-lens:applicable-when |
|
194 |
+ (lambda (x) |
|
195 |
+ (if (equal (caar x) "") |
|
196 |
+ (cons (cons nil (cdar x)) |
|
197 |
+ (treeify (cdr x))) |
|
198 |
+ (treeify x))) |
|
199 |
+ (data-lens:of-min-length 2))))) |
|
200 |
+ (step (data-lens:on strip-prefixes extract-keys-and-sort))) |
|
201 |
+ |
|
202 |
+ (funcall (alexandria:compose recurse step) strings))) |
|
203 |
+ |
|
204 |
+ |
|
205 |
+ |
|
206 |
+ |
|
207 |
+(defparameter +months+ |
|
208 |
+ '("January" |
|
209 |
+ "February" |
|
210 |
+ "March" |
|
211 |
+ "April" |
|
212 |
+ "May" |
|
213 |
+ "June" |
|
214 |
+ "July" |
|
215 |
+ "August" |
|
216 |
+ "September" |
|
217 |
+ "October" |
|
218 |
+ "November" |
|
219 |
+ "December")) |