git.fiddlerwoaroof.com
Browse code

Add optimized split function that acts like python

Implement the equivalent of str.split in lisp. Optimized a bit for
speed: significantly faster than cl-ppcre or the serapeum equivalent.

fiddlerwoaroof authored on 05/04/2017 08:35:28
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,167 @@
1
+(defpackage :string-utils
2
+  (:use :cl))
3
+(in-package :string-utils)
4
+
5
+(deftype array-length ()
6
+  `(integer 0 ,array-dimension-limit))
7
+(deftype array-index ()
8
+  `(integer 0 ,(1- array-dimension-limit)))
9
+
10
+(defun get-part-modifier (char string &optional count)
11
+  (declare (optimize #+dev (debug 3) #-dev (speed 3))
12
+	   (type character char)
13
+	   (type string string))
14
+  (flet ((count-splits (string)
15
+	   (declare (optimize (speed 3))
16
+		    (type simple-string string))
17
+	   (do* ((x (the array-length 0) (1+ x))
18
+		 (cur-char #1=(aref string x) #1#)
19
+		 (result (the array-length 0) (if (char= cur-char char)
20
+						  (1+ result)
21
+						  result)))
22
+		((= x (1- (length string))) (1+ result))
23
+	     (declare (type array-length result)))))
24
+    (typecase string
25
+      ((and string (not simple-string))
26
+       (setf string (copy-seq string))))
27
+    (unless count
28
+      (setf count (count-splits string))))
29
+  (let ((parts (make-array count :initial-element nil :element-type '(or string null)))
30
+	(start-idx 0)
31
+	(target-spot 0))
32
+    (prog1 parts
33
+      (dotimes (end-idx (length string))
34
+	(when (typecase string
35
+		(simple-string (char= (aref string end-idx) char))
36
+		(string (char= (aref string end-idx) char)))
37
+	  (setf (aref parts target-spot)
38
+		(make-array (- end-idx start-idx) :displaced-to string :displaced-index-offset start-idx
39
+			    :element-type 'character))
40
+	  (incf target-spot)
41
+	  (setf start-idx (1+ end-idx))
42
+	  (when (= target-spot (1- count))
43
+	    (return))))
44
+      (when (<= start-idx (length string))
45
+	(setf (aref parts target-spot)
46
+	      (make-array (- (length string) start-idx)
47
+			  :displaced-to string :displaced-index-offset start-idx
48
+			  :element-type 'character))))))
49
+
50
+;; TODO: implement test
51
+(defun %split-on-char (divider string &key count (test nil))
52
+  (declare (optimize #+dev (debug 3) (speed 3) (space 2))
53
+	   (type (or null array-length) count)
54
+	   (type (or null function symbol) test)
55
+	   (type character divider)
56
+	   (type string string))
57
+  (typecase string
58
+    ((and string (not simple-string))
59
+     (setf string (copy-seq string))))
60
+  (check-type string simple-string)
61
+
62
+  (flet ((count-splits (string)
63
+	   (declare (optimize (speed 3))
64
+		    (type simple-string string))
65
+	   (do* ((x (the array-length 0) (1+ x))
66
+		 (cur-char #1=(aref string x) #1#)
67
+		 (result (the array-length 0) (if (char= cur-char divider)
68
+						  (1+ result)
69
+						  result)))
70
+		((= x (1- (length string))) (1+ result))
71
+	     (declare (type array-length result))))
72
+	 (find-pos (start-pos)
73
+	   (declare (optimize (speed 3))
74
+		    (type array-index start-pos))
75
+	   (etypecase test
76
+	     (function (position divider string :start start-pos :test test))
77
+	     (null (position divider string :start start-pos))
78
+	     (symbol (position divider string :start start-pos :test (symbol-function test))))))
79
+
80
+    (unless count
81
+      (setf count (count-splits string)))
82
+
83
+    (check-type count array-length)
84
+    (let ((parts (make-array (1+ count) :fill-pointer 0))
85
+	  (start-pos (the fixnum 0)))
86
+      (declare (dynamic-extent start-pos))
87
+      (prog1 parts
88
+	(loop 
89
+	   for end-pos = (find-pos start-pos)
90
+	   while end-pos 
91
+	   for num-parts from 1 below count
92
+	   do
93
+	     (vector-push (subseq string start-pos end-pos) parts)
94
+	     (setf start-pos (1+ end-pos))
95
+	   finally
96
+	     (when (< start-pos (length string))
97
+	       (vector-push (subseq string start-pos)
98
+			    parts)))))))
99
+
100
+(defmacro twice (&body body)
101
+  `(progn ,@body
102
+	  ,@body))
103
+
104
+
105
+(defun %split-on-string (divider string &key count (test nil))
106
+  (declare (optimize #+dev (debug 3) (speed 3))
107
+	   (type string divider string)
108
+	   (type (or null function symbol) test)
109
+	   (type (or null array-index) count))
110
+  (flet ((%search (start-pos)
111
+	   (declare (optimize (speed 3))
112
+		    (type array-index start-pos)
113
+		    (inline))
114
+	   (typecase divider
115
+	     (simple-string (typecase string
116
+			      (simple-string (search divider string :start2 start-pos))
117
+			      (string (search divider string :start2 start-pos))))
118
+	     (string (search divider string :start2 start-pos))))
119
+	 (%search-with-test (start-pos test)
120
+	   (declare (optimize (speed 3))
121
+		    (type array-index start-pos)
122
+		    (type function test)
123
+		    (inline))
124
+	   (typecase divider
125
+	     (simple-string (typecase string
126
+			      (simple-string (search divider string :start2 start-pos :test test))
127
+			      (string (search divider string :start2 start-pos :test test))))
128
+	     (string (search divider string :start2 start-pos :test test)))))
129
+    (declare (dynamic-extent (function %search)
130
+			     (function %search-with-test)))
131
+    (let ((num-parts (the array-length 0))
132
+	  (pattern-length (the array-length (length divider)))
133
+	  (search-test (typecase test
134
+			 (function test)
135
+			 (null)
136
+			 (symbol (symbol-function test))))
137
+	  (parts (make-array (if count count 100)
138
+			     :adjustable t
139
+			     :fill-pointer 0))
140
+	  (start-pos 0))
141
+      (loop 
142
+	 for end-pos = (typecase search-test
143
+			 (function (%search-with-test start-pos search-test))
144
+			 (null (%search start-pos)))
145
+	 do
146
+	   (vector-push-extend (subseq string start-pos end-pos) parts) 
147
+	   (incf (the array-length num-parts))
148
+	 while end-pos
149
+	 do (setf start-pos (the array-length (+ pattern-length end-pos)))
150
+	 until (and count (>= (1+ num-parts) count))
151
+	 finally
152
+	   (when (and count end-pos)
153
+	     (vector-push-extend (subseq string (+ pattern-length end-pos)) parts))
154
+	   (return parts)))))
155
+
156
+(defun split (divider string &key count (test nil))
157
+  (declare (optimize #+dev (debug 3) (speed 3) (space 3)))
158
+  (unless test
159
+    (setf test
160
+	  (typecase divider
161
+	    (string 'equal)
162
+	    (t 'eql))))
163
+  (etypecase divider
164
+    (character (%split-on-char divider string :count count :test test))
165
+    (string (if (= 1 (length divider))
166
+		(%split-on-char (aref divider 0) string :count count :test test)
167
+		(%split-on-string divider string :count count :test test)))))