git.fiddlerwoaroof.com
Browse code

Initial version

fiddlerwoaroof authored on 14/08/2017 06:39:16
Showing 4 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,16 @@
1
+;;; -*- Mode:Lisp; Syntax:ANSI-Common-Lisp; Package: ASDF-USER -*-
2
+(in-package :asdf-user)
3
+
4
+(defsystem :html-sanitizer 
5
+  :description ""
6
+  :author "Ed L <edward@elangley.org>"
7
+  :license "MIT"
8
+  :depends-on (:alexandria
9
+               :uiop
10
+               :serapeum
11
+               :plump
12
+               :lquery
13
+               :fiveam)
14
+  :serial t
15
+  :components ((:file "package")
16
+               (:file "html-sanitizer")))
0 17
new file mode 100644
1 18
Binary files /dev/null and b/html-sanitizer.fasl differ
2 19
new file mode 100644
... ...
@@ -0,0 +1,109 @@
1
+(in-package :html-sanitizer)
2
+
3
+(defparameter *serialization-mode* :xml)
4
+(defparameter +plump-dont-self-close-tags+ '("span" "div" "iframe" "script"))  ; insert more tags if needed
5
+
6
+(defmethod plump:serialize-object :around ((node plump:element))
7
+  (let ((tag-name (plump:tag-name node)))
8
+    (if (and (eq *serialization-mode* :html)
9
+             (= 0 (length (plump:children node)))
10
+             (member tag-name +plump-dont-self-close-tags+ :test #'string-equal))
11
+        (progn
12
+          (format plump:*stream* "<~A" tag-name)
13
+          (plump:serialize (plump:attributes node) plump:*stream*)
14
+          (format plump:*stream* "></~A>" tag-name))
15
+        (call-next-method node))))
16
+
17
+(defparameter +safe-tags+
18
+  (list "a" "abbr" "acronym" "address" "area" "article" "aside"
19
+        "audio" "b" "bdi" "bdo" "big" "blink" "blockquote" "body" "br"
20
+        "caption" "center" "cite" "code" "col" "colgroup" "content"
21
+        "data" "datalist" "dd" "decorator" "del" "details" "dfn" "dir"
22
+        "div" "dl" "dt" "element" "em" "fieldset" "figcaption"
23
+        "figure" "font" "footer" "form" "h1" "h2" "h3" "h4" "h5" "h6"
24
+        "head" "header" "hgroup" "hr" "html" "i" "img" "ins" "kbd"
25
+        "label" "legend" "li" "main" "map" "mark" "marquee" "menu"
26
+        "menuitem" "meter" "nav" "nobr" "ol" "optgroup" "option"
27
+        "output" "p" "pre" "progress" "q" "rp" "rt" "ruby" "s" "samp"
28
+        "section" "select" "shadow" "small" "source" "spacer" "span"
29
+        "strike" "strong" "sub" "summary" "sup" "table" "tbody" "td"
30
+        "template" "textarea" "tfoot" "th" "thead" "time" "tr" "track"
31
+        "tt" "u" "ul" "var" "video" "wbr"))
32
+
33
+
34
+(defparameter +safe-attrs+
35
+  (list "accept" "action" "align" "alt" "autocomplete" "background"
36
+        "bgcolor" "border" "cellpadding" "cellspacing" "checked" "cite"
37
+        "class" "clear" "color" "cols" "colspan" "coords" "datetime"
38
+        "default" "dir" "disabled" "download" "enctype" "face" "for"
39
+        "headers" "height" "hidden" "high" "href" "hreflang" "id"
40
+        "ismap" "label" "lang" "list" "loop" "low" "max" "maxlength"
41
+        "media" "method" "min" "multiple" "name" "noshade" "novalidate"
42
+        "nowrap" "open" "optimum" "pattern" "placeholder" "poster"
43
+        "preload" "pubdate" "radiogroup" "readonly" "rel" "required"
44
+        "rev" "reversed" "role" "rows" "rowspan" "spellcheck" "scope"
45
+        "selected" "shape" "size" "span" "srclang" "start" "src" "step"
46
+        "summary" "tabindex" "title" "type" "usemap" "valign" "value"
47
+        "width" "xmlns"))
48
+
49
+(defparameter *comment-mode* :strip
50
+  ;;TODO: strip-conditional to only strip conditional
51
+  ;;      comments (e.g. <!--[if ...]>...<![endif]-->)
52
+  )
53
+
54
+(defgeneric sanitize-node (node)
55
+  (:documentation "")
56
+  (:method (node)
57
+    node)
58
+
59
+  (:method ((node plump:comment))
60
+    (plump:remove-child node))
61
+
62
+  (:method :after ((node plump:element))
63
+           (map nil #'sanitize-node
64
+                (plump:children node)))
65
+
66
+  (:method ((node plump:element))
67
+    (if (member (plump:tag-name node)
68
+                +safe-tags+
69
+                :test 'equalp)
70
+        (progn
71
+          (loop for attr being the hash-keys in (plump:attributes node)
72
+             unless (member attr +safe-attrs+ :test 'equalp) do
73
+               (plump:remove-attribute node attr))
74
+          node)
75
+        (plump:remove-child node))))
76
+
77
+(defun sanitize (html)
78
+  (let ((result (plump:parse html))
79
+        (*serialization-mode* :html))
80
+    (map nil
81
+         #'sanitize-node 
82
+         (plump:children result))
83
+    (plump:serialize result nil)))
84
+
85
+(defpackage :html-sanitizer.test
86
+  (:use :cl :fiveam))
87
+(in-package :html-sanitizer.test)
88
+
89
+(eval-when (:compile-toplevel :load-toplevel :execute)
90
+  (import 'html-sanitizer::sanitize))
91
+
92
+(def-suite :html-sanitizer)
93
+(in-suite :html-sanitizer)
94
+
95
+(test removes-script-tags
96
+  (is (equal "<div></div>"
97
+             (sanitize "<div><script></script></div>"))))
98
+
99
+(test removes-style-tags
100
+  (is (equal "<div></div>"
101
+             (sanitize "<div><style></style></div>"))))
102
+
103
+(test removes-style-attrs
104
+  (is (equal "<div></div>"
105
+             (sanitize "<div style=\"a: 1\"></div>"))))
106
+
107
+(test removes-comments
108
+  (is (equal ""
109
+             (sanitize "<!-- -->"))))
0 110
new file mode 100644
... ...
@@ -0,0 +1,5 @@
1
+(cl:in-package :cl-user)
2
+
3
+(defpackage :html-sanitizer
4
+  (:use :cl :alexandria :serapeum)
5
+  (:export :sanitize))