git.fiddlerwoaroof.com
feed-archive.lisp
25a661c3
 (in-package :alimenta.feed-archive)
 
 (defvar *feeds*)
 (defvar *feed-base*)
 
 (defparameter +dirname-format+
   '((:year 4) #\- (:month 2) #\- (:day 2) #\/ (:hour 2) #\- (:min 2) #\/))
 
 (defun get-store-directory-name (timestamp)
79f9884d
   (flet ((make-dirname (timestamp)
e3a39cd6
            (merge-pathnames
             (local-time:format-timestring nil
                                           (local-time:timestamp-minimize-part timestamp
                                                                               :sec)
                                           :format +dirname-format+)
             *feed-base*)))
     (values (prog1-let ((result (make-dirname timestamp)))
               (ensure-directories-exist result)))))
79f9884d
 
 (defun test-feed-list ()
   (values '("http://feeds.feedburner.com/GamasutraFeatureArticles/"
d28d3987
             "http://edwardfeser.blogspot.com/feeds/posts/default"
dec54d4c
             "http://feeds.feedburner.com/undergroundthomist/yCSy"
114a2b3d
             "https://www.codinghorror.com/blog/index.xml"
             "https://sancrucensis.wordpress.com/feed/")
           #p"/tmp/feed-archive/"))
25a661c3
 
 (defun init-feeds (&key feed-list archive-root)
   (ubiquitous:restore 'alimenta.feed-archiver)
   (let ((default-root (or archive-root
114a2b3d
                           (merge-pathnames ".feed-archive/"
                                            (truename "~/")))))
25a661c3
     (values (ubiquitous:defaulted-value feed-list :feeds)
114a2b3d
             (ubiquitous:defaulted-value default-root :archive :root))))
25a661c3
 
 (defun add-feed (feed)
   (init-feeds)
   (pushnew feed
114a2b3d
            (ubiquitous:value :feeds)
            :test #'equalp))
25a661c3
 
e3a39cd6
 (defmacro lambda* ((&rest args) &body body)
   (let ((rest-arg (gensym "REST")))
     `(lambda (,@args &rest ,rest-arg)
        (declare (ignore ,rest-arg))
        ,@body)))
 
c1d0bd12
 (defun safe-pull-feed (feed-url &aux (pop-times 0))
f92f94ff
   "Handles date parsing errors in the feed: chronicity won't parse
    certain date formats, this catches the error and modifies the
    format to something chronicity can handle."
e3a39cd6
   (handler-bind ((warning #'muffle-warning)
                  (error (lambda* (c)
                           (when (find-restart 'alimenta:pop-token c)
                             (cond
                               ((< pop-times 50)
                                (incf pop-times)
                                (format t
                                        "~&Processing error, trying to pop a token (popped ~d times)~%"
                                        pop-times)
                                (alimenta:pop-token))
                               (t
                                (continue)))))))
     (prog1-bind (feed (alimenta.pull-feed:pull-feed feed-url))
       ;; Why am I decf-ing here?
       (alimenta:transform feed
c9158189
                           (lambda (entity)
                             (typecase entity
                               (alimenta:item
e3a39cd6
                               (let ((v (alimenta:content entity)))
                                 (when v
                                   (setf (alimenta:content entity)
c9158189
                                          (html-sanitizer:sanitize v))))))))
e3a39cd6
       (decf pop-times))))
f7e44666
 
36fc9de2
 (defmacro with-progress-message ((stream before after &optional (error-msg " ERROR~%~4t~a~%")) &body body)
   (once-only (before after stream)
     `(handler-bind ((error (op (format ,stream ,error-msg _))))
        (format ,stream "~&~a . . ." ,before)
        (multiple-value-prog1 (progn
114a2b3d
                                ,@body)
          (format ,stream " ~a~%" ,after)))))
0010b953
 
f7e44666
 (defun skip-feed ()
   (when-let ((restart (find-restart 'skip-feed)))
     (invoke-restart restart)))
25a661c3
 
2f3de0b8
 (defun save-feed (feed output-file &key (if-exists :supersede))
   (with-output-to-file (s output-file :if-exists if-exists)
     (plump:serialize (alimenta:doc feed) s)))
 
114a2b3d
 (defun log-pull (feed-puller feed-url stream)
   (let ((before-message (concatenate 'string "Trying to pull: " feed-url)))
     (with-progress-message (stream before-message "Success")
       (funcall feed-puller feed-url))))
 
 (defun normalize-feed (feed-url feed)
   (alimenta:filter-feed (coerce-feed-link feed-url feed)
                         (complement #'older-than-a-month)
                         :key 'alimenta:date))
 
 (defun log-serialization (feed-url stream feed path)
d28d3987
   (declare (ignorable feed-url stream feed path))
114a2b3d
   (with-progress-message (stream "Serializing XML" (format nil "done with ~a" feed-url))
     (save-feed feed (merge-pathnames "feed.xml" path))))
 
 (defun feed-relative-pathname (path &optional (feed-base *feed-base*))
   (uiop:enough-pathname path feed-base))
 
d28d3987
 (defun feed-index (index-stream pull-time references)
   (yason:with-output (index-stream :indent t)
     (yason:encode-object
9ef705d4
      (make-instance 'feed-index
                     :pull-time pull-time
                     :references (remove-if 'null references)))))
 
d28d3987
 
 (defun pull-and-store-feed (feed-url stream-provider &optional (feed-puller #'safe-pull-feed))
9ef705d4
   (declare (optimize (debug 3)))
36fc9de2
   (flet ((log-pull (stream)
114a2b3d
            (declare (inline) (dynamic-extent stream))
            (log-pull feed-puller feed-url stream))
          (log-serialization (stream feed path)
            (declare (inline) (dynamic-extent stream))
d28d3987
            (log-serialization feed-url stream feed
                               (merge-pathnames path
                                                (stream-provider:root stream-provider)))))
9ef705d4
     (handler-bind ((cl+ssl:ssl-error-verify
e3a39cd6
                      (lambda (c)
                        (declare (ignore c))
                        (format *error-output* "~&SSL Error while pulling ~a~%"
                                feed-url))))
9ef705d4
       (with-simple-restart (skip-feed "Stop processing for ~a" feed-url)
         (let* ((feed (with-retry ("Pull feed again.")
                        (normalize-feed feed-url (log-pull t)))))
           (trivia:match (store feed stream-provider)
             ((list title path)
              (log-serialization t feed path)
              (make-feed-reference (alimenta:feed-link feed)
                                   :title title
                                   :path (feed-relative-pathname
                                          (uiop:pathname-directory-pathname
                                           (merge-pathnames path
                                                            (stream-provider:root stream-provider))))))))))))
79f9884d
 
114a2b3d
 (defun archive-feeds (pull-time pull-directory index-stream)
d28d3987
   (prog1-bind (references (mapcar (op (pull-and-store-feed _ pull-directory))
                                   *feeds*))
     (feed-index index-stream pull-time references)))
114a2b3d
 
 (defun archive-feeds-nondeterm ()
0010b953
   (let* ((pull-time (local-time:now))
e3a39cd6
          (pull-directory (get-store-directory-name pull-time))
dec54d4c
          (index-path (merge-pathnames "index.json" pull-directory))
d28d3987
          (feed-stream-provider (make-instance 'alimenta.feed-archive.encoders:feed-stream-provider
                                               :if-exists :error
                                               :root pull-directory)))
0010b953
     (with-open-file (index index-path :direction :output)
d28d3987
       (archive-feeds pull-time
                      feed-stream-provider
                      index))
297698f5
     (format t "~&!! pull-directory ~a~%" (uiop:enough-pathname pull-directory *feed-base*))))
79f9884d
 
f92f94ff
 ;; This is an ungodly mess, we need to avoid funneling everything through fix-pathname-or-skip
79f9884d
 (defun command-line-main (&optional (feed-list-initializer #'init-feeds))
f92f94ff
   (labels ((feed-type-unsupported (c &key (restart 'skip-feed))
dec54d4c
              (format t "~&Feed type unsupported: ~a for feed ~a~%"
                      (alimenta:feed-type c)
                      (alimenta:feed-link c))
              (funcall restart))
e3a39cd6
            (fix-pathname-or-skip (c &key
                                       (restart 'skip-feed)
                                       (wrapped-condition nil wc-p))
dec54d4c
              (typecase (or wrapped-condition c)
                (alimenta:feed-type-unsupported (feed-type-unsupported c))
e3a39cd6
                (t
dec54d4c
                 (if (find-restart 'fix-pathname)
                     (fix-pathname)
e3a39cd6
                     (if (find-restart 'alimenta.pull-feed:skip-feed)
                         (alimenta.pull-feed:skip-feed c)
                         (progn
                           (unless (eq restart 'continue)
                             (format t "~&Skipping a feed... ~s~%"
                                     (if wc-p
                                         (alimenta.feed-archive.encoders:the-feed c)
                                         "Unknown")))
                           (funcall restart))))))))
f92f94ff
 
36fc9de2
     (let ((error-count 0))
e3a39cd6
       (handler-bind
           ((alimenta.feed-archive.encoders:feed-error
              (op (fix-pathname-or-skip
                   _1 :wrapped-condition
                   (alimenta.feed-archive.encoders:the-condition _1))))
            (alimenta:feed-type-unsupported #'feed-type-unsupported)
            ((or usocket:timeout-error usocket:ns-error cl+ssl:ssl-error-verify)
              (op (alimenta.pull-feed:skip-feed _)))
 
            (error
              (op
                (format t "~&Error signaled, ~a (count ~d)"
                        _1 error-count)
                (incf error-count)
                (unless (< error-count 15)
                  (format t " continuing~%")
                  (fix-pathname-or-skip _1 :restart 'continue)))))
         (multiple-value-bind (*feeds* *feed-base*)
             (funcall feed-list-initializer)
dec54d4c
           (alimenta.pull-feed:with-user-agent ("Feed Archiver v0.1b")
             (archive-feeds-nondeterm)))))))
114a2b3d
 
9ef705d4
 
 (defpackage :alimenta.feed-archive/tests
   (:use :cl :should-test)
   (:export ))
 (in-package :alimenta.feed-archive/tests)
 
 (eval-when (:compile-toplevel :load-toplevel :execute)
   (import 'alimenta.feed-archive::feed-index))
 
 (defun hash-table= (ht1 ht2 &key (key-test 'equal) (value-test 'equal))
   (let ((ht1-keys (alexandria:hash-table-keys ht1))
         (ht2-keys (alexandria:hash-table-keys ht2)))
     (and (= (length ht1-keys)
             (length ht2-keys))
          (every key-test ht1-keys ht2-keys)
          (every value-test
                 (alexandria:hash-table-values ht1)
                 (alexandria:hash-table-values ht2)))))
 
 (deftest feed-index ()
e3a39cd6
   (should be hash-table=
           (yason:parse
            (with-output-to-string (s)
              (feed-index s (local-time:encode-timestamp 0 0 0 0 1 1 1) '()))
            :object-as :hash-table :json-arrays-as-vectors nil)
           (alexandria:alist-hash-table
            '(("pull-time" . "0001-01-01T00:00:00.000000-08:00")
              ("feeds" . ())))))