git.fiddlerwoaroof.com
Raw Blame History
(in-package :alimenta.feed-archive)

(defvar *feeds*)
(defvar *feed-base*)

(defparameter +dirname-format+
  '((:year 4) #\- (:month 2) #\- (:day 2) #\/ (:hour 2) #\- (:min 2) #\/))

(defun get-store-directory-name (timestamp)
  (flet ((make-dirname (timestamp)
           (merge-pathnames
            (local-time:format-timestring nil
                                          (local-time:timestamp-minimize-part timestamp
                                                                              :sec)
                                          :format +dirname-format+)
            *feed-base*)))
    (values (prog1-let ((result (make-dirname timestamp)))
              (ensure-directories-exist result)))))

(defun test-feed-list ()
  (values '("http://feeds.feedburner.com/GamasutraFeatureArticles/"
            "http://edwardfeser.blogspot.com/feeds/posts/default"
            "http://feeds.feedburner.com/undergroundthomist/yCSy"
            "https://www.codinghorror.com/blog/index.xml"
            "https://sancrucensis.wordpress.com/feed/")
          #p"/tmp/feed-archive/"))

(defun init-feeds (&key feed-list archive-root)
  (ubiquitous:restore 'alimenta.feed-archiver)
  (let ((default-root (or archive-root
                          (merge-pathnames ".feed-archive/"
                                           (truename "~/")))))
    (values (ubiquitous:defaulted-value feed-list :feeds)
            (ubiquitous:defaulted-value default-root :archive :root))))

(defun add-feed (feed)
  (init-feeds)
  (pushnew feed
           (ubiquitous:value :feeds)
           :test #'equalp))

(defmacro lambda* ((&rest args) &body body)
  (let ((rest-arg (gensym "REST")))
    `(lambda (,@args &rest ,rest-arg)
       (declare (ignore ,rest-arg))
       ,@body)))

(defun safe-pull-feed (feed-url &aux (pop-times 0))
  "Handles date parsing errors in the feed: chronicity won't parse
   certain date formats, this catches the error and modifies the
   format to something chronicity can handle."
  (handler-bind ((warning #'muffle-warning)
                 (error (lambda* (c)
                          (when (find-restart 'alimenta:pop-token c)
                            (cond
                              ((< pop-times 50)
                               (incf pop-times)
                               (format t
                                       "~&Processing error, trying to pop a token (popped ~d times)~%"
                                       pop-times)
                               (alimenta:pop-token))
                              (t
                               (continue)))))))
    (prog1-bind (feed (alimenta.pull-feed:pull-feed feed-url))
      ;; Why am I decf-ing here?
      (alimenta:transform feed
                          (lambda (entity)
                            (typecase entity
                              (alimenta:item
                              (let ((v (alimenta:content entity)))
                                (when v
                                  (setf (alimenta:content entity)
                                         (html-sanitizer:sanitize v))))))))
      (decf pop-times))))

(defmacro with-progress-message ((stream before after &optional (error-msg " ERROR~%~4t~a~%")) &body body)
  (once-only (before after stream)
    `(handler-bind ((error (op (format ,stream ,error-msg _))))
       (format ,stream "~&~a . . ." ,before)
       (multiple-value-prog1 (progn
                               ,@body)
         (format ,stream " ~a~%" ,after)))))

(defun skip-feed ()
  (when-let ((restart (find-restart 'skip-feed)))
    (invoke-restart restart)))

(defun save-feed (feed output-file &key (if-exists :supersede))
  (with-output-to-file (s output-file :if-exists if-exists)
    (plump:serialize (alimenta:doc feed) s)))

(defun log-pull (feed-puller feed-url stream)
  (let ((before-message (concatenate 'string "Trying to pull: " feed-url)))
    (with-progress-message (stream before-message "Success")
      (funcall feed-puller feed-url))))

(defun normalize-feed (feed-url feed)
  (alimenta:filter-feed (coerce-feed-link feed-url feed)
                        (complement #'older-than-a-month)
                        :key 'alimenta:date))

(defun log-serialization (feed-url stream feed path)
  (declare (ignorable feed-url stream feed path))
  (with-progress-message (stream "Serializing XML" (format nil "done with ~a" feed-url))
    (save-feed feed (merge-pathnames "feed.xml" path))))

(defun feed-relative-pathname (path &optional (feed-base *feed-base*))
  (uiop:enough-pathname path feed-base))

(defun feed-index (index-stream pull-time references)
  (yason:with-output (index-stream :indent t)
    (yason:encode-object
     (make-instance 'feed-index
                    :pull-time pull-time
                    :references (remove-if 'null references)))))


(defun pull-and-store-feed (feed-url stream-provider &optional (feed-puller #'safe-pull-feed))
  (declare (optimize (debug 3)))
  (flet ((log-pull (stream)
           (declare (inline) (dynamic-extent stream))
           (log-pull feed-puller feed-url stream))
         (log-serialization (stream feed path)
           (declare (inline) (dynamic-extent stream))
           (log-serialization feed-url stream feed
                              (merge-pathnames path
                                               (stream-provider:root stream-provider)))))
    (handler-bind ((cl+ssl:ssl-error-verify
                     (lambda (c)
                       (declare (ignore c))
                       (format *error-output* "~&SSL Error while pulling ~a~%"
                               feed-url))))
      (with-simple-restart (skip-feed "Stop processing for ~a" feed-url)
        (let* ((feed (with-retry ("Pull feed again.")
                       (normalize-feed feed-url (log-pull t)))))
          (trivia:match (store feed stream-provider)
            ((list title path)
             (log-serialization t feed path)
             (make-feed-reference (alimenta:feed-link feed)
                                  :title title
                                  :path (feed-relative-pathname
                                         (uiop:pathname-directory-pathname
                                          (merge-pathnames path
                                                           (stream-provider:root stream-provider))))))))))))

(defun archive-feeds (pull-time pull-directory index-stream)
  (prog1-bind (references (mapcar (op (pull-and-store-feed _ pull-directory))
                                  *feeds*))
    (feed-index index-stream pull-time references)))

(defun archive-feeds-nondeterm ()
  (let* ((pull-time (local-time:now))
         (pull-directory (get-store-directory-name pull-time))
         (index-path (merge-pathnames "index.json" pull-directory))
         (feed-stream-provider (make-instance 'alimenta.feed-archive.encoders:feed-stream-provider
                                              :if-exists :error
                                              :root pull-directory)))
    (with-open-file (index index-path :direction :output)
      (archive-feeds pull-time
                     feed-stream-provider
                     index))
    (format t "~&!! pull-directory ~a~%" (uiop:enough-pathname pull-directory *feed-base*))))

;; This is an ungodly mess, we need to avoid funneling everything through fix-pathname-or-skip
(defun command-line-main (&optional (feed-list-initializer #'init-feeds))
  (labels ((feed-type-unsupported (c &key (restart 'skip-feed))
             (format t "~&Feed type unsupported: ~a for feed ~a~%"
                     (alimenta:feed-type c)
                     (alimenta:feed-link c))
             (funcall restart))
           (fix-pathname-or-skip (c &key
                                      (restart 'skip-feed)
                                      (wrapped-condition nil wc-p))
             (typecase (or wrapped-condition c)
               (alimenta:feed-type-unsupported (feed-type-unsupported c))
               (t
                (if (find-restart 'fix-pathname)
                    (fix-pathname)
                    (if (find-restart 'alimenta.pull-feed:skip-feed)
                        (alimenta.pull-feed:skip-feed c)
                        (progn
                          (unless (eq restart 'continue)
                            (format t "~&Skipping a feed... ~s~%"
                                    (if wc-p
                                        (alimenta.feed-archive.encoders:the-feed c)
                                        "Unknown")))
                          (funcall restart))))))))

    (let ((error-count 0))
      (handler-bind
          ((alimenta.feed-archive.encoders:feed-error
             (op (fix-pathname-or-skip
                  _1 :wrapped-condition
                  (alimenta.feed-archive.encoders:the-condition _1))))
           (alimenta:feed-type-unsupported #'feed-type-unsupported)
           ((or usocket:timeout-error usocket:ns-error cl+ssl:ssl-error-verify)
             (op (alimenta.pull-feed:skip-feed _)))

           (error
             (op
               (format t "~&Error signaled, ~a (count ~d)"
                       _1 error-count)
               (incf error-count)
               (unless (< error-count 15)
                 (format t " continuing~%")
                 (fix-pathname-or-skip _1 :restart 'continue)))))
        (multiple-value-bind (*feeds* *feed-base*)
            (funcall feed-list-initializer)
          (alimenta.pull-feed:with-user-agent ("Feed Archiver v0.1b")
            (archive-feeds-nondeterm)))))))


(defpackage :alimenta.feed-archive/tests
  (:use :cl :should-test)
  (:export ))
(in-package :alimenta.feed-archive/tests)

(eval-when (:compile-toplevel :load-toplevel :execute)
  (import 'alimenta.feed-archive::feed-index))

(defun hash-table= (ht1 ht2 &key (key-test 'equal) (value-test 'equal))
  (let ((ht1-keys (alexandria:hash-table-keys ht1))
        (ht2-keys (alexandria:hash-table-keys ht2)))
    (and (= (length ht1-keys)
            (length ht2-keys))
         (every key-test ht1-keys ht2-keys)
         (every value-test
                (alexandria:hash-table-values ht1)
                (alexandria:hash-table-values ht2)))))

(deftest feed-index ()
  (should be hash-table=
          (yason:parse
           (with-output-to-string (s)
             (feed-index s (local-time:encode-timestamp 0 0 0 0 1 1 1) '()))
           :object-as :hash-table :json-arrays-as-vectors nil)
          (alexandria:alist-hash-table
           '(("pull-time" . "0001-01-01T00:00:00.000000-08:00")
             ("feeds" . ())))))