(defpackage :fwoar.lisp-sandbox.reddit-dataset-creator (:use :cl ) (:export )) (in-package :fwoar.lisp-sandbox.reddit-dataset-creator) (defun get-data (subreddit) (alimenta:to-feed (fw.lu:closing (plump:parse (drakma:http-request (puri:merge-uris (format nil "~(~a~).rss?limit=100" subreddit) "https://www.reddit.com/r/") :want-stream t))))) (defun get-data-json (subreddit) (fw.lu:closing (yason:parse (drakma:http-request (puri:merge-uris (format nil "~(~a~).json?limit=100" subreddit) "https://www.reddit.com/r/") :want-stream t)))) (defun subreddit-json-image-urls (subreddit &optional (fetch 'get-data-json-cached)) (data-lens.transducers:transduce (data-lens:• (data-lens.transducers:mapping (data-lens:key "data")) #+nil (data-lens.transducers:mapping (lambda (it) (format t "~&~s~%" (gethash "url" it)) it)) (data-lens.transducers:mapping (data-lens:• (lambda (it) (fw.lu:dive '("source" "url") it)) (data-lens:applicable-when (data-lens:element 0) (complement #'null)) (data-lens:sorted '< :key (data-lens:key "width")) (lambda (it) (fw.lu:dive '("preview" "images") it)))) (data-lens.transducers:filtering 'identity)) 'data-lens.transducers:list-builder (fw.lu:dive '("data" "children") (funcall fetch subreddit)))) (defun subreddit-image-urls (subreddit &optional (fetch 'get-data-cached)) (data-lens.transducers:transduce (data-lens:• (data-lens.transducers:mapping (data-lens:• (lambda (it) (lquery:$ (initialize it) "a" (attr "href"))) 'alimenta:content)) (data-lens.transducers:catting) (data-lens.transducers:filtering (data-lens:regex-match "[.](jpe?g|png)$"))) 'data-lens.transducers:list-builder (funcall fetch subreddit))) (defvar *reset-cache* nil) (defun get-data-cached (subreddit) (let ((cache (load-time-value (make-hash-table)))) (when *reset-cache* (clrhash cache) (setf *reset-cache* nil)) (alexandria:ensure-gethash subreddit cache (get-data subreddit)))) (defun get-data-json-cached (subreddit) (let ((cache (load-time-value (make-hash-table)))) (when *reset-cache* (clrhash cache) (setf *reset-cache* nil)) (alexandria:ensure-gethash subreddit cache (get-data-json subreddit)))) (defun dump-stream (input output &optional fn) (with-open-stream (echo (make-echo-stream input output)) (loop with buffer-length = 1000 with buffer = (make-array buffer-length :element-type 'character) for read-chars = (read-sequence buffer echo) do (when fn (funcall fn buffer buffer-length)) (unless (< read-chars buffer-length) (return nil))))) (defun connect-streams (input output &key (background t) fn) "This reads from input and writes output until the end of input is found." (dump-stream input output fn)) (defun store-image (base category url) (let* ((uri (puri:parse-uri url)) (fn (pathname-name (parse-namestring (puri:uri-path uri))))) (alexandria:with-output-to-file (s (ensure-directories-exist (merge-pathnames (make-pathname :directory (list :relative (string category)) :name fn) (parse-namestring base)))) (fw.lu:closing (connect-streams (drakma:http-request url :want-stream t) s :background nil))))) (format *standard-output* "~&~s ~s ~s ~s ~s ~s~%" SB-EXT:*CORE-PATHNAME* SB-EXT:*RUNTIME-PATHNAME* asdf/user::*nil-pathname* *COMPILE-FILE-PATHNAME* *LOAD-PATHNAME* swank/sbcl::*buffer-name* ) #+(or) (asdf:initialize-source-registry (fwoar.git-systems:define-dir-deps () (:git"alimenta" "git@git.fiddlerwoaroof.com:u/edwlan/alimenta.git" "master") (:git"alimenta-feed-archive" "git@git.fiddlerwoaroof.com:u/edwlan/alimenta-feed-archive.git" "master") (:git"data-lens" "git@git.fiddlerwoaroof.com:data-lens.git" "master") (:git"collection-class" "git@git.fiddlerwoaroof.com:u/edwlan/collection-class.git" "master")))