(defpackage :cells-html-scraper (:use :cl :alexandria :serapeum :fw.lu :cells)) (in-package :cells-html-scraper) (lquery:define-lquery-macro progn (nodes &rest args) `(lquery:$ (inline ,nodes) ,@args)) (lquery:define-lquery-function hn-score (item) (lquery:$1 (inline item) (next) ".score" (text))) (lquery:define-lquery-function hn-age (item) (lquery:$1 (inline item) (next) ".age" (text))) (lquery:define-lquery-function hn-comments (item) (lquery:$1 (inline item) (next) ".age" (lquery-funcs:next) (next) (next) (text))) (defclass hn-item () ((%url :initarg :url :reader url) (%title :initarg :title :reader title) (%score :initarg :score :reader score) (%age :initarg :age :reader age) (%comments :initarg :comments :reader comments))) (defun make-hn-item (url title score age comments) (make-instance 'hn-item :url (puri:parse-uri url) :title title :score (when score (parse-integer score :junk-allowed t)) :age age :comments (when comments (parse-integer comments :junk-allowed t)))) (defmodel hn-scraped () ((%html :initarg :html :accessor html :initform (c-in "")) (%doc :reader %doc :initform (c? (plump:parse (^html)))) (%hnmain :reader %hnmain :initform (c? (lquery:$1 (inline (^%doc)) "#hnmain"))) (%body :reader %body :initform (c? (lquery:$ (inline (^%hnmain)) ".itemlist tr.athing"))) (%titles :reader titles :initform (c? (lquery:$ (inline (^%body)) (combine (progn ".title .storylink" (attr "href") (node)) (progn ".title .storylink" (text) (node)) (hn-score) (hn-age) (hn-comments))))) (%items :reader items :initform (c? (map 'vector (op (apply 'make-hn-item _*)) (^titles)))))) (defmodel url-getter () ((%url :initarg :url :accessor url :initform (c-in '())) (%text :reader text :initform (c? (let ((drakma:*text-content-types* (acons "application" "json" drakma:*text-content-types*))) (drakma:http-request (^url))))))) (defun get-links (url) (restart-case (values (map 'list (compose (op (list* url _)) #'cdr) (remove-if-not (op (string= _ "alternate")) (lquery:$ (initialize (drakma:http-request url)) "link" (combine (attr "rel") (attr "href") (attr "type"))) :key #'car)) "") (continue nil :report (lambda (stream) (format stream "skip url ~a" url)) (values nil url))))