(defpackage :cells-html-scraper
(:use :cl :alexandria :serapeum :fw.lu :cells))
(in-package :cells-html-scraper)
(lquery:define-lquery-macro progn (nodes &rest args)
`(lquery:$
(inline ,nodes)
,@args))
(lquery:define-lquery-function hn-score (item)
(lquery:$1 (inline item)
(next)
".score"
(text)))
(lquery:define-lquery-function hn-age (item)
(lquery:$1 (inline item)
(next)
".age"
(text)))
(lquery:define-lquery-function hn-comments (item)
(lquery:$1 (inline item)
(next)
".age"
(lquery-funcs:next)
(next)
(next)
(text)))
(defclass hn-item ()
((%url :initarg :url :reader url)
(%title :initarg :title :reader title)
(%score :initarg :score :reader score)
(%age :initarg :age :reader age)
(%comments :initarg :comments :reader comments)))
(defun make-hn-item (url title score age comments)
(make-instance 'hn-item
:url (puri:parse-uri url)
:title title
:score (when score (parse-integer score :junk-allowed t))
:age age
:comments (when comments (parse-integer comments :junk-allowed t))))
(defmodel hn-scraped ()
((%html :initarg :html
:accessor html
:initform (c-in ""))
(%doc :reader %doc :initform (c? (plump:parse (^html))))
(%hnmain :reader %hnmain
:initform (c? (lquery:$1
(inline (^%doc))
"#hnmain")))
(%body :reader %body
:initform (c? (lquery:$
(inline (^%hnmain))
".itemlist tr.athing")))
(%titles :reader titles
:initform (c? (lquery:$
(inline (^%body))
(combine (progn ".title .storylink" (attr "href")
(node))
(progn ".title .storylink" (text)
(node))
(hn-score)
(hn-age)
(hn-comments)))))
(%items :reader items :initform (c? (map 'vector
(op (apply 'make-hn-item _*))
(^titles))))))
(defmodel url-getter ()
((%url :initarg :url
:accessor url
:initform (c-in '()))
(%text :reader text
:initform (c? (let ((drakma:*text-content-types* (acons "application" "json" drakma:*text-content-types*)))
(drakma:http-request (^url)))))))
(defun get-links (url)
(restart-case (values (map 'list (compose (op (list* url _))
#'cdr)
(remove-if-not (op (string= _ "alternate"))
(lquery:$
(initialize (drakma:http-request url))
"link"
(combine (attr "rel") (attr "href") (attr "type")))
:key #'car))
"")
(continue nil
:report (lambda (stream) (format stream "skip url ~a" url))
(values nil url))))