(defpackage warc-pkg (:use :cl)) (in-package :warc-pkg) (defparameter *cdx-file* (merge-pathnames "quicklisp/local-projects/warc-pkg/data/example-extra.cdx" (user-homedir-pathname))) (defparameter *cdx-file2* (merge-pathnames "quicklisp/local-projects/warc-pkg/data/example.cdx" (user-homedir-pathname))) (defparameter *warc-paths* (merge-pathnames "quicklisp/local-projects/warc-pkg/data/" (user-homedir-pathname))) (defclass SURT () ((host :initarg :host :initform (error "MUST provide a host") :reader host) (path :initarg :path :initform "" :reader path) (args :initarg :args :initform "" :reader args))) (defmethod print-object ((obj SURT) stream) (print-unreadable-object (obj stream :type t :identity t) (if (string= "" (args obj)) (princ (format nil "~A/~A" (host obj) (path obj)) stream) (princ (format nil "~A/~A?~A" (host obj) (path obj) (args obj)) stream)))) (defun make-surt (full-url) (labels ((parse-args (args) (let ((split-args (str:split "&" args))) (format nil "~{~A~^&~}" (stable-sort (copy-seq split-args) #'(lambda (x y) (let ((a (str:split "=" x)) (b (str:split "=" y))) (string-lessp (first a) (first b)))))))) (parse-path (path) (if (null path) "" path)) (parse-host (host) (format nil "~{~A~^,~})" (reverse (str:split "." host)))) (parse-url (url) (let* ((url-&-args (str:split "?" url)) (host-&-path (str:split "/" (second (str:split "://" (first url-&-args) :limit 2)) :limit 2))) (values (parse-host (first host-&-path)) (parse-path (second host-&-path)) (parse-args (or (second url-&-args) "")))))) (multiple-value-bind (host path args) (parse-url full-url) (make-instance 'SURT :host host :path path :args args)))) (defclass CDX () ((urlkey :initarg :urlkey :initform (error "MUST have a urlkey") :accessor urlkey) (datetime :initarg :datetime :initform (error "MUST have a datetime") :accessor datetime) (url :initarg :url :initform (error "MUST have a url") :accessor url) (mimetype :initarg :mimetype :initform (error "MUST have a mimetype") :accessor mimetype) (status-code :initarg :status-code :initform "-" :accessor status-code) (hash :initarg :hash :initform "" :accessor hash) (meta :initarg :meta :initform "-" :accessor meta) (robots :initarg :robots :initform "-" :accessor robots) (len :initarg :len :initform (error "MUST has a length") :accessor len) (offset :initarg :offset :initform (error "MUST have an offset") :accessor offset) (path :initarg :path :initform (error "Must has a path") :accessor path) (orig-len :initarg :orig-len :initform "-" :accessor orig-len) (orig-offset :initarg :orig-offset :initform "-" :accessor orig-offset) (orig-path :initarg :orig-path :initform "-" :accessor orig-path))) (defmethod print-object ((obj CDX) stream) (print-unreadable-object (obj stream :type t :identity t) (princ (format nil "~A: ~A" (urlkey obj) (datetime obj)) stream))) (defun make-cdx (line) (let* ((s (uiop:split-string line :separator " ")) (cdx-record (make-instance 'CDX :urlkey (make-surt (nth 2 s)) :datetime (nth 1 s) :url (nth 2 s) :mimetype (nth 3 s) :status-code (nth 4 s) :hash (nth 5 s) :meta (nth 6 s) :robots (nth 7 s) :len (parse-integer (nth 8 s)) :offset (parse-integer (nth 9 s)) :path (nth 10 s)))) (when (= 14 (length s)) (progn (setf (orig-len cdx-record) (parse-integer (nth 11 s))) (setf (orig-offset cdx-record) (parse-integer (nth 12 s))) (setf (orig-path cdx-record) (parse-integer (nth 13 s))))) cdx-record)) (defun read-cdx-file (file) (with-open-file (in file :direction :input) (when in (let ((lines (loop for line = (read-line in nil) while line collect (string-trim " " line)))) (mapcar #'make-cdx (remove-if #'(lambda (line) (string= (subseq line 0 3) "CDX")) lines)))))) (defun read-warcgz-record (file offset len) (with-open-file (in file :direction :input :element-type '(unsigned-byte 8)) (when in (file-position in offset) (read-bytes in len)))) (defun read-warc-record (file offset len) (with-open-file (in file :direction :input :element-type 'unsigned-byte) (when in (file-position in offset) (map 'string #'code-char (read-bytes in len))))) (defun read-warc-records (cdx-records) (let ((warc-records (mapcar #'(lambda (rec) (read-warc-record (merge-pathnames (path rec) *warc-paths*) (offset rec) (len rec))) cdx-records))) warc-records)) (defun read-warcgz-records (cdx-records) (let ((warc-records (mapcar #'(lambda (rec) (read-warcgz-record (merge-pathnames (path rec) *warc-paths*) (offset rec) (len rec))) cdx-records))) warc-records)) (defun read-bytes (file bytes) (let ((data (loop for i from bytes downto 0 collect (read-byte file)))) data)) (let ((cdx-records (read-cdx-file *cdx-file2*))) (read-warcgz-records cdx-records))