(defpackage :rubbish-newline-counter (:use :cl) (:export #:count-lines-in-file)) (in-package :rubbish-newline-counter) (defun count-lines-in-file (pathname) (mmap:with-mmap (pointer fd length pathname) (declare (ignore fd)) (when (zerop length) (return-from count-lines-in-file 0)) (%count-lines-in-file pointer length))) (sb-c:defknown %broadcast-byte ((unsigned-byte 8)) (sb-ext:simd-pack-256 integer) (sb-c:movable sb-c:flushable)) (sb-c:defknown %read-bytes (sb-alien:system-area-pointer (unsigned-byte 64)) (sb-ext:simd-pack-256 integer) (sb-c:foldable sb-c:movable sb-c:flushable)) (sb-c:defknown %zeroes () (sb-ext:simd-pack-256 integer) ;; Don't fold because XORing should be faster than loading from ;; memory. (sb-c:movable sb-c:flushable)) (sb-c:defknown %equals ((sb-ext:simd-pack-256 integer) (sb-ext:simd-pack-256 integer)) (sb-ext:simd-pack-256 integer) (sb-c:movable)) (sb-c:defknown %sum-absolute-differences ((sb-ext:simd-pack-256 integer) (sb-ext:simd-pack-256 integer)) (sb-ext:simd-pack-256 integer) (sb-c:movable)) (sb-c:defknown %subtract-ub8 ((sb-ext:simd-pack-256 integer) (sb-ext:simd-pack-256 integer)) (sb-ext:simd-pack-256 integer) (sb-c:movable)) (sb-c:defknown %add-ub64 ((sb-ext:simd-pack-256 integer) (sb-ext:simd-pack-256 integer)) (sb-ext:simd-pack-256 integer) (sb-c:movable)) (defmacro define-boring-vop (name args result &body generator) `(progn (sb-vm::define-vop (,name) (:translate ,name) (:policy :fast-safe) (:args ,@(loop for (name nil . rest) in args collect (cons name rest))) (:arg-types ,@(mapcar #'second args)) (:results (,(first result) ,@(rest (rest result)))) (:result-types ,(second result)) (:generator 1 ,@generator)) (defun ,name ,(mapcar #'first args) (,name ,@(mapcar #'first args))))) (in-package :sb-vm) (rubbish-newline-counter::define-boring-vop rubbish-newline-counter::%broadcast-byte ((byte unsigned-num :scs (unsigned-reg))) (broadcasted simd-pack-256-int :scs (int-avx2-reg)) (inst movq broadcasted byte) (inst vpbroadcastb broadcasted broadcasted)) (rubbish-newline-counter::define-boring-vop rubbish-newline-counter::%read-bytes ((pointer system-area-pointer :scs (sap-reg)) (index unsigned-num :scs (unsigned-reg))) (bytes simd-pack-256-int :scs (int-avx2-reg)) ;; We assume that MMAP will MMAP to at least 32-byte alignment, and ;; this is probably the case for any MMU you will ever use. (inst vmovdqa bytes (ea 0 pointer index 1))) (rubbish-newline-counter::define-boring-vop rubbish-newline-counter::%zeroes () (zeroes simd-pack-256-int :scs (int-avx2-reg)) (inst vpxor zeroes zeroes zeroes)) (rubbish-newline-counter::define-boring-vop rubbish-newline-counter::%equals ((a . #1=(simd-pack-256-int :scs (int-avx2-reg))) (b . #1#)) (eq . #1#) (inst vpcmpeqb eq a b)) (rubbish-newline-counter::define-boring-vop rubbish-newline-counter::%sum-absolute-differences ((a . #1=(simd-pack-256-int :scs (int-avx2-reg))) (b . #1#)) (sum . #1#) (inst vpsadbw sum a b)) (rubbish-newline-counter::define-boring-vop rubbish-newline-counter::%subtract-ub8 ((a . #1=(simd-pack-256-int :scs (int-avx2-reg))) (b . #1#)) (difference . #1#) (inst vpsubb difference a b)) (rubbish-newline-counter::define-boring-vop rubbish-newline-counter::%add-ub64 ((a . #1=(simd-pack-256-int :scs (int-avx2-reg))) (b . #1#)) (sum . #1#) (inst vpaddq sum a b)) (in-package :rubbish-newline-counter) (declaim (inline ub64)) (defun ub64 (x) (ldb (byte 64 0) x)) (defun %count-lines-in-file (pointer length) (declare (optimize (speed 3)) (fixnum length)) ;; Just what http://0x80.pl/notesen/2019-01-29-simd-count-byte.html#avx2-and-sse-vectorization does. (let ((newlines (%broadcast-byte 10)) (global-sum (%zeroes)) (simd-end (max 0 (- length 256)))) (loop for base-index of-type (unsigned-byte 64) from 0 below simd-end by 256 ;; Maintain smaller newline counters. do (let ((local-sum (%zeroes))) (loop for i from 0 below 256 by 32 for index = (ub64 (+ i base-index)) do (let* ((in (%read-bytes pointer index)) ;; This is either 0 for a miss or -1 for a match. (eq (%equals in newlines))) ;; So this adds 1 (subtracts -1) for every match. (setf local-sum (%subtract-ub8 local-sum eq)))) ;; Then add them to larger counters with this weird instruction. (setf global-sum (%add-ub64 global-sum (%sum-absolute-differences local-sum (%zeroes)))))) (multiple-value-bind (a b c d) (sb-ext:%simd-pack-256-ub64s global-sum) ;; There is one more line than there are #\Newline characters, ;; unless there are no characters, which we already tested. (loop for index from simd-end below length count (= (cffi:mem-aref pointer :char index) 10) into residual finally (return (ub64 (+ 1 a b c d residual)))))))