git.fiddlerwoaroof.com
levehnstein.lisp
5628d0bc
 (defpackage :levehnstein
   (:shadowing-import-from :data-lens :pick)
   (:use :cl
         :fw.lu
         :data-lens))
 (in-package :levehnstein)
 
   ;;;                                      ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ;;;                                      ;;;
   ;;;                                      ;;;
   ;;;                                      ;;;
   ;;;      ;;;;;;;;;;;;;;;;;;;;;;;;;;      ;;;
   ;;;      ;;; ;;; ;;;  ;;;  ;;;  ;;;      ;;;
   ;;;      ;;;     c at h at      ;;;      ;;;
   ;;;      ;;;     co t ho t      ;;;      ;;;
   ;;;      ;;; oat coat           ;;;      ;;;
   ;;;      ;;;          ho  g dog ;;;      ;;;
   ;;;      ;;;  ;;;  ;;;; ;;; ;;; ;;;      ;;;
   ;;;      ;;;;;;;;;;;;;;;;;;;;;;;;;;      ;;;
   ;;;                                      ;;;
   ;;;                                      ;;;
   ;;;                                      ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ;;;                                      ;;;
 
 (defun collect-by-distance (pivot items)
   (let ((items (sort (copy-seq items) '<
                      :key (op (levehnstein _ pivot)))))
     (nreverse
      (reduce (destructuring-lambda (acc (distance item))
                (if acc
                    (if (= (caar acc) distance)
                        (list* (list* (caar acc) item (cdar acc))
                               (cdr acc))
                        (list* (list distance item) acc))
                    (list (list distance item))))
              (map 'list (juxt (op (levehnstein _ pivot))
                               'identity)
                   items)
              :initial-value ()))))
 
 (defun levehnstein-1-p (a b)
   (let-by-length (a b)
     (or (equal a b)
         (cond ((= (length b)
                   (length a))
                (= (loop
                      for x from 0 below (length a)
                      when (not (eql (elt a x)
                                     (elt b x)))
                      count 1)
                   1))
               ((= (- (length b)
                      (length a))
                   1)
                (some 'identity
                      (map 'list
                           (op (equal a
                                      (remove _1 b
                                              :start _2
                                              :count 1)))
                           b (alexandria:iota (length b)))))))))
 
 (defun build-chain (items)
   (if (null items)
       ()
       (let* ((selection (find-if (op (levehnstein-1-p _1 (car items)))
                                  (fisher-yates (cdr items))))
              (next (when selection
                      (build-chain (cons selection
                                         (without selection
                                                  (cdr items)))))))
         (if selection
             (list* (list* (car items) (car next))
                    (cdr next))
             (list* (list (car items))
                    (build-chain (cdr items)))))))
 
 (defun get-edits (items)
   (mapcar (op (cons _1
                     (mapcar (lambda (x) (levehnstein-1 _1 x))
                             items)))
           items))
 
 #+null
 (defgeneric levehnstein-1 (a b)
   (:method ((a list) (b list))
     (if (or (null a) (listp (car a)))
         (call-next-method)
         (= (levehnstein (car (last a)) (car b))
            1)))
   (:method (a b)
     (= (levehnstein a b)
        1)))
 
 (defun fisher-yates (seq)
   (let* ((shuffled (alexandria:copy-sequence 'vector seq))
          (n (length shuffled)))
     (loop for i from (1- n) downto 1
        for j = (random (1+ i))
        do (rotatef (elt shuffled j)
                    (elt shuffled i)))
     (alexandria:copy-sequence (type-of seq) shuffled)))
 
 (defun levehnstein-sum (seq)
   (apply #'+
          (mapcar #'car
                  (cdr (funcall (derive 'levehnstein)
                                seq)))))
 
 (defun pick-sequence (seq1 seq2)
   (fw.lu:if-let* ((seq1-score (levehnstein-sum seq1))
                   (seq2-score (levehnstein-sum seq2))
                   (_ (< seq1-score seq2-score)))
     (values seq1 seq1-score)
     (values seq2 seq2-score)))
 
 (defun edit-distance-1-p (a b)
   (if (equal a b)
       t
       (if (eql (elt a 0)
                (elt b 0))
           (edit-distance-1-p (subseq a 1) (subseq b 1))
           (equal (subseq a 1) (subseq b 1)))))
 
 (defun find-highest-out-of-place (seq)
   (multiple-value-bind (value idx)
       (funcall (alexandria:compose (maximizing #'< #'car)
                                    #'cdr
                                    (derive 'levehnstein))
                seq)
     (values (1+ idx)
             (cdr value))))
 
 (defun swap-highest (seq)
   (let ((oop (find-highest-out-of-place seq)))
     (concatenate 'list
                  (subseq seq oop)
                  (subseq seq 0 oop))))
 
 (defun crossover (seq)
   (if (null seq)
       seq
       (if (oddp (length seq))
           (append (crossover (cdr seq))
                   (list (car seq)))
           (let ((candidate (1+ (random (1- (length seq))))))
             (append (list (elt seq candidate))
                     (subseq seq 1 candidate)
                     (list (elt seq 0))
                     (subseq seq (1+ candidate)))))))
 
 (defun evolve (seq steps fitness chance)
   (loop repeat steps
      for candidate = seq then (if (= 0 (random chance))
                                   (pick-sequence
                                    (and i-seq (swap-highest i-seq))
                                    (pick-sequence (crossover i-seq)
                                                   i-seq))
                                   i-seq)
      for (i-seq score) = (multiple-value-list
                           (funcall fitness seq (fisher-yates seq)))
      finally (return (values i-seq score))))
 
 (defun evolve-halves (seq steps fitness chance)
   (let* ((halfway (random (length seq)))
          (first-part (subseq seq 0 halfway))
          (second-part (subseq seq halfway))
          (result (append (pick-sequence first-part
                                         (evolve (fisher-yates first-part) steps fitness chance))
                          (pick-sequence second-part
                                         (evolve (fisher-yates second-part) steps fitness chance)))))
     (evolve (pick-sequence seq result)
             steps fitness chance)))
 
 (defun aggregate-items (items)
   (loop
      with candidates = items
      for (item . remainder) on items
      while remainder
      do
        (setf candidates (remove item candidates :test 'equal))
        (format t "~&~s ~s~%" item remainder)
      when (and item candidates)
      collect (loop
                 for match in candidates
                 when (and match (levehnstein-1 item match))
                 do
                   (format t "~&~s ~s~%" item match)
                   (return (append (alexandria:ensure-list item)
                                   (alexandria:ensure-list match))))))
 
 (defmacro let-order ((min max) (a b) &body body)
   (alexandria:once-only (a b)
     `(destructuring-bind (,min ,max) (if (<= (length ,a) (length ,b))
                                          (list ,a ,b)
                                          (list ,b ,a))
        ,@body)))
 
 (defmacro let-by-length ((a b) &body body)
   `(let-order (,a ,b) (,a ,b)
      ,@body))
 
 (defun levehnstein-equal (a b)
   (if (and (= (length a) (length b))
            (> (length a) 0))
       (if (eql (elt a 0) (elt b 0))
           (levehnstein-equal     (subseq a 1 nil) (subseq b 1 nil))
           (1+ (levehnstein-equal (subseq a 1 nil) (subseq b 1 nil))))
       0))
 
 (defun window (seq len &optional acc)
   (if (= (length seq) len)
       (cons seq acc)
       (window (subseq seq 1 nil)
               len
               (cons (subseq seq 0 len) acc))))
 
 (defun minimal-levehnstein (a b)
   (let-order (a b) (a b)
     (apply #'min
            (mapcar (op (levehnstein-equal a _))
                    (window b (length a))))))
 
 (defun align-things (a b)
   (let-by-length (a b)
     (cond
       ((equal (length a) (length b))
        (values (coerce a 'list)
                (coerce b 'list)))
       ((equal a "")
        (values (make-sequence 'list (length b)
                               :initial-element nil)
                (coerce b 'list)))
       ((equal (elt a 0) (elt b 0))
        (multiple-value-bind (as bs) (align-things (subseq a 1) (subseq b 1))
          (values (cons (elt a 0) as)
                  (cons (elt b 0) bs))))
       (t (multiple-value-bind (as bs) (align-things a (subseq b 1))
            (values (cons nil as)
                    (cons (elt b 0) bs)))))))
 
 (defun levehnstein (a b)
   (min (+ (minimal-levehnstein (coerce a 'list)
                                (coerce b 'list))
           (abs (- (length a)
                   (length b))))
        (multiple-value-call 'levehnstein-equal (align-things a b))))
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 #+nil
 (every 'identity
        (list (= (levehnstein "kitten" "sitting") 3)
              (= (levehnstein "closure" "clojure") (levehnstein "clojure" "closure") 1)
              (= (levehnstein "xyx" "xyyyx") 2)
              (= (levehnstein "" "123456") 6)
              (= (levehnstein "1234" "02345") 2)
              (= (levehnstein "abcd" "ad") 2)
              (= (levehnstein "Clojure" "Clojure") (levehnstein "" "")  0)
              (= (levehnstein "ttttattttctg" "tcaaccctaccat") 10)
              (= (levehnstein "gaattctaatctc" "caaacaaaaaattt") 9)))
 
 #+nil
 (defun levehnstein (a b)
   (labels ((window-internal (seq len acc)
              (if (= (length seq) len)
                  (cons seq acc)
                  (window-internal (subseq seq 1 nil)
                                   len
                                   (cons (subseq seq 0 len) acc))))
            (min-offset-internal (haystack needle &optional (x 0) min-x min-dist)
              (if (null haystack)
                  (list min-x min-dist (length needle))
                  (let* ((search (car haystack))
                         (dist (levehnstein needle search)))
                    (destructuring-bind (min-x min-dist)
                        (if (or (null min-dist) (< dist min-dist))
                            (list x dist)
                            (list min-x min-dist))
                      (min-offset-internal (cdr haystack)
                                           needle
                                           (1+ x)
                                           min-x
                                           min-dist)))))
            (min-offset (needle haystack)
              (min-offset-internal (window-internal haystack (length needle) nil)
                                   needle))
            (levehnstein-head (len a b)
              (levehnstein (subseq a 0 len)
                           (subseq b 0 len)))
            (min-offset-tail (offset a b)
              (min-offset (subseq a offset)
                          (subseq b offset)))
            (map-reduce (comb tr init seq1 seq2)
              (reduce (lambda (acc next)
                        (funcall comb acc (apply tr next)))
                      (map 'list 'list seq1 seq2)
                      :initial-value init))
            (difference-step (char-a char-b) (if (eql char-a char-b) 0 1))
            (uneven-loop (a b &optional (x 1) (min-x x) min-total-dist)
              (destructuring-bind (tail-offset dist overlap) (min-offset-tail x a b)
                (let* ((head-distance (levehnstein-head x a b))
                       (leftover-tail (- (- (length b) x)
                                         (+ overlap tail-offset)))
                       (total-dist (+ head-distance tail-offset dist leftover-tail)))
                  (destructuring-bind (min-total-dist min-x)
                      (if (< total-dist (or min-total-dist total-dist))
                          (list total-dist x)
                          (list (or min-total-dist total-dist) min-x))
                    (if (= x (1- (length a)))
                        min-total-dist
                        (uneven-loop a b (1+ x) min-x min-total-dist)))))))
     (cond ((> (length a) (length b)) (levehnstein b a))
           ((equal a "")              (length b))
           ((equal (subseq a 1) b)    1)
           ((equal (subseq b 1) a)    1)
           ((= (length a) (length b)) (map-reduce #'+ #'difference-step 0 a b))
           (t                         (uneven-loop a b)))))
 
 (defun without (el items)
   (remove el items :test 'equal))
 
 (defun all-pairs (items)
   (map 'list
        (lambda (item)
          (map 'list
               (lambda (item-2)
                 (list item item-2))
               (without item items)))
        items))
 
 (defun make-word-graph (words)
   (funcall (alexandria:compose
             (op (alexandria:write-string-into-file _ "/tmp/foo.dot" :if-exists :supersede))
             (op (format nil "~&graph {~%~{~a~%~}}~%" _))
             (op (remove-duplicates _ :test 'equal))
             (over (op (format nil "~{~s~^ -- ~};" (sort (subseq _ 0 2) 'string<))))
             (op (apply 'concatenate 'list _))
             (over (alexandria:compose
                    (lambda (x) (remove-if-not (op (= _ 1)) x
                                               :key (op (car (last _)))))
                    (lambda (x) (sort x '<
                                      :key (op (car (last _)))))
                    (over (lambda (it) (concatenate 'list it (list (apply 'levehnstein it)))))))
             'all-pairs)
            words))