levenshtein.ss
;;; @Package     levenshtein
;;; @Subtitle    Levenshtein Distance Metric in Scheme
;;; @HomePage    http://www.neilvandyke.org/levenshtein-scheme/
;;; @Author      Neil Van Dyke
;;; @Version     0.6
;;; @Date        2009-03-14
;;; @PLaneT      neil/levenshtein:1:3

;; $Id: levenshtein.ss,v 1.49 2009/03/14 07:32:45 neilpair Exp $

;;; @legal
;;; Copyright @copyright{} 2004--2009 Neil Van Dyke.  This program is Free
;;; Software; you can redistribute it and/or modify it under the terms of the
;;; GNU Lesser General Public License as published by the Free Software
;;; Foundation; either version 3 of the License (LGPL 3), or (at your option)
;;; any later version.  This program is distributed in the hope that it will be
;;; useful, but without any warranty; without even the implied warranty of
;;; merchantability or fitness for a particular purpose.  See
;;; @indicateurl{http://www.gnu.org/licenses/} for details.  For other licenses
;;; and consulting, please contact the author.
;;; @end legal

#lang scheme/base

;;; @section Introduction
;;;
;;; This is a Scheme implementation of the @dfn{Levenshtein Distance}
;;; algorithm, which is an @dfn{edit distance} metric of string similarity, due
;;; to Vladimir Levenshtein.  The Levenshtein Distance is a function of two
;;; strings that represents a count of single-character insertions, deletions,
;;; and substitions that will change the first string to the second.  More
;;; information is available in
;;; @uref{http://www.nist.gov/dads/HTML/Levenshtein.html, NIST DADS} and the
;;; Michael Gilleland article, ``@uref{http://www.merriampark.com/ld.htm,
;;; Levenshtein Distance in Three Flavors}.''
;;;
;;; This implementation is modeled after a
;;; @uref{http://www.mgilleland.com/ld/ldperl2.htm, space-efficient Perl
;;; implementation} by Jorge Mas Trullenque.  It has been written in R5RS
;;; Scheme, and extended to support heterogeneous combinations of Scheme types
;;; (strings, lists, vectors), user-supplied predicate functions, and
;;; optionally reusable scratch vectors.

(define (%identity x) x)

(define (%string-empty? v) (zero? (string-length v)))
(define (%vector-empty? v) (zero? (vector-length v)))

(define (%string->vector s)
  (list->vector (string->list s)))

;;; @section Basic Comparisons

;;; In the current implementation, all comparisons are done internally via
;;; vectors.

;;; @defproc vector-levenshtein/predicate/get-scratch a b pred get-scratch
;;;
;;; Few, if any, programs will use this procedure directly.  This is like
;;; @code{vector-levenshtein/predicate}, but allows @var{get-scratch} to be
;;; specified.  @var{get-scratch} is a procedure of one term, @i{n}, that
;;; yields a vector of length @i{n} or greater, which is used for
;;; record-keeping during execution of the Levenshtein algorithm.
;;; @code{make-vector} can be used for @var{get-scratch}, although some
;;; programs comparing a large size or quantity of vectors may wish to reuse a
;;; record-keeping vector, rather than each time allocating a new one that will
;;; need to be garbage-collected.

(define (vector-levenshtein/predicate/get-scratch a b pred get-scratch)
  (let ((a-len (vector-length a))
        (b-len (vector-length b)))
    (cond ((zero? a-len) b-len)
          ((zero? b-len) a-len)
          (else
           (let ((w    (get-scratch (+ 1 b-len)))
                 (next #f))
             (let fill ((k b-len))
               (vector-set! w k k)
               (or (zero? k) (fill (- k 1))))
             (let loop-i ((i 0))
               (if (= i a-len)
                   next
                   (let ((a-i (vector-ref a i)))
                     (let loop-j ((j   0)
                                  (cur (+ 1 i)))
                       (if (= j b-len)
                           (begin (vector-set! w b-len next)
                                  (loop-i (+ 1 i)))
                           ;; TODO: Make these costs parameters.
                           (begin (set! next (min (+ 1 (vector-ref w (+ 1 j)))
                                                  (+ 1 cur)
                                                  (if (pred a-i
                                                            (vector-ref b j))
                                                      (vector-ref w j)
                                                      (+ 1 (vector-ref w j)))))
                                  (vector-set! w j cur)
                                  (loop-j (+ 1 j) next))))))))))))

;;; @defproc  vector-levenshtein/predicate a b pred
;;; @defprocx vector-levenshtein/eq        a b
;;; @defprocx vector-levenshtein/eqv       a b
;;; @defprocx vector-levenshtein/equal     a b
;;; @defprocx vector-levenshtein           a b
;;;
;;; Calculate the Levenshtein Distance of vectors @var{a} and @var{b}.
;;; @var{pred} is the predicate procedure for determining if two elements are
;;; equal.  The @code{/eq}, @code{/eqv}, and @code{/equal} variants correspond
;;; to the standard equivalence predicates, @code{eq?}, @code{eqv?}, and
;;; @code{equal?}.  @code{vector-levenshtein} is an alias for
;;; @code{vector-levenshtein/equal}.
;;;
;;; @lisp
;;; (vector-levenshtein '#(6 6 6) '#(6 35 6 24 6 32)) @result{} 3
;;; @end lisp

(define (vector-levenshtein/predicate a b pred)
  (vector-levenshtein/predicate/get-scratch a b pred make-vector))

(define (vector-levenshtein/eq    a b)
  (vector-levenshtein/predicate a b eq?))
(define (vector-levenshtein/eqv   a b)
  (vector-levenshtein/predicate a b eqv?))
(define (vector-levenshtein/equal a b)
  (vector-levenshtein/predicate a b equal?))

(define (vector-levenshtein a b) (vector-levenshtein/equal a b))

;;; @defproc  list-levenshtein/predicate a b pred
;;; @defprocx list-levenshtein/eq        a b
;;; @defprocx list-levenshtein/eqv       a b
;;; @defprocx list-levenshtein/equal     a b
;;; @defprocx list-levenshtein           a b
;;;
;;; Calculate the Levenshtein Distance of lists @var{a} and @var{b}.
;;; @var{pred} is the predicate procedure for determining if two elements are
;;; equal.  The @code{/eq}, @code{/eqv}, and @code{/equal} variants correspond
;;; to the standard equivalence predicates, @code{eq?}, @code{eqv?}, and
;;; @code{equal?}.  @code{list-levenshtein} is an alias for
;;; @code{list-levenshtein/equal}.  Note that comparison of lists is less
;;; efficient than comparison of vectors.
;;;
;;; @lisp
;;; (list-levenshtein/eq '(b c e x f y) '(a b c d e f)) @result{} 4
;;; @end lisp

(define (list-levenshtein/predicate a b pred)
  (cond ((null? a) (length b))
        ((null? b) (length a))
        (else (vector-levenshtein/predicate (list->vector a)
                                            (list->vector b)
                                            pred))))

(define (list-levenshtein/eq    a b) (list-levenshtein/predicate a b eq?))
(define (list-levenshtein/eqv   a b) (list-levenshtein/predicate a b eqv?))
(define (list-levenshtein/equal a b) (list-levenshtein/predicate a b equal?))

(define (list-levenshtein       a b) (list-levenshtein/equal     a b))

;; TODO: Maybe make a version that does the O(n) access to the list elements in
;;       exchange for not allocating a vector.

;;; @defproc string-levenshtein a b
;;;
;;; Calculate the Levenshtein Distance of strings @var{a} and @var{b}.
;;;
;;; @lisp
;;; (string-levenshtein "adresse" "address") @result{} 2
;;; @end lisp

(define (string-levenshtein a b)
  ;; TODO: Maybe make a version that doesn't convert to vectors but also
  ;;       doesn't do lots of string-refs.
  (cond ((zero? (string-length a)) (string-length b))
        ((zero? (string-length b)) (string-length a))
        (else (vector-levenshtein/eqv
               (%string->vector a)
               (%string->vector b)))))

(define (%string-levenshtein/predicate a b pred)
  (cond ((zero? (string-length a)) (string-length b))
        ((zero? (string-length b)) (string-length a))
        (else (vector-levenshtein/predicate
               (%string->vector a)
               (%string->vector b)
               pred))))

;;; @section Type-Coercing Comparisons

;;; Procedures @code{levenshtein} and @code{levenshtein/predicate} provide a
;;; convenient interface for comparing a combination of vectors, lists, and
;;; strings, the types of which might not be known until runtime.

;;; @defproc levenshtein/predicate a b pred
;;;
;;; Calculates the Levenshtein Distance of two objects @var{a} and @var{b},
;;; which are vectors, lists, or strings.  @var{a} and @var{b} need not be of
;;; the same type.  @var{pred} is the element equivalence predicate used.
;;;
;;; @lisp
;;; (levenshtein/predicate '#(#\A #\B #\C #\D)
;;;                        "aBXcD"
;;;                        char-ci=?)
;;; @result{} 1
;;; @end lisp

(define levenshtein/predicate
  ;; TODO: Change this to a let-syntax.
  (let ((foo (lambda (a b pred a-emp a-len a-vec)
               (let ((bar (lambda (b-emp b-len b-vec)
                            (if (b-emp b)
                                (a-len a)
                                (vector-levenshtein/predicate (a-vec a)
                                                              (b-vec b)
                                                              pred)))))
                 (cond ((vector? b) (bar %vector-empty?
                                         vector-length
                                         %identity))
                       ((string? b) (bar %string-empty?
                                         string-length
                                         %string->vector))
                       ((list?   b) (bar null? length list->vector))
                       (else (error "term 2 must be vector, list, or string:"
                                    b)))))))
    (lambda (a b pred)
      (cond ((vector? a) (if (vector? b)
                             (vector-levenshtein/predicate a b pred)
                             (foo a b pred
                                  %vector-empty?
                                  vector-length
                                  %identity)))
            ((string? a) (if (string? b)
                             (%string-levenshtein/predicate
                              a b pred)
                             (foo a b pred
                                  %string-empty?
                                  string-length
                                  %string->vector)))
            ((list?   a) (if (list? b)
                             (list-levenshtein/predicate a b pred)
                             (foo a b pred null? length list->vector)))
            (else (error "term 1 must be vector, list, or string:" a))))))

;;; @defproc levenshtein a b
;;;
;;; Calculate the levenshtein distance of @var{a} and @var{b}, in a similar
;;; manner as using @code{levenshtein/predicate} with @code{equal?} as the
;;; predicate.
;;;
;;; @lisp
;;; (define g '#(#\g #\u #\m #\b #\o))
;;;
;;; (levenshtein g "gambol")  @result{} 2
;;; (levenshtein g "dumbo")   @result{} 1
;;; (levenshtein g "umbrage") @result{} 5
;;; @end lisp

(define (levenshtein a b)
  (if (and (string? a) (string? b))
      (string-levenshtein a b)
      (levenshtein/predicate a b equal?)))

;; @appendix Trullenque Perl Implementation
;;
;; For reference, the implementation from [Trullenque] is reproduced here.
;;
;; @verbatim
;; sub levenshtein($$){
;;   my @A=split //, lc shift;
;;   my @B=split //, lc shift;
;;   my @W=(0..@B);
;;   my ($i, $j, $cur, $next);
;;   for $i (0..$#A){
;;     $cur=$i+1;
;;     for $j (0..$#B){
;;             $next=min(
;;                     $W[$j+1]+1,
;;                     $cur+1,
;;                     ($A[$i] ne $B[$j])+$W[$j]
;;             );
;;             $W[$j]=$cur;
;;             $cur=$next;
;;     }
;;     $W[@B]=$next;
;;   }
;;   return $next;
;; }
;;
;; sub min($$$){
;;   if ($_[0] < $_[2]){ pop @_; } else { shift @_; }
;;   return $_[0] < $_[1]? $_[0]:$_[1];
;; }
;; @end verbatim

;;; @unnumberedsec History
;;;
;;; @table @asis
;;;
;;; @item Version 0.6 --- 2009-03-14 -- PLaneT @code{(1 3)}
;;; Documentation fixes.
;;;
;;; @item Version 0.5 --- 2009-02-24 -- PLaneT @code{(1 2)}
;;; License is now LGPL 3.  Tests moved out of main file.  Converted to
;;; author's new Scheme administration system.
;;;
;;; @item Version 0.4 --- 2005-07-10 -- PLaneT @code{(1 1)}
;;; Added Testeez tests.
;;;
;;; @item Version 0.3 --- 2005-07-09 -- PLaneT @code{(1 0)}
;;; PLaneT release, and minor documentation changes.
;;;
;;; @item Version 0.2 --- 2004-07-06
;;; Documentation changes.
;;;
;;; @item Version 0.1 --- 2004-05-13
;;; First release.  Tested only lightly, and today @emph{is} the 13th, so
;;; @i{caveat emptor}.
;;;
;;; @end table

(provide
 levenshtein
 levenshtein/predicate
 list-levenshtein
 list-levenshtein/eq
 list-levenshtein/equal
 list-levenshtein/eqv
 list-levenshtein/predicate
 string-levenshtein
 vector-levenshtein
 vector-levenshtein/eq
 vector-levenshtein/equal
 vector-levenshtein/eqv
 vector-levenshtein/predicate
 vector-levenshtein/predicate/get-scratch)