(module levenshtein mzscheme

;;; @Package     levenshtein.scm
;;; @Subtitle    Levenshtein Distance Metric in Scheme
;;; @HomePage    http://www.neilvandyke.org/levenshtein-scm/
;;; @Author      Neil W. Van Dyke
;;; @AuthorEmail neil@@neilvandyke.org
;;; @Version     0.3
;;; @Date        2005-07-09

;; $Id: levenshtein.scm,v 1.39 2005/07/09 20:13:21 neil Exp $

;;; @legal
;;; Copyright @copyright{} 2004 - 2005 Neil W. Van Dyke.  This program is Free
;;; Software; you can redistribute it and/or modify it under the terms of the
;;; GNU Lesser General Public License as published by the Free Software
;;; Foundation; either version 2.1 of the License, or (at your option) any
;;; later version.  This program is distributed in the hope that it will be
;;; useful, but without any warranty; without even the implied warranty of
;;; merchantability or fitness for a particular purpose.  See
;;; @indicateurl{http://www.gnu.org/copyleft/lesser.html} for details.  For
;;; other license options and consulting, contact the author.
;;; @end legal

;;; @section Introduction
;;; This is a Scheme implementation of the @dfn{Levenshtein Distance}
;;; algorithm, which is an @dfn{edit distance} metric of string similarity, due
;;; to Vladimir Levenshtein.  The Levenshtein Distance is a function of two
;;; strings that represents a count of single-character insertions, deletions,
;;; and substitions that will change the first string to the second.  More
;;; information is available in
;;; @uref{http://www.nist.gov/dads/HTML/Levenshtein.html, NIST DADS} and
;;; Michael Gilleland article, ``@uref{http://www.merriampark.com/ld.htm,
;;; Levenshtein Distance, in Three Flavors}.''
;;; This implementation is modeled after a
;;; @uref{http://www.mgilleland.com/ld/ldperl2.htm, space-efficient Perl
;;; implementation} by Jorge Mas Trullenque.  It has been written in R5RS
;;; Scheme (with an @code{error} procedure such as the one in SRFI-23), and
;;; extended to support heterogeneous combinations of Scheme types (strings,
;;; lists, vectors), user-supplied predicate functions, and optionally reusable
;;; scratch vectors.
;;; This version 0.1 is an early release that has been tested only lightly.

;; Note: Identifiers beginning with "levenshtein-internal:" are for internal
;; use only.

(define (levenshtein-internal:error p m o) (error (string-append p " - " m) o))

(define (levenshtein-internal:identity x) x)

(define (levenshtein-internal:string-empty? v) (zero? (string-length v)))
(define (levenshtein-internal:vector-empty? v) (zero? (vector-length v)))

(define (levenshtein-internal:string->vector s)
  (list->vector (string->list s)))

;;; @section Basic Comparisons

;;; In the current implementation, all comparisons are done internally via
;;; vectors.

;;; @defproc vector-levenshtein/predicate/get-scratch a b pred get-scratch
;;; Few, if any, programs will use this procedure directly.  This is like
;;; @code{vector-levenshtein/predicate}, but allows @var{get-scratch} to be
;;; specified.  @var{get-scratch} is a procedure of one term, @i{n}, that
;;; yields a vector of length @i{n} or greater, which is used for
;;; record-keeping during execution of the Levenshtein algorithm.
;;; @code{make-vector} can be used for @var{get-scratch}, although some
;;; programs comparing a large size or quantity of vectors may wish to reuse a
;;; record-keeping vector, rather than each time allocating a new one that will
;;; need to be garbage-collected.

(define (vector-levenshtein/predicate/get-scratch a b pred get-scratch)
  (let ((a-len (vector-length a))
        (b-len (vector-length b)))
    (cond ((zero? a-len) b-len)
          ((zero? b-len) a-len)
           (let ((w    (get-scratch (+ 1 b-len)))
                 (next #f))
             (let fill ((k b-len))
               (vector-set! w k k)
               (or (zero? k) (fill (- k 1))))
             (let loop-i ((i 0))
               (if (= i a-len)
                   (let ((a-i (vector-ref a i)))
                     (let loop-j ((j   0)
                                  (cur (+ 1 i)))
                       (if (= j b-len)
                           (begin (vector-set! w b-len next)
                                  (loop-i (+ 1 i)))
                           ;; TODO: Make these costs parameters.
                           (begin (set! next (min (+ 1 (vector-ref w (+ 1 j)))
                                                  (+ 1 cur)
                                                  (if (pred a-i
                                                            (vector-ref b j))
                                                      (vector-ref w j)
                                                      (+ 1 (vector-ref w j)))))
                                  (vector-set! w j cur)
                                  (loop-j (+ 1 j) next))))))))))))

;;; @defproc  vector-levenshtein/predicate a b pred
;;; @defprocx vector-levenshtein/eq        a b
;;; @defprocx vector-levenshtein/eqv       a b
;;; @defprocx vector-levenshtein/equal     a b
;;; @defprocx vector-levenshtein           a b
;;; Calculate the Levenshtein Distance of vectors @var{a} and @var{b}.
;;; @var{pred} is the predicate procedure for determining if two elements are
;;; equal.  The @code{/eq}, @code{/eqv}, and @code{/equal} variants correspond
;;; to the standard equivalence predicates, @code{eq?}, @code{eqv?}, and
;;; @code{equal?}.  @code{vector-levenshtein} is an alias for
;;; @code{vector-levenshtein/equal}.
;;; @lisp
;;; (vector-levenshtein '#(6 6 6) '#(6 35 6 24 6 32)) @result{} 3
;;; @end lisp

(define (vector-levenshtein/predicate a b pred)
  (vector-levenshtein/predicate/get-scratch a b pred make-vector))

(define (vector-levenshtein/eq    a b)
  (vector-levenshtein/predicate a b eq?))
(define (vector-levenshtein/eqv   a b)
  (vector-levenshtein/predicate a b eqv?))
(define (vector-levenshtein/equal a b)
  (vector-levenshtein/predicate a b equal?))

(define vector-levenshtein vector-levenshtein/equal)

;;; @defproc  list-levenshtein/predicate a b pred
;;; @defprocx list-levenshtein/eq        a b
;;; @defprocx list-levenshtein/eqv       a b
;;; @defprocx list-levenshtein/equal     a b
;;; @defprocx list-levenshtein           a b
;;; Calculate the Levenshtein Distance of lists @var{a} and @var{b}.
;;; @var{pred} is the predicate procedure for determining if two elements are
;;; equal.  The @code{/eq}, @code{/eqv}, and @code{/equal} variants correspond
;;; to the standard equivalence predicates, @code{eq?}, @code{eqv?}, and
;;; @code{equal?}.  @code{list-levenshtein} is an alias for
;;; @code{list-levenshtein/equal}.  Note that comparison of lists is less
;;; efficient than comparison of vectors.
;;; @lisp
;;; (list-levenshtein/eq '(b c e x f y) '(a b c d e f)) @result{} 4
;;; @end lisp

(define (list-levenshtein/predicate a b pred)
  (cond ((null? a) (length b))
        ((null? b) (length a))
        (else (vector-levenshtein/predicate (list->vector a)
                                            (list->vector b)

(define (list-levenshtein/eq    a b) (list-levenshtein/predicate a b eq?))
(define (list-levenshtein/eqv   a b) (list-levenshtein/predicate a b eqv?))
(define (list-levenshtein/equal a b) (list-levenshtein/predicate a b equal?))

(define list-levenshtein list-levenshtein/equal)

;; TODO: Maybe make a version that does the O(n) access to the list elements in
;;       exchange for not allocating a vector.

;;; @defproc string-levenshtein a b
;;; Calculate the Levenshtein Distance of strings @var{a} and @var{b}.
;;; @lisp
;;; (string-levenshtein "adresse" "address") @result{} 2
;;; @end lisp

(define (string-levenshtein a b)
  ;; TODO: Maybe make a version that doesn't convert to vectors but also
  ;;       doesn't do lots of string-refs.
  (cond ((zero? (string-length a)) (string-length b))
        ((zero? (string-length b)) (string-length a))
        (else (vector-levenshtein/eqv
               (levenshtein-internal:string->vector a)
               (levenshtein-internal:string->vector b)))))

(define (levenshtein-internal:string-levenshtein/predicate a b pred)
  (cond ((zero? (string-length a)) (string-length b))
        ((zero? (string-length b)) (string-length a))
        (else (vector-levenshtein/predicate
               (levenshtein-internal:string->vector a)
               (levenshtein-internal:string->vector b)

;;; @section Type-Coercing Comparisons

;;; Procedures @code{levenshtein} and @code{levenshtein/predicate} provide a
;;; convenient interface for comparing a combination of vectors, lists, and
;;; strings, the types of which might not be known until runtime.

;;; @defproc levenshtein/predicate a b pred
;;; Calculates the Levenshtein Distance of two objects @var{a} and @var{b},
;;; which are vectors, lists, or strings.  @var{a} and @var{b} need not be of
;;; the same type.  @var{pred} is the element equivalence predicate used.
;;; @lisp
;;; (levenshtein/predicate '#(#\A #\B #\C #\D)
;;;                        "aBXcD"
;;;                        char-ci=?)
;;; @result{} 1
;;; @end lisp

(define levenshtein/predicate
  (let ((foo (lambda (a b pred a-emp a-len a-vec)
               (let ((bar (lambda (b-emp b-len b-vec)
                            (if (b-emp b)
                                (a-len a)
                                (vector-levenshtein/predicate (a-vec a)
                                                              (b-vec b)
                 (cond ((vector? b) (bar levenshtein-internal:vector-empty?
                       ((string? b) (bar levenshtein-internal:string-empty?
                       ((list?   b) (bar null? length list->vector))
                       (else (levenshtein-internal:error
                              "term 2 must be vector, list, or string"
    (lambda (a b pred)
      (cond ((vector? a) (if (vector? b)
                             (vector-levenshtein/predicate a b pred)
                             (foo a b pred
            ((string? a) (if (string? b)
                              a b pred)
                             (foo a b pred
            ((list?   a) (if (list? b)
                             (list-levenshtein/predicate a b pred)
                             (foo a b pred null? length list->vector)))
            (else (levenshtein-internal:error
                   "term 1 must be vector, list, or string"

;;; @defproc levenshtein a b
;;; Calculate the levenshtein distance of @var{a} and @var{b}, in a similar
;;; manner as using @code{levenshtein/predicate} with @code{equal?} as the
;;; predicate.
;;; @lisp
;;; (define g '#(#\g #\u #\m #\b #\o))
;;; (levenshtein g "gambol")  @result{} 2
;;; (levenshtein g "dumbo")   @result{} 1
;;; (levenshtein g "umbrage") @result{} 5
;;; @end lisp

(define (levenshtein a b)
  (if (and (string? a) (string? b))
      (string-levenshtein a b)
      (levenshtein/predicate a b equal?)))

;; TODO: Test it.
;; (list (levenshtein "adresse" "address")
;;       (levenshtein "adresse" "addressee")
;;       (levenshtein "gambol"  "gumbo")
;;       (levenshtein "gumbo"   "gambol")
;;       (levenshtein "gumbo"  "bumble")
;;       (levenshtein "gumbo"  '#(#\b #\u #\m #\b #\l #\e))
;;       (levenshtein '#(#\g #\u #\m #\b #\o) '#(#\b #\u #\m #\b #\l #\e))
;;       (levenshtein '(#\g #\u #\m #\b #\o)  '#(#\b #\u #\m #\b #\l #\e))
;;       (levenshtein '#(#\g #\u #\m #\b #\o) '(#\b #\u #\m #\b #\l #\e))
;;       (levenshtein '#(#\g #\u #\m #\b #\o) '(#\b #\u #\m #\b #\l #\e))
;;       (levenshtein "a"      "abcde") ; 4
;;       (levenshtein "abcde"      "a") ; 4
;;       (levenshtein '#(6 6 6) '(1 2 3 4 5 6)) ;5
;;       (levenshtein '#(6 6 6) '(1 2 3 4 5 6 7)) ; 6
;;       (levenshtein '#(6 6 6) '(1 2 3 4 5 6 7 6 6)) ; 6
;;       )

;; @appendix Trullenque Perl Implementation
;; For reference, the implementation from [Trullenque] is reproduced here.
;; @verbatim
;; sub levenshtein($$){
;;   my @A=split //, lc shift;
;;   my @B=split //, lc shift;
;;   my @W=(0..@B);
;;   my ($i, $j, $cur, $next);
;;   for $i (0..$#A){
;;     $cur=$i+1;
;;     for $j (0..$#B){
;;             $next=min(
;;                     $W[$j+1]+1,
;;                     $cur+1,
;;                     ($A[$i] ne $B[$j])+$W[$j]
;;             );
;;             $W[$j]=$cur;
;;             $cur=$next;
;;     }
;;     $W[@B]=$next;
;;   }
;;   return $next;
;; }
;; sub min($$$){
;;   if ($_[0] < $_[2]){ pop @_; } else { shift @_; }
;;   return $_[0] < $_[1]? $_[0]:$_[1];
;; }
;; @end verbatim

;;; @unnumberedsec History
;;; @table @asis
;;; @item Version 0.3 --- 2005-07-09
;;; PLaneT release, and minor documentation changes.
;;; @item Version 0.2 --- 2004-07-06
;;; Documentation changes.
;;; @item Version 0.1 --- 2004-05-13
;;; First release.  Tested only lightly, and today @emph{is} the 13th, so
;;; @i{caveat emptor}.
;;; @end table

(provide (all-defined)))