soundex.ss
;;; @Package     soundex
;;; @Subtitle    Soundex Index Keying in Scheme
;;; @HomePage    http://www.neilvandyke.org/soundex-scheme/
;;; @Author      Neil Van Dyke
;;; @Version     0.3
;;; @Date        2009-02-24
;;; @PLaneT      neil/soundex:1:1

;; $Id: soundex.ss,v 1.42 2009/02/24 05:24:54 neilpair Exp $

;;; @legal
;;; Copyright @copyright{} 2004--2009 Neil Van Dyke.  This program is Free
;;; Software; you can redistribute it and/or modify it under the terms of the
;;; GNU Lesser General Public License as published by the Free Software
;;; Foundation; either version 3 of the License (LGPL 3), or (at your option)
;;; any later version.  This program is distributed in the hope that it will be
;;; useful, but without any warranty; without even the implied warranty of
;;; merchantability or fitness for a particular purpose.  See
;;; @indicateurl{http://www.gnu.org/licenses/} for details.  For other licenses
;;; and consulting, please contact the author.
;;; @end legal

#lang scheme/base

;;; @section Introduction

;;; The @b{soundex} library provides an implementation in Scheme of the Soundex
;;; indexing hash function as specified somewhat loosely by US National
;;; Archives and Records Administration (NARA) publication [Soundex], and
;;; verified empirically against test cases from various sources.  Both the
;;; current NARA function and the older version with different handling of `H'
;;; and `W' are supported.
;;;
;;; Additionally, a nonstandard prefix-guessing function that is an invention
;;; of this lirbray function permits additional Soundex keys to be generated
;;; from a string, increasing recall.
;;;
;;; This library should work under any R5RS Scheme implementation for which
;;; @code{char->integer} yields ASCII values.
;;;
;;; @itemize
;;;
;;; @item
;;; [GIL-55] US National Archives and Records Administration, ``Using the
;;; Census Soundex,'' General Information Leaflet 55, 1995.
;;;
;;; @item
;;; [Soundex] US National Archives and Records Administration, ``The Soundex
;;; Indexing System,'' 2000-02-19.
;;;
;;; @end itemize

;;; @section Characters, Ordinals, and Codes

;;; To facilitate possible future support of other input character sets, this
;;; library employs a @dfn{character ordinal} abstract representation of the
;;; letters used by Soundex.  The ordinal value is an integer from 0 to
;;; 25---corresponding to the 26 letters `A' through `Z', respectively---and
;;; can be used for fast mapping via vectors.  Most applications need not be
;;; aware of this.

;;; @defproc soundex-ordinal chr
;;;
;;; Yields the Soundex ordinal value of character @var{chr}, of @var{#f} if the
;;; character is not considered a letter.
;;;
;;; @lisp
;;; (soundex-ordinal #\a) @result{} 0
;;; (soundex-ordinal #\A) @result{} 0
;;; (soundex-ordinal #\Z) @result{} 25
;;; (soundex-ordinal #\3) @result{} #f
;;; (soundex-ordinal #\.) @result{} #f
;;; @end lisp

(define (soundex-ordinal chr)
  (let ((x (char->integer chr)))
    (cond ((< x 65)  #f)
          ((< x 91)  (- x 65))
          ((< x 97)  #f)
          ((< x 123) (- x 97))
          (else      #f))))

;;; @defproc soundex-ordinal->char ord
;;;
;;; Yields the upper-case letter character that corresponds to the character
;;; ordinal value @var{ord}.  For example:
;;;
;;; @lisp
;;; (soundex-ordinal->char (soundex-ordinal #\a)) @result{} #\A
;;; @end lisp
;;;
;;; Note that an @code{#f} value as a result of applying @code{soundex-ordinal}
;;; is @emph{not} an ordinal value, and is not mapped to a character by
;;; @code{soundex-ordinal->char}.  For example:
;;;
;;; @lisp
;;; (soundex-ordinal->char (soundex-ordinal #\')) @error{}
;;; @end lisp

(define soundex-ordinal->char
  (let ((letters
         '#(#\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M
            #\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z)))
    (lambda (ord)
      (vector-ref letters ord))))

;;; @defproc soundex-ordinal->soundex-code ord
;;;
;;; Yields a library-specific Soundex code for character ordinal @var{ord}.
;;;
;;; @lisp
;;; (soundex-ordinal->soundex-code (soundex-ordinal #\a)) @result{} aeiou
;;; (soundex-ordinal->soundex-code (soundex-ordinal #\c)) @result{} #\2
;;; (soundex-ordinal->soundex-code (soundex-ordinal #\N)) @result{} #\5
;;; (soundex-ordinal->soundex-code (soundex-ordinal #\w)) @result{} hw
;;; (soundex-ordinal->soundex-code (soundex-ordinal #\y)) @result{} y
;;; @end lisp

(define soundex-ordinal->soundex-code
  (let ((code-vector
         '#(aeiou #\1 #\2 #\3 aeiou #\1 #\2 hw aeiou #\2 #\2 #\4 #\5
                  #\5 aeiou #\1 #\2 #\6 #\2 #\3 aeiou #\1 hw #\2 y #\2)))
    (lambda (ord)
      (if ord (vector-ref code-vector ord) #f))))

;;; @defproc char->soundex-code chr
;;;
;;; Yields a library-specific Soundex code for character @var{chr}.  This is
;;; equivalent to: @code{(soundex-ordinal->soundex-code (soundex-ordinal
;;; @var{chr}))}.

(define (char->soundex-code chr)
  (soundex-ordinal->soundex-code (soundex-ordinal chr)))

;;; @section Hashing

;;; Soundex hashes of strings can be generated with @code{soundex-nara},
;;; @code{soundex-old}, and @code{soundex}.

;;; @defproc soundex/narahw/start str narahw? start
;;;
;;; This is an internal procedure.
;;;
;;; @lisp
;;; (soundex/narahw/start "van Dam" #t 4) @result{} "D500"
;;; (soundex/narahw/start ".0,!"    #t 0) @result{} #f
;;; @end lisp

(define (soundex/narahw/start str narahw? start)
  (let ((len (string-length str)))
    (let find-first-alpha ((i start))
      (if (>= i len)
          #f
          (let* ((ord (soundex-ordinal (string-ref str i))))
            (if ord
                (let ((result      (make-string 4 #\0))
                      (result-used 1))
                  (string-set! result 0 (soundex-ordinal->char ord))
                  (let scan ((i          (+ 1 i))
                             (prior-code (soundex-ordinal->soundex-code ord)))
                    (if (>= i len)
                        result
                        (let ((code (char->soundex-code (string-ref str i))))
                          (case code
                            ((aeiou y) (scan (+ 1 i) code))
                            ((hw) (scan (+ 1 i) (if narahw? prior-code code)))
                            ((#\1 #\2 #\3 #\4 #\5 #\6)
                             (if (eqv? code prior-code)
                                 (scan (+ 1 i) prior-code)
                                 (begin (string-set! result result-used code)
                                        (if (= result-used 3)
                                            result
                                            (begin (set! result-used
                                                         (+ 1 result-used))
                                                   (scan (+ 1 i) code))))))
                            (else (scan (+ 1 i) #f)))))))
                (find-first-alpha (+ 1 i))))))))

;;; @defproc  soundex-nara str
;;; @defprocx soundex-old  str
;;; @defprocx soundex      str
;;;
;;; Yields a Soundex hash key of string @var{str}, or @code{#f} if not even an
;;; initial letter could be found.  @code{soundex-nara} generates NARA hashes,
;;; and @code{soundex-old} generates older-style hashes.  @code{soundex} is an
;;; alias for @code{soundex-nara}.
;;;
;;; @lisp
;;; (soundex-nara "Ashcraft") @result{} "A261"
;;; (soundex-old  "Ashcraft") @result{} "A226"
;;; (soundex      "Ashcraft") @result{} "A261"
;;; (soundex      "")         @result{} #f
;;; @end lisp

(define (soundex-nara str) (soundex/narahw/start str #t 0))

(define (soundex-old  str) (soundex/narahw/start str #f 0))

(define (soundex str) (soundex-nara str))

;;; @section Prefixing

;;; Multiple Soundex hashes from a single string can be generated by
;;; @code{soundex-nara/prefixing}, @code{soundex-old/prefixing}, and
;;; @code{soundex/p}, which consider the string with and without various common
;;; surname prefixes.

;;; @defproc soundex-prefix-starts str
;;;
;;; Yields a list of Soundex start points in string @var{str}, as character
;;; index integers, for making hash keys with and without prefixes.  A prefix
;;; must be followed by at least two letters, although they can be interspersed
;;; with non-letter characters.  The exact behavior of this function is subject
;;; to change in future versions of this library.
;;;
;;; @lisp
;;; (soundex-prefix-starts "Smith")          @result{} (0)
;;; (soundex-prefix-starts "  Jones")        @result{} (2)
;;; (soundex-prefix-starts "vanderlinden")   @result{} (0 3 6)
;;; (soundex-prefix-starts "van der linden") @result{} (0 3 7)
;;; (soundex-prefix-starts "")               @result{} ()
;;; (soundex-prefix-starts "123")            @result{} ()
;;; (soundex-prefix-starts "dea")            @result{} (0)
;;; (soundex-prefix-starts "dea ")           @result{} (0)
;;; (soundex-prefix-starts "dean")           @result{} (0)
;;; (soundex-prefix-starts "delasol")        @result{} (0 2 3 4)
;;; @end lisp

(define (soundex-prefix-starts str)
  ;; TODO: Maybe someday find a really elegant way to integrate this into the
  ;;       coding pass, or cache the ordinals.  At the same time, make it
  ;;       data-driven, so that it's easier to make a prefixing constructor
  ;;       from a user-provided list of prefixes.
  (letrec ((len (string-length str))
           (i   0)
           (ord #f)
           (next-ord
            (lambda ()
              (if (= i len)
                  'end
                  (begin (set! ord (soundex-ordinal (string-ref str i)))
                         (set! i (+ 1 i))
                         (or ord (next-ord))))))
           (trailed?
            (lambda ()
              (let ((saved-i i)
                    (result  (let loop ((needed 2))
                               (if (> needed 0)
                                   (case (next-ord)
                                     ((end) #f)
                                     ((#f)  (loop needed))
                                     (else  (loop (- needed 1))))
                                   #t))))
                (set! i saved-i)
                result))))
    (let find-first ()
      (case (next-ord)
        ((end) '())
        ((#f)  (find-first))
        (else
         ;; A=0  B=1  C=2  D=3  E=4  F=5  G=6  H=7  I=8  J=9  K=10 L=11 M=12
         ;; N=13 O=14 P=15 Q=16 R=17 S=18 T=19 U=20 V=21 W=22 X=23 Y=24 Z=25
         (cons (- i 1)
               (case ord
                 ((2) ;; C
                  (if (and (eq? (next-ord) 14) ;; (C)O
                           (eq? (next-ord) 13) ;; (CO)N
                           (trailed?))
                      (list i)
                      '()))
                 ((3) ;; D
                  (case (next-ord)
                    ((4) ;; (D)E
                     (if (trailed?)
                         (cons i
                               (case (next-ord)
                                 ((11) ;; (DE)L
                                  (if (trailed?)
                                      (cons i
                                            (if (and (eq? (next-ord) 0)
                                                     ;; (DEL)A
                                                     (trailed?))
                                                (list i)
                                                '()))
                                      '()))
                                 ((18) ;; (DE)S
                                  (if (trailed?) (list i) '()))
                                 (else '())))
                         '()))
                    ((8 20) ;; (D)I, (D)U
                     (if (trailed?) (list i) '()))
                    (else '())))
                 ((11) ;; L
                  (case (next-ord)
                    ((0 4) ;; (L)A, (L)E
                     (if (trailed?) (list i) '()))
                    (else '())))
                 ((21) ;; V
                  (case (next-ord)
                    ((0 14) ;; (V)A, (V)O
                     (if (eq? (next-ord) 13) ;; (V*)N
                         (cons i (if (and (eq? (next-ord) 3) ;; (V*N)D
                                          (eq? (next-ord) 4) ;; (V*ND)E
                                          (trailed?))
                                     (case (next-ord)
                                       ((13 17) ;; (V*NDE)N, (V*NDE)R
                                        (if (trailed?) (list i) '()))
                                       (else '()))
                                     '()))
                         '()))
                    (else '())))
                 (else '()))))))))

;;; @defproc soundex/narahw str narahw?
;;;
;;; This is an internal procedure.

(define (soundex/prefixing/narahw str narahw?)
  (let ((result '()))
    (for-each (lambda (start)
                (let ((sx (soundex/narahw/start str narahw? start)))
                  (and sx
                       (not (member sx result))
                       (set! result (cons sx result)))))
              (soundex-prefix-starts str))
    (reverse result)))

;;; @defproc  soundex-nara/prefixing str
;;; @defprocx soundex-old/prefixing  str
;;; @defprocx soundex/p              str
;;;
;;; Yields a list of zero or more Soundex hash keys from string @var{str},
;;; based on the whole string and the string with various prefixes skipped.
;;; All elements of the list are mutually unique.
;;; @code{soundex-nara/prefixing} generates NARA hashes, and
;;; @code{soundex-old/prefixing} generates older-style hashes.
;;; @code{soundex/p} is an alias for @code{soundex-nara/prefixing}.
;;;
;;; @lisp
;;; (soundex/p "Van Damme") @result{} ("V535" "D500")
;;; (soundex/p "vanvoom")   @result{} ("V515" "V500")
;;; (soundex/p "vanvanvan") @result{} ("V515")
;;; (soundex/p "DeLaSol")   @result{} ("D424" "L240" "A240" "S400")
;;; (soundex/p "")          @result{} ()
;;; @end lisp

(define (soundex-nara/prefixing str) (soundex/prefixing/narahw str #t))

(define (soundex-old/prefixing  str) (soundex/prefixing/narahw str #f))

(define (soundex/p str) (soundex-nara/prefixing str))

;;; @unnumberedsec History

;;; @table @asis
;;;
;;; @item Version 0.3 --- 2009-02-24 -- PLaneT @code{(1 0)}
;;; Licensed under LGPL 3.  Converted to author's new Scheme administration
;;; system.  Made test suite executable.  Minor documentation changes.
;;;
;;; @item Version 0.2 --- 2004-08-02
;;; Minor documentation change.  Version frozen for PLaneT packaging.
;;;
;;; @item Version 0.1 --- 2004-05-10
;;; First release.
;;;
;;; @end table

(provide
 char->soundex-code
 soundex
 soundex-nara
 soundex-nara/prefixing
 soundex-old
 soundex-old/prefixing
 soundex-ordinal
 soundex-ordinal->char
 soundex-ordinal->soundex-code
 soundex-prefix-starts
 soundex/narahw/start
 soundex/p
 soundex/prefixing/narahw)