;;; @Package soundex ;;; @Subtitle Soundex Index Keying in Scheme ;;; @HomePage http://www.neilvandyke.org/soundex-scheme/ ;;; @Author Neil Van Dyke ;;; @Version 0.5 ;;; @Date 2009-02-24 ;;; @PLaneT neil/soundex:1:2 ;; $Id: soundex.ss,v 1.44 2009/02/24 13:32:04 neilpair Exp $ ;;; @legal ;;; Copyright @copyright{} 2004--2009 Neil Van Dyke. This program is Free ;;; Software; you can redistribute it and/or modify it under the terms of the ;;; GNU Lesser General Public License as published by the Free Software ;;; Foundation; either version 3 of the License (LGPL 3), or (at your option) ;;; any later version. This program is distributed in the hope that it will be ;;; useful, but without any warranty; without even the implied warranty of ;;; merchantability or fitness for a particular purpose. See ;;; @indicateurl{http://www.gnu.org/licenses/} for details. For other licenses ;;; and consulting, please contact the author. ;;; @end legal #lang scheme/base ;;; @section Introduction ;;; The @b{soundex} library provides an implementation in Scheme of the Soundex ;;; indexing hash function as specified somewhat loosely by US National ;;; Archives and Records Administration (NARA) publication [Soundex], and ;;; verified empirically against test cases from various sources. Both the ;;; current NARA function and the older version with different handling of `H' ;;; and `W' are supported. ;;; ;;; Additionally, a nonstandard prefix-guessing function that is an invention ;;; of this lirbray function permits additional Soundex keys to be generated ;;; from a string, increasing recall. ;;; ;;; This library should work under any R5RS Scheme implementation for which ;;; @code{char->integer} yields ASCII values. ;;; ;;; @itemize ;;; ;;; @item ;;; [GIL-55] US National Archives and Records Administration, ``Using the ;;; Census Soundex,'' General Information Leaflet 55, 1995. ;;; ;;; @item ;;; [Soundex] US National Archives and Records Administration, ``The Soundex ;;; Indexing System,'' 2000-02-19. ;;; ;;; @end itemize ;;; @section Characters, Ordinals, and Codes ;;; To facilitate possible future support of other input character sets, this ;;; library employs a @dfn{character ordinal} abstract representation of the ;;; letters used by Soundex. The ordinal value is an integer from 0 to ;;; 25---corresponding to the 26 letters `A' through `Z', respectively---and ;;; can be used for fast mapping via vectors. Most applications need not be ;;; aware of this. ;;; @defproc soundex-ordinal chr ;;; ;;; Yields the Soundex ordinal value of character @var{chr}, of @var{#f} if the ;;; character is not considered a letter. ;;; ;;; @lisp ;;; (soundex-ordinal #\a) @result{} 0 ;;; (soundex-ordinal #\A) @result{} 0 ;;; (soundex-ordinal #\Z) @result{} 25 ;;; (soundex-ordinal #\3) @result{} #f ;;; (soundex-ordinal #\.) @result{} #f ;;; @end lisp (define (soundex-ordinal chr) (let ((x (char->integer chr))) (cond ((< x 65) #f) ((< x 91) (- x 65)) ((< x 97) #f) ((< x 123) (- x 97)) (else #f)))) ;;; @defproc soundex-ordinal->char ord ;;; ;;; Yields the upper-case letter character that corresponds to the character ;;; ordinal value @var{ord}. For example: ;;; ;;; @lisp ;;; (soundex-ordinal->char (soundex-ordinal #\a)) @result{} #\A ;;; @end lisp ;;; ;;; Note that an @code{#f} value as a result of applying @code{soundex-ordinal} ;;; is @emph{not} an ordinal value, and is not mapped to a character by ;;; @code{soundex-ordinal->char}. For example: ;;; ;;; @lisp ;;; (soundex-ordinal->char (soundex-ordinal #\')) @error{} ;;; @end lisp (define soundex-ordinal->char (let ((letters '#(#\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M #\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z))) (lambda (ord) (vector-ref letters ord)))) ;;; @defproc soundex-ordinal->soundex-code ord ;;; ;;; Yields a library-specific Soundex code for character ordinal @var{ord}. ;;; ;;; @lisp ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\a)) @result{} aeiou ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\c)) @result{} #\2 ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\N)) @result{} #\5 ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\w)) @result{} hw ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\y)) @result{} y ;;; @end lisp (define soundex-ordinal->soundex-code (let ((code-vector '#(aeiou #\1 #\2 #\3 aeiou #\1 #\2 hw aeiou #\2 #\2 #\4 #\5 #\5 aeiou #\1 #\2 #\6 #\2 #\3 aeiou #\1 hw #\2 y #\2))) (lambda (ord) (if ord (vector-ref code-vector ord) #f)))) ;;; @defproc char->soundex-code chr ;;; ;;; Yields a library-specific Soundex code for character @var{chr}. This is ;;; equivalent to: @code{(soundex-ordinal->soundex-code (soundex-ordinal ;;; @var{chr}))}. (define (char->soundex-code chr) (soundex-ordinal->soundex-code (soundex-ordinal chr))) ;;; @section Hashing ;;; Soundex hashes of strings can be generated with @code{soundex-nara}, ;;; @code{soundex-old}, and @code{soundex}. ;; @defproc %soundex/narahw/start str narahw? start ;; ;; This is an internal procedure. ;; ;; @lisp ;; (%soundex/narahw/start "van Dam" #t 4) @result{} "D500" ;; (%soundex/narahw/start ".0,!" #t 0) @result{} #f ;; @end lisp (define (%soundex/narahw/start str narahw? start) (let ((len (string-length str))) (let find-first-alpha ((i start)) (if (>= i len) #f (let* ((ord (soundex-ordinal (string-ref str i)))) (if ord (let ((result (make-string 4 #\0)) (result-used 1)) (string-set! result 0 (soundex-ordinal->char ord)) (let scan ((i (+ 1 i)) (prior-code (soundex-ordinal->soundex-code ord))) (if (>= i len) result (let ((code (char->soundex-code (string-ref str i)))) (case code ((aeiou y) (scan (+ 1 i) code)) ((hw) (scan (+ 1 i) (if narahw? prior-code code))) ((#\1 #\2 #\3 #\4 #\5 #\6) (if (eqv? code prior-code) (scan (+ 1 i) prior-code) (begin (string-set! result result-used code) (if (= result-used 3) result (begin (set! result-used (+ 1 result-used)) (scan (+ 1 i) code)))))) (else (scan (+ 1 i) #f))))))) (find-first-alpha (+ 1 i)))))))) ;;; @defproc soundex-nara str ;;; @defprocx soundex-old str ;;; @defprocx soundex str ;;; ;;; Yields a Soundex hash key of string @var{str}, or @code{#f} if not even an ;;; initial letter could be found. @code{soundex-nara} generates NARA hashes, ;;; and @code{soundex-old} generates older-style hashes. @code{soundex} is an ;;; alias for @code{soundex-nara}. ;;; ;;; @lisp ;;; (soundex-nara "Ashcraft") @result{} "A261" ;;; (soundex-old "Ashcraft") @result{} "A226" ;;; (soundex "Ashcraft") @result{} "A261" ;;; (soundex "") @result{} #f ;;; @end lisp (define (soundex-nara str) (%soundex/narahw/start str #t 0)) (define (soundex-old str) (%soundex/narahw/start str #f 0)) (define (soundex str) (soundex-nara str)) ;;; @section Prefixing ;;; Multiple Soundex hashes from a single string can be generated by ;;; @code{soundex-nara/prefixing}, @code{soundex-old/prefixing}, and ;;; @code{soundex/p}, which consider the string with and without various common ;;; surname prefixes. ;;; @defproc soundex-prefix-starts str ;;; ;;; Yields a list of Soundex start points in string @var{str}, as character ;;; index integers, for making hash keys with and without prefixes. A prefix ;;; must be followed by at least two letters, although they can be interspersed ;;; with non-letter characters. The exact behavior of this function is subject ;;; to change in future versions of this library. ;;; ;;; @lisp ;;; (soundex-prefix-starts "Smith") @result{} (0) ;;; (soundex-prefix-starts " Jones") @result{} (2) ;;; (soundex-prefix-starts "vanderlinden") @result{} (0 3 6) ;;; (soundex-prefix-starts "van der linden") @result{} (0 3 7) ;;; (soundex-prefix-starts "") @result{} () ;;; (soundex-prefix-starts "123") @result{} () ;;; (soundex-prefix-starts "dea") @result{} (0) ;;; (soundex-prefix-starts "dea ") @result{} (0) ;;; (soundex-prefix-starts "dean") @result{} (0) ;;; (soundex-prefix-starts "delasol") @result{} (0 2 3 4) ;;; @end lisp (define (soundex-prefix-starts str) ;; TODO: Maybe someday find a really elegant way to integrate this into the ;; coding pass, or cache the ordinals. At the same time, make it ;; data-driven, so that it's easier to make a prefixing constructor ;; from a user-provided list of prefixes. (letrec ((len (string-length str)) (i 0) (ord #f) (next-ord (lambda () (if (= i len) 'end (begin (set! ord (soundex-ordinal (string-ref str i))) (set! i (+ 1 i)) (or ord (next-ord)))))) (trailed? (lambda () (let ((saved-i i) (result (let loop ((needed 2)) (if (> needed 0) (case (next-ord) ((end) #f) ((#f) (loop needed)) (else (loop (- needed 1)))) #t)))) (set! i saved-i) result)))) (let find-first () (case (next-ord) ((end) '()) ((#f) (find-first)) (else ;; A=0 B=1 C=2 D=3 E=4 F=5 G=6 H=7 I=8 J=9 K=10 L=11 M=12 ;; N=13 O=14 P=15 Q=16 R=17 S=18 T=19 U=20 V=21 W=22 X=23 Y=24 Z=25 (cons (- i 1) (case ord ((2) ;; C (if (and (eq? (next-ord) 14) ;; (C)O (eq? (next-ord) 13) ;; (CO)N (trailed?)) (list i) '())) ((3) ;; D (case (next-ord) ((4) ;; (D)E (if (trailed?) (cons i (case (next-ord) ((11) ;; (DE)L (if (trailed?) (cons i (if (and (eq? (next-ord) 0) ;; (DEL)A (trailed?)) (list i) '())) '())) ((18) ;; (DE)S (if (trailed?) (list i) '())) (else '()))) '())) ((8 20) ;; (D)I, (D)U (if (trailed?) (list i) '())) (else '()))) ((11) ;; L (case (next-ord) ((0 4) ;; (L)A, (L)E (if (trailed?) (list i) '())) (else '()))) ((21) ;; V (case (next-ord) ((0 14) ;; (V)A, (V)O (if (eq? (next-ord) 13) ;; (V*)N (cons i (if (and (eq? (next-ord) 3) ;; (V*N)D (eq? (next-ord) 4) ;; (V*ND)E (trailed?)) (case (next-ord) ((13 17) ;; (V*NDE)N, (V*NDE)R (if (trailed?) (list i) '())) (else '())) '())) '())) (else '()))) (else '())))))))) ;; @defproc soundex/narahw str narahw? ;; ;; This is an internal procedure. (define (%soundex/prefixing/narahw str narahw?) (let ((result '())) (for-each (lambda (start) (let ((sx (%soundex/narahw/start str narahw? start))) (and sx (not (member sx result)) (set! result (cons sx result))))) (soundex-prefix-starts str)) (reverse result))) ;;; @defproc soundex-nara/prefixing str ;;; @defprocx soundex-old/prefixing str ;;; @defprocx soundex/p str ;;; ;;; Yields a list of zero or more Soundex hash keys from string @var{str}, ;;; based on the whole string and the string with various prefixes skipped. ;;; All elements of the list are mutually unique. ;;; @code{soundex-nara/prefixing} generates NARA hashes, and ;;; @code{soundex-old/prefixing} generates older-style hashes. ;;; @code{soundex/p} is an alias for @code{soundex-nara/prefixing}. ;;; ;;; @lisp ;;; (soundex/p "Van Damme") @result{} ("V535" "D500") ;;; (soundex/p "vanvoom") @result{} ("V515" "V500") ;;; (soundex/p "vanvanvan") @result{} ("V515") ;;; (soundex/p "DeLaSol") @result{} ("D424" "L240" "A240" "S400") ;;; (soundex/p "") @result{} () ;;; @end lisp (define (soundex-nara/prefixing str) (%soundex/prefixing/narahw str #t)) (define (soundex-old/prefixing str) (%soundex/prefixing/narahw str #f)) (define (soundex/p str) (soundex-nara/prefixing str)) ;;; @unnumberedsec History ;;; @table @asis ;;; ;;; @item Version 0.5 --- 2009-02-24 -- PLaneT @code{(1 2)} ;;; Ahem. ;;; ;;; @item Version 0.4 --- 2009-02-24 -- PLaneT @code{(1 1)} ;;; Removed internal-use-only procedures from documentation. ;;; ;;; @item Version 0.3 --- 2009-02-24 -- PLaneT @code{(1 0)} ;;; Licensed under LGPL 3. Converted to author's new Scheme administration ;;; system. Made test suite executable. Minor documentation changes. ;;; ;;; @item Version 0.2 --- 2004-08-02 ;;; Minor documentation change. Version frozen for PLaneT packaging. ;;; ;;; @item Version 0.1 --- 2004-05-10 ;;; First release. ;;; ;;; @end table (provide char->soundex-code soundex soundex-nara soundex-nara/prefixing soundex-old soundex-old/prefixing soundex-ordinal soundex-ordinal->char soundex-ordinal->soundex-code soundex-prefix-starts soundex/p)