#lang racket/base
(require racket/contract
(planet neil/mcfly))
(doc (section "Introduction"))
(doc "The "
(bold "soundex")
" library provides an implementation in Racket of the Soundex indexing
hash function as specified somewhat loosely by US National Archives and
Records Administration (NARA) publication [Soundex], and verified
empirically against test cases from various sources. Both the current
NARA function and the older version with different handling of `H' and
`W' are supported.")
(doc (itemlist
(item "[GIL-55] US National Archives and Records Administration, ``Using
the Census Soundex,'' General Information Leaflet 55, 1995.")
(item "[Soundex] US National Archives and Records Administration, ``The
Soundex Indexing System,'' 2000-02-19.")))
(doc "Additionally, a nonstandard prefix-guessing function that is an invention
of this package permits additional Soundex keys to be generated from a
string,increasing recall.")
(doc (section "Characters, Ordinals, and Codes"))
(doc "To facilitate possible future support of other input character sets, this
library employs a "
(italic "character ordinal")
" abstract representation of the letters used by Soundex. The ordinal
value is an integer from 0 to 25---corresponding to the 26 letters `A'
through `Z', respectively---and can be used for fast mapping via vectors.
Most applications need not be aware of this.")
(doc procedure soundex-ordinal?
"Predicate for whether or not "
(racket x)
" is a Soundex ordinal.")
(provide/contract (soundex-ordinal? (-> any/c boolean?)))
(define (soundex-ordinal? x)
(and (integer? x)
(<= 0 x 25)
#t))
(doc procedure soundex-ordinal
"Yields the Soundex ordinal value of character "
(racket chr)
", or "
(racket #f)
" if the character is not considered a letter."
(racketinput (soundex-ordinal #\a)
#,(racketresult 0))
(racketinput (soundex-ordinal #\A)
#,(racketresult 0))
(racketinput (soundex-ordinal #\Z)
#,(racketresult 25))
(racketinput (soundex-ordinal #\3)
#,(racketresult #f))
(racketinput (soundex-ordinal #\.)
#,(racketresult #f)))
(provide/contract (soundex-ordinal (-> char? (or/c soundex-ordinal? #f))))
(define (soundex-ordinal chr)
(let ((x (char->integer chr)))
(cond ((< x 65) #f)
((< x 91) (- x 65))
((< x 97) #f)
((< x 123) (- x 97))
(else #f))))
(doc procedure soundex-ordinal->char
"Yields the upper-case letter character that corresponds to the character
ordinal value "
(racket ord)
". For example:"
(racketinput (soundex-ordinal->char (soundex-ordinal #\a))
#,(racketresult #\A))
(para "Note that a "
(racket #f)
" value as a result of applying "
(racket soundex-ordinal)
" is "
(italic "not")
" an ordinal value, and is not mapped to a character by "
(racket soundex-ordinal->char)
". For example:")
(racketinput (soundex-ordinal->char (soundex-ordinal #\'))
#,(racketerror
"soundex-ordinal->char: contract violation, expected: soundex-ordinal?, given: #f")))
(provide/contract (soundex-ordinal->char (-> soundex-ordinal? char?)))
(define soundex-ordinal->char
(let ((letters
'#(#\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M
#\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z)))
(lambda (ord)
(vector-ref letters ord))))
(doc procedure soundex-code?
"Predicate for whether or not "
(racket x)
" is a Soundex code.")
(provide/contract (soundex-code? (-> any/c boolean?)))
(define (soundex-code? x)
(and (memq x '(aeiou hw y #\1 #\2 #\3 #\4 #\5 #\6))
#t))
(doc procedure soundex-ordinal->soundex-code
(para "Yields a library-specific Soundex code for character ordinal "
(racket ord)
".")
(racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\a))
#,(racketresult aeiou))
(racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\c))
#,(racketresult #\2))
(racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\N))
#,(racketresult #\5))
(racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\w))
#,(racketresult hw))
(racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\y))
#,(racketresult y)))
(provide/contract (soundex-ordinal->soundex-code (-> soundex-ordinal? soundex-code?)))
(define soundex-ordinal->soundex-code
(let ((code-vector
'#(aeiou #\1 #\2 #\3 aeiou #\1 #\2 hw aeiou #\2 #\2 #\4 #\5
#\5 aeiou #\1 #\2 #\6 #\2 #\3 aeiou #\1 hw #\2 y #\2)))
(lambda (ord)
(if ord (vector-ref code-vector ord) #f))))
(doc procedure char->soundex-code
"Yields a library-specific Soundex code for character "
(racket chr)
". This is equivalent to: "
(racketblock (soundex-ordinal->soundex-code (soundex-ordinal #,(italic (racket chr))))))
(provide/contract (char->soundex-code (-> char? soundex-code?)))
(define (char->soundex-code chr)
(soundex-ordinal->soundex-code (soundex-ordinal chr)))
(doc (section "Hashing"))
(doc "Soundex hashes of strings can be generated with "
(racket soundex-nara)
", "
(racket soundex-old)
", and "
(racket soundex)
".")
(define (%soundex/narahw/start str narahw? start)
(let ((len (string-length str)))
(let find-first-alpha ((i start))
(if (>= i len)
#f
(let* ((ord (soundex-ordinal (string-ref str i))))
(if ord
(let ((result (make-string 4 #\0))
(result-used 1))
(string-set! result 0 (soundex-ordinal->char ord))
(let scan ((i (+ 1 i))
(prior-code (soundex-ordinal->soundex-code ord)))
(if (>= i len)
result
(let ((code (char->soundex-code (string-ref str i))))
(case code
((aeiou y) (scan (+ 1 i) code))
((hw) (scan (+ 1 i) (if narahw? prior-code code)))
((#\1 #\2 #\3 #\4 #\5 #\6)
(if (eqv? code prior-code)
(scan (+ 1 i) prior-code)
(begin (string-set! result result-used code)
(if (= result-used 3)
result
(begin (set! result-used
(+ 1 result-used))
(scan (+ 1 i) code))))))
(else (scan (+ 1 i) #f)))))))
(find-first-alpha (+ 1 i))))))))
(doc procedures (soundex-nara soundex-old soundex)
"Yields a Soundex hash key of string "
(racket str)
", or "
(racket #f)
" if not even an initial letter could be found. "
(racket soundex-nara)
" generates NARA hashes,and "
(racket soundex-old)
" generates older-style hashes. "
(racket soundex)
" is an alias for "
(racket soundex-nara)
"."
(racketinput (soundex-nara "Ashcraft")
#,(racketresult "A261"))
(racketinput (soundex-old "Ashcraft")
#,(racketresult "A226"))
(racketinput (soundex "Ashcraft")
#,(racketresult "A261"))
(racketinput (soundex "")
#,(racketresult #f)))
(provide/contract (soundex-nara (-> string? string?))
(soundex-old (-> string? string?))
(soundex (-> string? string?)))
(define (soundex-nara str) (%soundex/narahw/start str #t 0))
(define (soundex-old str) (%soundex/narahw/start str #f 0))
(define (soundex str) (soundex-nara str))
(doc (section "Prefixing"))
(doc "Multiple Soundex hashes from a single string can be generated by "
(racket soundex-nara/prefixing)
", "
(racket soundex-old/prefixing)
", and "
(racket soundex/p)
", which consider the string with and without various common surname prefixes.")
(doc procedure soundex-prefix-starts
"Yields a list of Soundex start points in string "
(racket str)
", as character index integers, for making hash keys with and without
prefixes. A prefix must be followed by at least two letters, although
they can be interspersed with non-letter characters. The exact behavior
of this function is subject to change in future versions of this library."
(racketinput (soundex-prefix-starts "Smith")
#,(racketresult (0)))
(racketinput (soundex-prefix-starts " Jones")
#,(racketresult (2)))
(racketinput (soundex-prefix-starts "vanderlinden")
#,(racketresult (0 3 6)))
(racketinput (soundex-prefix-starts "van der linden")
#,(racketresult (0 3 7)))
(racketinput (soundex-prefix-starts "")
#,(racketresult ()))
(racketinput (soundex-prefix-starts "123")
#,(racketresult ()))
(racketinput (soundex-prefix-starts "dea")
#,(racketresult (0)))
(racketinput (soundex-prefix-starts "dea ")
#,(racketresult (0)))
(racketinput (soundex-prefix-starts "dean")
#,(racketresult (0)))
(racketinput (soundex-prefix-starts "delasol")
#,(racketresult (0 2 3 4))))
(provide/contract (soundex-prefix-starts (-> string? (listof exact-nonnegative-integer?))))
(define (soundex-prefix-starts str)
(letrec ((len (string-length str))
(i 0)
(ord #f)
(next-ord
(lambda ()
(if (= i len)
'end
(begin (set! ord (soundex-ordinal (string-ref str i)))
(set! i (+ 1 i))
(or ord (next-ord))))))
(trailed?
(lambda ()
(let ((saved-i i)
(result (let loop ((needed 2))
(if (> needed 0)
(case (next-ord)
((end) #f)
((#f) (loop needed))
(else (loop (- needed 1))))
#t))))
(set! i saved-i)
result))))
(let find-first ()
(case (next-ord)
((end) '())
((#f) (find-first))
(else
(cons (- i 1)
(case ord
((2) (if (and (eq? (next-ord) 14) (eq? (next-ord) 13) (trailed?))
(list i)
'()))
((3) (case (next-ord)
((4) (if (trailed?)
(cons i
(case (next-ord)
((11) (if (trailed?)
(cons i
(if (and (eq? (next-ord) 0)
(trailed?))
(list i)
'()))
'()))
((18) (if (trailed?) (list i) '()))
(else '())))
'()))
((8 20) (if (trailed?) (list i) '()))
(else '())))
((11) (case (next-ord)
((0 4) (if (trailed?) (list i) '()))
(else '())))
((21) (case (next-ord)
((0 14) (if (eq? (next-ord) 13) (cons i (if (and (eq? (next-ord) 3) (eq? (next-ord) 4) (trailed?))
(case (next-ord)
((13 17) (if (trailed?) (list i) '()))
(else '()))
'()))
'()))
(else '())))
(else '()))))))))
(define (%soundex/prefixing/narahw str narahw?)
(let ((result '()))
(for-each (lambda (start)
(let ((sx (%soundex/narahw/start str narahw? start)))
(and sx
(not (member sx result))
(set! result (cons sx result)))))
(soundex-prefix-starts str))
(reverse result)))
(doc procedures (soundex-nara/prefixing soundex-old/prefixing soundex/p)
"Yields a list of zero or more Soundex hash keys from string "
(racket str)
" based on the whole string and the string with various prefixes skipped.
All elements of the list are mutually unique. "
(racket soundex-nara/prefixing)
" generates NARA hashes, and "
(racket soundex-old/prefixing)
" generates older-style hashes. "
(racket soundex/p)
" is an alias for "
(racket soundex-nara/prefixing)
"."
(racketinput (soundex/p "Van Damme")
#,(racketresult ("V535" "D500")))
(racketinput (soundex/p "vanvoom")
#,(racketresult ("V515" "V500")))
(racketinput (soundex/p "vanvanvan")
#,(racketresult ("V515")))
(racketinput (soundex/p "DeLaSol")
#,(racketresult ("D424" "L240" "A240" "S400")))
(racketinput (soundex/p "")
#,(racketresult ())))
(provide/contract (soundex-nara/prefixing (-> string? (listof string?)))
(soundex-old/prefixing (-> string? (listof string?)))
(soundex/p (-> string? (listof string?))))
(define (soundex-nara/prefixing str) (%soundex/prefixing/narahw str #t))
(define (soundex-old/prefixing str) (%soundex/prefixing/narahw str #f))
(define (soundex/p str) (soundex-nara/prefixing str))
(doc history
(#:planet 2:0 #:date "2012-06-12"
"Converted to McFly and Overeasy. Added contracts. Changed
references from Scheme to Racket.")
(#:version "0.6" #:planet 1:3 #:date "2009-03-14"
"Documentation fix.")
(#:version "0.5" #:planet 1:2 #:date "2009-02-24"
"Ahem.")
(#:version "0.4" #:planet 1:1 #:date "2009-02-24"
"Removed internal-use-only procedures from documentation.")
(#:version "0.3" #:planet 1:0 #:date "2009-02-24"
"Licensed under LGPL 3. Converted to author's new Scheme
administration system. Made test suite executable. Minor
documentation changes.")
(#:version "0.2" #:date "2004-08-02"
"Minor documentation change. Version frozen for PLaneT
packaging.")
(#:version "0.1" #:date "2004-05-10"
"First release."))