main.rkt
#lang racket/base
;; For legal info, see file "info.rkt".

(require racket/contract
         (planet neil/mcfly))

;; (doc (define-syntax ==>
;;        (syntax-rules ()
;;          ((_ RESULT0 RESULTn ...)
;;           (span (larger (italic "\u21d2"))
;;                 (racketinput RESULT0 RESULTn ...))))))
;;            
;; (doc (define ==> (span (larger (italic "\u21d2")))))
             
(doc (section "Introduction"))

(doc "The "
     (bold "soundex")
     " library provides an implementation in Racket of the Soundex indexing
      hash function as specified somewhat loosely by US National Archives and
      Records Administration (NARA) publication [Soundex], and verified
      empirically against test cases from various sources.  Both the current
      NARA function and the older version with different handling of `H' and
      `W' are supported.")

(doc (itemlist

      (item "[GIL-55] US National Archives and Records Administration, ``Using
             the Census Soundex,'' General Information Leaflet 55, 1995.")

      (item "[Soundex] US National Archives and Records Administration, ``The
             Soundex Indexing System,'' 2000-02-19.")))

(doc "Additionally, a nonstandard prefix-guessing function that is an invention
      of this package permits additional Soundex keys to be generated from a
      string,increasing recall.")

(doc (section "Characters, Ordinals, and Codes"))

(doc "To facilitate possible future support of other input character sets, this
      library employs a "
     (italic "character ordinal")
     " abstract representation of the letters used by Soundex.  The ordinal
      value is an integer from 0 to 25---corresponding to the 26 letters `A'
      through `Z', respectively---and can be used for fast mapping via vectors.
      Most applications need not be aware of this.")

(doc procedure soundex-ordinal?
     "Predicate for whether or not "
     (racket x)
     " is a Soundex ordinal.")
(provide/contract (soundex-ordinal? (-> any/c boolean?)))
(define (soundex-ordinal? x)
  (and (integer? x)
       (<= 0 x 25)
       #t))

(doc procedure soundex-ordinal
     "Yields the Soundex ordinal value of character "
     (racket chr)
     ", or "
     (racket #f)
     " if the character is not considered a letter."
     (racketinput (soundex-ordinal #\a)
                  #,(racketresult 0))
     (racketinput (soundex-ordinal #\A)
                  #,(racketresult 0))
     (racketinput (soundex-ordinal #\Z)
                  #,(racketresult 25))
     (racketinput (soundex-ordinal #\3)
                  #,(racketresult #f))
     (racketinput (soundex-ordinal #\.)
                  #,(racketresult #f)))
(provide/contract (soundex-ordinal (-> char? (or/c soundex-ordinal? #f))))
(define (soundex-ordinal chr)
  (let ((x (char->integer chr)))
    (cond ((< x 65)  #f)
          ((< x 91)  (- x 65))
          ((< x 97)  #f)
          ((< x 123) (- x 97))
          (else      #f))))

(doc procedure soundex-ordinal->char
     "Yields the upper-case letter character that corresponds to the character
      ordinal value "
     (racket ord)
     ".  For example:"
     (racketinput (soundex-ordinal->char (soundex-ordinal #\a))
                  #,(racketresult #\A))
     (para "Note that a "
           (racket #f)
           " value as a result of applying "
           (racket soundex-ordinal)
           " is "
           (italic "not")
           " an ordinal value, and is not mapped to a character by "
           (racket soundex-ordinal->char)
           ".  For example:")
     (racketinput (soundex-ordinal->char (soundex-ordinal #\'))
                  #,(racketerror
                     "soundex-ordinal->char: contract violation, expected: soundex-ordinal?, given: #f")))
(provide/contract (soundex-ordinal->char (-> soundex-ordinal? char?)))
(define soundex-ordinal->char
  (let ((letters
         '#(#\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M
            #\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z)))
    (lambda (ord)
      (vector-ref letters ord))))

(doc procedure soundex-code?
     "Predicate for whether or not "
     (racket x)
     " is a Soundex code.")
(provide/contract (soundex-code? (-> any/c boolean?)))
(define (soundex-code? x)
  (and (memq x '(aeiou hw y #\1 #\2 #\3 #\4 #\5 #\6))
       #t))

(doc procedure soundex-ordinal->soundex-code
     (para "Yields a library-specific Soundex code for character ordinal "
           (racket ord)
           ".")
     (racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\a))
                  #,(racketresult aeiou))
     (racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\c))
                  #,(racketresult #\2))
     (racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\N))
                  #,(racketresult #\5))
     (racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\w))
                  #,(racketresult hw))
     (racketinput (soundex-ordinal->soundex-code (soundex-ordinal #\y))
                  #,(racketresult y)))
(provide/contract (soundex-ordinal->soundex-code (-> soundex-ordinal? soundex-code?)))
(define soundex-ordinal->soundex-code
  (let ((code-vector
         '#(aeiou #\1 #\2 #\3 aeiou #\1 #\2 hw aeiou #\2 #\2 #\4 #\5
                  #\5 aeiou #\1 #\2 #\6 #\2 #\3 aeiou #\1 hw #\2 y #\2)))
    (lambda (ord)
      (if ord (vector-ref code-vector ord) #f))))

(doc procedure char->soundex-code
     "Yields a library-specific Soundex code for character "
     (racket chr)
     ".  This is equivalent to: "
     (racketblock (soundex-ordinal->soundex-code (soundex-ordinal #,(italic (racket chr))))))
(provide/contract (char->soundex-code (-> char? soundex-code?)))
(define (char->soundex-code chr)
  (soundex-ordinal->soundex-code (soundex-ordinal chr)))

(doc (section "Hashing"))

(doc "Soundex hashes of strings can be generated with "
     (racket soundex-nara)
     ", "
     (racket soundex-old)
     ", and "
     (racket soundex)
     ".")

(define (%soundex/narahw/start str narahw? start)
  (let ((len (string-length str)))
    (let find-first-alpha ((i start))
      (if (>= i len)
          #f
          (let* ((ord (soundex-ordinal (string-ref str i))))
            (if ord
                (let ((result      (make-string 4 #\0))
                      (result-used 1))
                  (string-set! result 0 (soundex-ordinal->char ord))
                  (let scan ((i          (+ 1 i))
                             (prior-code (soundex-ordinal->soundex-code ord)))
                    (if (>= i len)
                        result
                        (let ((code (char->soundex-code (string-ref str i))))
                          (case code
                            ((aeiou y) (scan (+ 1 i) code))
                            ((hw) (scan (+ 1 i) (if narahw? prior-code code)))
                            ((#\1 #\2 #\3 #\4 #\5 #\6)
                             (if (eqv? code prior-code)
                                 (scan (+ 1 i) prior-code)
                                 (begin (string-set! result result-used code)
                                        (if (= result-used 3)
                                            result
                                            (begin (set! result-used
                                                         (+ 1 result-used))
                                                   (scan (+ 1 i) code))))))
                            (else (scan (+ 1 i) #f)))))))
                (find-first-alpha (+ 1 i))))))))

(doc procedures (soundex-nara soundex-old soundex)
     "Yields a Soundex hash key of string "
     (racket str)
     ", or "
     (racket #f)
     " if not even an initial letter could be found.  "
     (racket soundex-nara)
     " generates NARA hashes,and "
     (racket soundex-old)
     " generates older-style hashes.  "
     (racket soundex)
     " is an alias for "
     (racket soundex-nara)
     "."
     (racketinput (soundex-nara "Ashcraft")
                  #,(racketresult "A261"))
     (racketinput (soundex-old  "Ashcraft")
                  #,(racketresult "A226"))
     (racketinput (soundex      "Ashcraft")
                  #,(racketresult "A261"))
     (racketinput (soundex      "")
                  #,(racketresult #f)))
(provide/contract (soundex-nara (-> string? string?))
                  (soundex-old  (-> string? string?))
                  (soundex      (-> string? string?)))
(define (soundex-nara str) (%soundex/narahw/start str #t 0))
(define (soundex-old  str) (%soundex/narahw/start str #f 0))
(define (soundex str) (soundex-nara str))

(doc (section "Prefixing"))

(doc "Multiple Soundex hashes from a single string can be generated by "
     (racket soundex-nara/prefixing)
     ", "
     (racket soundex-old/prefixing)
     ", and "
     (racket soundex/p)
     ", which consider the string with and without various common surname prefixes.")

(doc procedure soundex-prefix-starts
     "Yields a list of Soundex start points in string "
     (racket str)
     ", as character index integers, for making hash keys with and without
      prefixes.  A prefix must be followed by at least two letters, although
      they can be interspersed with non-letter characters.  The exact behavior
      of this function is subject to change in future versions of this library."
     (racketinput (soundex-prefix-starts "Smith")
                  #,(racketresult (0)))
     (racketinput (soundex-prefix-starts "  Jones")
                  #,(racketresult (2)))
     (racketinput (soundex-prefix-starts "vanderlinden")
                  #,(racketresult (0 3 6)))
     (racketinput (soundex-prefix-starts "van der linden")
                  #,(racketresult (0 3 7)))
     (racketinput (soundex-prefix-starts "")
                  #,(racketresult ()))
     (racketinput (soundex-prefix-starts "123")
                  #,(racketresult ()))
     (racketinput (soundex-prefix-starts "dea")
                  #,(racketresult (0)))
     (racketinput (soundex-prefix-starts "dea ")
                  #,(racketresult (0)))
     (racketinput (soundex-prefix-starts "dean")
                  #,(racketresult (0)))
     (racketinput (soundex-prefix-starts "delasol")
                  #,(racketresult (0 2 3 4))))
(provide/contract (soundex-prefix-starts (-> string? (listof exact-nonnegative-integer?))))
(define (soundex-prefix-starts str)
  ;; TODO: Maybe someday find a really elegant way to integrate this into the
  ;;       coding pass, or cache the ordinals.  At the same time, make it
  ;;       data-driven, so that it's easier to make a prefixing constructor
  ;;       from a user-provided list of prefixes.
  (letrec ((len (string-length str))
           (i   0)
           (ord #f)
           (next-ord
            (lambda ()
              (if (= i len)
                  'end
                  (begin (set! ord (soundex-ordinal (string-ref str i)))
                         (set! i (+ 1 i))
                         (or ord (next-ord))))))
           (trailed?
            (lambda ()
              (let ((saved-i i)
                    (result  (let loop ((needed 2))
                               (if (> needed 0)
                                   (case (next-ord)
                                     ((end) #f)
                                     ((#f)  (loop needed))
                                     (else  (loop (- needed 1))))
                                   #t))))
                (set! i saved-i)
                result))))
    (let find-first ()
      (case (next-ord)
        ((end) '())
        ((#f)  (find-first))
        (else
         ;; A=0  B=1  C=2  D=3  E=4  F=5  G=6  H=7  I=8  J=9  K=10 L=11 M=12
         ;; N=13 O=14 P=15 Q=16 R=17 S=18 T=19 U=20 V=21 W=22 X=23 Y=24 Z=25
         (cons (- i 1)
               (case ord
                 ((2) ;; C
                  (if (and (eq? (next-ord) 14) ;; (C)O
                           (eq? (next-ord) 13) ;; (CO)N
                           (trailed?))
                      (list i)
                      '()))
                 ((3) ;; D
                  (case (next-ord)
                    ((4) ;; (D)E
                     (if (trailed?)
                         (cons i
                               (case (next-ord)
                                 ((11) ;; (DE)L
                                  (if (trailed?)
                                      (cons i
                                            (if (and (eq? (next-ord) 0)
                                                     ;; (DEL)A
                                                     (trailed?))
                                                (list i)
                                                '()))
                                      '()))
                                 ((18) ;; (DE)S
                                  (if (trailed?) (list i) '()))
                                 (else '())))
                         '()))
                    ((8 20) ;; (D)I, (D)U
                     (if (trailed?) (list i) '()))
                    (else '())))
                 ((11) ;; L
                  (case (next-ord)
                    ((0 4) ;; (L)A, (L)E
                     (if (trailed?) (list i) '()))
                    (else '())))
                 ((21) ;; V
                  (case (next-ord)
                    ((0 14) ;; (V)A, (V)O
                     (if (eq? (next-ord) 13) ;; (V*)N
                         (cons i (if (and (eq? (next-ord) 3) ;; (V*N)D
                                          (eq? (next-ord) 4) ;; (V*ND)E
                                          (trailed?))
                                     (case (next-ord)
                                       ((13 17) ;; (V*NDE)N, (V*NDE)R
                                        (if (trailed?) (list i) '()))
                                       (else '()))
                                     '()))
                         '()))
                    (else '())))
                 (else '()))))))))

(define (%soundex/prefixing/narahw str narahw?)
  (let ((result '()))
    (for-each (lambda (start)
                (let ((sx (%soundex/narahw/start str narahw? start)))
                  (and sx
                       (not (member sx result))
                       (set! result (cons sx result)))))
              (soundex-prefix-starts str))
    (reverse result)))

(doc procedures (soundex-nara/prefixing soundex-old/prefixing soundex/p)
     "Yields a list of zero or more Soundex hash keys from string "
     (racket str)
     " based on the whole string and the string with various prefixes skipped.
      All elements of the list are mutually unique. "
     (racket soundex-nara/prefixing)
     " generates NARA hashes, and "
     (racket soundex-old/prefixing)
     " generates older-style hashes. "
     (racket soundex/p)
     " is an alias for "
     (racket soundex-nara/prefixing)
     "."
     (racketinput (soundex/p "Van Damme")
                  #,(racketresult ("V535" "D500")))
     (racketinput (soundex/p "vanvoom")
                  #,(racketresult ("V515" "V500")))
     (racketinput (soundex/p "vanvanvan")
                  #,(racketresult ("V515")))
     (racketinput (soundex/p "DeLaSol")
                  #,(racketresult ("D424" "L240" "A240" "S400")))
     (racketinput (soundex/p "")
                  #,(racketresult ())))
(provide/contract (soundex-nara/prefixing (-> string? (listof string?)))
                  (soundex-old/prefixing  (-> string? (listof string?)))
                  (soundex/p              (-> string? (listof string?))))
(define (soundex-nara/prefixing str) (%soundex/prefixing/narahw str #t))
(define (soundex-old/prefixing  str) (%soundex/prefixing/narahw str #f))
(define (soundex/p              str) (soundex-nara/prefixing str))

(doc history

     (#:planet 2:0 #:date "2012-06-12"
                "Converted to McFly and Overeasy.  Added contracts.  Changed
references from Scheme to Racket.")

     (#:version "0.6" #:planet 1:3 #:date "2009-03-14"
                "Documentation fix.")

     (#:version "0.5" #:planet 1:2 #:date "2009-02-24"
                "Ahem.")

     (#:version "0.4" #:planet 1:1 #:date "2009-02-24"
                "Removed internal-use-only procedures from documentation.")

     (#:version "0.3" #:planet 1:0 #:date "2009-02-24"
                "Licensed under LGPL 3.  Converted to author's new Scheme
                 administration system.  Made test suite executable.  Minor
                 documentation changes.")

     (#:version "0.2" #:date "2004-08-02"
                "Minor documentation change.  Version frozen for PLaneT
                 packaging.")

     (#:version "0.1" #:date "2004-05-10"
                "First release."))