1 | ;;;; levenshtein-sequence-functor.scm -*- Scheme -*- |
---|
2 | ;;;; Kon Lovett, Mar '20 |
---|
3 | ;;;; Kon Lovett, Apr '12 |
---|
4 | ;;;; Kon Lovett, May '06 |
---|
5 | |
---|
6 | ;; Issues |
---|
7 | ;; |
---|
8 | ;; - When insert-cost <> delete-cost is the initialization of the work-vector correct? |
---|
9 | ;; What about when we swap the source & target? |
---|
10 | ;; |
---|
11 | ;; - Don't have a "real" sequence abstraction, only recognizes vector, list, and string. |
---|
12 | ;; What about byte-vector, u8vector, ..., and stream? |
---|
13 | |
---|
14 | (include "levenshtein-cost-interface") |
---|
15 | (include "levenshtein-sequence-interface") |
---|
16 | |
---|
17 | (functor (levenshtein-sequence-functor (CO COST-OPER) (SO SEQUENCE-OPER)) |
---|
18 | |
---|
19 | (;export |
---|
20 | levenshtein-distance/sequence) |
---|
21 | |
---|
22 | (import scheme) |
---|
23 | (import (chicken base)) |
---|
24 | (import (chicken type)) |
---|
25 | (import (srfi 1)) |
---|
26 | (import (srfi 63)) |
---|
27 | (import vector-lib) |
---|
28 | (import miscmacros) |
---|
29 | (import type-checks) |
---|
30 | (import type-errors) |
---|
31 | (import levenshtein-operators) |
---|
32 | (import CO) |
---|
33 | (import SO) |
---|
34 | |
---|
35 | ;;; |
---|
36 | |
---|
37 | ;moremacros |
---|
38 | (define-syntax swap! |
---|
39 | (syntax-rules () |
---|
40 | ((swap! ?a ?b) |
---|
41 | (let ( |
---|
42 | (_tmp ?a) ) |
---|
43 | (set! ?a ?b) |
---|
44 | (set! ?b _tmp)) ) ) ) |
---|
45 | |
---|
46 | (define (levenshtein-distance/sequence source target |
---|
47 | #!key |
---|
48 | (insert-cost 1) (delete-cost 1) (substitute-cost 1) |
---|
49 | (get-work-vector make-vector) |
---|
50 | (elm-eql eqv?) |
---|
51 | (limit-cost #f)) |
---|
52 | |
---|
53 | ; Validate |
---|
54 | (check-sequence 'levenshtein-distance/generic-sequence source) |
---|
55 | (check-sequence 'levenshtein-distance/generic-sequence target) |
---|
56 | (check-procedure 'levenshtein-distance/generic-sequence elm-eql "elm-eql") |
---|
57 | (check-procedure 'levenshtein-distance/generic-sequence get-work-vector "get-work-vector") |
---|
58 | |
---|
59 | ; |
---|
60 | (let ((source-length (sequence-length source)) |
---|
61 | (target-length (sequence-length target))) |
---|
62 | |
---|
63 | (cond |
---|
64 | |
---|
65 | ; Quit when source or target empty |
---|
66 | ((zero? source-length) |
---|
67 | (cost-multiply target-length insert-cost)) |
---|
68 | ((zero? target-length) |
---|
69 | (cost-multiply source-length insert-cost)) |
---|
70 | |
---|
71 | ; Otherwise need to calculate distance |
---|
72 | (else |
---|
73 | |
---|
74 | ; "Strip" common prefix & suffix |
---|
75 | (let ((prefix-length (sequence-prefix-length elm-eql source target)) |
---|
76 | (suffix-length (sequence-suffix-length elm-eql source target))) |
---|
77 | |
---|
78 | (let ((stripped-source-start prefix-length) |
---|
79 | (stripped-source-end (- source-length suffix-length)) |
---|
80 | (stripped-target-start prefix-length) |
---|
81 | (stripped-target-end (- target-length suffix-length))) |
---|
82 | |
---|
83 | (let ((stripped-source-length (- stripped-source-end stripped-source-start)) |
---|
84 | (stripped-target-length (- stripped-target-end stripped-target-start))) |
---|
85 | |
---|
86 | ; Prefix overlaps suffix? |
---|
87 | (unless (and (<= 0 stripped-source-length) (<= 0 stripped-target-length)) |
---|
88 | |
---|
89 | ; Use the longest match & revert to the full string otherwise |
---|
90 | (if (< prefix-length suffix-length) |
---|
91 | (begin |
---|
92 | (set! stripped-source-start 0) |
---|
93 | (set! stripped-target-start 0)) |
---|
94 | (begin |
---|
95 | (set! stripped-source-end source-length) |
---|
96 | (set! stripped-target-end target-length))) |
---|
97 | |
---|
98 | ; Re-calc stripped lengths |
---|
99 | (set! stripped-source-length (- stripped-source-end stripped-source-start)) |
---|
100 | (set! stripped-target-length (- stripped-target-end stripped-target-start))) |
---|
101 | |
---|
102 | (cond |
---|
103 | |
---|
104 | ; Stripped source or target empty? |
---|
105 | ((zero? stripped-source-length) |
---|
106 | (cost-multiply stripped-target-length insert-cost)) |
---|
107 | ((zero? stripped-target-length) |
---|
108 | (cost-multiply stripped-source-length insert-cost)) |
---|
109 | |
---|
110 | ; Otherwise need to calculate distance |
---|
111 | (else |
---|
112 | |
---|
113 | ; Perform distance calculation on "stripped" source & target |
---|
114 | (let ((source |
---|
115 | (subsequence/shared source stripped-source-start stripped-source-end)) |
---|
116 | (target |
---|
117 | (subsequence/shared target stripped-target-start stripped-target-end)) |
---|
118 | (source-length stripped-source-length) |
---|
119 | (target-length stripped-target-length)) |
---|
120 | |
---|
121 | ; Swap so target is the shorter of source & target |
---|
122 | (when (< source-length target-length) |
---|
123 | (swap! source-length target-length) |
---|
124 | (swap! source target)) |
---|
125 | |
---|
126 | ; Allocate matrix row/column work vector |
---|
127 | (let ((work (get-work-vector (add1 target-length)))) |
---|
128 | |
---|
129 | ; Initialize work vector |
---|
130 | (do ((k 0 (add1 k)) |
---|
131 | (cost 0 (cost-add cost insert-cost))) |
---|
132 | ((> k target-length)) |
---|
133 | (vector-set! work k cost)) |
---|
134 | |
---|
135 | ; "Early" return is needed |
---|
136 | (let/cc return |
---|
137 | |
---|
138 | ; Calculate edit "cost" |
---|
139 | (let ((total-cost #f) |
---|
140 | (cost-at-source delete-cost)) |
---|
141 | |
---|
142 | ; For each source element |
---|
143 | (sequence-for-each |
---|
144 | (lambda (source-index source-elm) |
---|
145 | |
---|
146 | ; Every element costs |
---|
147 | (let ((current-cost cost-at-source)) |
---|
148 | |
---|
149 | ; For each target element |
---|
150 | (sequence-for-each |
---|
151 | (lambda (target-index target-elm) |
---|
152 | |
---|
153 | ; Calculate cost to this position |
---|
154 | (set! total-cost |
---|
155 | (cost-minimum |
---|
156 | (cost-add insert-cost (vector-ref work (add1 target-index))) |
---|
157 | (cost-add delete-cost current-cost) |
---|
158 | (let ((cost-at-target (vector-ref work target-index))) |
---|
159 | (if (elm-eql source-elm target-elm) cost-at-target |
---|
160 | (cost-add substitute-cost cost-at-target))))) |
---|
161 | |
---|
162 | ; Quit when past limit |
---|
163 | (when (and limit-cost (cost-less-than limit-cost total-cost)) |
---|
164 | (return limit-cost)) |
---|
165 | |
---|
166 | ; Save the cost to this point |
---|
167 | (vector-set! work target-index current-cost) |
---|
168 | (set! current-cost total-cost) ) |
---|
169 | target) |
---|
170 | |
---|
171 | ; Save total-cost at target |
---|
172 | (vector-set! work target-length total-cost) ) |
---|
173 | |
---|
174 | ; Bump to next source cost |
---|
175 | ; Assumes indexing from 0 to end |
---|
176 | (set! cost-at-source (cost-add cost-at-source delete-cost)) ) |
---|
177 | source) |
---|
178 | |
---|
179 | ; Result is the total cost of edit |
---|
180 | total-cost ) ) ) ) ) ) ) ) ) ) ) ) ) |
---|
181 | |
---|
182 | ) ;functor levenshtein-sequence-functor |
---|