source: project/release/4/mbox/trunk/mbox-string.scm @ 34045

Last change on this file since 34045 was 34045, checked in by Ivan Raikov, 3 years ago

mbox: added test cases

File size: 8.7 KB
Line 
1;;
2;;  mbox parser combinators specialized for strings.
3;;
4;;   Copyright 2009-2017 Ivan Raikov
5;;
6;;
7;;   This program is free software: you can redistribute it and/or
8;;   modify it under the terms of the GNU General Public License as
9;;   published by the Free Software Foundation, either version 3 of
10;;   the License, or (at your option) any later version.
11;;
12;;   This program is distributed in the hope that it will be useful,
13;;   but WITHOUT ANY WARRANTY; without even the implied warranty of
14;;   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;;   General Public License for more details.
16;;
17;;   A full copy of the GPL license can be found at
18;;   <http://www.gnu.org/licenses/>.
19
20(module mbox-string
21
22        (
23         mbox-file->messages
24         message? message-envelope message-headers message-body
25         )
26
27   (import scheme chicken foreign )
28
29   (require-extension utils data-structures srfi-1 srfi-13 typeclass input-classes)
30
31   (require-library abnf internet-message mbox)
32   (import (only abnf <CoreABNF>  <Token> <CharLex> 
33                 CharLex->CoreABNF Input->Token 
34                 Token->CharLex 
35                 )
36           (only internet-message <InetMessage> CoreABNF->InetMessage)
37           (only mbox <Mbox> Input+.CoreABNF->Mbox message? message-envelope message-headers message-body )
38
39           )
40
41
42;;
43;;
44;; Fast sub-sequence search, based on work by Boyer, Moore, Horspool,
45;; Sunday, and Lundh.
46;;
47;; Based on code from the Haskell text library by Tom Harper and Bryan
48;; O'Sullivan. http://hackage.haskell.org/package/text
49;;
50;;
51;; References:
52;;
53;; * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
54;;   Communications of the ACM, 20, 10, 762-772 (1977)
55;;
56;; * R. N. Horspool: Practical Fast Searching in Strings.  Software -
57;;   Practice and Experience 10, 501-506 (1980)
58;;
59;; * D. M. Sunday: A Very Fast Substring Search Algorithm.
60;;   Communications of the ACM, 33, 8, 132-142 (1990)
61;;
62;; * F. Lundh: The Fast Search Algorithm.
63;;   <http://effbot.org/zone/stringlib.htm> (2006)
64;;
65;; From http://effbot.org/zone/stringlib.htm:
66;;
67;; When designing the new algorithm, I used the following constraints:
68;;
69;;     * should be faster than the current brute-force algorithm for
70;;       all test cases (based on real-life code), including Jim
71;;       Hugunin’s worst-case test
72;;
73;;     * small setup overhead; no dynamic allocation in the fast path
74;;       (O(m) for speed, O(1) for storage)
75;;
76;;     * sublinear search behaviour in good cases (O(n/m))
77;;
78;;     * no worse than the current algorithm in worst case (O(nm))
79;;
80;;     * should work well for both 8-bit strings and 16-bit or 32-bit
81;;       Unicode strings (no O(σ) dependencies)
82;;
83;;     * many real-life searches should be good, very few should be
84;;       worst case
85;;
86;;     * reasonably simple implementation
87;;
88;;  This rules out most standard algorithms (Knuth-Morris-Pratt is not
89;;  sublinear, Boyer-Moore needs tables that depend on both the
90;;  alphabet size and the pattern size, most Boyer-Moore variants need
91;;  tables that depend on the pattern size, etc.).
92;;
93;;  After some tweaking, I came up with a simplication of Boyer-Moore,
94;;  incorporating ideas from Horspool and Sunday. Here’s an outline:
95;;
96;; def find(s, p):
97;;     # find first occurrence of p in s
98;;     n = len(s)
99;;     m = len(p)
100;;     skip = delta1(p)[p[m-1]]
101;;     i = 0
102;;     while i <= n-m:
103;;         if s[i+m-1] == p[m-1]: # (boyer-moore)
104;;             # potential match
105;;             if s[i:i+m-1] == p[:m-1]:
106;;                 return i
107;;             if s[i+m] not in p:
108;;                 i = i + m + 1 # (sunday)
109;;             else:
110;;                 i = i + skip # (horspool)
111;;         else:
112;;             # skip
113;;             if s[i+m] not in p:
114;;                 i = i + m + 1 # (sunday)
115;;             else:
116;;                 i = i + 1
117;;     return -1 # not found
118;;
119;; The delta1(p)[p[m-1]] value is simply the Boyer-Moore delta1 (or
120;; bad-character skip) value for the last character in the pattern.
121;;
122;; For the s[i+m] not in p test, I use a 32-bit bitmask, using the 5
123;; least significant bits of the character as the key. This could be
124;; described as a simple Bloom filter.
125;;
126;; Note that the above Python code may access s[n], which would result in
127;; an IndexError exception. For the CPython implementation, this is not
128;; really a problem, since CPython adds trailing NULL entries to both
129;; 8-bit and Unicode strings. 
130
131;;
132;; /O(n+m)/ Find the offsets of all non-overlapping indices of
133;; needle within haystack.
134;;
135;; In (unlikely) bad cases, this algorithm's complexity degrades
136;; towards /O(n*m)/.
137;;
138
139(define swizzle
140    (foreign-lambda* int ((char k))
141#<<END
142     int result;
143
144     result = 1 << (k & 0x3F);
145
146     C_return(result);
147END
148))
149
150(define (make-table nlast nindex nlen z)
151  (lambda (i msk skp)
152    (let loop ((i i) (msk msk) (skp skp))
153      (cond ((>= i nlast) 
154             (values (bitwise-ior msk (swizzle z)) skp))
155            (else        
156             (let* ((c    (nindex i))
157                    (skp1 (cond ((char=? c z)  (- nlen i 2))
158                                (else     skp))))
159               (loop (+ 1 i) (bitwise-ior msk (swizzle c)) skp1)))))))
160             
161(define (scan1 hindex hlen c)
162  (let loop ((i 0) (ax '()))
163    (cond ((>= i hlen)        (reverse ax))
164          ((char=? (hindex i) c)   (loop (+ 1 i) (cons i ax)))
165          (else               (loop (+ 1 i) ax)))))
166
167(define (scan nindex hindex nlast nlen ldiff z mask skip i)
168
169  (define (candidate-match i j)
170    (cond ((>= j nlast)  #t)
171          ((not (char=? (hindex (+ i j)) (nindex j)))  #f)
172          (else (candidate-match i (+ 1 j)))))
173
174  (let loop ((i i) (ax '()))
175
176    (if (>= i ldiff) (reverse ax)
177
178        (let ((c (hindex (+ i nlast))))
179          (cond
180           ;;
181           ((and (char=? c z) (candidate-match i 0))
182            (loop (+ i nlen) (cons i ax)))
183           ;;
184           (else
185            (let* ((next-in-pattern?
186                    (zero? (bitwise-and mask (swizzle (hindex (+ i nlen))))))
187                    (delta (cond (next-in-pattern? (+ 1 nlen))
188                                 ((char=? c z)  (+ 1 skip))
189                                 (else     1))))
190              (loop (+ i delta) ax))))))))
191
192
193(define (subsequence-search needle haystack)
194
195  (let ((nobj  needle)
196        (nlen  (string-length needle))
197        (hobj  haystack)
198        (hlen  (string-length haystack)))
199    (let* ((nindex   (lambda (k) 
200                       (string-ref nobj k)))
201           (hindex   (lambda (k) 
202                       (string-ref hobj k)))
203           (ldiff    (- hlen nlen))
204           (nlast    (- nlen 1))
205           (z        (nindex nlast))
206           (tbl      (make-table nlast nindex nlen z)))
207
208      (let-values 
209       (((mask skip)  (tbl 0 0 (- nlen 2))))
210       (cond ((= 1 nlen) 
211              (scan1 hindex hlen (nindex 0)))
212             ((or (<= nlen 0) (negative? ldiff)) 
213              '())
214             (else
215              (scan nindex hindex nlast nlen ldiff z mask skip 0)))))))
216
217;;
218;; Based on code from the Haskell text library by Tom Harper and Bryan
219;; O'Sullivan. http://hackage.haskell.org/package/text
220;;
221;;    /O(n+m)/ Find all non-overlapping instances of needle in
222;;  haystack.  The first element of the returned pair is the prefix
223;;  of haystack prior to any matches of needle.  The second is a
224;;  list of pairs.
225;;
226;;  The first element of each pair in the list is a span from the
227;;  beginning of a match to the beginning of the next match, while the
228;;  second is a span from the beginning of the match to the end of the
229;;  input.
230;;
231;;  Examples:
232;;
233;;  > find "::" ""
234;;  > ==> ("", [])
235;;  > find "/" "a/b/c/d"
236;;  > ==> ("a", [("/b","/b/c/d"), ("/c","/c/d"), ("/d","/d")])
237;;
238;;  In (unlikely) bad cases, this function's time complexity degrades
239;;  towards /O(n*m)/.
240
241;; find :: Text * Text -> (Text, [(Text, Text)])
242
243(define (string-find needle haystack)
244  (cond ((string-null? needle) 
245         (error 'find "empty pattern" needle))
246        (else
247         (let ((r (subsequence-search needle haystack)))
248
249           (cond ((null? r) 
250                  (list haystack '()))
251                 (else      
252                  (let* ((hlen  (string-length haystack))
253                         (chunk (lambda (n l) 
254                                  (string-copy haystack n l)))
255                         (go    (lambda (s xs)
256
257
258                                  (let loop ((s s) (xs xs) (ax '()))
259
260                                    (if (null? xs)
261                                        (let ((c (chunk s hlen)))
262                                          (reverse (cons (list c c) ax)))
263                                        (let ((x (car xs)) (xs (cdr xs)))
264                                          (loop x xs
265                                                (cons (list (chunk s x) 
266                                                            (chunk s hlen)) 
267                                                      ax)))))))
268                         )
269                    (list (chunk 0 (car r))
270                          (go (car r) (cdr r)))))))
271         )))
272
273
274(define (string-car x) (string-ref x 0))
275
276(define (string-cdr x) (string-drop x 1))
277
278(define string-<Input>
279  (make-<Input> string-null? string-car string-cdr))
280
281(define string-<Token>
282  (Input->Token string-<Input>))
283
284(define string-<CharLex>
285  (Token->CharLex string-<Token>))
286
287(define string-<CoreABNF>
288  (CharLex->CoreABNF string-<CharLex>))
289
290(define string-<Input+>
291  (make-<Input+> string-<Input> 
292                 string-find
293                 identity
294                 (lambda (x . rest)
295                   (print "string-input: x = " x)
296                   (read-all x))
297                 ))
298
299(define string-<Mbox>
300  (Input+.CoreABNF->Mbox string-<Input+>
301                         string-<CoreABNF>
302                         ))
303
304(import-instance (<Mbox> string-<Mbox>) )
305
306
307)
Note: See TracBrowser for help on using the repository browser.