source: project/release/4/mbox/trunk/mbox-string.scm @ 24969

Last change on this file since 24969 was 24969, checked in by Ivan Raikov, 9 years ago

mbox: added mbox-string module

File size: 8.7 KB
Line 
1;;
2;;  mbox parser combinators specialized for strings.
3;;
4;;   Copyright 2009-2011 Ivan Raikov and the Okinawa Institute of Science
5;;   and Technology.
6;;
7;;
8;;   This program is free software: you can redistribute it and/or
9;;   modify it under the terms of the GNU General Public License as
10;;   published by the Free Software Foundation, either version 3 of
11;;   the License, or (at your option) any later version.
12;;
13;;   This program is distributed in the hope that it will be useful,
14;;   but WITHOUT ANY WARRANTY; without even the implied warranty of
15;;   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;;   General Public License for more details.
17;;
18;;   A full copy of the GPL license can be found at
19;;   <http://www.gnu.org/licenses/>.
20
21(module mbox-string
22
23        (
24         mbox-file->messages
25         message? message-envelope message-headers message-body
26         )
27
28   (import scheme chicken foreign )
29
30   (require-extension utils data-structures srfi-1 srfi-13 typeclass input-classes)
31
32   (require-library abnf internet-message mbox)
33   (import (only abnf <CoreABNF>  <Token> <CharLex> 
34                 CharLex->CoreABNF Input->Token 
35                 Token->CharLex 
36                 )
37           (only internet-message <InetMessage> CoreABNF->InetMessage)
38           (only mbox <Mbox> Input+.CoreABNF->Mbox message? message-envelope message-headers message-body )
39
40           )
41
42
43;;
44;;
45;; Fast sub-sequence search, based on work by Boyer, Moore, Horspool,
46;; Sunday, and Lundh.
47;;
48;; Based on code from the Haskell text library by Tom Harper and Bryan
49;; O'Sullivan. http://hackage.haskell.org/package/text
50;;
51;;
52;; References:
53;;
54;; * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
55;;   Communications of the ACM, 20, 10, 762-772 (1977)
56;;
57;; * R. N. Horspool: Practical Fast Searching in Strings.  Software -
58;;   Practice and Experience 10, 501-506 (1980)
59;;
60;; * D. M. Sunday: A Very Fast Substring Search Algorithm.
61;;   Communications of the ACM, 33, 8, 132-142 (1990)
62;;
63;; * F. Lundh: The Fast Search Algorithm.
64;;   <http://effbot.org/zone/stringlib.htm> (2006)
65;;
66;; From http://effbot.org/zone/stringlib.htm:
67;;
68;; When designing the new algorithm, I used the following constraints:
69;;
70;;     * should be faster than the current brute-force algorithm for
71;;       all test cases (based on real-life code), including Jim
72;;       Hugunin’s worst-case test
73;;
74;;     * small setup overhead; no dynamic allocation in the fast path
75;;       (O(m) for speed, O(1) for storage)
76;;
77;;     * sublinear search behaviour in good cases (O(n/m))
78;;
79;;     * no worse than the current algorithm in worst case (O(nm))
80;;
81;;     * should work well for both 8-bit strings and 16-bit or 32-bit
82;;       Unicode strings (no O(σ) dependencies)
83;;
84;;     * many real-life searches should be good, very few should be
85;;       worst case
86;;
87;;     * reasonably simple implementation
88;;
89;;  This rules out most standard algorithms (Knuth-Morris-Pratt is not
90;;  sublinear, Boyer-Moore needs tables that depend on both the
91;;  alphabet size and the pattern size, most Boyer-Moore variants need
92;;  tables that depend on the pattern size, etc.).
93;;
94;;  After some tweaking, I came up with a simplication of Boyer-Moore,
95;;  incorporating ideas from Horspool and Sunday. Here’s an outline:
96;;
97;; def find(s, p):
98;;     # find first occurrence of p in s
99;;     n = len(s)
100;;     m = len(p)
101;;     skip = delta1(p)[p[m-1]]
102;;     i = 0
103;;     while i <= n-m:
104;;         if s[i+m-1] == p[m-1]: # (boyer-moore)
105;;             # potential match
106;;             if s[i:i+m-1] == p[:m-1]:
107;;                 return i
108;;             if s[i+m] not in p:
109;;                 i = i + m + 1 # (sunday)
110;;             else:
111;;                 i = i + skip # (horspool)
112;;         else:
113;;             # skip
114;;             if s[i+m] not in p:
115;;                 i = i + m + 1 # (sunday)
116;;             else:
117;;                 i = i + 1
118;;     return -1 # not found
119;;
120;; The delta1(p)[p[m-1]] value is simply the Boyer-Moore delta1 (or
121;; bad-character skip) value for the last character in the pattern.
122;;
123;; For the s[i+m] not in p test, I use a 32-bit bitmask, using the 5
124;; least significant bits of the character as the key. This could be
125;; described as a simple Bloom filter.
126;;
127;; Note that the above Python code may access s[n], which would result in
128;; an IndexError exception. For the CPython implementation, this is not
129;; really a problem, since CPython adds trailing NULL entries to both
130;; 8-bit and Unicode strings. 
131
132;;
133;; /O(n+m)/ Find the offsets of all non-overlapping indices of
134;; needle within haystack.
135;;
136;; In (unlikely) bad cases, this algorithm's complexity degrades
137;; towards /O(n*m)/.
138;;
139
140(define swizzle
141    (foreign-lambda* int ((char k))
142#<<END
143     int result;
144
145     result = 1 << (k & 0x3F);
146
147     C_return(result);
148END
149))
150
151(define (make-table nlast nindex nlen z)
152  (lambda (i msk skp)
153    (let loop ((i i) (msk msk) (skp skp))
154      (cond ((>= i nlast) 
155             (values (bitwise-ior msk (swizzle z)) skp))
156            (else        
157             (let* ((c    (nindex i))
158                    (skp1 (cond ((char=? c z)  (- nlen i 2))
159                                (else     skp))))
160               (loop (+ 1 i) (bitwise-ior msk (swizzle c)) skp1)))))))
161             
162(define (scan1 hindex hlen c)
163  (let loop ((i 0) (ax '()))
164    (cond ((>= i hlen)        (reverse ax))
165          ((char=? (hindex i) c)   (loop (+ 1 i) (cons i ax)))
166          (else               (loop (+ 1 i) ax)))))
167
168(define (scan nindex hindex nlast nlen ldiff z mask skip i)
169
170  (define (candidate-match i j)
171    (cond ((>= j nlast)  #t)
172          ((not (char=? (hindex (+ i j)) (nindex j)))  #f)
173          (else (candidate-match i (+ 1 j)))))
174
175  (let loop ((i i) (ax '()))
176
177    (if (>= i ldiff) (reverse ax)
178
179        (let ((c (hindex (+ i nlast))))
180          (cond
181           ;;
182           ((and (char=? c z) (candidate-match i 0))
183            (loop (+ i nlen) (cons i ax)))
184           ;;
185           (else
186            (let* ((next-in-pattern?
187                    (zero? (bitwise-and mask (swizzle (hindex (+ i nlen))))))
188                    (delta (cond (next-in-pattern? (+ 1 nlen))
189                                 ((char=? c z)  (+ 1 skip))
190                                 (else     1))))
191              (loop (+ i delta) ax))))))))
192
193
194(define (subsequence-search needle haystack)
195
196  (let ((nobj  needle)
197        (nlen  (string-length needle))
198        (hobj  haystack)
199        (hlen  (string-length haystack)))
200    (let* ((nindex   (lambda (k) 
201                       (string-ref nobj k)))
202           (hindex   (lambda (k) 
203                       (string-ref hobj k)))
204           (ldiff    (- hlen nlen))
205           (nlast    (- nlen 1))
206           (z        (nindex nlast))
207           (tbl      (make-table nlast nindex nlen z)))
208
209      (let-values 
210       (((mask skip)  (tbl 0 0 (- nlen 2))))
211       (cond ((= 1 nlen) 
212              (scan1 hindex hlen (nindex 0)))
213             ((or (<= nlen 0) (negative? ldiff)) 
214              '())
215             (else
216              (scan nindex hindex nlast nlen ldiff z mask skip 0)))))))
217
218;;
219;; Based on code from the Haskell text library by Tom Harper and Bryan
220;; O'Sullivan. http://hackage.haskell.org/package/text
221;;
222;;    /O(n+m)/ Find all non-overlapping instances of needle in
223;;  haystack.  The first element of the returned pair is the prefix
224;;  of haystack prior to any matches of needle.  The second is a
225;;  list of pairs.
226;;
227;;  The first element of each pair in the list is a span from the
228;;  beginning of a match to the beginning of the next match, while the
229;;  second is a span from the beginning of the match to the end of the
230;;  input.
231;;
232;;  Examples:
233;;
234;;  > find "::" ""
235;;  > ==> ("", [])
236;;  > find "/" "a/b/c/d"
237;;  > ==> ("a", [("/b","/b/c/d"), ("/c","/c/d"), ("/d","/d")])
238;;
239;;  In (unlikely) bad cases, this function's time complexity degrades
240;;  towards /O(n*m)/.
241
242;; find :: Text * Text -> (Text, [(Text, Text)])
243
244(define (string-find needle haystack)
245  (cond ((string-null? needle) 
246         (error 'find "empty pattern" needle))
247        (else
248         (let ((r (subsequence-search needle haystack)))
249
250           (cond ((null? r) 
251                  (list haystack '()))
252                 (else      
253                  (let* ((hlen  (string-length haystack))
254                         (chunk (lambda (n l) 
255                                  (string-copy haystack n l)))
256                         (go    (lambda (s xs)
257
258
259                                  (let loop ((s s) (xs xs) (ax '()))
260
261                                    (if (null? xs)
262                                        (let ((c (chunk s hlen)))
263                                          (reverse (cons (list c c) ax)))
264                                        (let ((x (car xs)) (xs (cdr xs)))
265                                          (loop x xs
266                                                (cons (list (chunk s x) 
267                                                            (chunk s hlen)) 
268                                                      ax)))))))
269                         )
270                    (list (chunk 0 (car r))
271                          (go (car r) (cdr r)))))))
272         )))
273
274
275(define (string-car x) (string-ref x 0))
276
277(define (string-cdr x) (string-drop x 1))
278
279(define string-<Input>
280  (make-<Input> string-null? string-car string-cdr))
281
282(define string-<Token>
283  (Input->Token string-<Input>))
284
285(define string-<CharLex>
286  (Token->CharLex string-<Token>))
287
288(define string-<CoreABNF>
289  (CharLex->CoreABNF string-<CharLex>))
290
291(define string-<Input+>
292  (make-<Input+> string-<Input> 
293                 string-find
294                 identity
295                 (lambda (x . rest) (read-all x))
296                 ))
297
298(define string-<Mbox>
299  (Input+.CoreABNF->Mbox string-<Input+>
300                         string-<CoreABNF>
301                         ))
302
303(import-instance (<Mbox> string-<Mbox>) )
304
305
306)
Note: See TracBrowser for help on using the repository browser.