source: project/release/4/lexgen/trunk/lexgen.scm @ 30885

Last change on this file since 30885 was 30885, checked in by Ivan Raikov, 6 years ago

lexgen: ensure that that bind* calls the binding procedure even on empty input

File size: 9.2 KB
Line 
1;;
2;;  Lexer combinator library.
3;;
4;;  Based on the SML lexer generator by Thant Tessman.
5;;
6;;  Copyright 2009-2014 Ivan Raikov and the Okinawa Institute of
7;;  Science and Technology.
8;;
9;;
10;; This program is free software: you can redistribute it and/or
11;; modify it under the terms of the GNU General Public License as
12;; published by the Free Software Foundation, either version 3 of the
13;; License, or (at your option) any later version.
14;;
15;; This program is distributed in the hope that it will be useful, but
16;; WITHOUT ANY WARRANTY; without even the implied warranty of
17;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;; General Public License for more details.
19;;
20;; A full copy of the GPL license can be found at
21;; <http://www.gnu.org/licenses/>.
22
23(module lexgen
24
25  ( seq star bar 
26    try pass pos opt lst eos 
27    bind bind* rebind rebind* drop
28
29    lex 
30    (Input->Token tok) <Token>
31    (Token->CharLex char range set lit) <CharLex>
32    )
33
34
35  (import scheme chicken data-structures)
36  (require-library srfi-1)
37  (import (only srfi-1 first second filter-map fold concatenate every lset<= ))
38  (require-extension typeclass)
39  (require-library input-classes)
40  (import (only input-classes <Input>))
41
42  (require-extension utf8 utf8-srfi-14)
43
44
45;;
46;;   This is a lexer generator comprised in its core of five small
47;;   functions. The programmer assembles these functions into regular
48;;   expression pattern-matching functions.
49;;
50;;   The idea is that a pattern matcher function takes a list of
51;;   streams, and returns a new list of streams advanced by every
52;;   combination allowed by the pattern matcher function.
53;;
54;;   A stream is a list that can take one of two forms:
55;;   
56;;   1) A list of two elements: the first element is a list of
57;;   elements consumed by the pattern matcher; the second element is a
58;;   list of characters not yet consumed. E.g., the list
59;;
60;;   ((a) (b c d e))
61;;
62;;   represents a stream that contains the consumed character a,
63;;   and the unconsumed characters b c d e.
64;;
65;;   2) A list of three elements: the first two elements are as
66;;   before, but the third element is a procedure that is applied to
67;;   the tail of the unconsumed list, in order to obtain the next
68;;   character. E.g., the list:
69;;
70;;   ((a) (b <port>) <procedure (lambda (in) (list (read-char in) in))>
71;;
72;;   represents a stream that contains the consumed character a, the
73;;   unconsumed character b, and an input port to read subsequent
74;;   character from; and a procedure that reads one character from the
75;;   input port, and returns it along with the modified port. Note
76;;   that the use of side-effecting structures such as ports will lead
77;;   to erroneous results with backtracking parsers.
78;;
79;;   Also note that the number of streams returned by the function
80;;   typically won't match the number of streams passed in. If the
81;;   pattern doesn't match at all, the empty list is returned.
82;;
83
84
85;; This matches a sequence of patterns.
86
87(define (seq p1 p2)
88  (lambda (sk fk strm)
89    (p1 (lambda (strm1) (p2 sk fk strm1)) fk strm)))
90
91;; This matches either one of two patterns. It's analogous to patterns
92;; separated by the '|' in regular expressions.
93
94(define (bar p1 p2)
95  (lambda (sk fk strm)
96    (p1 sk (lambda _ (p2 sk fk strm)) strm)))
97
98
99;; Kleene closure. Analogous to '*'
100
101(define (star p)
102  (lambda (sk fk strm)
103        (p (lambda (strm1) 
104            (if (eoi? (cadr strm1)) (sk strm1)
105             ((star p) sk sk strm1))) sk strm)))
106
107;; this parser always succeeds
108
109(define (pass sk fk s) (sk s))
110
111;; this parser matches end of input
112
113(define (eos sk fk s) (if (eoi? (cadr s)) (sk s) (fk s)))
114 
115;; Positive closure. Analogous to '+'
116
117(define (pos pat) (seq pat (star pat)))
118
119;; Optional pattern. Analogous to '?'
120
121(define (opt pat) (bar pat pass))
122
123;; Matches a consecutive list of patterns
124
125(define (lst ps)
126  (let ((ps (reverse ps)))
127    (let recur ((ps (cdr ps)) (p1 (car ps)))
128      (cond ((null? ps) p1)
129            (else (recur (cdr ps) (seq (car ps) p1)))))))
130 
131
132;; datatype used by bind and drop
133(define-record-type box (make-box contents)
134  box? (contents box-contents ))
135
136(define box make-box)
137(define unbox box-contents)
138
139;; Given a list (X_1 ... X_n), returns a list ( (X_1 ... X_(n-1))  X_n )
140(define-inline (split-at-last x)
141  (if (null? x) (list #f (list))
142      (let loop ((prev (list (car x))) (rest (cdr x)))
143        (cond ((null? rest)
144               (if (null? (cdr prev))
145                   (list '() (car prev))
146                   (list (reverse (cdr prev)) (car prev))))
147              (else (loop (cons (car rest) prev) (cdr rest)))))))
148
149;; helpers for bind
150(define-inline (bind-apply f)
151  (lambda (s) 
152    (cond ((pair? s)
153           (let ((eaten (car s))
154                 (food  (cadr s)))
155             (let* ((ep     (split-at-last eaten))
156                    (eaten1 (car ep))
157                    (eaten0 (cadr ep)))
158               (assert (box? eaten0))
159               (let ((x   (and (list? eaten1) (f eaten1))))
160                 (if x (list (append x (unbox eaten0)) food)
161                     (list (unbox eaten0) food)))
162               )))
163          (else s))))
164
165(define-inline (box-stream s)
166  (cond ((pair? s)
167         (let ((eaten (car s))
168               (food  (cadr s)))
169           (list (list (box eaten)) food)))
170        (else s)))
171
172;; Binds a procedure f to the consumed tokens returned by p
173;; Calls failure on empty input
174(define (bind f p)
175  (let ((ba (bind-apply f)))
176    (lambda (sk fk s)
177      (if (eoi? (cadr s)) 
178          (fk s)
179          (let ((sk1 (lambda (s1) (sk (ba s1))))
180                (fk1 (lambda (s1) (fk s))))
181            (p sk1 fk1 (box-stream s)))))))
182
183;; Same as bind, but calls success on empty input
184(define (bind* f p)
185  (let ((ba (bind-apply f)))
186    (lambda (sk fk s)
187      (if (eoi? (cadr s)) 
188          (sk (ba (box-stream s)))
189          (let ((sk1 (lambda (s1) (sk (ba s1))))
190                (fk1 (lambda (s1) (fk s))))
191            (p sk1 fk1 (box-stream s)))))))
192
193
194(define (drop p)
195  (bind (lambda x #f) p))
196
197
198;; helpers for rebind
199(define-inline (rebind-apply g)
200  (lambda (i s) 
201    (cond ((pair? s)
202           (let ((eaten (car s))
203                 (food  (cdr s)))
204             (let* ((ep (split-at-last eaten))
205                    (eaten1 (car ep))
206                    (eaten0 (cadr ep)))
207               (assert (box? eaten0))
208               (let* ((x   (and (list? eaten1) (g i eaten1)))
209                      (res (if x (cons (append x (unbox eaten0)) food) 
210                               (cons (unbox eaten0) food))))
211                 res))))
212          (else s))))
213
214;; Applies a procedure f to the un-consumed tokens, then applies
215;; procedure g to the result of f and the tokens returned by p
216;; Calls failure on empty input
217(define (rebind f g p)
218  (let ((ra (rebind-apply g)))
219    (lambda (sk fk s)
220      (if (eoi? (cadr s)) 
221          (fk s)
222          (let* ((info   ((compose f cadr) s))
223                 (sk1    (lambda (s) (sk (ra info s)))))
224            (p sk1 fk (box-stream s)))))))
225
226;; Same as rebind, but calls success on empty input
227(define (rebind* f g p)
228  (let ((ra (rebind-apply g)))
229    (lambda (sk fk s)
230      (if (eoi? (cadr s)) 
231          (sk s)
232          (let* ((info   ((compose f cadr) s))
233                 (sk1    (lambda (s) (sk (ra info s)))))
234            (p sk1 fk (box-stream s)))))))
235
236
237;; This takes a pattern and a string, turns the string into a list of
238;; streams (containing one stream), applies the pattern, and returns
239;; the longest match.
240
241(define (->char-list s)
242  (if (string? s) (list (string->list s)) s))
243
244(define (lex pat error ss)
245  (let* ((stream (cond ((string? ss) `(() . ,(->char-list ss)))
246                       ((pair? ss)   ss)
247                       (else  (error ss)))))
248    (pat (lambda (s) (list (reverse (first s)) (second s)))
249         (lambda (s) (error s)) stream)))
250
251(define-record-type eoi (make-eoi) eoi?)
252(define (stream-eoi? strm) (eoi? (cadr strm)))
253
254(define-class <Token> (<Input> input)  tok)
255
256 
257;; 'tok' builds a pattern matcher function that applies procedure p to
258;; a given token and an input character. If the procedure returns a
259;; true value, that value is prepended to the list of consumed
260;; elements, and the input character is removed from the list of input
261;; elements.
262
263(define=> (tok <Input>)
264  (lambda (t p )
265    (lambda (sk fk strm)
266      (and (pair? strm) 
267           (let ((c (car strm))
268                 (u (cadr strm)))
269             (cond ((eoi? u)    (fk strm))
270                   ((empty? u)  (fk (list c (make-eoi))))
271                   ((p t (head u)) =>
272                    (lambda (ans) (sk (list (cons ans c) (tail u)))))
273                   (else  (fk strm))
274                   )))
275      )))
276
277
278                               
279;; Converts a binary predicate procedure to a binary procedure that
280;; returns its right argument when the predicate is true, and false
281;; otherwise.
282
283(define (try p) (lambda (x y) (let ((res (p x y))) (and res y))))
284 
285(define (Input->Token I) (make-<Token> I (tok I)))
286
287(define-class <CharLex> (<Token> T)  char set range lit)
288
289;; Matches a single character
290
291(define=> (char <Token>)
292  (lambda (c) (tok c (try char=?))))
293 
294;; Matches any of a SRFI-14 set of characters.
295
296(define=> (set <Token>)
297  (lambda (s)
298    (let ((cs (if (char-set? s) s (list->char-set (if (string? s) (string->list s) s)))))
299      (tok cs (try char-set-contains?)))))
300
301;; Range of characters. Analogous to character class '[]'
302
303(define=> (range <Token>)
304  (letrec ((range0 
305            (lambda (a b)
306              (if (char<? b a) (range0 b a)
307                  (tok (ucs-range->char-set (char->integer a) (+ 1 (char->integer b))) 
308                       (try char-set-contains?))))))
309    range0))
310
311;; Matches a literal string s
312
313(define=> (lit <Token>)
314  (lambda (s)
315    (let ((f (lambda (t) (tok t (try char=?)))))
316      (lst (map f (if (string? s) (string->list s) s))))))
317
318
319(define (Token->CharLex T)
320  (make-<CharLex> T
321                  (char T)
322                  (set T)
323                  (range T)
324                  (lit T)
325                  ))
326
327
328)
Note: See TracBrowser for help on using the repository browser.