source: project/release/4/lexgen/trunk/lexgen.scm @ 30860

Last change on this file since 30860 was 30860, checked in by Ivan Raikov, 6 years ago

lexgen: added bind* and rebind* variants of bind and rebind

File size: 9.1 KB
Line 
1;;
2;;  Lexer combinator library.
3;;
4;;  Based on the SML lexer generator by Thant Tessman.
5;;
6;;  Copyright 2009-2014 Ivan Raikov and the Okinawa Institute of
7;;  Science and Technology.
8;;
9;;
10;; This program is free software: you can redistribute it and/or
11;; modify it under the terms of the GNU General Public License as
12;; published by the Free Software Foundation, either version 3 of the
13;; License, or (at your option) any later version.
14;;
15;; This program is distributed in the hope that it will be useful, but
16;; WITHOUT ANY WARRANTY; without even the implied warranty of
17;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;; General Public License for more details.
19;;
20;; A full copy of the GPL license can be found at
21;; <http://www.gnu.org/licenses/>.
22
23(module lexgen
24
25  ( seq star bar 
26    try pass pos opt lst 
27    bind bind* rebind rebind* drop
28
29    lex 
30
31    (Input->Token tok) <Token>
32    (Token->CharLex char range set lit) <CharLex>
33    )
34
35
36  (import scheme chicken data-structures)
37  (require-library srfi-1)
38  (import (only srfi-1 first second filter-map fold concatenate every lset<= ))
39  (require-extension typeclass)
40  (require-library input-classes)
41  (import (only input-classes <Input>))
42
43  (require-extension utf8 utf8-srfi-14)
44
45
46;;
47;;   This is a lexer generator comprised in its core of five small
48;;   functions. The programmer assembles these functions into regular
49;;   expression pattern-matching functions.
50;;
51;;   The idea is that a pattern matcher function takes a list of
52;;   streams, and returns a new list of streams advanced by every
53;;   combination allowed by the pattern matcher function.
54;;
55;;   A stream is a list that can take one of two forms:
56;;   
57;;   1) A list of two elements: the first element is a list of
58;;   elements consumed by the pattern matcher; the second element is a
59;;   list of characters not yet consumed. E.g., the list
60;;
61;;   ((a) (b c d e))
62;;
63;;   represents a stream that contains the consumed character a,
64;;   and the unconsumed characters b c d e.
65;;
66;;   2) A list of three elements: the first two elements are as
67;;   before, but the third element is a procedure that is applied to
68;;   the tail of the unconsumed list, in order to obtain the next
69;;   character. E.g., the list:
70;;
71;;   ((a) (b <port>) <procedure (lambda (in) (list (read-char in) in))>
72;;
73;;   represents a stream that contains the consumed character a, the
74;;   unconsumed character b, and an input port to read subsequent
75;;   character from; and a procedure that reads one character from the
76;;   input port, and returns it along with the modified port. Note
77;;   that the use of side-effecting structures such as ports will lead
78;;   to erroneous results with backtracking parsers.
79;;
80;;   Also note that the number of streams returned by the function
81;;   typically won't match the number of streams passed in. If the
82;;   pattern doesn't match at all, the empty list is returned.
83;;
84
85
86;; This matches a sequence of patterns.
87
88(define (seq p1 p2)
89  (lambda (sk fk strm)
90    (p1 (lambda (strm1) (p2 sk fk strm1)) fk strm)))
91
92;; This matches either one of two patterns. It's analogous to patterns
93;; separated by the '|' in regular expressions.
94
95(define (bar p1 p2)
96  (lambda (sk fk strm)
97    (p1 sk (lambda _ (p2 sk fk strm)) strm)))
98
99
100;; Kleene closure. Analogous to '*'
101
102(define (star p)
103  (lambda (sk fk strm)
104    (p (lambda (strm1) 
105         (if (eoi? (cadr strm1)) (sk strm1)
106             ((star p) sk sk strm1))) sk strm)))
107
108;; this parser always succeeds
109(define (pass sk fk s) (sk s))
110 
111;; Positive closure. Analogous to '+'
112
113(define (pos pat) (seq pat (star pat)))
114
115;; Optional pattern. Analogous to '?'
116
117(define (opt pat) (bar pat pass))
118
119;; Matches a consecutive list of patterns
120
121(define (lst ps)
122  (let ((ps (reverse ps)))
123    (let recur ((ps (cdr ps)) (p1 (car ps)))
124      (cond ((null? ps) p1)
125            (else (recur (cdr ps) (seq (car ps) p1)))))))
126 
127
128;; datatype used by bind and drop
129(define-record-type box (make-box contents)
130  box? (contents box-contents ))
131
132(define box make-box)
133(define unbox box-contents)
134
135;; Given a list (X_1 ... X_n), returns a list ( (X_1 ... X_(n-1))  X_n )
136(define-inline (split-at-last x)
137  (if (null? x) (list #f (list))
138      (let loop ((prev (list (car x))) (rest (cdr x)))
139        (cond ((null? rest)
140               (if (null? (cdr prev))
141                   (list '() (car prev))
142                   (list (reverse (cdr prev)) (car prev))))
143              (else (loop (cons (car rest) prev) (cdr rest)))))))
144
145;; helpers for bind
146(define-inline (bind-apply f)
147  (lambda (s) 
148    (cond ((pair? s)
149           (let ((eaten (car s))
150                 (food  (cadr s)))
151             (let* ((ep     (split-at-last eaten))
152                    (eaten1 (car ep))
153                    (eaten0 (cadr ep)))
154               (assert (box? eaten0))
155               (let ((x   (and (list? eaten1) (f eaten1))))
156                 (if x (list (append x (unbox eaten0)) food)
157                     (list (unbox eaten0) food)))
158               )))
159          (else s))))
160
161(define-inline (box-stream s)
162  (cond ((pair? s)
163         (let ((eaten (car s))
164               (food  (cadr s)))
165           (list (list (box eaten)) food)))
166        (else s)))
167
168;; Binds a procedure f to the consumed tokens returned by p
169;; Calls failure on empty input
170(define (bind f p)
171  (let ((ba (bind-apply f)))
172    (lambda (sk fk s)
173      (if (eoi? (cadr s)) 
174          (fk s)
175          (let ((sk1 (lambda (s1) (sk (ba s1))))
176                (fk1 (lambda (s1) (fk s))))
177            (p sk1 fk1 (box-stream s)))))))
178
179;; Same as bind, but calls success on empty input
180(define (bind* f p)
181  (let ((ba (bind-apply f)))
182    (lambda (sk fk s)
183      (if (eoi? (cadr s)) 
184          (sk s)
185          (let ((sk1 (lambda (s1) (sk (ba s1))))
186                (fk1 (lambda (s1) (fk s))))
187            (p sk1 fk1 (box-stream s)))))))
188
189
190(define (drop p)
191  (bind (lambda x #f) p))
192
193
194;; helpers for rebind
195(define-inline (rebind-apply g)
196  (lambda (i s) 
197    (cond ((pair? s)
198           (let ((eaten (car s))
199                 (food  (cdr s)))
200             (let* ((ep (split-at-last eaten))
201                    (eaten1 (car ep))
202                    (eaten0 (cadr ep)))
203               (assert (box? eaten0))
204               (let* ((x   (and (list? eaten1) (g i eaten1)))
205                      (res (if x (cons (append x (unbox eaten0)) food) 
206                               (cons (unbox eaten0) food))))
207                 res))))
208          (else s))))
209
210;; Applies a procedure f to the un-consumed tokens, then applies
211;; procedure g to the result of f and the tokens returned by p
212;; Calls failure on empty input
213(define (rebind f g p)
214  (let ((ra (rebind-apply g)))
215    (lambda (sk fk s)
216      (if (eoi? (cadr s)) 
217          (fk s)
218          (let* ((info   ((compose f cadr) s))
219                 (sk1    (lambda (s) (sk (ra info s)))))
220            (p sk1 fk (box-stream s)))))))
221
222;; Same as rebind, but calls success on empty input
223(define (rebind* f g p)
224  (let ((ra (rebind-apply g)))
225    (lambda (sk fk s)
226      (if (eoi? (cadr s)) 
227          (sk s)
228          (let* ((info   ((compose f cadr) s))
229                 (sk1    (lambda (s) (sk (ra info s)))))
230            (p sk1 fk (box-stream s)))))))
231
232
233;; This takes a pattern and a string, turns the string into a list of
234;; streams (containing one stream), applies the pattern, and returns
235;; the longest match.
236
237(define (->char-list s)
238  (if (string? s) (list (string->list s)) s))
239
240(define (lex pat error ss)
241  (let* ((stream (cond ((string? ss) `(() . ,(->char-list ss)))
242                       ((pair? ss)   ss)
243                       (else  (error ss)))))
244    (pat (lambda (s) (list (reverse (first s)) (second s)))
245         (lambda (s) (error s)) stream)))
246
247(define-record-type eoi (make-eoi) eoi?)
248
249
250;; The following procedures are specific to the input class type
251
252(define-class <Token> (<Input> input)  tok)
253
254 
255;; 'tok' builds a pattern matcher function that applies procedure p to
256;; a given token and an input character. If the procedure returns a
257;; true value, that value is prepended to the list of consumed
258;; elements, and the input character is removed from the list of input
259;; elements.
260
261(define=> (tok <Input>)
262  (lambda (t p )
263    (lambda (sk fk strm)
264      (and (pair? strm) 
265           (let ((c (car strm))
266                 (u (cadr strm)))
267             (cond ((eoi? u)    (fk strm))
268                   ((empty? u)  (fk (list c (make-eoi))))
269                   ((p t (head u)) =>
270                    (lambda (ans) (sk (list (cons ans c) (tail u)))))
271                   (else  (fk strm))
272                   )))
273      )))
274
275
276                               
277;; Converts a binary predicate procedure to a binary procedure that
278;; returns its right argument when the predicate is true, and false
279;; otherwise.
280
281(define (try p) (lambda (x y) (let ((res (p x y))) (and res y))))
282 
283(define (Input->Token I) (make-<Token> I (tok I)))
284
285(define-class <CharLex> (<Token> T)  char set range lit)
286
287;; Matches a single character
288
289(define=> (char <Token>)
290  (lambda (c) (tok c (try char=?))))
291 
292;; Matches any of a SRFI-14 set of characters.
293
294(define=> (set <Token>)
295  (lambda (s)
296    (let ((cs (if (char-set? s) s (list->char-set (if (string? s) (string->list s) s)))))
297      (tok cs (try char-set-contains?)))))
298
299;; Range of characters. Analogous to character class '[]'
300
301(define=> (range <Token>)
302  (letrec ((range0 
303            (lambda (a b)
304              (if (char<? b a) (range0 b a)
305                  (tok (ucs-range->char-set (char->integer a) (+ 1 (char->integer b))) 
306                       (try char-set-contains?))))))
307    range0))
308
309;; Matches a literal string s
310
311(define=> (lit <Token>)
312  (lambda (s)
313    (let ((f (lambda (t) (tok t (try char=?)))))
314      (lst (map f (if (string? s) (string->list s) s))))))
315
316
317(define (Token->CharLex T)
318  (make-<CharLex> T
319                  (char T)
320                  (set T)
321                  (range T)
322                  (lit T)
323                  ))
324
325
326)
Note: See TracBrowser for help on using the repository browser.