;;
;; Lexer combinator library.
;;
;; Based on the SML lexer generator by Thant Tessman.
;;
;; Ported to Chicken Scheme by Ivan Raikov.
;; Copyright 2009 Ivan Raikov.
;;
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions
;; are met:
;;
;; - Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; - Redistributions in binary form must reproduce the above
;; copyright notice, this list of conditions and the following
;; disclaimer in the documentation and/or other materials provided
;; with the distribution.
;;
;; - Neither name of the copyright holders nor the names of its
;; contributors may be used to endorse or promote products derived
;; from this software without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE
;; CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
;; INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
;; MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR THE
;; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
;; USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
;; AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;; LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;; ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;; POSSIBILITY OF SUCH DAMAGE.
;;
(module lexgen
( tok seq star bar
try pass pos opt char
set range lst lit
longest lex )
(import scheme chicken data-structures srfi-14)
(require-extension srfi-1 matchable)
;;
;; This is a lexer generator comprised in its core of four small
;; functions. The programmer assembles these functions into regular
;; expression pattern-matching functions.
;;
;; The idea is that a pattern matcher function takes a list of
;; streams, and returns a new list of streams advanced by every
;; combination allowed by the pattern matcher function. In this
;; implementation, a stream is simply a tuple containing a list of
;; elements consumed by the pattern matcher, and a list of
;; characters not yet consumed.
;;
;; Note that the number of streams returned by the function
;; typically won't match the number of streams passed in. If the
;; pattern doesn't match at all, the empty list is returned.
;;
;; 'tok' builds a pattern matcher function that applies procedure p to
;; a given token and an input character. If the procedure returns a
;; true value, that value is prepended to the list of consumed
;; elements, and the input character is removed from the list of input
;; elements.
(define (tok t p)
(let ((f (lambda (s)
(match s ((c (h . r))
(let ((ans (p t h)))
(and ans (list (cons ans c) r))))
((c ()) s)
(else #f)))))
(lambda (a r streams)
(let ((streams1 (filter-map f streams)))
(if (null? streams1) (r streams) (a streams1))))))
;; This matches a sequence of patterns.
(define (seq p1 p2)
(lambda (a r streams)
(p1 (lambda (streams1) (p2 a r streams1)) r streams)))
;; This matches either one of two patterns. It's analogous to patterns
;; separated by the '|' in regular expressions.
(define (bar p1 p2)
(lambda (a r streams)
(let ((r1 (lambda (streams1) (p2 a (lambda (streams2) (r streams1)) streams)) ))
(p1 a r1 streams))))
;; Kleene closure. Analogous to '*'
(define (star p)
(lambda (a r streams)
(let ((a1 (lambda (streams1) (a (concatenate (list streams streams1))))))
(p (lambda (streams1) (if (equal? streams streams1) (a streams1)
((star p) a1 a1 streams1)))
a streams))))
;; The rest of these are built from the previous four and are provided
;; for convenience.
;; this parser always succeeds
(define (pass a r s) (a s))
;; Positive closure. Analogous to '+'
(define (pos pat) (seq pat (star pat)))
;; Optional pattern. Analogous to '?'
(define (opt pat) (bar pat pass))
;; Converts a binary predicate procedure to a binary procedure that
;; returns its right argument when the predicate is true, and false
;; otherwise.
(define (try p) (lambda (x y) (let ((res (p x y))) (and res y))))
;; Matches a single character
(define (char c) (tok c (try char=?)))
;; Matches any of a SRFI-14 set of characters.
(define (set s)
(let ((cs (if (char-set? s) s (list->char-set (if (string? s) (string->list s) s)))))
(tok cs (try char-set-contains?))))
;; Range of characters. Analogous to character class '[]'
(define (range a b)
(if (charchar-set
(char->integer a) (char->integer b)))))
;; Matches a consecutive list of patterns
(define (lst ps)
(let ((ps (reverse ps)))
(let loop ((ps (cdr ps)) (p1 (car ps)))
(cond ((null? ps) p1)
((null? (cdr ps)) (seq (car ps) p1))
(else (loop (cdr ps) (seq (car ps) p1)))))))
;; Matches a literal string s
(define (lit s)
(let ((f (lambda (t) (tok t (try char=?)))))
(lst (map f (if (string? s) (string->list s) s)))))
;; Takes the resulting streams produced by the application of a
;; pattern on a stream (or streams) and selects the longest match if
;; one exists.
(define (longest streams)
(match-let (((count stream)
(fold (lambda (stream max)
(match (list stream max)
(((eaten food) (max-count max-stream))
(if (< max-count (length eaten))
(list (length eaten) stream) max))
(else (error 'longest "invalid stream" stream))))
(list 0 `(() ()))
streams)))
(and (positive? count) stream)))
;; This takes a pattern and a string, turns the string into a list of
;; streams (containing one stream), applies the pattern, and returns
;; the longest match.
(define (->char-list s)
(if (string? s) (string->list s) s))
(define (lex pat error s)
(let* ((stream (->char-list s))
(res (longest (pat identity error `((() ,stream))))))
(and res (list (reverse (first res)) (second res)))))
)