source: project/ssax/ssax-core.scm @ 2701

Last change on this file since 2701 was 2701, checked in by daishi, 13 years ago

ssax: fix for improving performance a little

File size: 114.1 KB
Line 
1; May be compiled with:
2;       chicken snapshot.scm -output-file snapshot.c
3;  and
4;       gcc -O3 -fomit-frame-pointer snapshot.c -lchicken -lm -lstuffed-chicken
5
6(declare
7  (fixnum)
8  (uses srfi-1 srfi-13)
9  (export SSAX:warn SSAX:skip-pi attlist-fold
10          SSAX:Prefix-XML SSAX:complete-start-tag SSAX:skip-S SSAX:read-markup-token
11          SSAX:assert-token parser-error SSAX:read-char-data 
12          SSAX:skip-internal-dtd SSAX:S-chars SSAX:read-QName
13          assert-curr-char SSAX:read-NCNname name-compare SSAX:resolve-name
14          SSAX:largest-unres-name SSAX:read-cdata-body SSAX:read-attributes
15          SSAX:ncname-starting-char? SSAX:read-external-ID SSAX:scan-Misc
16          SSAX:handle-parsed-entity SSAX:complete-start-tag xml-token-head xml-token-kind
17          SSAX:uri-string->symbol string-whitespace? SSAX:read-pi-body-as-string) )
18
19;(define pp pretty-print)
20;(define command-line argv)
21;KL
22; Chicken-enabled version of Oleg Kiselyov's Standard Scheme "Prelude"
23;
24
25;                  My Standard Scheme "Prelude"
26;
27; This version of the prelude contains several forms and procedures
28; that are specific to a Gambit-C 3.0 system.
29; See myenv-scm.scm, myenv-bigloo.scm, etc. for versions
30; of this prelude that are tuned to other Scheme systems.
31;
32; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
33
34
35;;; assert truth of an expression (or a sequence of expressions)
36;;; if there is more than one expression, they're 'AND'ed
37(define-macro (assert . x)
38                  (if (null? (cdr x))
39                    `(or ,@x (error "failed assertion" ',@x))
40                    `(or (and ,@x) (error "failed assertion" '(,@x)))))
41
42;;;
43(define (assure exp error-msg)
44  (or exp (error error-msg)))
45
46
47;;; Gambit's include and declare are disabled
48;(define-macro (include file) `(begin #f))
49;(define-macro (declare . x)  `(begin #f))
50
51
52;;; like cout << arguments << args
53;;;   where argument can be any Scheme object. If it's a procedure
54;;;   (without args) it's executed rather than printed (like newline)
55(define (cout . args)
56  (for-each (lambda (x)
57              (if (procedure? x) (x) (display x)))
58            args))
59
60(define (cerr . args)
61  (for-each (lambda (x)
62    (if (procedure? x) (x (current-error-port)) (display x (current-error-port))))
63            args))
64
65(define nl (string #\newline))
66
67(define (identify-error msg args . disposition-msgs)
68  (let ((port (current-error-port)))
69    (newline port)
70    (display "ERROR" port)
71    (display msg port)
72    (for-each (lambda (msg) (display msg port))
73              (append args disposition-msgs))
74    (newline port)))
75
76; Some useful increment/decrement operators
77; Note, ##fixnum prefix is Gambit-specific, it means that the
78; operands assumed FIXNUM (as they ought to be anyway).
79; This perfix could be safely removed: it'll leave the code just as
80; correct, but more portable (and less efficient)
81
82; Mutable increment
83(define-macro (++! x) `(set! ,x (+ 1 ,x)))
84
85; Read-only increment
86(define-macro (++ x) `(add1 ,x))
87
88; Mutable decrement
89(define-macro (--! x) `(set! ,x (- ,x 1)))
90
91; Read-only decrement
92(define-macro (-- x) `(sub1 ,x))
93
94
95; Some useful control operators
96
97                        ; if condition is true, execute stmts in turn
98                        ; and return the result of the last statement
99                        ; otherwise, return #f
100(define-macro (when condition . stmts)
101  `(and ,condition (begin ,@stmts)))
102 
103
104                        ; if condition is false execute stmts in turn
105                        ; and return the result of the last statement
106                        ; otherwise, return #t
107                        ; This primitive is often called 'unless'
108(define-macro (whennot condition . stmts)
109  `(or ,condition (begin ,@stmts)))
110
111
112                        ; Execute a sequence of forms and return the
113                        ; result of the _first_ one. Like PROG1 in Lisp.
114                        ; Typically used to evaluate one or more forms with
115                        ; side effects and return a value that must be
116                        ; computed before some or all of the side effects
117                        ; happen.
118(define-macro (begin0 form . forms)
119  (let ((var (gensym)))
120    `(let ((,var ,form)) ,@forms ,var)))
121
122                        ; Prepend an ITEM to a LIST, like a Lisp macro PUSH
123                        ; an ITEM can be an expression, but ls must be a VAR
124(define-macro (push! item ls)
125  `(set! ,ls (cons ,item ,ls)))
126
127                        ; Is str the empty string?
128                        ; string-null? str -> bool
129                        ; See Olin Shiver's Underground String functions
130(define (string-null? str) (zero? (string-length str)))
131
132
133; Like let* but allowing for multiple-value bindings
134(define-macro (let-values* bindings . body)
135  (if (null? bindings) (cons 'begin body)
136      (apply (lambda (vars initializer)
137         (let ((cont 
138                (cons 'let-values* 
139                      (cons (cdr bindings) body))))
140           (cond
141            ((not (pair? vars))         ; regular let case, a single var
142             `(let ((,vars ,initializer)) ,cont))
143            ((null? (cdr vars))         ; single var, see the prev case
144             `(let ((,(car vars) ,initializer)) ,cont))
145           (else                        ; the most generic case
146            `(receive ,vars ,initializer ,cont)))))
147       (car bindings))))
148
149
150                        ; assoc-primitives with a default clause
151                        ; If the search in the assoc list fails, the
152                        ; default action argument is returned. If this
153                        ; default action turns out to be a thunk,
154                        ; the result of its evaluation is returned.
155                        ; If the default action is not given, an error
156                        ; is signaled
157
158(define-macro (assq-def key alist . default-action-arg)
159  (let ((default-action
160        (if (null? default-action-arg)
161          `(error "failed to assq key '" ,key "' in a list " ,alist)
162          (let ((defact-symb (car default-action-arg)))
163            `(if (procedure? ,defact-symb) (,defact-symb) ,defact-symb)))))
164    `(or (assq ,key ,alist) ,default-action)))
165
166(define-macro (assv-def key alist . default-action-arg)
167  (let ((default-action
168        (if (null? default-action-arg)
169          `(error "failed to assv key '" ,key "' in a list " ,alist)
170          (let ((defact-symb (car default-action-arg)))
171            `(if (procedure? ,defact-symb) (,defact-symb) ,defact-symb)))))
172    `(or (assv ,key ,alist) ,default-action)))
173
174(define-macro (assoc-def key alist . default-action-arg)
175  (let ((default-action
176        (if (null? default-action-arg)
177          `(error "failed to assoc key '" ,key "' in a list " ,alist)
178          (let ((defact-symb (car default-action-arg)))
179            `(if (procedure? ,defact-symb) (,defact-symb) ,defact-symb)))))
180    `(or (assoc ,key ,alist) ,default-action)))
181
182
183; Convenience macros to avoid quoting of symbols
184; being deposited/looked up in the environment
185(define-macro (env.find key) `(%%env.find ',key))
186(define-macro (env.demand key) `(%%env.demand ',key))
187(define-macro (env.bind key value) `(%%env.bind ',key ,value))
188
189; This code provides informative error messages
190;   for SSAX (S)XML parser.
191;
192;
193; NOTE: Chicken-specific !
194; It was tested with SSAX version 4.5
195;
196; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
197
198
199
200;****************************************************************************
201;                       My Scheme misc utility functions
202;               (mainly dealing with string and list manipulations)
203;
204; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
205
206(declare                        ; Gambit-compiler optimization options
207 (standard-bindings)
208 (fixnum)
209)
210;(include "myenv.scm") ; include target dependent stuff
211
212
213;------------------------------------------------------------------------
214;                               Iterator ANY?
215;
216; -- procedure+: any? PRED COLLECTION
217;       Searches for the first element in the collection satisfying a
218;       given predicate
219;       That is, the procedure applies PRED to every element of the
220;       COLLECTION in turn.
221;       The first element for which PRED returns non-#f stops the iteration;
222;       the value of the predicate is returned.
223;       If none of the elements of the COLLECTION satisfy the predicate,
224;       the return value from the procedure is #f
225;       COLLECTION can be a list, a vector, a string, or an input port.
226
227(define (any? <pred?> coll)
228  (cond
229    ((list? coll)
230      (let loop ((curr-l coll))
231        (if (null? curr-l) #f
232          (or (<pred?> (car curr-l)) (loop (cdr curr-l))))))
233         
234    ((vector? coll)
235      (let ((len (vector-length coll)))
236       (let loop ((i 0))
237        (if (>= i len) #f
238          (or (<pred?> (vector-ref coll i)) (loop (+ 1 i)))))))
239
240    ((string? coll)
241      (let ((len (string-length coll)))
242       (let loop ((i 0))
243        (if (>= i len) #f
244          (or (<pred?> (string-ref coll i)) (loop (+ 1 i)))))))
245
246    ((input-port? coll)
247      (let loop ((c (read-char coll)))
248        (if (eof-object? c) #f
249          (or (<pred?> c) (loop (read-char coll))))))
250
251    (else (error "any? on an invalid collection"))))
252
253(define (test-any?)
254
255  (define (test-driver pred? coll expected-result)
256    (let ((res (any? pred? coll)))
257      (if (not (eqv? res expected-result))
258        (error "computed result " res "differs from the expected one "
259               expected-result))))
260  (define (eq-a? x) (if (char=? x #\a) x #f))
261  (define (gt1? x) (if (> x 1) x #f))
262 
263  (cout "finding an element in a list" nl)
264  (test-driver gt1? '(1 2 3 4 5) 2)
265  (test-driver gt1? '(1 1 1 1 1) #f)
266  (test-driver gt1? '(4 1 1 1 1) 4)
267  (test-driver gt1? '(4 5 6 1 9) 4)
268  (test-driver gt1? '(-4 -5 -6 1 9) 9)
269  (test-driver eq-a? '(#\b #\c #\a #\k) #\a)
270 
271  (cout "finding an element in a vector" nl)
272  (test-driver gt1? '#(1 2 3 4 5) 2)
273  (test-driver gt1? '#(1 1 1 1 1) #f)
274  (test-driver gt1? '#(4 1 1 1 1) 4)
275  (test-driver gt1? '#(4 5 6 1 9) 4)
276  (test-driver gt1? '#(-4 -5 -6 1 9) 9)
277  (test-driver eq-a? '#(#\b #\c #\a #\k) #\a)
278 
279  (cout "done" nl nl)
280)
281
282
283
284;------------------------------------------------------------------------
285;               Some list manipulation functions
286
287; -- procedure+: list-intersperse SRC-L ELEM
288; inserts ELEM between elements of the SRC-L, returning a freshly allocated
289; list (cells, that is)
290     
291(define (list-intersperse src-l elem)
292  (if (null? src-l) src-l
293    (let loop ((l (cdr src-l)) (dest (cons (car src-l) '())))
294      (if (null? l) (reverse dest)
295        (loop (cdr l) (cons (car l) (cons elem dest)))))))
296
297
298; -- procedure+: list-intersperse! SRC-L ELEM
299; inserts ELEM between elements of the SRC-L inplace
300     
301(define (list-intersperse! src-l elem)
302  (if (null? src-l) src-l
303    (let loop ((l src-l))
304      (let ((next-l (cdr l)))
305        (if (null? next-l) src-l
306          (begin
307            (set-cdr! l (cons elem next-l))
308            (loop next-l)))))))
309
310
311        ; List-tail-difference: given two lists, list1 and list2 where
312        ; list2 is presumably a tail of list1, return
313        ; a (freshly allocated) list which is a difference between list1
314        ; and list2. If list2 is *not* a tail of list1, the entire list1
315        ; is returned.
316(define (list-tail-diff list1 list2)
317  (let loop ((l1-curr list1) (difference '()))
318    (cond
319      ((eq? l1-curr list2) (reverse difference))
320      ((null? l1-curr) (reverse difference))
321      (else (loop (cdr l1-curr) (cons (car l1-curr) difference))))))
322
323
324;------------------------------------------------------------------------
325
326
327                ; Return the index of the first occurence of a-char in str, or #f
328(define (string-index str a-char)
329  (let loop ((pos 0))
330    (cond
331      ((>= pos (string-length str)) #f) ; whole string has been searched, in vain
332      ((char=? a-char (string-ref str pos)) pos)
333      (else (loop (+ 1 pos))))))
334
335                ; Return the index of the last occurence of a-char in str, or #f
336(define (string-rindex str a-char)
337  (let loop ((pos (-- (string-length str))))
338    (cond
339      ((negative? pos) #f)              ; whole string has been searched, in vain
340      ((char=? a-char (string-ref str pos)) pos)
341      (else (loop (-- pos))))))
342
343
344
345;
346; -- procedure+: substring? PATTERN STRING
347;     Searches STRING to see if it contains the substring PATTERN.
348;     Returns the index of the first substring of STRING that is equal
349;     to PATTERN; or `#f' if STRING does not contain PATTERN.
350;
351;          (substring? "rat" "pirate")             =>  2
352;          (substring? "rat" "outrage")            =>  #f
353;          (substring? "" any-string)              =>  0
354
355(define (substring? pattern str)
356  (let* ((pat-len (string-length pattern))
357         (search-span (- (string-length str) pat-len))
358         (c1 (if (zero? pat-len) #f (string-ref pattern 0)))
359         (c2 (if (<= pat-len 1) #f (string-ref pattern 1))))
360    (cond
361     ((not c1) 0)           ; empty pattern, matches upfront
362     ((not c2) (string-index str c1)) ; one-char pattern
363     (else                  ; matching a pattern of at least two chars
364        (let outer ((pos 0))
365          (cond
366            ((> pos search-span) #f)    ; nothing was found thru the whole str
367            ((not (char=? c1 (string-ref str pos)))
368                (outer (+ 1 pos)))      ; keep looking for the right beginning
369            ((not (char=? c2 (string-ref str (+ 1 pos))))
370                (outer (+ 1 pos)))      ; could've done pos+2 if c1 == c2....
371            (else                       ; two char matched: high probability
372                                        ; the rest will match too
373                (let inner ((i-pat 2) (i-str (+ 2 pos)))
374                   (if (>= i-pat pat-len) pos ; whole pattern matched
375                      (if (char=? (string-ref pattern i-pat)
376                                  (string-ref str i-str))
377                        (inner (+ 1 i-pat) (+ 1 i-str))
378                        (outer (+ 1 pos))))))))))))     ; mismatch after partial match
379
380; Here are some specialized substring? functions
381;
382; -- procedure+: string-prefix? PATTERN STRING
383; -- procedure+: string-prefix-ci? PATTERN STRING
384; checks to make sure that PATTERN is a prefix of STRING
385;
386;          (string-prefix? "pir" "pirate")             =>  #t
387;          (string-prefix? "rat" "outrage")            =>  #f
388;          (string-prefix? "" any-string)              =>  #t
389;          (string-prefix? any-string any-string)      =>  #t
390
391(define (string-prefix? pattern str)
392  (let loop ((i 0))
393    (cond
394      ((>= i (string-length pattern)) #t)
395      ((>= i (string-length str)) #f)
396      ((char=? (string-ref pattern i) (string-ref str i))
397        (loop (++ i)))
398      (else #f))))
399
400(define (string-prefix-ci? pattern str)
401  (let loop ((i 0))
402    (cond
403      ((>= i (string-length pattern)) #t)
404      ((>= i (string-length str)) #f)
405      ((char-ci=? (string-ref pattern i) (string-ref str i))
406        (loop (++ i)))
407      (else #f))))
408
409; -- procedure+: string-suffix? PATTERN STRING
410; -- procedure+: string-suffix-ci? PATTERN STRING
411; checks to make sure that PATTERN is a suffix of STRING
412;
413;          (string-suffix? "ate" "pirate")             =>  #t
414;          (string-suffix? "rag" "outrage")            =>  #f
415;          (string-suffix? "" any-string)              =>  #t
416;          (string-suffix? any-string any-string)      =>  #t
417
418(define (string-suffix? pattern str)
419  (let loop ((i (-- (string-length pattern))) (j (-- (string-length str))))
420    (cond
421      ((negative? i) #t)
422      ((negative? j) #f)
423      ((char=? (string-ref pattern i) (string-ref str j))
424        (loop (-- i) (-- j)))
425      (else #f))))
426
427(define (string-suffix-ci? pattern str)
428  (let loop ((i (-- (string-length pattern))) (j (-- (string-length str))))
429    (cond
430      ((negative? i) #t)
431      ((negative? j) #f)
432      ((char-ci=? (string-ref pattern i) (string-ref str j))
433        (loop (-- i) (-- j)))
434      (else #f))))
435
436
437;               String case modification functions
438
439                        ; Return a new string made of characters of the
440                        ; original string in the lower case
441(define (string-downcase str)
442  (do ((target-str (make-string (string-length str))) (i 0 (++ i)))
443      ((>= i (string-length str)) target-str)
444      (string-set! target-str i (char-downcase (string-ref str i)))))
445
446                        ; Return a new string made of characters of the
447                        ; original string in the upper case
448(define (string-upcase str)
449  (do ((target-str (make-string (string-length str))) (i 0 (++ i)))
450      ((>= i (string-length str)) target-str)
451      (string-set! target-str i (char-upcase (string-ref str i)))))
452
453                        ; Lower the case of string's characters inplace
454(define (string-downcase! str)
455  (do ((i 0 (++ i))) ((>= i (string-length str)))
456    (string-set! str i (char-downcase (string-ref str i)))))
457
458                        ; Raise the case of string's characters inplace
459(define (string-upcase! str)
460  (do ((i 0 (++ i))) ((>= i (string-length str)))
461    (string-set! str i (char-upcase (string-ref str i)))))
462
463
464;
465; -- procedure+: string->integer STR START END
466;
467; Makes sure a substring of the STR from START (inclusive) till END (exclusive)
468; is a representation of a non-negative integer in decimal notation. If so, this
469; integer is returned. Otherwise -- when the substring contains non-decimal
470; characters, or when the range from START till END is not within STR, the
471; result is #f.
472;
473; This procedure is a simplification of the standard string->number.
474; The latter is far more generic: for example, it will try to read
475; strings like "1/2" "1S2" "1.34" and even "1/0" (the latter causing
476; a zero-divide error). Note that to string->number,  "1S2" is a valid
477; representation of an _inexact_ integer (100 to be precise).
478; Oftentimes we want to be more restrictive about what we consider a number; we
479; want merely to read an integral label.
480
481(define (string->integer str start end)
482  (and (< -1 start end (++ (string-length str)))
483    (let loop ((pos start) (accum 0))
484      (cond
485        ((>= pos end) accum)
486        ((char-numeric? (string-ref str pos))
487          (loop (++ pos) (+ (char->integer (string-ref str pos)) 
488              (- (char->integer #\0)) (* 10 accum))))
489        (else #f)))))
490
491
492;
493; -- procedure+: string-split STRING
494; -- procedure+: string-split STRING '()
495; -- procedure+: string-split STRING '() MAXSPLIT
496;
497; Returns a list of whitespace delimited words in STRING.
498; If STRING is empty or contains only whitespace, then the empty list
499; is returned. Leading and trailing whitespaces are trimmed.
500; If MAXSPLIT is specified and positive, the resulting list will contain at most
501; MAXSPLIT elements, the last of which is the string remaining after
502; (MAXSPLIT - 1) splits. If MAXSPLIT is specified and non-positive,
503; the empty list is returned. "In time critical applications it behooves
504; you not to split into more fields than you really need."
505;
506; -- procedure+: string-split STRING CHARSET
507; -- procedure+: string-split STRING CHARSET MAXSPLIT
508;
509; Returns a list of words delimited by the characters in CHARSET in
510; STRING. CHARSET is a list of characters that are treated as delimiters.
511; Leading or trailing delimeters are NOT trimmed. That is, the resulting
512; list will have as many initial empty string elements as there are
513; leading delimiters in STRING.
514;
515; If MAXSPLIT is specified and positive, the resulting list will contain at most
516; MAXSPLIT elements, the last of which is the string remaining after
517; (MAXSPLIT - 1) splits. If MAXSPLIT is specified and non-positive,
518; the empty list is returned. "In time critical applications it behooves
519; you not to split into more fields than you really need."
520;
521; This is based on the split function in Python/Perl
522;
523; (string-split " abc d e f  ") ==> ("abc" "d" "e" "f")
524; (string-split " abc d e f  " '() 1) ==> ("abc d e f  ")
525; (string-split " abc d e f  " '() 0) ==> ()
526; (string-split ":abc:d:e::f:" '(#\:)) ==> ("" "abc" "d" "e" "" "f" "")
527; (string-split ":" '(#\:)) ==> ("" "")
528; (string-split "root:x:0:0:Lord" '(#\:) 2) ==> ("root" "x:0:0:Lord")
529; (string-split "/usr/local/bin:/usr/bin:/usr/ucb/bin" '(#\:))
530; ==> ("/usr/local/bin" "/usr/bin" "/usr/ucb/bin")
531; (string-split "/usr/local/bin" '(#\/)) ==> ("" "usr" "local" "bin")
532
533(define (string-split str . rest)
534                ; maxsplit is a positive number
535  (define (split-by-whitespace str maxsplit)
536    (define (skip-ws i yet-to-split-count)
537      (cond
538        ((>= i (string-length str)) '())
539        ((char-whitespace? (string-ref str i))
540          (skip-ws (++ i) yet-to-split-count))
541        (else (scan-beg-word (++ i) i yet-to-split-count))))
542    (define (scan-beg-word i from yet-to-split-count)
543      (cond
544        ((zero? yet-to-split-count)
545          (cons (substring str from (string-length str)) '()))
546        (else (scan-word i from yet-to-split-count))))
547    (define (scan-word i from yet-to-split-count)
548      (cond
549        ((>= i (string-length str))
550          (cons (substring str from i) '()))
551        ((char-whitespace? (string-ref str i))
552          (cons (substring str from i) 
553            (skip-ws (++ i) (-- yet-to-split-count))))
554        (else (scan-word (++ i) from yet-to-split-count))))
555    (skip-ws 0 (-- maxsplit)))
556
557                ; maxsplit is a positive number
558                ; str is not empty
559  (define (split-by-charset str delimeters maxsplit)
560    (define (scan-beg-word from yet-to-split-count)
561      (cond
562        ((>= from (string-length str)) '(""))
563        ((zero? yet-to-split-count)
564          (cons (substring str from (string-length str)) '()))
565        (else (scan-word from from yet-to-split-count))))
566    (define (scan-word i from yet-to-split-count)
567      (cond
568        ((>= i (string-length str))
569          (cons (substring str from i) '()))
570        ((memq (string-ref str i) delimeters)
571          (cons (substring str from i) 
572            (scan-beg-word (++ i) (-- yet-to-split-count))))
573        (else (scan-word (++ i) from yet-to-split-count))))
574    (scan-beg-word 0 (-- maxsplit)))
575
576                        ; resolver of overloading...
577                        ; if omitted, maxsplit defaults to
578                        ; (++ (string-length str))
579  (if (string-null? str) '()
580    (if (null? rest) 
581      (split-by-whitespace str (++ (string-length str)))
582      (let ((charset (car rest))
583          (maxsplit
584            (if (pair? (cdr rest)) (cadr rest) (++ (string-length str)))))
585        (cond
586          ((not (positive? maxsplit)) '())
587          ((null? charset) (split-by-whitespace str maxsplit))
588          (else (split-by-charset str charset maxsplit))))))
589)
590
591;------------------------------------------------------------------------
592;                       EXEC-PATH facility
593; The EXEC-PATH is the path used to load .scm files or run executable files
594; with.
595;
596; The path could be specified as an env-variable SCMPATH
597; if this env variable isn't set, the path of the executable file
598; is used
599
600; (define EXEC-PATH:PATH
601
602;       ; extract the path from argv[0] and make it absolute
603;   (letrec ((extract-argv0-path
604;         (lambda ()
605;           (let ((argv0-rel-path
606;                 (let ((argv0 (vector-ref argv 0)))
607;                       ; reverse search for the last PATH-SEPARATOR-CHAR
608;                   (let loop ((i (-- (string-length argv0))))
609;                     (cond
610;                       ((not (positive? i)) #f)
611;                       ((char=? (string-ref argv0 i) PATH-SEPARATOR-CHAR)
612;                         (substring argv0 0 (+ i 1)))
613;                       (else (loop (-- i)))))))
614;               (curr-path (OS:getcwd)))
615;             (cond
616;               ((not argv0-rel-path) curr-path)
617;                       ; check if argv0-rel-path was an absolute path
618;               ((char=? PATH-SEPARATOR-CHAR (string-ref argv0-rel-path 0))
619;                 argv0-rel-path)
620;               (else (string-append curr-path (string PATH-SEPARATOR-CHAR)
621;                   argv0-rel-path))))))
622             
623;               ; cached path
624;       (path-itself #f)
625;                       ; returned the cached path
626;       (get-path (lambda () path-itself))
627     
628;                       ; Compute the cached path, runs only the first time around
629;       (set-path
630;         (lambda ()
631;           (let ((path-being-computed
632;                 (or (OS:getenv "SCMPATH") (extract-argv0-path))))
633;             (assert path-being-computed)
634;             (set! path-itself
635;               (if (char=? PATH-SEPARATOR-CHAR
636;                   (string-ref path-being-computed
637;                     (-- (string-length path-being-computed))))
638;                 path-being-computed
639;            ; make sure path-being-computed ends in PATH-SEPARATOR-CHAR
640;                 (string-append path-being-computed (string PATH-SEPARATOR-CHAR))))
641;             (cerr nl "EXEC PATH is set to " path-itself nl)
642;             (set! EXEC-PATH:PATH get-path)
643;             (get-path)))))
644   
645;     set-path))
646
647   
648
649; (define (EXEC-PATH:load scm-file-name)
650;   (load (string-append (EXEC-PATH:PATH) scm-file-name)))
651
652; (define (EXEC-PATH:help)
653;   (cerr nl "Environment variable SCMPATH could be set up to point" nl)
654;   (cerr "to a directory containing dynamically loadable dictionaries/conf files" nl)
655;   (cerr "if this variable is not set, the directory where this executable resides" nl)
656;   (cerr "will be used" nl nl)
657;   (cerr "The current path is set to " (EXEC-PATH:PATH) nl nl))
658
659; ; "shell-out" to run a separately compiled executable with given
660; ; arguments, from the current EXEC-PATH
661; ; all 'args' (which must be strings) are "concatenated" together
662; ; to form a command-line for the executable
663; (define (EXEC-PATH:system executable . args)
664;   (let ((command-line
665;         (apply string-append
666;           (cons (EXEC-PATH:PATH) (cons executable (cons " " args))))))
667;     (cerr "about to execute: " command-line nl)
668;     (OS:system command-line)))
669
670;****************************************************************************
671;                       Simple Parsing of input
672;
673; The following simple functions surprisingly often suffice to parse
674; an input stream. They either skip, or build and return tokens,
675; according to inclusion or delimiting semantics. The list of
676; characters to expect, include, or to break at may vary from one
677; invocation of a function to another. This allows the functions to
678; easily parse even context-sensitive languages.
679;
680; EOF is generally frowned on, and thrown up upon if encountered.
681; Exceptions are mentioned specifically. The list of expected characters
682; (characters to skip until, or break-characters) may include an EOF
683; "character", which is to be coded as symbol *eof*
684;
685; The input stream to parse is specified as a PORT, which is usually
686; the last (and optional) argument. It defaults to the current input
687; port if omitted.
688;
689; IMPORT
690; This package relies on a function parser-error, which must be defined
691; by a user of the package. The function has the following signature:
692;       parser-error PORT MESSAGE SPECIALISING-MSG*
693; Many procedures of this package call parser-error to report a parsing
694; error.  The first argument is a port, which typically points to the
695; offending character or its neighborhood. Most of the Scheme systems
696; let the user query a PORT for the current position. MESSAGE is the
697; description of the error. Other arguments supply more details about
698; the problem.
699;
700; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
701
702(declare                        ; Gambit-compiler optimization options
703 (standard-bindings)
704 (extended-bindings)            ; Needed for #!optional arguments, DSSSL-style
705 (fixnum)                       ; optional, keyword and rest arguments
706)
707;(include "myenv.scm") ; include target dependent stuff
708
709;------------------------------------------------------------------------
710;                    Preparation and tuning section
711
712; This package is heavily used. Therefore, we take time to tune it in,
713; in particular for Gambit.
714
715
716; Concise and efficient definition of a function that takes one or two
717; optional arguments, e.g.,
718;
719; (define-opt (foo arg1 arg2 (optional (arg3 init3) (arg4 init4))) body)
720;
721; define-opt is identical to a regular define, with one exception: the
722; last argument may have a form
723;       (optional (binding init) ... )
724
725(cond-expand
726 ((or bigloo gambit)
727
728    ; For Gambit and Bigloo, which support DSSSL extended lambdas,
729    ; define-opt like the one in the example above is re-written into
730    ; (define-opt (foo arg1 arg2 #!optional (arg3 init3) (arg4 init4)) body)
731  (define-macro (define-opt bindings body . body-rest)
732    (let* ((rev-bindings (reverse bindings))
733           (opt-bindings
734            (and (pair? rev-bindings) (pair? (car rev-bindings))
735                 (eq? 'optional (caar rev-bindings))
736                 (cdar rev-bindings))))
737      (if opt-bindings
738        `(define ,(append (reverse
739                           (cons (with-input-from-string "#!optional" read)
740                                 (cdr rev-bindings)))
741                          opt-bindings)
742           ,body ,@body-rest)
743        `(define ,bindings ,body ,@body-rest))))
744  )
745 (else
746
747    ; For Scheme systems without DSSSL extensions, we rewrite the definition
748    ; of foo of the example above into the following:
749    ;   (define (foo arg1 arg2 . rest)
750    ;      (let* ((arg3 (if (null? rest) init3 (car rest)))
751    ;             (arg4 (if (or (null? rest) (null? (cdr rest))) init4
752    ;                       (cadr rest)))
753    ;        body))
754    ; We won't handle more than two optional arguments
755
756  (define-macro define-opt (lambda (bindings body . body-rest)
757    (let* ((rev-bindings (reverse bindings))
758           (opt-bindings
759            (and (pair? rev-bindings) (pair? (car rev-bindings))
760                 (eq? 'optional (caar rev-bindings))
761                 (cdar rev-bindings))))
762      (cond
763       ((not opt-bindings)              ; No optional arguments
764        `(define ,bindings ,body ,@body-rest))
765       ((null? opt-bindings)
766        `(define ,bindings ,body ,@body-rest))
767       ((or (null? (cdr opt-bindings)) (null? (cddr opt-bindings)))
768        (let* ((rest (gensym))          ; One or two optional args
769               (first-opt (car opt-bindings))
770               (second-opt (and (pair? (cdr opt-bindings))
771                                (cadr opt-bindings))))
772          `(define ,(let loop ((bindings bindings))
773                      (if (null? (cdr bindings)) rest
774                          (cons (car bindings) (loop (cdr bindings)))))
775             (let* ((,(car first-opt) (if (null? ,rest)
776                                          ,(cadr first-opt)
777                                          (car ,rest)))
778                    ,@(if second-opt
779                          `((,(car second-opt) 
780                             (if (or (null? ,rest) (null? (cdr ,rest)))
781                                 ,(cadr second-opt)
782                                 (cadr ,rest))))
783                          '()))
784               ,body ,@body-rest))))
785       (else
786        '(error "At most two options are supported"))))))
787  ))
788
789(cond-expand
790 (gambit
791      ; The following macro makes a macro that turns (read-char port)
792      ; into (##read-char port). We can't enter such a macro-converter
793      ; directly as readers of SCM and Bigloo, for ones, don't like
794      ; identifiers with two leading # characters
795   (define-macro (gambitize clause)
796     `(define-macro ,clause
797        ,(list 'quasiquote
798            (cons
799             (string->symbol (string-append "##"
800                                            (symbol->string (car clause))))
801             (map (lambda (id) (list 'unquote id)) (cdr clause))))))
802   (gambitize (read-char port))
803   (gambitize (peek-char port))
804   (gambitize (eof-object? port))
805   ;(gambitize (string-append a b))
806   )
807 (chicken
808   (define-macro (read-char port) `(##sys#read-char-0 ,port))
809   (define-macro (peek-char port) `(##sys#peek-char-0 ,port))
810   )
811 (else #t))
812
813
814
815;------------------------------------------------------------------------
816
817; -- procedure+: peek-next-char [PORT]
818;       advances to the next character in the PORT and peeks at it.
819;       This function is useful when parsing LR(1)-type languages
820;       (one-char-read-ahead).
821;       The optional argument PORT defaults to the current input port.
822
823(define-opt (peek-next-char (optional (port (current-input-port))))
824  (read-char port) 
825  (peek-char port)) 
826
827
828;------------------------------------------------------------------------
829
830; -- procedure+: assert-curr-char CHAR-LIST STRING [PORT]
831;       Reads a character from the PORT and looks it up
832;       in the CHAR-LIST of expected characters
833;       If the read character was found among expected, it is returned
834;       Otherwise, the procedure writes a nasty message using STRING
835;       as a comment, and quits.
836;       The optional argument PORT defaults to the current input port.
837;
838(define-opt (assert-curr-char expected-chars comment
839                              (optional (port (current-input-port))))
840  (let ((c (read-char port)))
841    (if (memq c expected-chars) c
842    (parser-error port "Wrong character " c
843           " (0x" (if (eof-object? c) "*eof*"
844                    (number->string (char->integer c) 16)) ") "
845           comment ". " expected-chars " expected"))))
846           
847
848; -- procedure+: skip-until CHAR-LIST [PORT]
849;       Reads and skips characters from the PORT until one of the break
850;       characters is encountered. This break character is returned.
851;       The break characters are specified as the CHAR-LIST. This list
852;       may include EOF, which is to be coded as a symbol *eof*
853;
854; -- procedure+: skip-until NUMBER [PORT]
855;       Skips the specified NUMBER of characters from the PORT and returns #f
856;
857;       The optional argument PORT defaults to the current input port.
858
859
860(define-opt (skip-until arg (optional (port (current-input-port))) )
861  (cond
862   ((number? arg)               ; skip 'arg' characters
863      (do ((i arg (-- i)))
864          ((<= i 0) #f)
865          (if (eof-object? (read-char port))
866            (parser-error port "Unexpected EOF while skipping "
867                         arg " characters"))))
868   (else                        ; skip until break-chars (=arg)
869     (let loop ((c (read-char port)))
870       (cond
871         ((memv c arg) c)
872         ((eof-object? c)
873           (if (memv '*eof* arg) c
874             (parser-error port "Unexpected EOF while skipping until " arg)))
875         (else (loop (read-char port))))))))
876
877
878; -- procedure+: skip-while CHAR-LIST [PORT]
879;       Reads characters from the PORT and disregards them,
880;       as long as they are mentioned in the CHAR-LIST.
881;       The first character (which may be EOF) peeked from the stream
882;       that is NOT a member of the CHAR-LIST is returned. This character
883;       is left on the stream.
884;       The optional argument PORT defaults to the current input port.
885
886(define-opt (skip-while skip-chars (optional (port (current-input-port))) )
887  (do ((c (peek-char port) (peek-char port)))
888      ((not (memv c skip-chars)) c)
889      (read-char port)))
890 
891; whitespace const
892
893;------------------------------------------------------------------------
894;                               Stream tokenizers
895
896
897; -- procedure+:
898;    next-token PREFIX-CHAR-LIST BREAK-CHAR-LIST [COMMENT-STRING] [PORT]
899;       skips any number of the prefix characters (members of the
900;       PREFIX-CHAR-LIST), if any, and reads the sequence of characters
901;       up to (but not including) a break character, one of the
902;       BREAK-CHAR-LIST.
903;       The string of characters thus read is returned.
904;       The break character is left on the input stream
905;       The list of break characters may include EOF, which is to be coded as
906;       a symbol *eof*. Otherwise, EOF is fatal, generating an error message
907;       including a specified COMMENT-STRING (if any)
908;
909;       The optional argument PORT defaults to the current input port.
910;
911; Note: since we can't tell offhand how large the token being read is
912; going to be, we make a guess, pre-allocate a string, and grow it by
913; quanta if necessary. The quantum is always the length of the string
914; before it was extended the last time. Thus the algorithm does
915; a Fibonacci-type extension, which has been proven optimal.
916; Note, explicit port specification in read-char, peek-char helps.
917
918; Procedure input-parse:init-buffer
919; returns an initial buffer for next-token* procedures.
920; The input-parse:init-buffer may allocate a new buffer per each invocation:
921;       (define (input-parse:init-buffer) (make-string 32))
922; Size 32 turns out to be fairly good, on average.
923; That policy is good only when a Scheme system is multi-threaded with
924; preemptive scheduling, or when a Scheme system supports shared substrings.
925; In all the other cases, it's better for input-parse:init-buffer to
926; return the same static buffer. next-token* functions return a copy
927; (a substring) of accumulated data, so the same buffer can be reused.
928; We shouldn't worry about new token being too large: next-token will use
929; a larger buffer automatically. Still, the best size for the static buffer
930; is to allow most of the tokens to fit in.
931; Using a static buffer _dramatically_ reduces the amount of produced garbage
932; (e.g., during XML parsing).
933(define input-parse-buffer (make-parameter #f))
934
935(define input-parse:init-buffer
936  (lambda ()
937    (or
938      (input-parse-buffer)
939      (let ((buffer (make-string 512)))
940        (input-parse-buffer buffer)
941        buffer))))
942
943(define-opt (next-token prefix-skipped-chars break-chars
944                        (optional (comment "") (port (current-input-port))) )
945  (let* ((buffer (input-parse:init-buffer))
946         (curr-buf-len (string-length buffer)) (quantum 16))
947    (let loop ((i 0) (c (skip-while prefix-skipped-chars port)))
948      (cond
949        ((memq c break-chars) (substring buffer 0 i))
950        ((eof-object? c)
951          (if (memq '*eof* break-chars)
952            (substring buffer 0 i)              ; was EOF expected?
953            (parser-error port "EOF while reading a token " comment)))
954        (else
955          (if (>= i curr-buf-len)       ; make space for i-th char in buffer
956            (begin                      ; -> grow the buffer by the quantum
957              (set! buffer (string-append buffer (make-string quantum)))
958              (set! quantum curr-buf-len)
959              (set! curr-buf-len (string-length buffer))))
960          (string-set! buffer i c)
961          (read-char port)                      ; move to the next char
962          (loop (++ i) (peek-char port))
963          )))))
964       
965
966; Another version of next-token, accumulating characters in a list rather
967; than in a string buffer. I heard that it tends to work faster.
968; In reality, it works just as fast as the string buffer version above,
969; but it allocates 50% more memory and thus has to run garbage collection
970; 50% as many times. See next-token-comp.scm
971
972(define-opt (next-token-list-based prefix-skipped-chars break-chars
973                  (optional (comment "") (port (current-input-port))) )
974  (let* ((first-char (skip-while prefix-skipped-chars port))
975         (accum-chars (cons first-char '())))
976    (cond
977      ((eof-object? first-char)
978        (if (memq '*eof* break-chars) ""
979          (parser-error port "EOF while skipping before reading token "
980                       comment)))
981      ((memq first-char break-chars) "")
982      (else
983        (read-char port)                ; consume the first-char
984        (let loop ((tail accum-chars) (c (peek-char port)))
985          (cond
986            ((memq c break-chars) (list->string accum-chars))
987            ((eof-object? c)
988              (if (memq '*eof* break-chars)
989                (list->string accum-chars)              ; was EOF expected?
990                (parser-error port "EOF while reading a token " comment)))
991            (else
992              (read-char port)          ; move to the next char
993              (set-cdr! tail (cons c '()))
994              (loop (cdr tail) (peek-char port))
995        )))))))
996
997
998; -- procedure+: next-token-of INC-CHARSET [PORT]
999;       Reads characters from the PORT that belong to the list of characters
1000;       INC-CHARSET. The reading stops at the first character which is not
1001;       a member of the set. This character is left on the stream.
1002;       All the read characters are returned in a string.
1003;
1004; -- procedure+: next-token-of PRED [PORT]
1005;       Reads characters from the PORT for which PRED (a procedure of one
1006;       argument) returns non-#f. The reading stops at the first character
1007;       for which PRED returns #f. That character is left on the stream.
1008;       All the results of evaluating of PRED up to #f are returned in a
1009;       string.
1010;
1011;       PRED is a procedure that takes one argument (a character
1012;       or the EOF object) and returns a character or #f. The returned
1013;       character does not have to be the same as the input argument
1014;       to the PRED. For example,
1015;       (next-token-of (lambda (c)
1016;                         (cond ((eof-object? c) #f)
1017;                               ((char-alphabetic? c) (char-downcase c))
1018;                               (else #f))))
1019;       will try to read an alphabetic token from the current
1020;       input port, and return it in lower case.
1021;
1022;       The optional argument PORT defaults to the current input port.
1023;
1024; Note: since we can't tell offhand how large the token being read is
1025; going to be, we make a guess, pre-allocate a string, and grow it by
1026; quanta if necessary. The quantum is always the length of the string
1027; before it was extended the last time. Thus the algorithm does
1028; a Fibonacci-type extension, which has been proven optimal.
1029;
1030; This procedure is similar to next-token but only it implements
1031; an inclusion rather than delimiting semantics.
1032
1033(define-opt (next-token-of incl-list/pred
1034                           (optional (port (current-input-port))) )
1035  (let* ((buffer (input-parse:init-buffer))
1036         (curr-buf-len (string-length buffer)) (quantum 16))
1037  (if (procedure? incl-list/pred)
1038    (let loop ((i 0) (c (peek-char port)))
1039      (cond
1040        ((incl-list/pred c) =>
1041          (lambda (c)
1042            (if (>= i curr-buf-len)     ; make space for i-th char in buffer
1043              (begin                    ; -> grow the buffer by the quantum
1044                (set! buffer (string-append buffer (make-string quantum)))
1045                (set! quantum curr-buf-len)
1046                (set! curr-buf-len (string-length buffer))))
1047            (string-set! buffer i c)
1048            (read-char port)                    ; move to the next char
1049            (loop (++ i) (peek-char port))))
1050        (else (substring buffer 0 i))))
1051                        ; incl-list/pred is a list of allowed characters
1052    (let loop ((i 0) (c (peek-char port)))
1053      (cond
1054        ((not (memq c incl-list/pred)) (substring buffer 0 i))
1055        (else
1056          (if (>= i curr-buf-len)       ; make space for i-th char in buffer
1057            (begin                      ; -> grow the buffer by the quantum
1058              (set! buffer (string-append buffer (make-string quantum)))
1059              (set! quantum curr-buf-len)
1060              (set! curr-buf-len (string-length buffer))))
1061          (string-set! buffer i c)
1062          (read-char port)                      ; move to the next char
1063          (loop (++ i) (peek-char port))
1064          ))))))
1065
1066
1067; -- procedure+: read-line [PORT]
1068;       Reads one line of text from the PORT, and returns it as a string.
1069;       A line is a (possibly empty) sequence of characters terminated
1070;       by CR, CRLF or LF (or even the end of file).
1071;       The terminating character (or CRLF combination) is removed from
1072;       the input stream. The terminating character(s) is not a part
1073;       of the return string either.
1074;       If EOF is encountered before any character is read, the return
1075;       value is EOF.
1076;
1077;       The optional argument PORT defaults to the current input port.
1078
1079(define-opt (read-line (optional (port (current-input-port))) )
1080  (if (eof-object? (peek-char port)) (peek-char port)
1081    (let* ((line
1082             (next-token '() '(#\newline #\return *eof*)
1083                         "reading a line" port))
1084           (c (read-char port)))        ; must be either \n or \r or EOF
1085       (and (eq? c #\return) (eq? (peek-char port) #\newline)
1086         (read-char port))                      ; skip \n that follows \r
1087       line)))
1088
1089
1090; -- procedure+: read-string N [PORT]
1091;       Reads N characters from the PORT, and  returns them in a string.
1092;       If EOF is encountered before N characters are read, a shorter string
1093;       will be returned.
1094;       If N is not positive, an empty string will be returned.
1095;       The optional argument PORT defaults to the current input port.
1096
1097(define-opt (read-string n (optional (port (current-input-port))) )
1098  (if (not (positive? n)) ""
1099    (let ((buffer (make-string n)))
1100      (let loop ((i 0) (c (read-char port)))
1101        (if (eof-object? c) (substring buffer 0 i)
1102          (let ((i1 (++ i)))
1103            (string-set! buffer i c)
1104            (if (= i1 n) buffer
1105              (loop i1 (read-char port)))))))))
1106
1107
1108; -- Function: find-string-from-port? STR IN-PORT MAX-NO-CHARS
1109;    Looks for a string STR within the first MAX-NO-CHARS chars of the
1110;    input port IN-PORT
1111;    MAX-NO-CHARS may be omitted: in that case, the search span would be
1112;    limited only by the end of the input stream.
1113;    When the STR is found, the function returns the number of
1114;    characters it has read from the port, and the port is set
1115;    to read the first char after that (that is, after the STR)
1116;    The function returns #f when the string wasn't found
1117; Note the function reads the port *STRICTLY* sequentially, and does not
1118; perform any buffering. So the function can be used even if the port is open
1119; on a pipe or other communication channel.
1120;
1121; Probably can be classified as misc-io.
1122;
1123; Notes on the algorithm.
1124; A special care should be taken in a situation when one had achieved a partial
1125; match with (a head of) STR, and then some unexpected character appeared in
1126; the stream. It'll be rash to discard all already read characters. Consider
1127; an example of string "acab" and the stream "bacacab...", specifically when
1128;    a  c  a _b_
1129; b  a  c  a  c  a  b ...
1130; that is, when 'aca' had matched, but then 'c' showed up in the stream
1131; while we were looking for 'b'. In that case, discarding all already read
1132; characters and starting the matching process from scratch, that is,
1133; from 'c a b ...', would miss a certain match.
1134; Note, we don't actually need to keep already read characters, or at least
1135; strlen(str) characters in some kind of buffer. If there has been no match,
1136; we can safely discard read characters. If there was some partial match,
1137; we already know the characters before, they are in the STR itself, so
1138; we don't need a special buffer for that.
1139
1140;;; "MISCIO" Search for string from port.
1141; Written 1995 by Oleg Kiselyov (oleg@ponder.csci.unt.edu)
1142; Modified 1996 by A. Jaffer (jaffer@ai.mit.edu)
1143;
1144; This code is in the public domain.
1145
1146(define (MISCIO:find-string-from-port? str <input-port> . max-no-char)
1147  (set! max-no-char (if (null? max-no-char) #f (car max-no-char)))
1148  (letrec
1149      ((no-chars-read 0)
1150       (my-peek-char                    ; Return a peeked char or #f
1151        (lambda () (and (or (not max-no-char) (< no-chars-read max-no-char))
1152                        (let ((c (peek-char <input-port>)))
1153                          (if (eof-object? c) #f c)))))
1154       (next-char (lambda () (read-char <input-port>)
1155                          (set! no-chars-read  (+ 1 no-chars-read))))
1156       (match-1st-char                  ; of the string str
1157        (lambda ()
1158          (let ((c (my-peek-char)))
1159            (if (not c) #f
1160                (begin (next-char)
1161                       (if (char=? c (string-ref str 0))
1162                           (match-other-chars 1)
1163                           (match-1st-char)))))))
1164       ;; There has been a partial match, up to the point pos-to-match
1165       ;; (for example, str[0] has been found in the stream)
1166       ;; Now look to see if str[pos-to-match] for would be found, too
1167       (match-other-chars
1168        (lambda (pos-to-match)
1169          (if (>= pos-to-match (string-length str))
1170              no-chars-read             ; the entire string has matched
1171              (let ((c (my-peek-char)))
1172                (and c
1173                     (if (not (char=? c (string-ref str pos-to-match)))
1174                         (backtrack 1 pos-to-match)
1175                         (begin (next-char)
1176                                (match-other-chars (+ 1 pos-to-match)))))))))
1177
1178       ;; There had been a partial match, but then a wrong char showed up.
1179       ;; Before discarding previously read (and matched) characters, we check
1180       ;; to see if there was some smaller partial match. Note, characters read
1181       ;; so far (which matter) are those of str[0..matched-substr-len - 1]
1182       ;; In other words, we will check to see if there is such i>0 that
1183       ;; substr(str,0,j) = substr(str,i,matched-substr-len)
1184       ;; where j=matched-substr-len - i
1185       (backtrack
1186        (lambda (i matched-substr-len)
1187          (let ((j (- matched-substr-len i)))
1188            (if (<= j 0)
1189              (match-1st-char)  ; backed off completely to the begining of str
1190              (let loop ((k 0))
1191                (if (>= k j)
1192                   (match-other-chars j) ; there was indeed a shorter match
1193                   (if (char=? (string-ref str k)
1194                               (string-ref str (+ i k)))
1195                     (loop (+ 1 k))
1196                     (backtrack (+ 1 i) matched-substr-len))))))))
1197       )
1198    (match-1st-char)))
1199
1200(define find-string-from-port? MISCIO:find-string-from-port?)
1201
1202
1203;-----------------------------------------------------------------------------
1204;   This is a test driver for miscio:find-string-from-port?, to make sure it
1205;                       really works as intended
1206
1207; moved to vinput-parse.scm
1208;       Functional XML parsing framework: SAX/DOM and SXML parsers
1209;             with support for XML Namespaces and validation
1210;
1211; This is a package of low-to-high level lexing and parsing procedures
1212; that can be combined to yield a SAX, a DOM, a validating parsers, or
1213; a parser intended for a particular document type. The procedures in
1214; the package can be used separately to tokenize or parse various
1215; pieces of XML documents. The package supports XML Namespaces,
1216; internal and external parsed entities, user-controlled handling of
1217; whitespace, and validation. This module therefore is intended to be
1218; a framework, a set of "Lego blocks" you can use to build a parser
1219; following any discipline and performing validation to any degree. As
1220; an example of the parser construction, this file includes a
1221; semi-validating SXML parser.
1222
1223; The present XML framework has a "sequential" feel of SAX yet a
1224; "functional style" of DOM. Like a SAX parser, the framework scans
1225; the document only once and permits incremental processing. An
1226; application that handles document elements in order can run as
1227; efficiently as possible. _Unlike_ a SAX parser, the framework does
1228; not require an application register stateful callbacks and surrender
1229; control to the parser. Rather, it is the application that can drive
1230; the framework -- calling its functions to get the current lexical or
1231; syntax element. These functions do not maintain or mutate any state
1232; save the input port. Therefore, the framework permits parsing of XML
1233; in a pure functional style, with the input port being a monad (or a
1234; linear, read-once parameter).
1235
1236; Besides the PORT, there is another monad -- SEED. Most of the
1237; middle- and high-level parsers are single-threaded through the
1238; seed. The functions of this framework do not process or affect the
1239; SEED in any way: they simply pass it around as an instance of an
1240; opaque datatype.  User functions, on the other hand, can use the
1241; seed to maintain user's state, to accumulate parsing results, etc. A
1242; user can freely mix his own functions with those of the
1243; framework. On the other hand, the user may wish to instantiate a
1244; high-level parser: SSAX:make-elem-parser or SSAX:make-parser.  In
1245; the latter case, the user must provide functions of specific
1246; signatures, which are called at predictable moments during the
1247; parsing: to handle character data, element data, or processing
1248; instructions (PI). The functions are always given the SEED, among
1249; other parameters, and must return the new SEED.
1250
1251; From a functional point of view, XML parsing is a combined
1252; pre-post-order traversal of a "tree" that is the XML document
1253; itself. This down-and-up traversal tells the user about an element
1254; when its start tag is encountered. The user is notified about the
1255; element once more, after all element's children have been
1256; handled. The process of XML parsing therefore is a fold over the
1257; raw XML document. Unlike a fold over trees defined in [1], the
1258; parser is necessarily single-threaded -- obviously as elements
1259; in a text XML document are laid down sequentially. The parser
1260; therefore is a tree fold that has been transformed to accept an
1261; accumulating parameter [1,2].
1262
1263; Formally, the denotational semantics of the parser can be expressed
1264; as
1265; parser:: (Start-tag -> Seed -> Seed) ->
1266;          (Start-tag -> Seed -> Seed -> Seed) ->
1267;          (Char-Data -> Seed -> Seed) ->
1268;          XML-text-fragment -> Seed -> Seed
1269; parser fdown fup fchar "<elem attrs> content </elem>" seed
1270;  = fup "<elem attrs>" seed
1271;       (parser fdown fup fchar "content" (fdown "<elem attrs>" seed))
1272;
1273; parser fdown fup fchar "char-data content" seed
1274;  = parser fdown fup fchar "content" (fchar "char-data" seed)
1275;
1276; parser fdown fup fchar "elem-content content" seed
1277;  = parser fdown fup fchar "content" (
1278;       parser fdown fup fchar "elem-content" seed)
1279
1280; Compare the last two equations with the left fold
1281; fold-left kons elem:list seed = fold-left kons list (kons elem seed)
1282
1283; The real parser created my SSAX:make-parser is slightly more complicated,
1284; to account for processing instructions, entity references, namespaces,
1285; processing of document type declaration, etc.
1286
1287
1288; The XML standard document referred to in this module is
1289;       http://www.w3.org/TR/1998/REC-xml-19980210.html
1290;
1291; The present file also defines a procedure that parses the text of an
1292; XML document or of a separate element into SXML, an
1293; S-expression-based model of an XML Information Set. SXML is also an
1294; Abstract Syntax Tree of an XML document. SXML is similar
1295; but not identical to DOM; SXML is particularly suitable for
1296; Scheme-based XML/HTML authoring, SXPath queries, and tree
1297; transformations. See SXML.html for more details.
1298; SXML is a term implementation of evaluation of the XML document [3].
1299; The other implementation is context-passing.
1300
1301; The present frameworks fully supports the XML Namespaces Recommendation:
1302;       http://www.w3.org/TR/REC-xml-names/
1303; Other links:
1304; [1] Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold,"
1305; Proc. ICFP'98, 1998, pp. 273-279.
1306; [2] Richard S. Bird, The promotion and accumulation strategies in
1307; transformational programming, ACM Trans. Progr. Lang. Systems,
1308; 6(4):487-504, October 1984.
1309; [3] Ralf Hinze, "Deriving Backtracking Monad Transformers,"
1310; Functional Pearl. Proc ICFP'00, pp. 186-197.
1311
1312; IMPORT
1313; parser-error SSAX:warn, see Handling of errors, below
1314; functions declared in files util.scm, input-parse.scm and look-for-str.scm
1315
1316; Handling of errors
1317; This package relies on a function parser-error, which must be defined
1318; by a user of the package. The function has the following signature:
1319;       parser-error PORT MESSAGE SPECIALISING-MSG*
1320; Many procedures of this package call 'parser-error' whenever a
1321; parsing, well-formedness or validation error is encountered. The
1322; first argument is a port, which typically points to the offending
1323; character or its neighborhood. Most of the Scheme systems let the
1324; user query a PORT for the current position. The MESSAGE argument
1325; indicates a failed XML production or a failed XML constraint. The
1326; latter is referred to by its anchor name in the XML Recommendation
1327; or XML Namespaces Recommendation. The parsing library (e.g.,
1328; next-token, assert-curr-char) invoke 'parser-error' as well, in
1329; exactly the same way.  See input-parse.scm for more details.
1330; See
1331;       http://pair.com/lisovsky/download/parse-error.scm
1332; for an excellent example of such a redefined parser-error function.
1333;
1334; In addition, the present code invokes a function SSAX:warn
1335;   SSAX:warn PORT MESSAGE SPECIALISING-MSG*
1336; to notify the user about warnings that are NOT errors but still
1337; may alert the user.
1338;
1339; Again, parser-error and SSAX:warn are supposed to be defined by the
1340; user. However, if a run-test macro below is set to include
1341; self-tests, this present code does provide the definitions for these
1342; functions to allow tests to run.
1343;
1344; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
1345
1346
1347        ; To run this code under Gambit, just evaluate or compile it along
1348        ; with the IMPORT functions mentioned above.
1349
1350        ; To run this code under SCM, do
1351        ; scm myenv-scm.scm util.scm input-parse.scm look-for-str.scm SSAX.scm
1352
1353        ; To run this code under MIT Scheme, do
1354        ; scheme -load myenv-mit.scm input-parse.scm util.scm \
1355        ; look-for-str.scm SSAX.scm
1356
1357        ; Current versions of SSAX ports to other Scheme systems
1358        ; (including Bigloo, Guile, Chicken, and PLT Scheme) are available from
1359        ;       http://pair.com/lisovsky/download/ssax/
1360        ; I'm deeply grateful to Kirill Lisovsky for developing and
1361        ; maintaining these versions. His comments and suggestions are
1362        ; appreciated indeed.
1363
1364        ; See http://pobox.com/~oleg/ftp/Scheme/
1365        ; for myenv.scm and other input parsing functions used
1366        ; in the present code.
1367; Move inside the run-test, just as catch-error.scm below???
1368; (include "myenv.scm")
1369
1370
1371; The following macro runs built-in test cases -- or does not run,
1372; depending on which of the two lines below you commented out
1373(define-macro run-test (lambda body '(begin #f)))
1374
1375; The following macro could've been defined just as
1376; (define-macro run-test (lambda body `(begin (display "\n-->Test\n") ,@body)))
1377;
1378; Instead, it's more involved, to make up for case-insensitivity of
1379; symbols on some Scheme systems. In Gambit, symbols are case
1380; sensitive: (eq? 'A 'a) is #f and (eq? 'Aa (string->symbol "Aa")) is
1381; #t.  On some systems, symbols are case-insensitive and just the
1382; opposite is true.  Therefore, we introduce a notation '"ASymbol" (a
1383; quoted string) that stands for a case-_sensitive_ ASymbol -- on any
1384; R5RS Scheme system. This notation is valid only within the body of
1385; run-test.
1386; The notation is implemented by scanning the run-test's
1387; body and replacing every occurrence of (quote "str") with the result
1388; of (string->symbol "str"). We can do such a replacement at macro-expand
1389; time (rather than at run time).
1390
1391 ;(define-macro run-test
1392 ;  (lambda body
1393 ;    (define (re-write body)
1394 ;      (cond
1395 ;       ((vector? body)
1396 ;      (list->vector (re-write (vector->list body))))
1397 ;       ((not (pair? body)) body)
1398 ;       ((and (eq? 'quote (car body)) (pair? (cdr body))
1399 ;           (string? (cadr body)))
1400 ;      (string->symbol (cadr body)))
1401 ;       (else (cons (re-write (car body)) (re-write (cdr body))))))
1402 ;    (cons 'begin (re-write body))))
1403
1404
1405 ;(run-test
1406 ; (include "catch-error.scm")
1407 ;)
1408;========================================================================
1409;                               Data Types
1410
1411; TAG-KIND
1412;       a symbol 'START, 'END, 'PI, 'DECL, 'COMMENT, 'CDSECT
1413;               or 'ENTITY-REF that identifies a markup token
1414
1415; UNRES-NAME
1416;       a name (called GI in the XML Recommendation) as given in an xml
1417;       document for a markup token: start-tag, PI target, attribute name.
1418;       If a GI is an NCName, UNRES-NAME is this NCName converted into
1419;       a Scheme symbol. If a GI is a QName, UNRES-NAME is a pair of
1420;       symbols: (PREFIX . LOCALPART)
1421
1422; RES-NAME
1423;       An expanded name, a resolved version of an UNRES-NAME.
1424;       For an element or an attribute name with a non-empty namespace URI,
1425;       RES-NAME is a pair of symbols, (URI-SYMB . LOCALPART).
1426;       Otherwise, it's a single symbol.
1427
1428; ELEM-CONTENT-MODEL
1429;       A symbol:
1430;       ANY       - anything goes, expect an END tag.
1431;       EMPTY-TAG - no content, and no END-tag is coming
1432;       EMPTY     - no content, expect the END-tag as the next token
1433;       PCDATA    - expect character data only, and no children elements
1434;       MIXED
1435;       ELEM-CONTENT
1436
1437; URI-SYMB
1438;       A symbol representing a namespace URI -- or other symbol chosen
1439;       by the user to represent URI. In the former case,
1440;       URI-SYMB is created by %-quoting of bad URI characters and
1441;       converting the resulting string into a symbol.
1442
1443; NAMESPACES
1444;       A list representing namespaces in effect. An element of the list
1445;       has one of the following forms:
1446;       (PREFIX URI-SYMB . URI-SYMB) or
1447;       (PREFIX USER-PREFIX . URI-SYMB)
1448;               USER-PREFIX is a symbol chosen by the user
1449;               to represent the URI.
1450;       (#f USER-PREFIX . URI-SYMB)
1451;               Specification of the user-chosen prefix and a URI-SYMBOL.
1452;       (*DEFAULT* USER-PREFIX . URI-SYMB)
1453;               Declaration of the default namespace
1454;       (*DEFAULT* #f . #f)
1455;               Un-declaration of the default namespace. This notation
1456;               represents overriding of the previous declaration
1457;       A NAMESPACES list may contain several elements for the same PREFIX.
1458;       The one closest to the beginning of the list takes effect.
1459
1460; ATTLIST
1461;       An ordered collection of (NAME . VALUE) pairs, where NAME is
1462;       a RES-NAME or an UNRES-NAME. The collection is an ADT
1463
1464; STR-HANDLER
1465;       A procedure of three arguments: STRING1 STRING2 SEED
1466;       returning a new SEED
1467;       The procedure is supposed to handle a chunk of character data
1468;       STRING1 followed by a chunk of character data STRING2.
1469;       STRING2 is a short string, often "\n" and even ""
1470
1471; ENTITIES
1472;       An assoc list of pairs:
1473;          (named-entity-name . named-entity-body)
1474;       where named-entity-name is a symbol under which the entity was
1475;       declared, named-entity-body is either a string, or
1476;       (for an external entity) a thunk that will return an
1477;       input port (from which the entity can be read).
1478;       named-entity-body may also be #f. This is an indication that a
1479;       named-entity-name is currently being expanded. A reference to
1480;       this named-entity-name will be an error: violation of the
1481;       WFC nonrecursion.
1482
1483; XML-TOKEN -- a record
1484
1485; In Gambit, you can use the following declaration:
1486; (define-structure xml-token kind head)
1487; The following declaration is "standard" as it follows SRFI-9:
1488;;(define-record-type  xml-token  (make-xml-token kind head)  xml-token?
1489;;  (kind  xml-token-kind)
1490;;  (head  xml-token-head) )
1491; No field mutators are declared as SSAX is a pure functional parser
1492;
1493; But to make the code more portable, we define xml-token simply as
1494; a pair. It suffices for us. Furthermore, xml-token-kind and xml-token-head
1495; can be defined as simple procedures. However, they are declared as
1496; macros below for efficiency.
1497
1498#|
1499(define (make-xml-token kind head) (cons kind head))
1500(define xml-token? pair?)
1501(define-macro xml-token-kind (lambda (token) `(car ,token)))
1502(define-macro xml-token-head (lambda (token) `(cdr ,token)))
1503|#
1504
1505(define-record xml-token kind head)
1506
1507
1508; This record represents a markup, which is, according to the XML
1509; Recommendation, "takes the form of start-tags, end-tags, empty-element tags,
1510; entity references, character references, comments, CDATA section delimiters,
1511; document type declarations, and processing instructions."
1512;
1513;       kind -- a TAG-KIND
1514;       head -- an UNRES-NAME. For xml-tokens of kinds 'COMMENT and
1515;               'CDSECT, the head is #f
1516;
1517; For example,
1518;       <P>  => kind='START, head='P
1519;       </P> => kind='END, head='P
1520;       <BR/> => kind='EMPTY-EL, head='BR
1521;       <!DOCTYPE OMF ...> => kind='DECL, head='DOCTYPE
1522;       <?xml version="1.0"?> => kind='PI, head='xml
1523;       &my-ent; => kind = 'ENTITY-REF, head='my-ent
1524;
1525; Character references are not represented by xml-tokens as these references
1526; are transparently resolved into the corresponding characters.
1527;
1528
1529
1530
1531; XML-DECL -- a record
1532
1533; The following is Gambit-specific, see below for a portable declaration
1534;(define-structure xml-decl elems entities notations)
1535
1536; The record represents a datatype of an XML document: the list of
1537; declared elements and their attributes, declared notations, list of
1538; replacement strings or loading procedures for parsed general
1539; entities, etc. Normally an xml-decl record is created from a DTD or
1540; an XML Schema, although it can be created and filled in in many other
1541; ways (e.g., loaded from a file).
1542;
1543; elems: an (assoc) list of decl-elem or #f. The latter instructs
1544;       the parser to do no validation of elements and attributes.
1545;
1546; decl-elem: declaration of one element:
1547;       (elem-name elem-content decl-attrs)
1548;       elem-name is an UNRES-NAME for the element.
1549;       elem-content is an ELEM-CONTENT-MODEL.
1550;       decl-attrs is an ATTLIST, of (ATTR-NAME . VALUE) associations
1551; !!!This element can declare a user procedure to handle parsing of an
1552; element (e.g., to do a custom validation, or to build a hash of
1553; IDs as they're encountered).
1554;
1555; decl-attr: an element of an ATTLIST, declaration of one attribute
1556;       (attr-name content-type use-type default-value)
1557;       attr-name is an UNRES-NAME for the declared attribute
1558;       content-type is a symbol: CDATA, NMTOKEN, NMTOKENS, ...
1559;               or a list of strings for the enumerated type.
1560;       use-type is a symbol: REQUIRED, IMPLIED, FIXED
1561;       default-value is a string for the default value, or #f if not given.
1562;
1563;
1564
1565; see a function make-empty-xml-decl to make a XML declaration entry
1566; suitable for a non-validating parsing.
1567
1568
1569;-------------------------
1570; Utilities
1571
1572; A rather useful utility from SRFI-1
1573; cons* elt1 elt2 ... -> object
1574;    Like LIST, but the last argument provides the tail of the constructed
1575;    list -- i.e., (cons* a1 a2 ... an) = (cons a1 (cons a2 (cons ... an))).
1576;
1577;   (cons* 1 2 3 4) => (1 2 3 . 4)
1578;   (cons* 1) => 1
1579#;(define (cons* first . rest)
1580  (let recur ((x first) (rest rest))
1581    (if (pair? rest)
1582        (cons x (recur (car rest) (cdr rest)))
1583        x)))
1584
1585;   SSAX:warn PORT MESSAGE SPECIALISING-MSG*
1586; to notify the user about warnings that are NOT errors but still
1587; may alert the user.
1588; Result is unspecified.
1589; We need to define the function to allow the self-tests to run.
1590; Normally the definition of SSAX:warn is to be provided by the user.
1591;(run-test
1592(define (SSAX:warn port msg . other-msg)
1593  (apply cerr (cons* "\nWarning: " msg other-msg))
1594  (newline (current-error-port)) )
1595;)
1596
1597
1598;   parser-error PORT MESSAGE SPECIALISING-MSG*
1599; to let the user know of a syntax error or a violation of a
1600; well-formedness or validation constraint.
1601; Result is unspecified.
1602; We need to define the function to allow the self-tests to run.
1603; Normally the definition of parser-error is to be provided by the user.
1604(run-test
1605 (define (parser-error port msg . specializing-msgs)
1606   (apply error (cons msg specializing-msgs)))
1607)
1608
1609; The following is a function that is often used in validation tests,
1610; to make sure that the computed result matches the expected one.
1611; This function is a standard equal? predicate with one exception.
1612; On Scheme systems where (string->symbol "A") and a symbol A
1613; are the same, equal_? is precisely equal?
1614; On other Scheme systems, we compare symbols disregarding their case.
1615; Since this function is used only in tests, we don't have to
1616; strive to make it efficient.
1617(run-test
1618 (define (equal_? e1 e2)
1619   (if (eq? 'A (string->symbol "A")) (equal? e1 e2)
1620       (cond
1621        ((symbol? e1)
1622         (and (symbol? e2) 
1623              (string-ci=? (symbol->string e1) (symbol->string e2))))
1624        ((pair? e1)
1625         (and (pair? e2)
1626              (equal_? (car e1) (car e2)) (equal_? (cdr e1) (cdr e2))))
1627        ((vector? e1)
1628         (and (vector? e2) (equal_? (vector->list e1) (vector->list e2))))
1629        (else
1630         (equal? e1 e2)))))
1631)
1632             
1633
1634; Test if a string is made of only whitespace
1635; An empty string is considered made of whitespace as well
1636(define (string-whitespace? str)
1637  (let ((len (string-length str)))
1638    (cond
1639     ((zero? len) #t)
1640     ((= 1 len) (char-whitespace? (string-ref str 0)))
1641     ((= 2 len) (and (char-whitespace? (string-ref str 0))
1642                     (char-whitespace? (string-ref str 1))))
1643     (else
1644      (let loop ((i 0))
1645        (or (>= i len)
1646            (and (char-whitespace? (string-ref str i))
1647                 (loop (++ i)))))))))
1648
1649; Find val in alist
1650; Return (values found-el remaining-alist) or
1651;        (values #f alist)
1652
1653(define (assq-values val alist)
1654  (let loop ((alist alist) (scanned '()))
1655    (cond
1656     ((null? alist) (values #f scanned))
1657     ((equal? val (caar alist))
1658      (values (car alist) (append scanned (cdr alist))))
1659     (else
1660      (loop (cdr alist) (cons (car alist) scanned))))))
1661
1662; From SRFI-1
1663(define (fold-right kons knil lis1)
1664    (let recur ((lis lis1))
1665       (if (null? lis) knil
1666            (let ((head (car lis)))
1667              (kons head (recur (cdr lis)))))))
1668
1669; Left fold combinator for a single list
1670(define (fold kons knil lis1)
1671  (let lp ((lis lis1) (ans knil))
1672    (if (null? lis) ans
1673      (lp (cdr lis) (kons (car lis) ans)))))
1674
1675
1676;========================================================================
1677;               Lower-level parsers and scanners
1678;
1679; They deal with primitive lexical units (Names, whitespaces, tags)
1680; and with pieces of more generic productions. Most of these parsers
1681; must be called in appropriate context. For example, SSAX:complete-start-tag
1682; must be called only when the start-tag has been detected and its GI
1683; has been read.
1684
1685;------------------------------------------------------------------------
1686;                       Low-level parsing code
1687
1688; Skip the S (whitespace) production as defined by
1689; [3] S ::= (#x20 | #x9 | #xD | #xA)
1690; The procedure returns the first not-whitespace character it
1691; encounters while scanning the PORT. This character is left
1692; on the input stream.
1693
1694(define SSAX:S-chars '(#\space #\tab #\return #\newline))
1695
1696(define (SSAX:skip-S port)
1697  (skip-while SSAX:S-chars port))
1698
1699
1700; Read a Name lexem and return it as string
1701; [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':'
1702;                  | CombiningChar | Extender
1703; [5] Name ::= (Letter | '_' | ':') (NameChar)*
1704;
1705; This code supports the XML Namespace Recommendation REC-xml-names,
1706; which modifies the above productions as follows:
1707;
1708; [4] NCNameChar ::= Letter | Digit | '.' | '-' | '_'
1709;                       | CombiningChar | Extender
1710; [5] NCName ::= (Letter | '_') (NCNameChar)*
1711; As the Rec-xml-names says,
1712; "An XML document conforms to this specification if all other tokens
1713; [other than element types and attribute names] in the document which
1714; are required, for XML conformance, to match the XML production for
1715; Name, match this specification's production for NCName."
1716; Element types and attribute names must match the production QName,
1717; defined below.
1718
1719; Check to see if a-char may start a NCName
1720(define (SSAX:ncname-starting-char? a-char)
1721  (and (char? a-char)
1722    (or
1723      (char-alphabetic? a-char)
1724      (char=? #\_ a-char))))
1725
1726
1727; Read a NCName starting from the current position in the PORT and
1728; return it as a symbol.
1729(define (SSAX:read-NCName port)
1730  (let ((first-char (peek-char port)))
1731    (or (SSAX:ncname-starting-char? first-char)
1732      (parser-error port "XMLNS [4] for '" first-char "'")))
1733  (string->symbol
1734    (next-token-of
1735      (lambda (c)
1736        (cond
1737          ((eof-object? c) #f)
1738          ((char-alphabetic? c) c)
1739          ((string-index "0123456789.-_" c) c)
1740          (else #f)))
1741      port)))
1742
1743; Read a (namespace-) Qualified Name, QName, from the current
1744; position in the PORT.
1745; From REC-xml-names:
1746;       [6] QName ::= (Prefix ':')? LocalPart
1747;       [7] Prefix ::= NCName
1748;       [8] LocalPart ::= NCName
1749; Return: an UNRES-NAME
1750(define (SSAX:read-QName port)
1751  (let ((prefix-or-localpart (SSAX:read-NCName port)))
1752    (case (peek-char port)
1753      ((#\:)                    ; prefix was given after all
1754       (read-char port)         ; consume the colon
1755       (cons prefix-or-localpart (SSAX:read-NCName port)))
1756      (else prefix-or-localpart) ; Prefix was omitted
1757      )))
1758
1759; The prefix of the pre-defined XML namespace
1760(define SSAX:Prefix-XML (string->symbol "xml"))
1761
1762(run-test
1763 (assert (eq? '_
1764                 (call-with-input-string "_" SSAX:read-NCName)))
1765 (assert (eq? '_
1766                 (call-with-input-string "_" SSAX:read-QName)))
1767 (assert (eq? (string->symbol "_abc_")
1768              (call-with-input-string "_abc_;" SSAX:read-NCName)))
1769 (assert (eq? (string->symbol "_abc_")
1770              (call-with-input-string "_abc_;" SSAX:read-QName)))
1771 (assert (eq? (string->symbol "_a.b")
1772              (call-with-input-string "_a.b " SSAX:read-QName)))
1773 (assert (equal? (cons (string->symbol "_a.b") (string->symbol "d.1-ef-"))
1774              (call-with-input-string "_a.b:d.1-ef-;" SSAX:read-QName)))
1775 (assert (equal? (cons (string->symbol "a") (string->symbol "b"))
1776              (call-with-input-string "a:b:c" SSAX:read-QName)))
1777
1778 (assert (failed? (call-with-input-string ":abc" SSAX:read-NCName)))
1779 (assert (failed? (call-with-input-string "1:bc" SSAX:read-NCName)))
1780)
1781
1782; Compare one RES-NAME or an UNRES-NAME with the other.
1783; Return a symbol '<, '>, or '= depending on the result of
1784; the comparison.
1785; Names without PREFIX are always smaller than those with the PREFIX.
1786(define name-compare
1787  (letrec ((symbol-compare
1788            (lambda (symb1 symb2)
1789              (cond
1790               ((eq? symb1 symb2) '=)
1791               ((string<? (symbol->string symb1) (symbol->string symb2))
1792                '<)
1793               (else '>)))))
1794    (lambda (name1 name2)
1795      (cond
1796       ((symbol? name1) (if (symbol? name2) (symbol-compare name1 name2)
1797                            '<))
1798       ((symbol? name2) '>)
1799       ((eq? name2 SSAX:largest-unres-name) '<)
1800       ((eq? name1 SSAX:largest-unres-name) '>)
1801       ((eq? (car name1) (car name2))   ; prefixes the same
1802        (symbol-compare (cdr name1) (cdr name2)))
1803       (else (symbol-compare (car name1) (car name2)))))))
1804
1805; An UNRES-NAME that is postulated to be larger than anything that can occur in
1806; a well-formed XML document.
1807; name-compare enforces this postulate.
1808(define SSAX:largest-unres-name (cons (gensym) (gensym)))
1809
1810(run-test
1811 (assert (eq? '= (name-compare 'ABC 'ABC)))
1812 (assert (eq? '< (name-compare 'ABC 'ABCD)))
1813 (assert (eq? '> (name-compare 'XB 'ABCD)))
1814 (assert (eq? '> (name-compare '(HTML . PRE) 'PRE)))
1815 (assert (eq? '< (name-compare 'HTML '(HTML . PRE))))
1816 (assert (eq? '= (name-compare '(HTML . PRE) '(HTML . PRE))))
1817 (assert (eq? '< (name-compare '(HTML . PRE) '(XML . PRE))))
1818 (assert (eq? '> (name-compare '(HTML . PRE) '(HTML . P))))
1819 (assert (eq? '< (name-compare '(HTML . PRE) SSAX:largest-unres-name)))
1820 (assert (eq? '< (name-compare '(ZZZZ . ZZZ) SSAX:largest-unres-name)))
1821 (assert (eq? '> (name-compare SSAX:largest-unres-name '(ZZZZ . ZZZ) )))
1822)
1823
1824
1825
1826; procedure:    SSAX:read-markup-token PORT
1827; This procedure starts parsing of a markup token. The current position
1828; in the stream must be #\<. This procedure scans enough of the input stream
1829; to figure out what kind of a markup token it is seeing. The procedure returns
1830; an xml-token structure describing the token. Note, generally reading
1831; of the current markup is not finished! In particular, no attributes of
1832; the start-tag token are scanned.
1833;
1834; Here's a detailed break out of the return values and the position in the PORT
1835; when that particular value is returned:
1836;       PI-token:       only PI-target is read.
1837;                       To finish the Processing Instruction and disregard it,
1838;                       call SSAX:skip-pi. SSAX:read-attributes may be useful
1839;                       as well (for PIs whose content is attribute-value
1840;                       pairs)
1841;       END-token:      The end tag is read completely; the current position
1842;                       is right after the terminating #\> character.   
1843;       COMMENT         is read and skipped completely. The current position
1844;                       is right after "-->" that terminates the comment.
1845;       CDSECT          The current position is right after "<!CDATA["
1846;                       Use SSAX:read-CDATA-body to read the rest.
1847;       DECL            We have read the keyword (the one that follows "<!")
1848;                       identifying this declaration markup. The current
1849;                       position is after the keyword (usually a
1850;                       whitespace character)
1851;
1852;       START-token     We have read the keyword (GI) of this start tag.
1853;                       No attributes are scanned yet. We don't know if this
1854;                       tag has an empty content either.
1855;                       Use SSAX:complete-start-tag to finish parsing of
1856;                       the token.
1857
1858(define SSAX:read-markup-token ; procedure SSAX:read-markup-token port
1859 (let ()
1860                ; we have read "<!-". Skip through the rest of the comment
1861                ; Return the 'COMMENT token as an indication we saw a comment
1862                ; and skipped it.
1863  (define (skip-comment port)
1864    (assert-curr-char '(#\-) "XML [15], second dash" port)
1865    (if (not (find-string-from-port? "-->" port))
1866      (parser-error port "XML [15], no -->"))
1867    (make-xml-token 'COMMENT #f))
1868
1869                ; we have read "<![" that must begin a CDATA section
1870  (define (read-CDATA port)
1871    (assert (string=? "CDATA[" (read-string 6 port)))
1872    (make-xml-token 'CDSECT #f))
1873
1874  (lambda (port)
1875    (assert-curr-char '(#\<) "start of the token" port)
1876    (case (peek-char port)
1877      ((#\/) (read-char port)
1878       (begin0 (make-xml-token 'END (SSAX:read-QName port))
1879               (SSAX:skip-S port)
1880               (assert-curr-char '(#\>) "XML [42]" port)))
1881      ((#\?) (read-char port) (make-xml-token 'PI (SSAX:read-NCName port)))
1882      ((#\!)
1883       (case (peek-next-char port)
1884         ((#\-) (read-char port) (skip-comment port))
1885         ((#\[) (read-char port) (read-CDATA port))
1886         (else (make-xml-token 'DECL (SSAX:read-NCName port)))))
1887      (else (make-xml-token 'START (SSAX:read-QName port)))))
1888))
1889
1890
1891; The current position is inside a PI. Skip till the rest of the PI
1892(define (SSAX:skip-pi port)     
1893  (if (not (find-string-from-port? "?>" port))
1894    (parser-error port "Failed to find ?> terminating the PI")))
1895
1896
1897; The current position is right after reading the PITarget. We read the
1898; body of PI and return is as a string. The port will point to the
1899; character right sfter '?>' combination that terminates PI.
1900; [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
1901
1902(define (SSAX:read-pi-body-as-string port)
1903  (SSAX:skip-S port)            ; skip WS after the PI target name
1904  (string-concatenate
1905    (let loop ()
1906      (let ((pi-fragment
1907             (next-token '() '(#\?) "reading PI content" port)))
1908        (if (eq? #\> (peek-next-char port))
1909            (begin
1910              (read-char port)
1911              (cons pi-fragment '()))
1912            (cons* pi-fragment "?" (loop)))))))
1913
1914(run-test
1915 (assert (equal? "p1 content "
1916    (call-with-input-string "<?pi1  p1 content ?>"
1917      (lambda (port)
1918        (SSAX:read-markup-token port)
1919        (SSAX:read-pi-body-as-string port)))))
1920 (assert (equal? "pi2? content? ?"
1921    (call-with-input-string "<?pi2 pi2? content? ??>"
1922      (lambda (port)
1923        (SSAX:read-markup-token port)
1924        (SSAX:read-pi-body-as-string port)))))
1925)
1926
1927;(define (SSAX:read-pi-body-as-name-values port)
1928
1929; The current pos in the port is inside an internal DTD subset
1930; (e.g., after reading #\[ that begins an internal DTD subset)
1931; Skip until the "]>" combination that terminates this DTD
1932(define (SSAX:skip-internal-dtd port)     
1933  (if (not (find-string-from-port? "]>" port))
1934    (parser-error port
1935                  "Failed to find ]> terminating the internal DTD subset")))
1936
1937
1938; procedure+:   SSAX:read-CDATA-body PORT STR-HANDLER SEED
1939;
1940; This procedure must be called after we have read a string "<![CDATA["
1941; that begins a CDATA section. The current position must be the first
1942; position of the CDATA body. This function reads _lines_ of the CDATA
1943; body and passes them to a STR-HANDLER, a character data consumer.
1944;
1945; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED.
1946; The first STRING1 argument to STR-HANDLER never contains a newline.
1947; The second STRING2 argument often will. On the first invocation of
1948; the STR-HANDLER, the seed is the one passed to SSAX:read-CDATA-body
1949; as the third argument. The result of this first invocation will be
1950; passed as the seed argument to the second invocation of the line
1951; consumer, and so on. The result of the last invocation of the
1952; STR-HANDLER is returned by the SSAX:read-CDATA-body.  Note a
1953; similarity to the fundamental 'fold' iterator.
1954;
1955; Within a CDATA section all characters are taken at their face value,
1956; with only three exceptions:
1957;       CR, LF, and CRLF are treated as line delimiters, and passed
1958;       as a single #\newline to the STR-HANDLER
1959;       "]]>" combination is the end of the CDATA section.
1960;       &gt; is treated as an embedded #\> character
1961; Note, &lt; and &amp; are not specially recognized (and are not expanded)!
1962
1963(define SSAX:read-CDATA-body 
1964  (let ((nl-str (string #\newline)))
1965
1966    (lambda (port str-handler seed)
1967      (let loop ((seed seed))
1968        (let ((fragment (next-token '() '(#\return #\newline #\] #\&)
1969                                    "reading CDATA" port)))
1970                        ; that is, we're reading the char after the 'fragment'
1971     (case (read-char port)     
1972       ((#\newline) (loop (str-handler fragment nl-str seed)))
1973       ((#\return)              ; if the next char is #\newline, skip it
1974         (if (eqv? (peek-char port) #\newline) (read-char port))
1975         (loop (str-handler fragment nl-str seed)))
1976       ((#\])
1977        (if (not (eqv? (peek-char port) #\]))
1978            (loop (str-handler fragment "]" seed))
1979            (let check-after-second-braket
1980                ((seed (if (string-null? fragment) seed
1981                           (str-handler fragment "" seed))))
1982              (case (peek-next-char port)       ; after the second bracket
1983                ((#\>) (read-char port) seed)   ; we have read "]]>"
1984                ((#\]) (check-after-second-braket
1985                        (str-handler "]" "" seed)))
1986                (else (loop (str-handler "]]" "" seed)))))))
1987       ((#\&)           ; Note that #\& within CDATA may stand for itself
1988        (let ((ent-ref  ; it does not have to start an entity ref
1989               (next-token-of (lambda (c) 
1990                 (and (not (eof-object? c)) (char-alphabetic? c) c)) port)))
1991          (cond         ; "&gt;" is to be replaced with #\>
1992           ((and (string=? "gt" ent-ref) (eq? (peek-char port) #\;))
1993            (read-char port)
1994            (loop (str-handler fragment ">" seed)))
1995           (else
1996            (loop 
1997             (str-handler ent-ref ""
1998                          (str-handler fragment "&" seed)))))))
1999       (else
2000         (parser-error port "can't happen"))))))))
2001
2002; a few lines of validation code
2003(run-test (letrec
2004  ((consumer (lambda (fragment foll-fragment seed)
2005     (cons* (if (equal? foll-fragment (string #\newline))
2006                " NL\n" foll-fragment) fragment seed)))
2007   (test (lambda (str expected-result)
2008          (display "\nbody: ") (write str) (display "\nResult: ")
2009          (let ((result
2010                 (reverse
2011                  (call-with-input-string str
2012                    (lambda (port) (SSAX:read-CDATA-body port consumer '()))
2013                    ))))
2014            (write result)
2015            (assert (equal? result expected-result)))))
2016   )
2017  (test "]]>" '())
2018  (test "abcd]]>" '("abcd" ""))
2019  (test "abcd]]]>" '("abcd" "" "]" ""))
2020  (test "abcd]]]]>" '("abcd" "" "]" "" "]" ""))
2021  (test "abcd]]]]]>" '("abcd" "" "]" "" "]" "" "]" ""))
2022  (test "abcd]]]a]]>" '("abcd" "" "]" "" "]]" "" "a" ""))
2023  (test "abc\r\ndef\n]]>" '("abc" " NL\n" "def" " NL\n"))
2024  (test "\r\n\r\n]]>" '("" " NL\n" "" " NL\n"))
2025  (test "\r\n\r\na]]>" '("" " NL\n" "" " NL\n" "a" ""))
2026  (test "abc&!!!]]>" '("abc" "&" "" "" "!!!" ""))
2027  (test "abc]]&gt;&gt&amp;]]]&gt;and]]>"
2028    '("abc" "" "]]" "" "" ">" "" "&" "gt" "" "" "&" "amp" "" ";" "" "]" ""
2029      "]]" "" "" ">" "and" ""))
2030))
2031
2032           
2033; procedure+:   SSAX:read-char-ref PORT
2034;
2035; [66]  CharRef ::=  '&#' [0-9]+ ';'
2036;                  | '&#x' [0-9a-fA-F]+ ';'
2037;
2038; This procedure must be called after we we have read "&#"
2039; that introduces a char reference.
2040; The procedure reads this reference and returns the corresponding char
2041; The current position in PORT will be after ";" that terminates
2042; the char reference
2043; Faults detected:
2044;       WFC: XML-Spec.html#wf-Legalchar
2045
2046(define (SSAX:read-char-ref port)
2047  (let* ((base
2048           (cond ((eq? (peek-char port) #\x) (read-char port) 16)
2049                 (else 10)))
2050         (name (next-token '() '(#\;) "XML [66]" port))
2051         (char-code (string->number name base)))
2052    (read-char port)    ; read the terminating #\; char
2053    (if (integer? char-code) (integer->char char-code)
2054      (parser-error port "[wf-Legalchar] broken for '" name "'"))))
2055
2056
2057; procedure+:   SSAX:handle-parsed-entity PORT NAME ENTITIES
2058;               CONTENT-HANDLER STR-HANDLER SEED
2059;
2060; Expand and handle a parsed-entity reference
2061; port - a PORT
2062; name - the name of the parsed entity to expand, a symbol
2063; entities - see ENTITIES
2064; content-handler -- procedure PORT ENTITIES SEED
2065;       that is supposed to return a SEED
2066; str-handler - a STR-HANDLER. It is called if the entity in question
2067; turns out to be a pre-declared entity
2068;
2069; The result is the one returned by CONTENT-HANDLER or STR-HANDLER
2070; Faults detected:
2071;       WFC: XML-Spec.html#wf-entdeclared
2072;       WFC: XML-Spec.html#norecursion
2073
2074(define SSAX:predefined-parsed-entities
2075  `(
2076    (,(string->symbol "amp") . "&")
2077    (,(string->symbol "lt") . "<")
2078    (,(string->symbol "gt") . ">")
2079    (,(string->symbol "apos") . "'")
2080    (,(string->symbol "quot") . "\"")))
2081
2082(define (SSAX:handle-parsed-entity port name entities
2083                                   content-handler str-handler seed)
2084  (cond   ; First we check the list of the declared entities
2085   ((assq name entities) =>
2086    (lambda (decl-entity)
2087      (let ((ent-body (cdr decl-entity)) ; mark the list to prevent recursion
2088            (new-entities (cons (cons name #f) entities)))
2089        (cond
2090         ((string? ent-body)
2091          (call-with-input-string ent-body
2092             (lambda (port) (content-handler port new-entities seed))))
2093         ((procedure? ent-body)
2094          (let ((port (ent-body)))
2095            (begin0
2096             (content-handler port new-entities seed)
2097             (close-input-port port))))
2098         (else
2099          (parser-error port "[norecursion] broken for " name))))))
2100    ((assq name SSAX:predefined-parsed-entities)
2101     => (lambda (decl-entity)
2102          (str-handler (cdr decl-entity) "" seed)))
2103    (else (parser-error port "[wf-entdeclared] broken for " name))))
2104
2105
2106
2107; The ATTLIST Abstract Data Type
2108; Currently is implemented as an assoc list sorted in the ascending
2109; order of NAMES.
2110
2111(define (make-empty-attlist) '())
2112
2113; Add a name-value pair to the existing attlist preserving the order
2114; Return the new list, in the sorted ascending order.
2115; Return #f if a pair with the same name already exists in the attlist
2116
2117(define (attlist-add attlist name-value)
2118  (if (null? attlist) (cons name-value attlist)
2119      (case (name-compare (car name-value) (caar attlist))
2120        ((=) #f)
2121        ((<) (cons name-value attlist))
2122        (else (cons (car attlist) (attlist-add (cdr attlist) name-value)))
2123        )))
2124
2125(define attlist-null? null?)
2126
2127; Given an non-null attlist, return a pair of values: the top and the rest
2128(define (attlist-remove-top attlist)
2129  (values (car attlist) (cdr attlist)))
2130
2131(define (attlist->alist attlist) attlist)
2132(define attlist-fold fold)
2133
2134; procedure+:   SSAX:read-attributes PORT ENTITIES
2135;
2136; This procedure reads and parses a production Attribute*
2137; [41] Attribute ::= Name Eq AttValue
2138; [10] AttValue ::=  '"' ([^<&"] | Reference)* '"'
2139;                 | "'" ([^<&'] | Reference)* "'"
2140; [25] Eq ::= S? '=' S?
2141;
2142;
2143; The procedure returns an ATTLIST, of Name (as UNRES-NAME), Value (as string)
2144; pairs. The current character on the PORT is a non-whitespace character
2145; that is not an ncname-starting character.
2146;
2147; Note the following rules to keep in mind when reading an 'AttValue'
2148; "Before the value of an attribute is passed to the application
2149; or checked for validity, the XML processor must normalize it as follows:
2150; - a character reference is processed by appending the referenced
2151;   character to the attribute value
2152; - an entity reference is processed by recursively processing the
2153;   replacement text of the entity [see ENTITIES]
2154;   [named entities amp lt gt quot apos are assumed pre-declared]
2155; - a whitespace character (#x20, #xD, #xA, #x9) is processed by appending #x20
2156;   to the normalized value, except that only a single #x20 is appended for a
2157;   "#xD#xA" sequence that is part of an external parsed entity or the
2158;   literal entity value of an internal parsed entity
2159; - other characters are processed by appending them to the normalized value "
2160;
2161;
2162; Faults detected:
2163;       WFC: XML-Spec.html#CleanAttrVals
2164;       WFC: XML-Spec.html#uniqattspec
2165
2166(define SSAX:read-attributes  ; SSAX:read-attributes port entities
2167 (let ()
2168                ; Read the AttValue from the PORT up to the delimiter
2169                ; (which can be a single or double-quote character,
2170                ; or even a symbol *eof*)
2171                ; 'prev-fragments' is the list of string fragments, accumulated
2172                ; so far, in reverse order.
2173                ; Return the list of fragments with newly read fragments
2174                ; prepended.
2175  (define (read-attrib-value delimiter port entities prev-fragments)
2176    (let* ((new-fragments
2177            (cons
2178             (next-token '() (cons delimiter
2179                                   '(#\newline #\return #\space #\tab #\< #\&))
2180                         "XML [10]" port)
2181             prev-fragments))
2182           (cterm (read-char port)))
2183      (if (or (eof-object? cterm) (eqv? cterm delimiter))
2184          new-fragments
2185          (case cterm
2186            ((#\newline #\space #\tab)
2187              (read-attrib-value delimiter port entities
2188                                 (cons " " new-fragments)))
2189            ((#\return)
2190              (if (eqv? (peek-char port) #\newline) (read-char port))
2191              (read-attrib-value delimiter port entities
2192                                 (cons " " new-fragments)))
2193            ((#\&)
2194              (cond
2195                ((eqv? (peek-char port) #\#)
2196                  (read-char port)
2197                  (read-attrib-value delimiter port entities
2198                     (cons (string (SSAX:read-char-ref port)) new-fragments)))
2199                (else
2200                 (read-attrib-value delimiter port entities
2201                     (read-named-entity port entities new-fragments)))))
2202            ((#\<) (parser-error port "[CleanAttrVals] broken"))
2203            (else (parser-error port "Can't happen"))))))
2204
2205                ; we have read "&" that introduces a named entity reference.
2206                ; read this reference and return the result of
2207                ; normalizing of the corresponding string
2208                ; (that is, read-attrib-value is applied to the replacement
2209                ; text of the entity)
2210                ; The current position will be after ";" that terminates
2211                ; the entity reference
2212  (define (read-named-entity port entities fragments)
2213    (let ((name (SSAX:read-NCName port)))
2214      (assert-curr-char '(#\;) "XML [68]" port)
2215      (SSAX:handle-parsed-entity port name entities
2216        (lambda (port entities fragments)
2217          (read-attrib-value '*eof* port entities fragments))
2218        (lambda (str1 str2 fragments)
2219          (if (equal? "" str2) (cons str1 fragments)
2220              (cons* str2 str1 fragments)))
2221        fragments)))
2222
2223  (lambda (port entities)
2224    (let loop ((attr-list (make-empty-attlist)))
2225      (if (not (SSAX:ncname-starting-char? (SSAX:skip-S port))) attr-list
2226          (let ((name (SSAX:read-QName port)))
2227            (SSAX:skip-S port)
2228            (assert-curr-char '(#\=) "XML [25]" port)
2229            (SSAX:skip-S port)
2230            (let ((delimiter 
2231                   (assert-curr-char '(#\' #\" ) "XML [10]" port)))
2232              (loop 
2233               (or (attlist-add attr-list 
2234                     (cons name 
2235                           (string-concatenate
2236                            (reverse
2237                             (read-attrib-value delimiter port entities
2238                                                '())))))
2239                   (parser-error port "[uniqattspec] broken for " name))))))))
2240))
2241
2242; a few lines of validation code
2243(run-test (letrec
2244    ((test (lambda (str decl-entities expected-res)
2245          (display "\ninput: ") (write str) (display "\nResult: ")
2246          (let ((result
2247                 (call-with-input-string str
2248              (lambda (port) (SSAX:read-attributes port decl-entities)))))
2249            (write result) (newline)
2250            (assert (equal? result expected-res))))))
2251    (test "" '() '())
2252    (test "href='http://a\tb\r\n\r\n\nc'" '()
2253          `((,(string->symbol "href") . "http://a b   c")))
2254    (test "_1 ='12&amp;' _2= \"\r\n\t12&#10;3\">" '()
2255          '((_1 . "12&") (_2 . "  12\n3")))
2256    (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2257          '((ent . "&lt;xx&gt;"))
2258          `((,(string->symbol "Abc") . ,(string-append "<&>"
2259                                            (string #\newline)))
2260            (,(string->symbol "Next") . "12<xx>34")))
2261    (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&en;34' />" 
2262          `((en . ,(lambda () (open-input-string "&quot;xx&apos;"))))
2263          `((,(string->symbol "Abc") . ,(string-append "<&>"
2264                                            (string #\newline)))
2265            (,(string->symbol "Next") . "12\"xx'34")))
2266    (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2267          '((ent . "&lt;&ent1;T;&gt;") (ent1 . "&amp;"))
2268          `((,(string->symbol "Abc") . ,(string-append "<&>"
2269                                            (string #\newline)))
2270            (,(string->symbol "Next") . "12<&T;>34")))
2271    (assert (failed?
2272        (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2273          '((ent . "<&ent1;T;&gt;") (ent1 . "&amp;")) '())))
2274    (assert (failed?
2275        (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2276          '((ent . "&lt;&ent;T;&gt;") (ent1 . "&amp;")) '())))
2277    (assert (failed?
2278        (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2279          '((ent . "&lt;&ent1;T;&gt;") (ent1 . "&ent;")) '())))
2280    (test "html:href='http://a\tb\r\n\r\n\nc'" '()
2281          `(((,(string->symbol "html") . ,(string->symbol "href"))
2282             . "http://a b   c")))
2283    (test "html:href='ref1' html:src='ref2'" '()
2284          `(((,(string->symbol "html") . ,(string->symbol "href"))
2285             . "ref1")
2286            ((,(string->symbol "html") . ,(string->symbol "src"))
2287             . "ref2")))
2288    (test "html:href='ref1' xml:html='ref2'" '()
2289          `(((,(string->symbol "html") . ,(string->symbol "href"))
2290             . "ref1")
2291            ((,SSAX:Prefix-XML . ,(string->symbol "html"))
2292             . "ref2")))
2293    (assert (failed? (test "html:href='ref1' html:href='ref2'" '() '())))
2294    (assert (failed? (test "html:href='<' html:href='ref2'" '() '())))
2295    (assert (failed? (test "html:href='ref1' html:href='&ref2;'" '() '())))
2296))
2297
2298; SSAX:resolve-name PORT UNRES-NAME NAMESPACES apply-default-ns?
2299;
2300; Convert an UNRES-NAME to a RES-NAME given the appropriate NAMESPACES
2301; declarations.
2302; the last parameter apply-default-ns? determines if the default
2303; namespace applies (for instance, it does not for attribute names)
2304;
2305; Per REC-xml-names/#nsc-NSDeclared, "xml" prefix is considered pre-declared
2306; and bound to the namespace name "http://www.w3.org/XML/1998/namespace".
2307;
2308; This procedure tests for the namespace constraints:
2309; http://www.w3.org/TR/REC-xml-names/#nsc-NSDeclared
2310
2311(define (SSAX:resolve-name port unres-name namespaces apply-default-ns?)
2312  (cond
2313   ((pair? unres-name)          ; it's a QNAME
2314    (cons
2315     (cond
2316     ((assq (car unres-name) namespaces) => cadr)
2317     ((eq? (car unres-name) SSAX:Prefix-XML) SSAX:Prefix-XML)
2318     (else
2319      (parser-error port "[nsc-NSDeclared] broken; prefix " (car unres-name))))
2320     (cdr unres-name)))
2321   (apply-default-ns?           ; Do apply the default namespace, if any
2322    (let ((default-ns (assq '*DEFAULT* namespaces)))
2323      (if (and default-ns (cadr default-ns))
2324          (cons (cadr default-ns) unres-name)
2325          unres-name)))         ; no default namespace declared
2326   (else unres-name)))          ; no prefix, don't apply the default-ns
2327           
2328         
2329(run-test
2330 (let* ((namespaces
2331        '((HTML UHTML . URN-HTML)
2332          (HTML UHTML-1 . URN-HTML)
2333          (A    UHTML . URN-HTML)))
2334        (namespaces-def
2335         (cons
2336          '(*DEFAULT* DEF . URN-DEF) namespaces))
2337        (namespaces-undef
2338         (cons
2339          '(*DEFAULT* #f . #f) namespaces-def))
2340        (port (current-input-port)))
2341
2342   (assert (equal? 'ABC 
2343                   (SSAX:resolve-name port 'ABC namespaces #t)))
2344   (assert (equal? '(DEF . ABC)
2345                   (SSAX:resolve-name port 'ABC namespaces-def #t)))
2346   (assert (equal? 'ABC
2347                   (SSAX:resolve-name port 'ABC namespaces-def #f)))
2348   (assert (equal? 'ABC
2349                   (SSAX:resolve-name port 'ABC namespaces-undef #t)))
2350   (assert (equal? '(UHTML . ABC)
2351                   (SSAX:resolve-name port '(HTML . ABC) namespaces-def #t)))
2352   (assert (equal? '(UHTML . ABC)
2353                   (SSAX:resolve-name port '(HTML . ABC) namespaces-def #f)))
2354   (assert (equal? `(,SSAX:Prefix-XML . space)
2355                   (SSAX:resolve-name port 
2356                       `(,(string->symbol "xml") . space) namespaces-def #f)))
2357   (assert (failed?
2358                   (SSAX:resolve-name port '(XXX . ABC) namespaces-def #f)))
2359))
2360
2361
2362; procedure+:   SSAX:uri-string->symbol URI-STR
2363; Convert a URI-STR to an appropriate symbol
2364(define (SSAX:uri-string->symbol uri-str)
2365  (string->symbol uri-str))
2366
2367; procedure+:   SSAX:complete-start-tag TAG PORT ELEMS ENTITIES NAMESPACES
2368;
2369; This procedure is to complete parsing of a start-tag markup. The
2370; procedure must be called after the start tag token has been
2371; read. TAG is an UNRES-NAME. ELEMS is an instance of xml-decl::elems;
2372; it can be #f to tell the function to do _no_ validation of elements
2373; and their attributes.
2374;
2375; This procedure returns several values:
2376;  ELEM-GI: a RES-NAME.
2377;  ATTRIBUTES: element's attributes, an ATTLIST of (RES-NAME . STRING)
2378;       pairs. The list does NOT include xmlns attributes.
2379;  NAMESPACES: the input list of namespaces amended with namespace
2380;       (re-)declarations contained within the start-tag under parsing
2381;  ELEM-CONTENT-MODEL
2382
2383; On exit, the current position in PORT will be the first character after
2384; #\> that terminates the start-tag markup.
2385;
2386; Faults detected:
2387;       VC: XML-Spec.html#enum
2388;       VC: XML-Spec.html#RequiredAttr
2389;       VC: XML-Spec.html#FixedAttr
2390;       VC: XML-Spec.html#ValueType
2391;       WFC: XML-Spec.html#uniqattspec (after namespaces prefixes are resolved)
2392;       VC: XML-Spec.html#elementvalid
2393;       WFC: REC-xml-names/#dt-NSName
2394
2395; Note, although XML Recommendation does not explicitly say it,
2396; xmlns and xmlns: attributes don't have to be declared (although they
2397; can be declared, to specify their default value)
2398
2399; Procedure:  SSAX:complete-start-tag tag-head port elems entities namespaces
2400(define SSAX:complete-start-tag
2401
2402 (let ((xmlns (string->symbol "xmlns"))
2403       (largest-dummy-decl-attr (list SSAX:largest-unres-name #f #f #f)))
2404
2405  ; Scan through the attlist and validate it, against decl-attrs
2406  ; Return an assoc list with added fixed or implied attrs.
2407  ; Note that both attlist and decl-attrs are ATTLISTs, and therefore,
2408  ; sorted
2409  (define (validate-attrs port attlist decl-attrs)
2410
2411    ; Check to see decl-attr is not of use type REQUIRED. Add
2412    ; the association with the default value, if any declared
2413    (define (add-default-decl decl-attr result)
2414      (let-values*
2415         (((attr-name content-type use-type default-value)
2416           (apply values decl-attr)))
2417         (and (eq? use-type 'REQUIRED)
2418              (parser-error port "[RequiredAttr] broken for" attr-name))
2419         (if default-value
2420             (cons (cons attr-name default-value) result)
2421             result)))
2422
2423    (let loop ((attlist attlist) (decl-attrs decl-attrs) (result '()))
2424      (if (attlist-null? attlist)
2425          (attlist-fold add-default-decl result decl-attrs)
2426          (let-values*
2427           (((attr attr-others)
2428             (attlist-remove-top attlist))
2429            ((decl-attr other-decls)
2430             (if (attlist-null? decl-attrs)
2431                 (values largest-dummy-decl-attr decl-attrs)
2432                 (attlist-remove-top decl-attrs)))
2433            )
2434           (case (name-compare (car attr) (car decl-attr))
2435             ((<) 
2436              (if (or (eq? xmlns (car attr))
2437                      (and (pair? (car attr)) (eq? xmlns (caar attr))))
2438                  (loop attr-others decl-attrs (cons attr result))
2439                  (parser-error port "[ValueType] broken for " attr)))
2440             ((>) 
2441              (loop attlist other-decls 
2442                    (add-default-decl decl-attr result)))
2443             (else      ; matched occurrence of an attr with its declaration
2444              (let-values*
2445               (((attr-name content-type use-type default-value)
2446                 (apply values decl-attr)))
2447               ; Run some tests on the content of the attribute
2448               (cond
2449                ((eq? use-type 'FIXED)
2450                 (or (equal? (cdr attr) default-value)
2451                     (parser-error port "[FixedAttr] broken for " attr-name)))
2452                ((eq? content-type 'CDATA) #t) ; everything goes
2453                ((pair? content-type)
2454                 (or (member (cdr attr) content-type)
2455                     (parser-error port "[enum] broken for " attr-name "="
2456                            (cdr attr))))
2457                (else
2458                 (SSAX:warn port "declared content type " content-type
2459                       " not verified yet")))
2460               (loop attr-others other-decls (cons attr result)))))
2461           ))))
2462           
2463
2464  ; Add a new namespace declaration to namespaces.
2465  ; First we convert the uri-str to a uri-symbol and search namespaces for
2466  ; an association (_ user-prefix . uri-symbol).
2467  ; If found, we return the argument namespaces with an association
2468  ; (prefix user-prefix . uri-symbol) prepended.
2469  ; Otherwise, we prepend (prefix uri-symbol . uri-symbol)
2470  (define (add-ns port prefix uri-str namespaces)
2471    (and (equal? "" uri-str)
2472         (parser-error port "[dt-NSName] broken for " prefix))
2473    (let ((uri-symbol (SSAX:uri-string->symbol uri-str)))
2474      (let loop ((nss namespaces))
2475        (cond
2476         ((null? nss)
2477          (cons (cons* prefix uri-symbol uri-symbol) namespaces))
2478         ((eq? uri-symbol (cddar nss))
2479          (cons (cons* prefix (cadar nss) uri-symbol) namespaces))
2480         (else (loop (cdr nss)))))))
2481     
2482  ; partition attrs into proper attrs and new namespace declarations
2483  ; return two values: proper attrs and the updated namespace declarations
2484  (define (adjust-namespace-decl port attrs namespaces)
2485    (let loop ((attrs attrs) (proper-attrs '()) (namespaces namespaces))
2486      (cond
2487       ((null? attrs) (values proper-attrs namespaces))
2488       ((eq? xmlns (caar attrs))        ; re-decl of the default namespace
2489        (loop (cdr attrs) proper-attrs 
2490              (if (equal? "" (cdar attrs))      ; un-decl of the default ns
2491                  (cons (cons* '*DEFAULT* #f #f) namespaces)
2492                  (add-ns port '*DEFAULT* (cdar attrs) namespaces))))
2493       ((and (pair? (caar attrs)) (eq? xmlns (caaar attrs)))
2494        (loop (cdr attrs) proper-attrs
2495              (add-ns port (cdaar attrs) (cdar attrs) namespaces)))
2496       (else
2497        (loop (cdr attrs) (cons (car attrs) proper-attrs) namespaces)))))
2498
2499    ; The body of the function
2500 (lambda (tag-head port elems entities namespaces)
2501  (let-values* 
2502   ((attlist (SSAX:read-attributes port entities))
2503    (empty-el-tag?
2504     (begin
2505       (SSAX:skip-S port)
2506       (and
2507        (eqv? #\/ 
2508              (assert-curr-char '(#\> #\/) "XML [40], XML [44], no '>'" port))
2509        (assert-curr-char '(#\>) "XML [44], no '>'" port))))
2510    ((elem-content decl-attrs)  ; see xml-decl for their type
2511     (if elems                  ; elements declared: validate!
2512         (cond
2513          ((assoc tag-head elems) =>
2514           (lambda (decl-elem)          ; of type xml-decl::decl-elem
2515             (values
2516              (if empty-el-tag? 'EMPTY-TAG (cadr decl-elem))
2517              (caddr decl-elem))))
2518          (else
2519           (parser-error port "[elementvalid] broken, no decl for " tag-head)))
2520         (values                ; non-validating parsing
2521          (if empty-el-tag? 'EMPTY-TAG 'ANY)
2522          #f)                   ; no attributes declared
2523         ))
2524    (merged-attrs (if decl-attrs (validate-attrs port attlist decl-attrs)
2525                      (attlist->alist attlist)))
2526    ((proper-attrs namespaces)
2527     (adjust-namespace-decl port merged-attrs namespaces))
2528    )
2529   ;(cerr "proper attrs: " proper-attrs nl)
2530   ; build the return value
2531   (values
2532    (SSAX:resolve-name port tag-head namespaces #t)
2533    (fold-right
2534     (lambda (name-value attlist)
2535       (or
2536        (attlist-add attlist
2537           (cons (SSAX:resolve-name port (car name-value) namespaces #f)
2538                 (cdr name-value)))
2539        (parser-error port "[uniqattspec] after NS expansion broken for " 
2540               name-value)))
2541     (make-empty-attlist)
2542     proper-attrs)
2543    namespaces
2544    elem-content)))))
2545
2546(run-test
2547 (let* ((urn-a (string->symbol "urn:a"))
2548        (urn-b (string->symbol "urn:b"))
2549        (urn-html (string->symbol "http://w3c.org/html"))
2550        (namespaces
2551         `((#f '"UHTML" . ,urn-html)
2552           ('"A"  '"UA" . ,urn-a)))
2553          (test
2554           (lambda (tag-head-name elems str)
2555             (call-with-input-string str
2556                (lambda (port)
2557                  (call-with-values
2558                      (lambda ()
2559                              (SSAX:complete-start-tag
2560                               (call-with-input-string tag-head-name
2561                                      (lambda (port) (SSAX:read-QName port)))
2562                               port
2563                               elems '() namespaces))
2564                    list))))))
2565
2566   ; First test with no validation of elements
2567   ;(test "TAG1" #f "")
2568   (assert (equal? `('"TAG1" () ,namespaces ANY)
2569                   (test "TAG1" #f ">")))
2570   (assert (equal? `('"TAG1" () ,namespaces EMPTY-TAG)
2571                   (test "TAG1" #f "/>")))
2572   (assert (equal? `('"TAG1" (('"HREF" . "a")) ,namespaces EMPTY-TAG)
2573                   (test "TAG1" #f "HREF='a'/>")))
2574   (assert (equal? `(('"UA" . '"TAG1") (('"HREF" . "a"))
2575                     ,(cons `(*DEFAULT* '"UA" . ,urn-a) namespaces) ANY)
2576                   (test "TAG1" #f "HREF='a' xmlns='urn:a'>")))
2577   (assert (equal? `('"TAG1" (('"HREF" . "a"))
2578                     ,(cons '(*DEFAULT* #f . #f) namespaces) ANY)
2579                   (test "TAG1" #f "HREF='a' xmlns=''>")))
2580   (assert (failed? (test "UA:TAG1" #f "HREF='a' xmlns=''/>")))
2581   (assert (equal? `(('"UA" . '"TAG1") ((('"UA" . '"HREF") . "a"))
2582                     ,(cons '(*DEFAULT* #f . #f) namespaces) ANY)
2583                   (test "A:TAG1" #f "A:HREF='a' xmlns=''>")))
2584   (assert (equal? `(('"UA" . '"TAG1") ((('"UA" . '"HREF") . "a"))
2585                     ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) ANY)
2586                   (test "A:TAG1" #f "A:HREF='a' xmlns='urn:b'>")))
2587   (assert (failed? (test "B:TAG1" #f "A:HREF='a' xmlns:b=''/>")))
2588   (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a"))
2589                     ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
2590                   (test "B:TAG1" #f "A:HREF='a' xmlns:B='urn:b'>")))
2591   (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a")
2592                                         ((,urn-b . '"SRC") . "b"))
2593                     ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
2594                   (test "B:TAG1" #f 
2595                         "B:SRC='b' A:HREF='a' xmlns:B='urn:b'>")))
2596   (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a")
2597                                         ((,urn-b . '"HREF") . "b"))
2598                          ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
2599                   (test "B:TAG1" #f 
2600                         "B:HREF=\"b\" A:HREF='a' xmlns:B='urn:b'>")))
2601   ; must be an error! Duplicate attr
2602   (assert (failed? (test "B:TAG1" #f
2603                          "HREF=\"b\" HREF='a' xmlns:B='urn:a'/>")))
2604   ; must be an error! Duplicate attr after ns expansion
2605   (assert (failed? (test "B:TAG1" #f 
2606                          "B:HREF=\"b\" A:HREF='a' xmlns:B='urn:a'/>")))
2607   (assert (equal? `(('"UA" . '"TAG1") (('"HREF" . "a")
2608                                        (('"UA" . '"HREF") . "b"))
2609                     ,(cons `(*DEFAULT* '"UA" . ,urn-a) namespaces) ANY)
2610                   (test "TAG1" #f 
2611                         "A:HREF=\"b\" HREF='a' xmlns='urn:a'>")))
2612   (assert (equal? `('"TAG1" ((('"UHTML" . '"HREF") . "a")
2613                              ((,urn-b . '"HREF") . "b"))
2614                     ,(append `(
2615                         ('"HTML" '"UHTML" . ,urn-html)
2616                         ('"B" ,urn-b . ,urn-b))
2617                              namespaces) ANY)
2618                   (test "TAG1" #f 
2619                         "B:HREF=\"b\" xmlns:B='urn:b' xmlns:HTML='http://w3c.org/html' HTML:HREF='a' >")))
2620
2621   ; Now test the validating parsing
2622   ; No decl for tag1
2623   (assert (failed? (test "TAG1" '((TAG2 ANY ()))
2624                          "B:HREF='b' xmlns:B='urn:b'>")))
2625   ; No decl for HREF elem
2626   (cond-expand
2627    ((not (or scm mit-scheme))  ; Regretfully, SCM treats '() as #f
2628     (assert (failed?
2629              (test "TAG1" '(('"TAG1" ANY ()))
2630                    "B:HREF='b' xmlns:B='urn:b'>"))))
2631    (else #t))
2632   ; No decl for HREF elem
2633   (assert (failed?
2634            (test "TAG1" '(('"TAG1" ANY (('"HREF1" CDATA IMPLIED #f))))
2635            "B:HREF='b' xmlns:B='urn:b'>")))
2636   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces EMPTY-TAG)
2637       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
2638             "HREF='b'/>")))
2639   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2640       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
2641             "HREF='b'>")))
2642   ; Req'd attribute not given error
2643   (assert (failed? 
2644            (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
2645                  ">")))
2646   ; Wrong content-type of the attribute
2647   (assert (failed? 
2648       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" ("c") REQUIRED #f))))
2649             "HREF='b'>")))
2650   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2651       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" ("c" "b") IMPLIED #f))))
2652             "HREF='b'>")))
2653   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2654       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED "c"))))
2655             "HREF='b'>")))
2656   ; Bad fixed attribute
2657   (assert (failed? 
2658         (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "c"))))
2659               "HREF='b'>")))
2660   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2661       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "b"))))
2662             "HREF='b'>")))
2663   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2664       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "b")))) ">")))
2665   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2666       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED "b")))) ">")))
2667   (assert (equal? `('"TAG1" () ,namespaces PCDATA)
2668       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED #f)))) ">")))
2669   ; Undeclared attr
2670   (assert (failed? 
2671        (test "TAG1"
2672              '(('"TAG1" PCDATA ((('"A" . '"HREF") CDATA IMPLIED "c"))))
2673              "HREF='b'>")))
2674   (assert (equal? `('"TAG1" (('"HREF" . "b") (('"UA" . '"HREF") . "c"))
2675                          ,namespaces PCDATA)
2676       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
2677                                       (('"A" . '"HREF") CDATA IMPLIED "c"))))
2678             "HREF='b'>")))
2679   (assert (equal? `(('"UA" . '"TAG1")
2680                     (('"HREF" . "b") (('"UA" . '"HREF") . "c"))
2681                     ,namespaces PCDATA)
2682       (test "A:TAG1" '((('"A" . '"TAG1") PCDATA
2683                         (('"HREF" NMTOKEN REQUIRED #f)
2684                          (('"A" . '"HREF") CDATA IMPLIED "c"))))
2685             "HREF='b'>")))
2686   (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
2687                     ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
2688       (test "B:TAG1" '((('"B" . '"TAG1") PCDATA (('"HREF" CDATA REQUIRED #f)
2689                           (('"xmlns" . '"B") CDATA IMPLIED "urn:b"))))
2690             "HREF='b'>")))
2691   (assert (equal? `((,urn-b . '"TAG1") (((,urn-b . '"HREF") . "b"))
2692                          ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
2693       (test "B:TAG1" '((('"B" . '"TAG1") PCDATA
2694                         ((('"B" . '"HREF") CDATA REQUIRED #f)
2695                          (('"xmlns" . '"B") CDATA IMPLIED "urn:b"))))
2696             "B:HREF='b'>")))
2697   (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
2698                     ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) PCDATA)
2699       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
2700                           ('"xmlns" CDATA IMPLIED "urn:b"))))
2701             "HREF='b'>")))
2702   ; xmlns not declared
2703   (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
2704                     ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) PCDATA)
2705       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
2706                           )))
2707             "HREF='b' xmlns='urn:b'>")))
2708   ; xmlns:B not declared
2709   (assert (equal? `((,urn-b . '"TAG1") (((,urn-b . '"HREF") . "b"))
2710                     ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
2711       (test "B:TAG1" '((('"B" . '"TAG1") PCDATA
2712                         ((('"B" . '"HREF") CDATA REQUIRED #f)
2713                           )))
2714             "B:HREF='b' xmlns:B='urn:b'>")))
2715))
2716
2717; procedure+:   SSAX:read-external-ID PORT
2718;
2719; This procedure parses an ExternalID production:
2720; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2721;               | 'PUBLIC' S PubidLiteral S SystemLiteral
2722; [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2723; [12] PubidLiteral ::=  '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2724; [13] PubidChar ::=  #x20 | #xD | #xA | [a-zA-Z0-9]
2725;                         | [-'()+,./:=?;!*#@$_%]
2726;
2727; This procedure is supposed to be called when an ExternalID is expected;
2728; that is, the current character must be either #\S or #\P that start
2729; correspondingly a SYSTEM or PUBLIC token. This procedure returns the
2730; SystemLiteral as a string. A PubidLiteral is disregarded if present.
2731 
2732(define (SSAX:read-external-ID port)
2733  (let ((discriminator (SSAX:read-NCName port)))
2734    (assert-curr-char SSAX:S-chars "space after SYSTEM or PUBLIC" port)
2735    (SSAX:skip-S port)
2736    (let ((delimiter 
2737          (assert-curr-char '(#\' #\" ) "XML [11], XML [12]" port)))
2738      (cond
2739        ((eq? discriminator (string->symbol "SYSTEM"))
2740          (begin0
2741            (next-token '() (list delimiter) "XML [11]" port)
2742            (read-char port)    ; reading the closing delim
2743            ))
2744         ((eq? discriminator (string->symbol "PUBLIC"))
2745           (skip-until (list delimiter) port)
2746           (assert-curr-char SSAX:S-chars "space after PubidLiteral" port)
2747           (SSAX:skip-S port)
2748           (let* ((delimiter 
2749                  (assert-curr-char '(#\' #\" ) "XML [11]" port))
2750                  (systemid
2751                    (next-token '() (list delimiter) "XML [11]" port)))
2752                (read-char port)        ; reading the closing delim
2753                systemid))
2754         (else
2755           (parser-error port "XML [75], " discriminator 
2756                  " rather than SYSTEM or PUBLIC"))))))
2757
2758
2759;-----------------------------------------------------------------------------
2760;                       Higher-level parsers and scanners
2761;
2762; They parse productions corresponding to the whole (document) entity
2763; or its higher-level pieces (prolog, root element, etc).
2764
2765
2766; Scan the Misc production in the context
2767; [1]  document ::=  prolog element Misc*
2768; [22] prolog ::= XMLDecl? Misc* (doctypedec l Misc*)?
2769; [27] Misc ::= Comment | PI |  S
2770;
2771; The following function should be called in the prolog or epilog contexts.
2772; In these contexts, whitespaces are completely ignored.
2773; The return value from SSAX:scan-Misc is either a PI-token,
2774; a DECL-token, a START token, or EOF.
2775; Comments are ignored and not reported.
2776
2777(define (SSAX:scan-Misc port)
2778  (let loop ((c (SSAX:skip-S port)))
2779    (cond
2780      ((eof-object? c) c)
2781      ((not (char=? c #\<))
2782        (parser-error port "XML [22], char '" c "' unexpected"))
2783      (else
2784        (let ((token (SSAX:read-markup-token port)))
2785          (case (xml-token-kind token)
2786            ((COMMENT) (loop (SSAX:skip-S port)))
2787            ((PI DECL START) token)
2788            (else
2789              (parser-error port "XML [22], unexpected token of kind "
2790                     (xml-token-kind token)
2791                     ))))))))
2792
2793; procedure+:   SSAX:read-char-data PORT EXPECT-EOF? STR-HANDLER SEED
2794;
2795; This procedure is to read the character content of an XML document
2796; or an XML element.
2797; [43] content ::=
2798;       (element | CharData | Reference | CDSect | PI
2799;       | Comment)*
2800; To be more precise, the procedure reads CharData, expands CDSect
2801; and character entities, and skips comments. The procedure stops
2802; at a named reference, EOF, at the beginning of a PI or a start/end tag.
2803;
2804; port
2805;       a PORT to read
2806; expect-eof?
2807;       a boolean indicating if EOF is normal, i.e., the character
2808;       data may be terminated by the EOF. EOF is normal
2809;       while processing a parsed entity.
2810; str-handler
2811;       a STR-HANDLER
2812; seed
2813;       an argument passed to the first invocation of STR-HANDLER.
2814;
2815; The procedure returns two results: SEED and TOKEN.
2816; The SEED is the result of the last invocation of STR-HANDLER, or the
2817; original seed if STR-HANDLER was never called.
2818;
2819; TOKEN can be either an eof-object (this can happen only if
2820; expect-eof? was #t), or:
2821;     - an xml-token describing a START tag or an END-tag;
2822;       For a start token, the caller has to finish reading it.
2823;     - an xml-token describing the beginning of a PI. It's up to an
2824;       application to read or skip through the rest of this PI;
2825;     - an xml-token describing a named entity reference.
2826;
2827; CDATA sections and character references are expanded inline and
2828; never returned. Comments are silently disregarded.
2829;
2830; As the XML Recommendation requires, all whitespace in character data
2831; must be preserved. However, a CR character (#xD) must be disregarded
2832; if it appears before a LF character (#xA), or replaced by a #xA character
2833; otherwise. See Secs. 2.10 and 2.11 of the XML Recommendation. See also
2834; the canonical XML Recommendation.
2835
2836        ; SSAX:read-char-data port expect-eof? str-handler seed
2837(define SSAX:read-char-data
2838 (let
2839     ((terminators-usual '(#\< #\& #\return))
2840      (terminators-usual-eof '(#\< *eof* #\& #\return))
2841
2842      (handle-fragment
2843       (lambda (fragment str-handler seed)
2844         (if (string-null? fragment) seed
2845             (str-handler fragment "" seed))))
2846      )
2847
2848   (lambda (port expect-eof? str-handler seed)
2849
2850     ; Very often, the first character we encounter is #\<
2851     ; Therefore, we handle this case in a special, fast path
2852     (if (eqv? #\< (peek-char port))
2853
2854         ; The fast path
2855         (let ((token (SSAX:read-markup-token port)))
2856           (case (xml-token-kind token)
2857             ((START END)       ; The most common case
2858              (values seed token))
2859             ((CDSECT)
2860              (let ((seed (SSAX:read-CDATA-body port str-handler seed)))
2861                (SSAX:read-char-data port expect-eof? str-handler seed)))
2862             ((COMMENT) (SSAX:read-char-data port expect-eof?
2863                                             str-handler seed))
2864             (else
2865              (values seed token))))
2866
2867
2868         ; The slow path
2869         (let ((char-data-terminators
2870                (if expect-eof? terminators-usual-eof terminators-usual)))
2871
2872           (let loop ((seed seed))
2873             (let* ((fragment
2874                     (next-token '() char-data-terminators 
2875                                 "reading char data" port))
2876                    (term-char (peek-char port)) ; one of char-data-terminators
2877                    )
2878               (if (eof-object? term-char)
2879                   (values
2880                    (handle-fragment fragment str-handler seed)
2881                    term-char)
2882                   (case term-char
2883                     ((#\<)
2884                      (let ((token (SSAX:read-markup-token port)))
2885                        (case (xml-token-kind token)
2886                          ((CDSECT)
2887                           (loop
2888                            (SSAX:read-CDATA-body port str-handler
2889                                (handle-fragment fragment str-handler seed))))
2890                          ((COMMENT)
2891                           (loop (handle-fragment fragment str-handler seed)))
2892                          (else
2893                           (values
2894                            (handle-fragment fragment str-handler seed)
2895                            token)))))
2896                     ((#\&)
2897                      (case (peek-next-char port)
2898                        ((#\#) (read-char port) 
2899                         (loop (str-handler fragment
2900                                       (string (SSAX:read-char-ref port))
2901                                       seed)))
2902                        (else
2903                         (let ((name (SSAX:read-NCName port)))
2904                           (assert-curr-char '(#\;) "XML [68]" port)
2905                           (values
2906                            (handle-fragment fragment str-handler seed)
2907                            (make-xml-token 'ENTITY-REF name))))))
2908                     (else              ; This must be a CR character
2909                      (if (eqv? (peek-next-char port) #\newline)
2910                          (read-char port))
2911                      (loop (str-handler fragment (string #\newline) seed))))
2912                   ))))))))
2913
2914
2915; a few lines of validation code
2916(run-test (letrec
2917  ((a-tag (make-xml-token 'START (string->symbol "BR")))
2918   (a-ref (make-xml-token 'ENTITY-REF (string->symbol "lt")))
2919   (eof-object (with-input-from-string "" read))
2920   (str-handler (lambda (fragment foll-fragment seed)
2921     (if (string-null? foll-fragment) (cons fragment seed)
2922         (cons* foll-fragment fragment seed))))
2923   (test (lambda (str expect-eof? expected-data expected-token)
2924           (display "\nbody: ") (write str) (display "\nResult: ")
2925          (let-values*
2926           (((seed token)
2927             (call-with-input-string str
2928                (lambda (port)
2929                 (SSAX:read-char-data port expect-eof? str-handler '()))))
2930            (result (reverse seed)))
2931           (write result)
2932           (display " ")
2933           (display token)
2934           (assert (equal? result expected-data)
2935                   (equal? token expected-token)))))
2936   )
2937  (test "" #t '() eof-object)
2938  (assert (failed? (test "" #f '() eof-object)))
2939  (test "  " #t '("  ") eof-object)
2940  (test "<BR/>" #f '() a-tag)
2941  (test " <BR  />" #f '(" ") a-tag)
2942
2943  (test " &lt;" #f '(" ") a-ref)
2944  (test " a&lt;" #f '(" a") a-ref)
2945  (test " a &lt;" #f '(" a ") a-ref)
2946
2947  (test " <!-- comment--> a  a<BR/>" #f '(" " " a  a") a-tag)
2948  (test " <!-- comment-->\ra  a<BR/>" #f '(" " "" "\n" "a  a") a-tag)
2949  (test " <!-- comment-->\r\na  a<BR/>" #f '(" " "" "\n" "a  a") a-tag)
2950  (test " <!-- comment-->\r\na\t\r\r\na<BR/>" #f
2951        '(" " "" "\n" "a\t" "\n" "" "\n" "a") a-tag)
2952  (test "a<!-- comment--> a  a<BR/>" #f '("a" " a  a") a-tag)
2953  (test "&#x21;<BR/>" #f '("" "!") a-tag)
2954  (test "&#x21;\n<BR/>" #f '("" "!" "\n") a-tag)
2955  (test "\t&#x21;\n<BR/>" #f '("\t" "!" "\n") a-tag)
2956  (test "\t&#x21;\na a<BR/>" #f '("\t" "!" "\na a") a-tag)
2957  (test "\t&#x21;\ra a<BR/>" #f '("\t" "!" "" "\n" "a a") a-tag)
2958
2959  (test " \ta &#x21;   b <BR/>" #f '(" \ta " "!" "   b ") a-tag)
2960  (test " \ta &#x20;   b <BR/>" #f '(" \ta " " " "   b ") a-tag)
2961
2962  (test "<![CDATA[<]]><BR/>" #f '("<") a-tag)
2963  (test "<![CDATA[]]]><BR/>" #f '("]") a-tag)
2964  (test "\t<![CDATA[<]]><BR/>" #f '("\t" "<") a-tag)
2965  (test "\t<![CDATA[<]]>a b<BR/>" #f '("\t" "<" "a b") a-tag)
2966  (test "\t<![CDATA[<]]>  a b<BR/>" #f '("\t" "<" "  a b") a-tag)
2967
2968  (test "\td <![CDATA[  <\r\r\n]]>  a b<BR/>" #f 
2969        '("\td " "  <" "\n" "" "\n" "  a b") a-tag)
2970))
2971
2972
2973
2974; procedure+:   SSAX:assert-token TOKEN KIND GI
2975; Make sure that TOKEN is of anticipated KIND and has anticipated GI
2976; Note GI argument may actually be a pair of two symbols, Namespace
2977; URI or the prefix, and of the localname.
2978; If the assertion fails, error-cont is evaluated by passing it
2979; three arguments: token kind gi. The result of error-cont is returned.
2980(define (SSAX:assert-token token kind gi error-cont)
2981  (or
2982    (and (xml-token? token)
2983      (eq? kind (xml-token-kind token))
2984      (equal? gi (xml-token-head token)))
2985    (error-cont token kind gi)))
2986
Note: See TracBrowser for help on using the repository browser.