source: project/release/3/ssax/ssax-core.scm @ 13362

Last change on this file since 13362 was 13362, checked in by Jim Ursetto, 11 years ago

ssax: remove STRING from standard bindings, allows Unicode entities

File size: 113.9 KB
Line 
1; May be compiled with:
2;       chicken snapshot.scm -output-file snapshot.c
3;  and
4;       gcc -O3 -fomit-frame-pointer snapshot.c -lchicken -lm -lstuffed-chicken
5
6(declare
7  (fixnum)
8  (not standard-bindings string) ; you must load utf8 to convert numeric entities > 255
9  (export SSAX:warn SSAX:skip-pi attlist-fold
10          SSAX:Prefix-XML SSAX:complete-start-tag SSAX:skip-S SSAX:read-markup-token
11          SSAX:assert-token parser-error SSAX:read-char-data 
12          SSAX:skip-internal-dtd SSAX:S-chars SSAX:read-QName
13          assert-curr-char SSAX:read-NCNname name-compare SSAX:resolve-name
14          SSAX:largest-unres-name SSAX:read-cdata-body SSAX:read-attributes
15          SSAX:ncname-starting-char? SSAX:read-external-ID SSAX:scan-Misc
16          SSAX:handle-parsed-entity SSAX:complete-start-tag xml-token-head xml-token-kind
17          SSAX:uri-string->symbol string-whitespace? SSAX:read-pi-body-as-string) )
18
19(use srfi-1 srfi-13)
20
21;(define pp pretty-print)
22;(define command-line argv)
23;KL
24; Chicken-enabled version of Oleg Kiselyov's Standard Scheme "Prelude"
25;
26
27;                  My Standard Scheme "Prelude"
28;
29; This version of the prelude contains several forms and procedures
30; that are specific to a Gambit-C 3.0 system.
31; See myenv-scm.scm, myenv-bigloo.scm, etc. for versions
32; of this prelude that are tuned to other Scheme systems.
33;
34; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
35
36
37;;; assert truth of an expression (or a sequence of expressions)
38;;; if there is more than one expression, they're 'AND'ed
39(define-macro (assert . x)
40                  (if (null? (cdr x))
41                    `(or ,@x (error "failed assertion" ',@x))
42                    `(or (and ,@x) (error "failed assertion" '(,@x)))))
43
44;;;
45(define (assure exp error-msg)
46  (or exp (error error-msg)))
47
48
49;;; Gambit's include and declare are disabled
50;(define-macro (include file) `(begin #f))
51;(define-macro (declare . x)  `(begin #f))
52
53
54;;; like cout << arguments << args
55;;;   where argument can be any Scheme object. If it's a procedure
56;;;   (without args) it's executed rather than printed (like newline)
57(define (cout . args)
58  (for-each (lambda (x)
59              (if (procedure? x) (x) (display x)))
60            args))
61
62(define (cerr . args)
63  (for-each (lambda (x)
64    (if (procedure? x) (x (current-error-port)) (display x (current-error-port))))
65            args))
66
67(define nl (string #\newline))
68
69(define (identify-error msg args . disposition-msgs)
70  (let ((port (current-error-port)))
71    (newline port)
72    (display "ERROR" port)
73    (display msg port)
74    (for-each (lambda (msg) (display msg port))
75              (append args disposition-msgs))
76    (newline port)))
77
78; Some useful increment/decrement operators
79; Note, ##fixnum prefix is Gambit-specific, it means that the
80; operands assumed FIXNUM (as they ought to be anyway).
81; This perfix could be safely removed: it'll leave the code just as
82; correct, but more portable (and less efficient)
83
84; Mutable increment
85(define-macro (++! x) `(set! ,x (+ 1 ,x)))
86
87; Read-only increment
88(define-macro (++ x) `(add1 ,x))
89
90; Mutable decrement
91(define-macro (--! x) `(set! ,x (- ,x 1)))
92
93; Read-only decrement
94(define-macro (-- x) `(sub1 ,x))
95
96
97; Some useful control operators
98
99                        ; if condition is true, execute stmts in turn
100                        ; and return the result of the last statement
101                        ; otherwise, return #f
102(define-macro (when condition . stmts)
103  `(and ,condition (begin ,@stmts)))
104 
105
106                        ; if condition is false execute stmts in turn
107                        ; and return the result of the last statement
108                        ; otherwise, return #t
109                        ; This primitive is often called 'unless'
110(define-macro (whennot condition . stmts)
111  `(or ,condition (begin ,@stmts)))
112
113
114                        ; Execute a sequence of forms and return the
115                        ; result of the _first_ one. Like PROG1 in Lisp.
116                        ; Typically used to evaluate one or more forms with
117                        ; side effects and return a value that must be
118                        ; computed before some or all of the side effects
119                        ; happen.
120(define-macro (begin0 form . forms)
121  (let ((var (gensym)))
122    `(let ((,var ,form)) ,@forms ,var)))
123
124                        ; Prepend an ITEM to a LIST, like a Lisp macro PUSH
125                        ; an ITEM can be an expression, but ls must be a VAR
126(define-macro (push! item ls)
127  `(set! ,ls (cons ,item ,ls)))
128
129                        ; Is str the empty string?
130                        ; string-null? str -> bool
131                        ; See Olin Shiver's Underground String functions
132(define (string-null? str) (zero? (string-length str)))
133
134
135; Like let* but allowing for multiple-value bindings
136(define-macro (let-values* bindings . body)
137  (if (null? bindings) (cons 'begin body)
138      (apply (lambda (vars initializer)
139         (let ((cont 
140                (cons 'let-values* 
141                      (cons (cdr bindings) body))))
142           (cond
143            ((not (pair? vars))         ; regular let case, a single var
144             `(let ((,vars ,initializer)) ,cont))
145            ((null? (cdr vars))         ; single var, see the prev case
146             `(let ((,(car vars) ,initializer)) ,cont))
147           (else                        ; the most generic case
148            `(receive ,vars ,initializer ,cont)))))
149       (car bindings))))
150
151
152                        ; assoc-primitives with a default clause
153                        ; If the search in the assoc list fails, the
154                        ; default action argument is returned. If this
155                        ; default action turns out to be a thunk,
156                        ; the result of its evaluation is returned.
157                        ; If the default action is not given, an error
158                        ; is signaled
159
160(define-macro (assq-def key alist . default-action-arg)
161  (let ((default-action
162        (if (null? default-action-arg)
163          `(error "failed to assq key '" ,key "' in a list " ,alist)
164          (let ((defact-symb (car default-action-arg)))
165            `(if (procedure? ,defact-symb) (,defact-symb) ,defact-symb)))))
166    `(or (assq ,key ,alist) ,default-action)))
167
168(define-macro (assv-def key alist . default-action-arg)
169  (let ((default-action
170        (if (null? default-action-arg)
171          `(error "failed to assv key '" ,key "' in a list " ,alist)
172          (let ((defact-symb (car default-action-arg)))
173            `(if (procedure? ,defact-symb) (,defact-symb) ,defact-symb)))))
174    `(or (assv ,key ,alist) ,default-action)))
175
176(define-macro (assoc-def key alist . default-action-arg)
177  (let ((default-action
178        (if (null? default-action-arg)
179          `(error "failed to assoc key '" ,key "' in a list " ,alist)
180          (let ((defact-symb (car default-action-arg)))
181            `(if (procedure? ,defact-symb) (,defact-symb) ,defact-symb)))))
182    `(or (assoc ,key ,alist) ,default-action)))
183
184
185; Convenience macros to avoid quoting of symbols
186; being deposited/looked up in the environment
187(define-macro (env.find key) `(%%env.find ',key))
188(define-macro (env.demand key) `(%%env.demand ',key))
189(define-macro (env.bind key value) `(%%env.bind ',key ,value))
190
191; This code provides informative error messages
192;   for SSAX (S)XML parser.
193;
194;
195; NOTE: Chicken-specific !
196; It was tested with SSAX version 4.5
197;
198; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
199
200
201
202;****************************************************************************
203;                       My Scheme misc utility functions
204;               (mainly dealing with string and list manipulations)
205;
206; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
207
208;(include "myenv.scm") ; include target dependent stuff
209
210
211;------------------------------------------------------------------------
212;                               Iterator ANY?
213;
214; -- procedure+: any? PRED COLLECTION
215;       Searches for the first element in the collection satisfying a
216;       given predicate
217;       That is, the procedure applies PRED to every element of the
218;       COLLECTION in turn.
219;       The first element for which PRED returns non-#f stops the iteration;
220;       the value of the predicate is returned.
221;       If none of the elements of the COLLECTION satisfy the predicate,
222;       the return value from the procedure is #f
223;       COLLECTION can be a list, a vector, a string, or an input port.
224
225(define (any? <pred?> coll)
226  (cond
227    ((list? coll)
228      (let loop ((curr-l coll))
229        (if (null? curr-l) #f
230          (or (<pred?> (car curr-l)) (loop (cdr curr-l))))))
231         
232    ((vector? coll)
233      (let ((len (vector-length coll)))
234       (let loop ((i 0))
235        (if (>= i len) #f
236          (or (<pred?> (vector-ref coll i)) (loop (+ 1 i)))))))
237
238    ((string? coll)
239      (let ((len (string-length coll)))
240       (let loop ((i 0))
241        (if (>= i len) #f
242          (or (<pred?> (string-ref coll i)) (loop (+ 1 i)))))))
243
244    ((input-port? coll)
245      (let loop ((c (read-char coll)))
246        (if (eof-object? c) #f
247          (or (<pred?> c) (loop (read-char coll))))))
248
249    (else (error "any? on an invalid collection"))))
250
251(define (test-any?)
252
253  (define (test-driver pred? coll expected-result)
254    (let ((res (any? pred? coll)))
255      (if (not (eqv? res expected-result))
256        (error "computed result " res "differs from the expected one "
257               expected-result))))
258  (define (eq-a? x) (if (char=? x #\a) x #f))
259  (define (gt1? x) (if (> x 1) x #f))
260 
261  (cout "finding an element in a list" nl)
262  (test-driver gt1? '(1 2 3 4 5) 2)
263  (test-driver gt1? '(1 1 1 1 1) #f)
264  (test-driver gt1? '(4 1 1 1 1) 4)
265  (test-driver gt1? '(4 5 6 1 9) 4)
266  (test-driver gt1? '(-4 -5 -6 1 9) 9)
267  (test-driver eq-a? '(#\b #\c #\a #\k) #\a)
268 
269  (cout "finding an element in a vector" nl)
270  (test-driver gt1? '#(1 2 3 4 5) 2)
271  (test-driver gt1? '#(1 1 1 1 1) #f)
272  (test-driver gt1? '#(4 1 1 1 1) 4)
273  (test-driver gt1? '#(4 5 6 1 9) 4)
274  (test-driver gt1? '#(-4 -5 -6 1 9) 9)
275  (test-driver eq-a? '#(#\b #\c #\a #\k) #\a)
276 
277  (cout "done" nl nl)
278)
279
280
281
282;------------------------------------------------------------------------
283;               Some list manipulation functions
284
285; -- procedure+: list-intersperse SRC-L ELEM
286; inserts ELEM between elements of the SRC-L, returning a freshly allocated
287; list (cells, that is)
288     
289(define (list-intersperse src-l elem)
290  (if (null? src-l) src-l
291    (let loop ((l (cdr src-l)) (dest (cons (car src-l) '())))
292      (if (null? l) (reverse dest)
293        (loop (cdr l) (cons (car l) (cons elem dest)))))))
294
295
296; -- procedure+: list-intersperse! SRC-L ELEM
297; inserts ELEM between elements of the SRC-L inplace
298     
299(define (list-intersperse! src-l elem)
300  (if (null? src-l) src-l
301    (let loop ((l src-l))
302      (let ((next-l (cdr l)))
303        (if (null? next-l) src-l
304          (begin
305            (set-cdr! l (cons elem next-l))
306            (loop next-l)))))))
307
308
309        ; List-tail-difference: given two lists, list1 and list2 where
310        ; list2 is presumably a tail of list1, return
311        ; a (freshly allocated) list which is a difference between list1
312        ; and list2. If list2 is *not* a tail of list1, the entire list1
313        ; is returned.
314(define (list-tail-diff list1 list2)
315  (let loop ((l1-curr list1) (difference '()))
316    (cond
317      ((eq? l1-curr list2) (reverse difference))
318      ((null? l1-curr) (reverse difference))
319      (else (loop (cdr l1-curr) (cons (car l1-curr) difference))))))
320
321
322;------------------------------------------------------------------------
323
324
325                ; Return the index of the first occurence of a-char in str, or #f
326(define (string-index str a-char)
327  (let loop ((pos 0))
328    (cond
329      ((>= pos (string-length str)) #f) ; whole string has been searched, in vain
330      ((char=? a-char (string-ref str pos)) pos)
331      (else (loop (+ 1 pos))))))
332
333                ; Return the index of the last occurence of a-char in str, or #f
334(define (string-rindex str a-char)
335  (let loop ((pos (-- (string-length str))))
336    (cond
337      ((negative? pos) #f)              ; whole string has been searched, in vain
338      ((char=? a-char (string-ref str pos)) pos)
339      (else (loop (-- pos))))))
340
341
342
343;
344; -- procedure+: substring? PATTERN STRING
345;     Searches STRING to see if it contains the substring PATTERN.
346;     Returns the index of the first substring of STRING that is equal
347;     to PATTERN; or `#f' if STRING does not contain PATTERN.
348;
349;          (substring? "rat" "pirate")             =>  2
350;          (substring? "rat" "outrage")            =>  #f
351;          (substring? "" any-string)              =>  0
352
353(define (substring? pattern str)
354  (let* ((pat-len (string-length pattern))
355         (search-span (- (string-length str) pat-len))
356         (c1 (if (zero? pat-len) #f (string-ref pattern 0)))
357         (c2 (if (<= pat-len 1) #f (string-ref pattern 1))))
358    (cond
359     ((not c1) 0)           ; empty pattern, matches upfront
360     ((not c2) (string-index str c1)) ; one-char pattern
361     (else                  ; matching a pattern of at least two chars
362        (let outer ((pos 0))
363          (cond
364            ((> pos search-span) #f)    ; nothing was found thru the whole str
365            ((not (char=? c1 (string-ref str pos)))
366                (outer (+ 1 pos)))      ; keep looking for the right beginning
367            ((not (char=? c2 (string-ref str (+ 1 pos))))
368                (outer (+ 1 pos)))      ; could've done pos+2 if c1 == c2....
369            (else                       ; two char matched: high probability
370                                        ; the rest will match too
371                (let inner ((i-pat 2) (i-str (+ 2 pos)))
372                   (if (>= i-pat pat-len) pos ; whole pattern matched
373                      (if (char=? (string-ref pattern i-pat)
374                                  (string-ref str i-str))
375                        (inner (+ 1 i-pat) (+ 1 i-str))
376                        (outer (+ 1 pos))))))))))))     ; mismatch after partial match
377
378; Here are some specialized substring? functions
379;
380; -- procedure+: string-prefix? PATTERN STRING
381; -- procedure+: string-prefix-ci? PATTERN STRING
382; checks to make sure that PATTERN is a prefix of STRING
383;
384;          (string-prefix? "pir" "pirate")             =>  #t
385;          (string-prefix? "rat" "outrage")            =>  #f
386;          (string-prefix? "" any-string)              =>  #t
387;          (string-prefix? any-string any-string)      =>  #t
388
389(define (string-prefix? pattern str)
390  (let loop ((i 0))
391    (cond
392      ((>= i (string-length pattern)) #t)
393      ((>= i (string-length str)) #f)
394      ((char=? (string-ref pattern i) (string-ref str i))
395        (loop (++ i)))
396      (else #f))))
397
398(define (string-prefix-ci? pattern str)
399  (let loop ((i 0))
400    (cond
401      ((>= i (string-length pattern)) #t)
402      ((>= i (string-length str)) #f)
403      ((char-ci=? (string-ref pattern i) (string-ref str i))
404        (loop (++ i)))
405      (else #f))))
406
407; -- procedure+: string-suffix? PATTERN STRING
408; -- procedure+: string-suffix-ci? PATTERN STRING
409; checks to make sure that PATTERN is a suffix of STRING
410;
411;          (string-suffix? "ate" "pirate")             =>  #t
412;          (string-suffix? "rag" "outrage")            =>  #f
413;          (string-suffix? "" any-string)              =>  #t
414;          (string-suffix? any-string any-string)      =>  #t
415
416(define (string-suffix? pattern str)
417  (let loop ((i (-- (string-length pattern))) (j (-- (string-length str))))
418    (cond
419      ((negative? i) #t)
420      ((negative? j) #f)
421      ((char=? (string-ref pattern i) (string-ref str j))
422        (loop (-- i) (-- j)))
423      (else #f))))
424
425(define (string-suffix-ci? pattern str)
426  (let loop ((i (-- (string-length pattern))) (j (-- (string-length str))))
427    (cond
428      ((negative? i) #t)
429      ((negative? j) #f)
430      ((char-ci=? (string-ref pattern i) (string-ref str j))
431        (loop (-- i) (-- j)))
432      (else #f))))
433
434
435;               String case modification functions
436
437                        ; Return a new string made of characters of the
438                        ; original string in the lower case
439(define (string-downcase str)
440  (do ((target-str (make-string (string-length str))) (i 0 (++ i)))
441      ((>= i (string-length str)) target-str)
442      (string-set! target-str i (char-downcase (string-ref str i)))))
443
444                        ; Return a new string made of characters of the
445                        ; original string in the upper case
446(define (string-upcase str)
447  (do ((target-str (make-string (string-length str))) (i 0 (++ i)))
448      ((>= i (string-length str)) target-str)
449      (string-set! target-str i (char-upcase (string-ref str i)))))
450
451                        ; Lower the case of string's characters inplace
452(define (string-downcase! str)
453  (do ((i 0 (++ i))) ((>= i (string-length str)))
454    (string-set! str i (char-downcase (string-ref str i)))))
455
456                        ; Raise the case of string's characters inplace
457(define (string-upcase! str)
458  (do ((i 0 (++ i))) ((>= i (string-length str)))
459    (string-set! str i (char-upcase (string-ref str i)))))
460
461
462;
463; -- procedure+: string->integer STR START END
464;
465; Makes sure a substring of the STR from START (inclusive) till END (exclusive)
466; is a representation of a non-negative integer in decimal notation. If so, this
467; integer is returned. Otherwise -- when the substring contains non-decimal
468; characters, or when the range from START till END is not within STR, the
469; result is #f.
470;
471; This procedure is a simplification of the standard string->number.
472; The latter is far more generic: for example, it will try to read
473; strings like "1/2" "1S2" "1.34" and even "1/0" (the latter causing
474; a zero-divide error). Note that to string->number,  "1S2" is a valid
475; representation of an _inexact_ integer (100 to be precise).
476; Oftentimes we want to be more restrictive about what we consider a number; we
477; want merely to read an integral label.
478
479(define (string->integer str start end)
480  (and (< -1 start end (++ (string-length str)))
481    (let loop ((pos start) (accum 0))
482      (cond
483        ((>= pos end) accum)
484        ((char-numeric? (string-ref str pos))
485          (loop (++ pos) (+ (char->integer (string-ref str pos)) 
486              (- (char->integer #\0)) (* 10 accum))))
487        (else #f)))))
488
489
490;
491; -- procedure+: string-split STRING
492; -- procedure+: string-split STRING '()
493; -- procedure+: string-split STRING '() MAXSPLIT
494;
495; Returns a list of whitespace delimited words in STRING.
496; If STRING is empty or contains only whitespace, then the empty list
497; is returned. Leading and trailing whitespaces are trimmed.
498; If MAXSPLIT is specified and positive, the resulting list will contain at most
499; MAXSPLIT elements, the last of which is the string remaining after
500; (MAXSPLIT - 1) splits. If MAXSPLIT is specified and non-positive,
501; the empty list is returned. "In time critical applications it behooves
502; you not to split into more fields than you really need."
503;
504; -- procedure+: string-split STRING CHARSET
505; -- procedure+: string-split STRING CHARSET MAXSPLIT
506;
507; Returns a list of words delimited by the characters in CHARSET in
508; STRING. CHARSET is a list of characters that are treated as delimiters.
509; Leading or trailing delimeters are NOT trimmed. That is, the resulting
510; list will have as many initial empty string elements as there are
511; leading delimiters in STRING.
512;
513; If MAXSPLIT is specified and positive, the resulting list will contain at most
514; MAXSPLIT elements, the last of which is the string remaining after
515; (MAXSPLIT - 1) splits. If MAXSPLIT is specified and non-positive,
516; the empty list is returned. "In time critical applications it behooves
517; you not to split into more fields than you really need."
518;
519; This is based on the split function in Python/Perl
520;
521; (string-split " abc d e f  ") ==> ("abc" "d" "e" "f")
522; (string-split " abc d e f  " '() 1) ==> ("abc d e f  ")
523; (string-split " abc d e f  " '() 0) ==> ()
524; (string-split ":abc:d:e::f:" '(#\:)) ==> ("" "abc" "d" "e" "" "f" "")
525; (string-split ":" '(#\:)) ==> ("" "")
526; (string-split "root:x:0:0:Lord" '(#\:) 2) ==> ("root" "x:0:0:Lord")
527; (string-split "/usr/local/bin:/usr/bin:/usr/ucb/bin" '(#\:))
528; ==> ("/usr/local/bin" "/usr/bin" "/usr/ucb/bin")
529; (string-split "/usr/local/bin" '(#\/)) ==> ("" "usr" "local" "bin")
530
531(define (string-split str . rest)
532                ; maxsplit is a positive number
533  (define (split-by-whitespace str maxsplit)
534    (define (skip-ws i yet-to-split-count)
535      (cond
536        ((>= i (string-length str)) '())
537        ((char-whitespace? (string-ref str i))
538          (skip-ws (++ i) yet-to-split-count))
539        (else (scan-beg-word (++ i) i yet-to-split-count))))
540    (define (scan-beg-word i from yet-to-split-count)
541      (cond
542        ((zero? yet-to-split-count)
543          (cons (substring str from (string-length str)) '()))
544        (else (scan-word i from yet-to-split-count))))
545    (define (scan-word i from yet-to-split-count)
546      (cond
547        ((>= i (string-length str))
548          (cons (substring str from i) '()))
549        ((char-whitespace? (string-ref str i))
550          (cons (substring str from i) 
551            (skip-ws (++ i) (-- yet-to-split-count))))
552        (else (scan-word (++ i) from yet-to-split-count))))
553    (skip-ws 0 (-- maxsplit)))
554
555                ; maxsplit is a positive number
556                ; str is not empty
557  (define (split-by-charset str delimeters maxsplit)
558    (define (scan-beg-word from yet-to-split-count)
559      (cond
560        ((>= from (string-length str)) '(""))
561        ((zero? yet-to-split-count)
562          (cons (substring str from (string-length str)) '()))
563        (else (scan-word from from yet-to-split-count))))
564    (define (scan-word i from yet-to-split-count)
565      (cond
566        ((>= i (string-length str))
567          (cons (substring str from i) '()))
568        ((memq (string-ref str i) delimeters)
569          (cons (substring str from i) 
570            (scan-beg-word (++ i) (-- yet-to-split-count))))
571        (else (scan-word (++ i) from yet-to-split-count))))
572    (scan-beg-word 0 (-- maxsplit)))
573
574                        ; resolver of overloading...
575                        ; if omitted, maxsplit defaults to
576                        ; (++ (string-length str))
577  (if (string-null? str) '()
578    (if (null? rest) 
579      (split-by-whitespace str (++ (string-length str)))
580      (let ((charset (car rest))
581          (maxsplit
582            (if (pair? (cdr rest)) (cadr rest) (++ (string-length str)))))
583        (cond
584          ((not (positive? maxsplit)) '())
585          ((null? charset) (split-by-whitespace str maxsplit))
586          (else (split-by-charset str charset maxsplit))))))
587)
588
589;------------------------------------------------------------------------
590;                       EXEC-PATH facility
591; The EXEC-PATH is the path used to load .scm files or run executable files
592; with.
593;
594; The path could be specified as an env-variable SCMPATH
595; if this env variable isn't set, the path of the executable file
596; is used
597
598; (define EXEC-PATH:PATH
599
600;       ; extract the path from argv[0] and make it absolute
601;   (letrec ((extract-argv0-path
602;         (lambda ()
603;           (let ((argv0-rel-path
604;                 (let ((argv0 (vector-ref argv 0)))
605;                       ; reverse search for the last PATH-SEPARATOR-CHAR
606;                   (let loop ((i (-- (string-length argv0))))
607;                     (cond
608;                       ((not (positive? i)) #f)
609;                       ((char=? (string-ref argv0 i) PATH-SEPARATOR-CHAR)
610;                         (substring argv0 0 (+ i 1)))
611;                       (else (loop (-- i)))))))
612;               (curr-path (OS:getcwd)))
613;             (cond
614;               ((not argv0-rel-path) curr-path)
615;                       ; check if argv0-rel-path was an absolute path
616;               ((char=? PATH-SEPARATOR-CHAR (string-ref argv0-rel-path 0))
617;                 argv0-rel-path)
618;               (else (string-append curr-path (string PATH-SEPARATOR-CHAR)
619;                   argv0-rel-path))))))
620             
621;               ; cached path
622;       (path-itself #f)
623;                       ; returned the cached path
624;       (get-path (lambda () path-itself))
625     
626;                       ; Compute the cached path, runs only the first time around
627;       (set-path
628;         (lambda ()
629;           (let ((path-being-computed
630;                 (or (OS:getenv "SCMPATH") (extract-argv0-path))))
631;             (assert path-being-computed)
632;             (set! path-itself
633;               (if (char=? PATH-SEPARATOR-CHAR
634;                   (string-ref path-being-computed
635;                     (-- (string-length path-being-computed))))
636;                 path-being-computed
637;            ; make sure path-being-computed ends in PATH-SEPARATOR-CHAR
638;                 (string-append path-being-computed (string PATH-SEPARATOR-CHAR))))
639;             (cerr nl "EXEC PATH is set to " path-itself nl)
640;             (set! EXEC-PATH:PATH get-path)
641;             (get-path)))))
642   
643;     set-path))
644
645   
646
647; (define (EXEC-PATH:load scm-file-name)
648;   (load (string-append (EXEC-PATH:PATH) scm-file-name)))
649
650; (define (EXEC-PATH:help)
651;   (cerr nl "Environment variable SCMPATH could be set up to point" nl)
652;   (cerr "to a directory containing dynamically loadable dictionaries/conf files" nl)
653;   (cerr "if this variable is not set, the directory where this executable resides" nl)
654;   (cerr "will be used" nl nl)
655;   (cerr "The current path is set to " (EXEC-PATH:PATH) nl nl))
656
657; ; "shell-out" to run a separately compiled executable with given
658; ; arguments, from the current EXEC-PATH
659; ; all 'args' (which must be strings) are "concatenated" together
660; ; to form a command-line for the executable
661; (define (EXEC-PATH:system executable . args)
662;   (let ((command-line
663;         (apply string-append
664;           (cons (EXEC-PATH:PATH) (cons executable (cons " " args))))))
665;     (cerr "about to execute: " command-line nl)
666;     (OS:system command-line)))
667
668;****************************************************************************
669;                       Simple Parsing of input
670;
671; The following simple functions surprisingly often suffice to parse
672; an input stream. They either skip, or build and return tokens,
673; according to inclusion or delimiting semantics. The list of
674; characters to expect, include, or to break at may vary from one
675; invocation of a function to another. This allows the functions to
676; easily parse even context-sensitive languages.
677;
678; EOF is generally frowned on, and thrown up upon if encountered.
679; Exceptions are mentioned specifically. The list of expected characters
680; (characters to skip until, or break-characters) may include an EOF
681; "character", which is to be coded as symbol *eof*
682;
683; The input stream to parse is specified as a PORT, which is usually
684; the last (and optional) argument. It defaults to the current input
685; port if omitted.
686;
687; IMPORT
688; This package relies on a function parser-error, which must be defined
689; by a user of the package. The function has the following signature:
690;       parser-error PORT MESSAGE SPECIALISING-MSG*
691; Many procedures of this package call parser-error to report a parsing
692; error.  The first argument is a port, which typically points to the
693; offending character or its neighborhood. Most of the Scheme systems
694; let the user query a PORT for the current position. MESSAGE is the
695; description of the error. Other arguments supply more details about
696; the problem.
697;
698; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
699
700;(include "myenv.scm") ; include target dependent stuff
701
702;------------------------------------------------------------------------
703;                    Preparation and tuning section
704
705; This package is heavily used. Therefore, we take time to tune it in,
706; in particular for Gambit.
707
708
709; Concise and efficient definition of a function that takes one or two
710; optional arguments, e.g.,
711;
712; (define-opt (foo arg1 arg2 (optional (arg3 init3) (arg4 init4))) body)
713;
714; define-opt is identical to a regular define, with one exception: the
715; last argument may have a form
716;       (optional (binding init) ... )
717
718(cond-expand
719 ((or bigloo gambit)
720
721    ; For Gambit and Bigloo, which support DSSSL extended lambdas,
722    ; define-opt like the one in the example above is re-written into
723    ; (define-opt (foo arg1 arg2 #!optional (arg3 init3) (arg4 init4)) body)
724  (define-macro (define-opt bindings body . body-rest)
725    (let* ((rev-bindings (reverse bindings))
726           (opt-bindings
727            (and (pair? rev-bindings) (pair? (car rev-bindings))
728                 (eq? 'optional (caar rev-bindings))
729                 (cdar rev-bindings))))
730      (if opt-bindings
731        `(define ,(append (reverse
732                           (cons (with-input-from-string "#!optional" read)
733                                 (cdr rev-bindings)))
734                          opt-bindings)
735           ,body ,@body-rest)
736        `(define ,bindings ,body ,@body-rest))))
737  )
738 (else
739
740    ; For Scheme systems without DSSSL extensions, we rewrite the definition
741    ; of foo of the example above into the following:
742    ;   (define (foo arg1 arg2 . rest)
743    ;      (let* ((arg3 (if (null? rest) init3 (car rest)))
744    ;             (arg4 (if (or (null? rest) (null? (cdr rest))) init4
745    ;                       (cadr rest)))
746    ;        body))
747    ; We won't handle more than two optional arguments
748
749  (define-macro define-opt (lambda (bindings body . body-rest)
750    (let* ((rev-bindings (reverse bindings))
751           (opt-bindings
752            (and (pair? rev-bindings) (pair? (car rev-bindings))
753                 (eq? 'optional (caar rev-bindings))
754                 (cdar rev-bindings))))
755      (cond
756       ((not opt-bindings)              ; No optional arguments
757        `(define ,bindings ,body ,@body-rest))
758       ((null? opt-bindings)
759        `(define ,bindings ,body ,@body-rest))
760       ((or (null? (cdr opt-bindings)) (null? (cddr opt-bindings)))
761        (let* ((rest (gensym))          ; One or two optional args
762               (first-opt (car opt-bindings))
763               (second-opt (and (pair? (cdr opt-bindings))
764                                (cadr opt-bindings))))
765          `(define ,(let loop ((bindings bindings))
766                      (if (null? (cdr bindings)) rest
767                          (cons (car bindings) (loop (cdr bindings)))))
768             (let* ((,(car first-opt) (if (null? ,rest)
769                                          ,(cadr first-opt)
770                                          (car ,rest)))
771                    ,@(if second-opt
772                          `((,(car second-opt) 
773                             (if (or (null? ,rest) (null? (cdr ,rest)))
774                                 ,(cadr second-opt)
775                                 (cadr ,rest))))
776                          '()))
777               ,body ,@body-rest))))
778       (else
779        '(error "At most two options are supported"))))))
780  ))
781
782(cond-expand
783 (gambit
784      ; The following macro makes a macro that turns (read-char port)
785      ; into (##read-char port). We can't enter such a macro-converter
786      ; directly as readers of SCM and Bigloo, for ones, don't like
787      ; identifiers with two leading # characters
788   (define-macro (gambitize clause)
789     `(define-macro ,clause
790        ,(list 'quasiquote
791            (cons
792             (string->symbol (string-append "##"
793                                            (symbol->string (car clause))))
794             (map (lambda (id) (list 'unquote id)) (cdr clause))))))
795   (gambitize (read-char port))
796   (gambitize (peek-char port))
797   (gambitize (eof-object? port))
798   ;(gambitize (string-append a b))
799   )
800 (chicken
801   (define-macro (read-char port) `(##sys#read-char-0 ,port))
802   (define-macro (peek-char port) `(##sys#peek-char-0 ,port))
803   )
804 (else #t))
805
806
807
808;------------------------------------------------------------------------
809
810; -- procedure+: peek-next-char [PORT]
811;       advances to the next character in the PORT and peeks at it.
812;       This function is useful when parsing LR(1)-type languages
813;       (one-char-read-ahead).
814;       The optional argument PORT defaults to the current input port.
815
816(define-opt (peek-next-char (optional (port (current-input-port))))
817  (read-char port) 
818  (peek-char port)) 
819
820
821;------------------------------------------------------------------------
822
823; -- procedure+: assert-curr-char CHAR-LIST STRING [PORT]
824;       Reads a character from the PORT and looks it up
825;       in the CHAR-LIST of expected characters
826;       If the read character was found among expected, it is returned
827;       Otherwise, the procedure writes a nasty message using STRING
828;       as a comment, and quits.
829;       The optional argument PORT defaults to the current input port.
830;
831(define-opt (assert-curr-char expected-chars comment
832                              (optional (port (current-input-port))))
833  (let ((c (read-char port)))
834    (if (memq c expected-chars) c
835    (parser-error port "Wrong character " c
836           " (0x" (if (eof-object? c) "*eof*"
837                    (number->string (char->integer c) 16)) ") "
838           comment ". " expected-chars " expected"))))
839           
840
841; -- procedure+: skip-until CHAR-LIST [PORT]
842;       Reads and skips characters from the PORT until one of the break
843;       characters is encountered. This break character is returned.
844;       The break characters are specified as the CHAR-LIST. This list
845;       may include EOF, which is to be coded as a symbol *eof*
846;
847; -- procedure+: skip-until NUMBER [PORT]
848;       Skips the specified NUMBER of characters from the PORT and returns #f
849;
850;       The optional argument PORT defaults to the current input port.
851
852
853(define-opt (skip-until arg (optional (port (current-input-port))) )
854  (cond
855   ((number? arg)               ; skip 'arg' characters
856      (do ((i arg (-- i)))
857          ((<= i 0) #f)
858          (if (eof-object? (read-char port))
859            (parser-error port "Unexpected EOF while skipping "
860                         arg " characters"))))
861   (else                        ; skip until break-chars (=arg)
862     (let loop ((c (read-char port)))
863       (cond
864         ((memv c arg) c)
865         ((eof-object? c)
866           (if (memv '*eof* arg) c
867             (parser-error port "Unexpected EOF while skipping until " arg)))
868         (else (loop (read-char port))))))))
869
870
871; -- procedure+: skip-while CHAR-LIST [PORT]
872;       Reads characters from the PORT and disregards them,
873;       as long as they are mentioned in the CHAR-LIST.
874;       The first character (which may be EOF) peeked from the stream
875;       that is NOT a member of the CHAR-LIST is returned. This character
876;       is left on the stream.
877;       The optional argument PORT defaults to the current input port.
878
879(define-opt (skip-while skip-chars (optional (port (current-input-port))) )
880  (do ((c (peek-char port) (peek-char port)))
881      ((not (memv c skip-chars)) c)
882      (read-char port)))
883 
884; whitespace const
885
886;------------------------------------------------------------------------
887;                               Stream tokenizers
888
889
890; -- procedure+:
891;    next-token PREFIX-CHAR-LIST BREAK-CHAR-LIST [COMMENT-STRING] [PORT]
892;       skips any number of the prefix characters (members of the
893;       PREFIX-CHAR-LIST), if any, and reads the sequence of characters
894;       up to (but not including) a break character, one of the
895;       BREAK-CHAR-LIST.
896;       The string of characters thus read is returned.
897;       The break character is left on the input stream
898;       The list of break characters may include EOF, which is to be coded as
899;       a symbol *eof*. Otherwise, EOF is fatal, generating an error message
900;       including a specified COMMENT-STRING (if any)
901;
902;       The optional argument PORT defaults to the current input port.
903;
904; Note: since we can't tell offhand how large the token being read is
905; going to be, we make a guess, pre-allocate a string, and grow it by
906; quanta if necessary. The quantum is always the length of the string
907; before it was extended the last time. Thus the algorithm does
908; a Fibonacci-type extension, which has been proven optimal.
909; Note, explicit port specification in read-char, peek-char helps.
910
911; Procedure input-parse:init-buffer
912; returns an initial buffer for next-token* procedures.
913; The input-parse:init-buffer may allocate a new buffer per each invocation:
914;       (define (input-parse:init-buffer) (make-string 32))
915; Size 32 turns out to be fairly good, on average.
916; That policy is good only when a Scheme system is multi-threaded with
917; preemptive scheduling, or when a Scheme system supports shared substrings.
918; In all the other cases, it's better for input-parse:init-buffer to
919; return the same static buffer. next-token* functions return a copy
920; (a substring) of accumulated data, so the same buffer can be reused.
921; We shouldn't worry about new token being too large: next-token will use
922; a larger buffer automatically. Still, the best size for the static buffer
923; is to allow most of the tokens to fit in.
924; Using a static buffer _dramatically_ reduces the amount of produced garbage
925; (e.g., during XML parsing).
926(define input-parse-buffer (make-parameter #f))
927
928(define input-parse:init-buffer
929  (lambda ()
930    (or
931      (input-parse-buffer)
932      (let ((buffer (make-string 512)))
933        (input-parse-buffer buffer)
934        buffer))))
935
936(define-opt (next-token prefix-skipped-chars break-chars
937                        (optional (comment "") (port (current-input-port))) )
938  (let* ((buffer (input-parse:init-buffer))
939         (curr-buf-len (string-length buffer)) (quantum 16))
940    (let loop ((i 0) (c (skip-while prefix-skipped-chars port)))
941      (cond
942        ((memq c break-chars) (substring buffer 0 i))
943        ((eof-object? c)
944          (if (memq '*eof* break-chars)
945            (substring buffer 0 i)              ; was EOF expected?
946            (parser-error port "EOF while reading a token " comment)))
947        (else
948          (if (>= i curr-buf-len)       ; make space for i-th char in buffer
949            (begin                      ; -> grow the buffer by the quantum
950              (set! buffer (string-append buffer (make-string quantum)))
951              (set! quantum curr-buf-len)
952              (set! curr-buf-len (string-length buffer))))
953          (string-set! buffer i c)
954          (read-char port)                      ; move to the next char
955          (loop (++ i) (peek-char port))
956          )))))
957       
958
959; Another version of next-token, accumulating characters in a list rather
960; than in a string buffer. I heard that it tends to work faster.
961; In reality, it works just as fast as the string buffer version above,
962; but it allocates 50% more memory and thus has to run garbage collection
963; 50% as many times. See next-token-comp.scm
964
965(define-opt (next-token-list-based prefix-skipped-chars break-chars
966                  (optional (comment "") (port (current-input-port))) )
967  (let* ((first-char (skip-while prefix-skipped-chars port))
968         (accum-chars (cons first-char '())))
969    (cond
970      ((eof-object? first-char)
971        (if (memq '*eof* break-chars) ""
972          (parser-error port "EOF while skipping before reading token "
973                       comment)))
974      ((memq first-char break-chars) "")
975      (else
976        (read-char port)                ; consume the first-char
977        (let loop ((tail accum-chars) (c (peek-char port)))
978          (cond
979            ((memq c break-chars) (list->string accum-chars))
980            ((eof-object? c)
981              (if (memq '*eof* break-chars)
982                (list->string accum-chars)              ; was EOF expected?
983                (parser-error port "EOF while reading a token " comment)))
984            (else
985              (read-char port)          ; move to the next char
986              (set-cdr! tail (cons c '()))
987              (loop (cdr tail) (peek-char port))
988        )))))))
989
990
991; -- procedure+: next-token-of INC-CHARSET [PORT]
992;       Reads characters from the PORT that belong to the list of characters
993;       INC-CHARSET. The reading stops at the first character which is not
994;       a member of the set. This character is left on the stream.
995;       All the read characters are returned in a string.
996;
997; -- procedure+: next-token-of PRED [PORT]
998;       Reads characters from the PORT for which PRED (a procedure of one
999;       argument) returns non-#f. The reading stops at the first character
1000;       for which PRED returns #f. That character is left on the stream.
1001;       All the results of evaluating of PRED up to #f are returned in a
1002;       string.
1003;
1004;       PRED is a procedure that takes one argument (a character
1005;       or the EOF object) and returns a character or #f. The returned
1006;       character does not have to be the same as the input argument
1007;       to the PRED. For example,
1008;       (next-token-of (lambda (c)
1009;                         (cond ((eof-object? c) #f)
1010;                               ((char-alphabetic? c) (char-downcase c))
1011;                               (else #f))))
1012;       will try to read an alphabetic token from the current
1013;       input port, and return it in lower case.
1014;
1015;       The optional argument PORT defaults to the current input port.
1016;
1017; Note: since we can't tell offhand how large the token being read is
1018; going to be, we make a guess, pre-allocate a string, and grow it by
1019; quanta if necessary. The quantum is always the length of the string
1020; before it was extended the last time. Thus the algorithm does
1021; a Fibonacci-type extension, which has been proven optimal.
1022;
1023; This procedure is similar to next-token but only it implements
1024; an inclusion rather than delimiting semantics.
1025
1026(define-opt (next-token-of incl-list/pred
1027                           (optional (port (current-input-port))) )
1028  (let* ((buffer (input-parse:init-buffer))
1029         (curr-buf-len (string-length buffer)) (quantum 16))
1030  (if (procedure? incl-list/pred)
1031    (let loop ((i 0) (c (peek-char port)))
1032      (cond
1033        ((incl-list/pred c) =>
1034          (lambda (c)
1035            (if (>= i curr-buf-len)     ; make space for i-th char in buffer
1036              (begin                    ; -> grow the buffer by the quantum
1037                (set! buffer (string-append buffer (make-string quantum)))
1038                (set! quantum curr-buf-len)
1039                (set! curr-buf-len (string-length buffer))))
1040            (string-set! buffer i c)
1041            (read-char port)                    ; move to the next char
1042            (loop (++ i) (peek-char port))))
1043        (else (substring buffer 0 i))))
1044                        ; incl-list/pred is a list of allowed characters
1045    (let loop ((i 0) (c (peek-char port)))
1046      (cond
1047        ((not (memq c incl-list/pred)) (substring buffer 0 i))
1048        (else
1049          (if (>= i curr-buf-len)       ; make space for i-th char in buffer
1050            (begin                      ; -> grow the buffer by the quantum
1051              (set! buffer (string-append buffer (make-string quantum)))
1052              (set! quantum curr-buf-len)
1053              (set! curr-buf-len (string-length buffer))))
1054          (string-set! buffer i c)
1055          (read-char port)                      ; move to the next char
1056          (loop (++ i) (peek-char port))
1057          ))))))
1058
1059
1060; -- procedure+: read-line [PORT]
1061;       Reads one line of text from the PORT, and returns it as a string.
1062;       A line is a (possibly empty) sequence of characters terminated
1063;       by CR, CRLF or LF (or even the end of file).
1064;       The terminating character (or CRLF combination) is removed from
1065;       the input stream. The terminating character(s) is not a part
1066;       of the return string either.
1067;       If EOF is encountered before any character is read, the return
1068;       value is EOF.
1069;
1070;       The optional argument PORT defaults to the current input port.
1071
1072(define-opt (read-line (optional (port (current-input-port))) )
1073  (if (eof-object? (peek-char port)) (peek-char port)
1074    (let* ((line
1075             (next-token '() '(#\newline #\return *eof*)
1076                         "reading a line" port))
1077           (c (read-char port)))        ; must be either \n or \r or EOF
1078       (and (eq? c #\return) (eq? (peek-char port) #\newline)
1079         (read-char port))                      ; skip \n that follows \r
1080       line)))
1081
1082
1083; -- procedure+: read-string N [PORT]
1084;       Reads N characters from the PORT, and  returns them in a string.
1085;       If EOF is encountered before N characters are read, a shorter string
1086;       will be returned.
1087;       If N is not positive, an empty string will be returned.
1088;       The optional argument PORT defaults to the current input port.
1089
1090(define-opt (read-string n (optional (port (current-input-port))) )
1091  (if (not (positive? n)) ""
1092    (let ((buffer (make-string n)))
1093      (let loop ((i 0) (c (read-char port)))
1094        (if (eof-object? c) (substring buffer 0 i)
1095          (let ((i1 (++ i)))
1096            (string-set! buffer i c)
1097            (if (= i1 n) buffer
1098              (loop i1 (read-char port)))))))))
1099
1100
1101; -- Function: find-string-from-port? STR IN-PORT MAX-NO-CHARS
1102;    Looks for a string STR within the first MAX-NO-CHARS chars of the
1103;    input port IN-PORT
1104;    MAX-NO-CHARS may be omitted: in that case, the search span would be
1105;    limited only by the end of the input stream.
1106;    When the STR is found, the function returns the number of
1107;    characters it has read from the port, and the port is set
1108;    to read the first char after that (that is, after the STR)
1109;    The function returns #f when the string wasn't found
1110; Note the function reads the port *STRICTLY* sequentially, and does not
1111; perform any buffering. So the function can be used even if the port is open
1112; on a pipe or other communication channel.
1113;
1114; Probably can be classified as misc-io.
1115;
1116; Notes on the algorithm.
1117; A special care should be taken in a situation when one had achieved a partial
1118; match with (a head of) STR, and then some unexpected character appeared in
1119; the stream. It'll be rash to discard all already read characters. Consider
1120; an example of string "acab" and the stream "bacacab...", specifically when
1121;    a  c  a _b_
1122; b  a  c  a  c  a  b ...
1123; that is, when 'aca' had matched, but then 'c' showed up in the stream
1124; while we were looking for 'b'. In that case, discarding all already read
1125; characters and starting the matching process from scratch, that is,
1126; from 'c a b ...', would miss a certain match.
1127; Note, we don't actually need to keep already read characters, or at least
1128; strlen(str) characters in some kind of buffer. If there has been no match,
1129; we can safely discard read characters. If there was some partial match,
1130; we already know the characters before, they are in the STR itself, so
1131; we don't need a special buffer for that.
1132
1133;;; "MISCIO" Search for string from port.
1134; Written 1995 by Oleg Kiselyov (oleg@ponder.csci.unt.edu)
1135; Modified 1996 by A. Jaffer (jaffer@ai.mit.edu)
1136;
1137; This code is in the public domain.
1138
1139(define (MISCIO:find-string-from-port? str <input-port> . max-no-char)
1140  (set! max-no-char (if (null? max-no-char) #f (car max-no-char)))
1141  (letrec
1142      ((no-chars-read 0)
1143       (my-peek-char                    ; Return a peeked char or #f
1144        (lambda () (and (or (not max-no-char) (< no-chars-read max-no-char))
1145                        (let ((c (peek-char <input-port>)))
1146                          (if (eof-object? c) #f c)))))
1147       (next-char (lambda () (read-char <input-port>)
1148                          (set! no-chars-read  (+ 1 no-chars-read))))
1149       (match-1st-char                  ; of the string str
1150        (lambda ()
1151          (let ((c (my-peek-char)))
1152            (if (not c) #f
1153                (begin (next-char)
1154                       (if (char=? c (string-ref str 0))
1155                           (match-other-chars 1)
1156                           (match-1st-char)))))))
1157       ;; There has been a partial match, up to the point pos-to-match
1158       ;; (for example, str[0] has been found in the stream)
1159       ;; Now look to see if str[pos-to-match] for would be found, too
1160       (match-other-chars
1161        (lambda (pos-to-match)
1162          (if (>= pos-to-match (string-length str))
1163              no-chars-read             ; the entire string has matched
1164              (let ((c (my-peek-char)))
1165                (and c
1166                     (if (not (char=? c (string-ref str pos-to-match)))
1167                         (backtrack 1 pos-to-match)
1168                         (begin (next-char)
1169                                (match-other-chars (+ 1 pos-to-match)))))))))
1170
1171       ;; There had been a partial match, but then a wrong char showed up.
1172       ;; Before discarding previously read (and matched) characters, we check
1173       ;; to see if there was some smaller partial match. Note, characters read
1174       ;; so far (which matter) are those of str[0..matched-substr-len - 1]
1175       ;; In other words, we will check to see if there is such i>0 that
1176       ;; substr(str,0,j) = substr(str,i,matched-substr-len)
1177       ;; where j=matched-substr-len - i
1178       (backtrack
1179        (lambda (i matched-substr-len)
1180          (let ((j (- matched-substr-len i)))
1181            (if (<= j 0)
1182              (match-1st-char)  ; backed off completely to the begining of str
1183              (let loop ((k 0))
1184                (if (>= k j)
1185                   (match-other-chars j) ; there was indeed a shorter match
1186                   (if (char=? (string-ref str k)
1187                               (string-ref str (+ i k)))
1188                     (loop (+ 1 k))
1189                     (backtrack (+ 1 i) matched-substr-len))))))))
1190       )
1191    (match-1st-char)))
1192
1193(define find-string-from-port? MISCIO:find-string-from-port?)
1194
1195
1196;-----------------------------------------------------------------------------
1197;   This is a test driver for miscio:find-string-from-port?, to make sure it
1198;                       really works as intended
1199
1200; moved to vinput-parse.scm
1201;       Functional XML parsing framework: SAX/DOM and SXML parsers
1202;             with support for XML Namespaces and validation
1203;
1204; This is a package of low-to-high level lexing and parsing procedures
1205; that can be combined to yield a SAX, a DOM, a validating parsers, or
1206; a parser intended for a particular document type. The procedures in
1207; the package can be used separately to tokenize or parse various
1208; pieces of XML documents. The package supports XML Namespaces,
1209; internal and external parsed entities, user-controlled handling of
1210; whitespace, and validation. This module therefore is intended to be
1211; a framework, a set of "Lego blocks" you can use to build a parser
1212; following any discipline and performing validation to any degree. As
1213; an example of the parser construction, this file includes a
1214; semi-validating SXML parser.
1215
1216; The present XML framework has a "sequential" feel of SAX yet a
1217; "functional style" of DOM. Like a SAX parser, the framework scans
1218; the document only once and permits incremental processing. An
1219; application that handles document elements in order can run as
1220; efficiently as possible. _Unlike_ a SAX parser, the framework does
1221; not require an application register stateful callbacks and surrender
1222; control to the parser. Rather, it is the application that can drive
1223; the framework -- calling its functions to get the current lexical or
1224; syntax element. These functions do not maintain or mutate any state
1225; save the input port. Therefore, the framework permits parsing of XML
1226; in a pure functional style, with the input port being a monad (or a
1227; linear, read-once parameter).
1228
1229; Besides the PORT, there is another monad -- SEED. Most of the
1230; middle- and high-level parsers are single-threaded through the
1231; seed. The functions of this framework do not process or affect the
1232; SEED in any way: they simply pass it around as an instance of an
1233; opaque datatype.  User functions, on the other hand, can use the
1234; seed to maintain user's state, to accumulate parsing results, etc. A
1235; user can freely mix his own functions with those of the
1236; framework. On the other hand, the user may wish to instantiate a
1237; high-level parser: SSAX:make-elem-parser or SSAX:make-parser.  In
1238; the latter case, the user must provide functions of specific
1239; signatures, which are called at predictable moments during the
1240; parsing: to handle character data, element data, or processing
1241; instructions (PI). The functions are always given the SEED, among
1242; other parameters, and must return the new SEED.
1243
1244; From a functional point of view, XML parsing is a combined
1245; pre-post-order traversal of a "tree" that is the XML document
1246; itself. This down-and-up traversal tells the user about an element
1247; when its start tag is encountered. The user is notified about the
1248; element once more, after all element's children have been
1249; handled. The process of XML parsing therefore is a fold over the
1250; raw XML document. Unlike a fold over trees defined in [1], the
1251; parser is necessarily single-threaded -- obviously as elements
1252; in a text XML document are laid down sequentially. The parser
1253; therefore is a tree fold that has been transformed to accept an
1254; accumulating parameter [1,2].
1255
1256; Formally, the denotational semantics of the parser can be expressed
1257; as
1258; parser:: (Start-tag -> Seed -> Seed) ->
1259;          (Start-tag -> Seed -> Seed -> Seed) ->
1260;          (Char-Data -> Seed -> Seed) ->
1261;          XML-text-fragment -> Seed -> Seed
1262; parser fdown fup fchar "<elem attrs> content </elem>" seed
1263;  = fup "<elem attrs>" seed
1264;       (parser fdown fup fchar "content" (fdown "<elem attrs>" seed))
1265;
1266; parser fdown fup fchar "char-data content" seed
1267;  = parser fdown fup fchar "content" (fchar "char-data" seed)
1268;
1269; parser fdown fup fchar "elem-content content" seed
1270;  = parser fdown fup fchar "content" (
1271;       parser fdown fup fchar "elem-content" seed)
1272
1273; Compare the last two equations with the left fold
1274; fold-left kons elem:list seed = fold-left kons list (kons elem seed)
1275
1276; The real parser created my SSAX:make-parser is slightly more complicated,
1277; to account for processing instructions, entity references, namespaces,
1278; processing of document type declaration, etc.
1279
1280
1281; The XML standard document referred to in this module is
1282;       http://www.w3.org/TR/1998/REC-xml-19980210.html
1283;
1284; The present file also defines a procedure that parses the text of an
1285; XML document or of a separate element into SXML, an
1286; S-expression-based model of an XML Information Set. SXML is also an
1287; Abstract Syntax Tree of an XML document. SXML is similar
1288; but not identical to DOM; SXML is particularly suitable for
1289; Scheme-based XML/HTML authoring, SXPath queries, and tree
1290; transformations. See SXML.html for more details.
1291; SXML is a term implementation of evaluation of the XML document [3].
1292; The other implementation is context-passing.
1293
1294; The present frameworks fully supports the XML Namespaces Recommendation:
1295;       http://www.w3.org/TR/REC-xml-names/
1296; Other links:
1297; [1] Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold,"
1298; Proc. ICFP'98, 1998, pp. 273-279.
1299; [2] Richard S. Bird, The promotion and accumulation strategies in
1300; transformational programming, ACM Trans. Progr. Lang. Systems,
1301; 6(4):487-504, October 1984.
1302; [3] Ralf Hinze, "Deriving Backtracking Monad Transformers,"
1303; Functional Pearl. Proc ICFP'00, pp. 186-197.
1304
1305; IMPORT
1306; parser-error SSAX:warn, see Handling of errors, below
1307; functions declared in files util.scm, input-parse.scm and look-for-str.scm
1308
1309; Handling of errors
1310; This package relies on a function parser-error, which must be defined
1311; by a user of the package. The function has the following signature:
1312;       parser-error PORT MESSAGE SPECIALISING-MSG*
1313; Many procedures of this package call 'parser-error' whenever a
1314; parsing, well-formedness or validation error is encountered. The
1315; first argument is a port, which typically points to the offending
1316; character or its neighborhood. Most of the Scheme systems let the
1317; user query a PORT for the current position. The MESSAGE argument
1318; indicates a failed XML production or a failed XML constraint. The
1319; latter is referred to by its anchor name in the XML Recommendation
1320; or XML Namespaces Recommendation. The parsing library (e.g.,
1321; next-token, assert-curr-char) invoke 'parser-error' as well, in
1322; exactly the same way.  See input-parse.scm for more details.
1323; See
1324;       http://pair.com/lisovsky/download/parse-error.scm
1325; for an excellent example of such a redefined parser-error function.
1326;
1327; In addition, the present code invokes a function SSAX:warn
1328;   SSAX:warn PORT MESSAGE SPECIALISING-MSG*
1329; to notify the user about warnings that are NOT errors but still
1330; may alert the user.
1331;
1332; Again, parser-error and SSAX:warn are supposed to be defined by the
1333; user. However, if a run-test macro below is set to include
1334; self-tests, this present code does provide the definitions for these
1335; functions to allow tests to run.
1336;
1337; $Id: ssax-core.scm,v 1.2 2004/05/31 00:05:04 flw Exp $
1338
1339
1340        ; To run this code under Gambit, just evaluate or compile it along
1341        ; with the IMPORT functions mentioned above.
1342
1343        ; To run this code under SCM, do
1344        ; scm myenv-scm.scm util.scm input-parse.scm look-for-str.scm SSAX.scm
1345
1346        ; To run this code under MIT Scheme, do
1347        ; scheme -load myenv-mit.scm input-parse.scm util.scm \
1348        ; look-for-str.scm SSAX.scm
1349
1350        ; Current versions of SSAX ports to other Scheme systems
1351        ; (including Bigloo, Guile, Chicken, and PLT Scheme) are available from
1352        ;       http://pair.com/lisovsky/download/ssax/
1353        ; I'm deeply grateful to Kirill Lisovsky for developing and
1354        ; maintaining these versions. His comments and suggestions are
1355        ; appreciated indeed.
1356
1357        ; See http://pobox.com/~oleg/ftp/Scheme/
1358        ; for myenv.scm and other input parsing functions used
1359        ; in the present code.
1360; Move inside the run-test, just as catch-error.scm below???
1361; (include "myenv.scm")
1362
1363
1364; The following macro runs built-in test cases -- or does not run,
1365; depending on which of the two lines below you commented out
1366(define-macro run-test (lambda body '(begin #f)))
1367
1368; The following macro could've been defined just as
1369; (define-macro run-test (lambda body `(begin (display "\n-->Test\n") ,@body)))
1370;
1371; Instead, it's more involved, to make up for case-insensitivity of
1372; symbols on some Scheme systems. In Gambit, symbols are case
1373; sensitive: (eq? 'A 'a) is #f and (eq? 'Aa (string->symbol "Aa")) is
1374; #t.  On some systems, symbols are case-insensitive and just the
1375; opposite is true.  Therefore, we introduce a notation '"ASymbol" (a
1376; quoted string) that stands for a case-_sensitive_ ASymbol -- on any
1377; R5RS Scheme system. This notation is valid only within the body of
1378; run-test.
1379; The notation is implemented by scanning the run-test's
1380; body and replacing every occurrence of (quote "str") with the result
1381; of (string->symbol "str"). We can do such a replacement at macro-expand
1382; time (rather than at run time).
1383
1384 ;(define-macro run-test
1385 ;  (lambda body
1386 ;    (define (re-write body)
1387 ;      (cond
1388 ;       ((vector? body)
1389 ;      (list->vector (re-write (vector->list body))))
1390 ;       ((not (pair? body)) body)
1391 ;       ((and (eq? 'quote (car body)) (pair? (cdr body))
1392 ;           (string? (cadr body)))
1393 ;      (string->symbol (cadr body)))
1394 ;       (else (cons (re-write (car body)) (re-write (cdr body))))))
1395 ;    (cons 'begin (re-write body))))
1396
1397
1398 ;(run-test
1399 ; (include "catch-error.scm")
1400 ;)
1401;========================================================================
1402;                               Data Types
1403
1404; TAG-KIND
1405;       a symbol 'START, 'END, 'PI, 'DECL, 'COMMENT, 'CDSECT
1406;               or 'ENTITY-REF that identifies a markup token
1407
1408; UNRES-NAME
1409;       a name (called GI in the XML Recommendation) as given in an xml
1410;       document for a markup token: start-tag, PI target, attribute name.
1411;       If a GI is an NCName, UNRES-NAME is this NCName converted into
1412;       a Scheme symbol. If a GI is a QName, UNRES-NAME is a pair of
1413;       symbols: (PREFIX . LOCALPART)
1414
1415; RES-NAME
1416;       An expanded name, a resolved version of an UNRES-NAME.
1417;       For an element or an attribute name with a non-empty namespace URI,
1418;       RES-NAME is a pair of symbols, (URI-SYMB . LOCALPART).
1419;       Otherwise, it's a single symbol.
1420
1421; ELEM-CONTENT-MODEL
1422;       A symbol:
1423;       ANY       - anything goes, expect an END tag.
1424;       EMPTY-TAG - no content, and no END-tag is coming
1425;       EMPTY     - no content, expect the END-tag as the next token
1426;       PCDATA    - expect character data only, and no children elements
1427;       MIXED
1428;       ELEM-CONTENT
1429
1430; URI-SYMB
1431;       A symbol representing a namespace URI -- or other symbol chosen
1432;       by the user to represent URI. In the former case,
1433;       URI-SYMB is created by %-quoting of bad URI characters and
1434;       converting the resulting string into a symbol.
1435
1436; NAMESPACES
1437;       A list representing namespaces in effect. An element of the list
1438;       has one of the following forms:
1439;       (PREFIX URI-SYMB . URI-SYMB) or
1440;       (PREFIX USER-PREFIX . URI-SYMB)
1441;               USER-PREFIX is a symbol chosen by the user
1442;               to represent the URI.
1443;       (#f USER-PREFIX . URI-SYMB)
1444;               Specification of the user-chosen prefix and a URI-SYMBOL.
1445;       (*DEFAULT* USER-PREFIX . URI-SYMB)
1446;               Declaration of the default namespace
1447;       (*DEFAULT* #f . #f)
1448;               Un-declaration of the default namespace. This notation
1449;               represents overriding of the previous declaration
1450;       A NAMESPACES list may contain several elements for the same PREFIX.
1451;       The one closest to the beginning of the list takes effect.
1452
1453; ATTLIST
1454;       An ordered collection of (NAME . VALUE) pairs, where NAME is
1455;       a RES-NAME or an UNRES-NAME. The collection is an ADT
1456
1457; STR-HANDLER
1458;       A procedure of three arguments: STRING1 STRING2 SEED
1459;       returning a new SEED
1460;       The procedure is supposed to handle a chunk of character data
1461;       STRING1 followed by a chunk of character data STRING2.
1462;       STRING2 is a short string, often "\n" and even ""
1463
1464; ENTITIES
1465;       An assoc list of pairs:
1466;          (named-entity-name . named-entity-body)
1467;       where named-entity-name is a symbol under which the entity was
1468;       declared, named-entity-body is either a string, or
1469;       (for an external entity) a thunk that will return an
1470;       input port (from which the entity can be read).
1471;       named-entity-body may also be #f. This is an indication that a
1472;       named-entity-name is currently being expanded. A reference to
1473;       this named-entity-name will be an error: violation of the
1474;       WFC nonrecursion.
1475
1476; XML-TOKEN -- a record
1477
1478; In Gambit, you can use the following declaration:
1479; (define-structure xml-token kind head)
1480; The following declaration is "standard" as it follows SRFI-9:
1481;;(define-record-type  xml-token  (make-xml-token kind head)  xml-token?
1482;;  (kind  xml-token-kind)
1483;;  (head  xml-token-head) )
1484; No field mutators are declared as SSAX is a pure functional parser
1485;
1486; But to make the code more portable, we define xml-token simply as
1487; a pair. It suffices for us. Furthermore, xml-token-kind and xml-token-head
1488; can be defined as simple procedures. However, they are declared as
1489; macros below for efficiency.
1490
1491#|
1492(define (make-xml-token kind head) (cons kind head))
1493(define xml-token? pair?)
1494(define-macro xml-token-kind (lambda (token) `(car ,token)))
1495(define-macro xml-token-head (lambda (token) `(cdr ,token)))
1496|#
1497
1498(define-record xml-token kind head)
1499
1500
1501; This record represents a markup, which is, according to the XML
1502; Recommendation, "takes the form of start-tags, end-tags, empty-element tags,
1503; entity references, character references, comments, CDATA section delimiters,
1504; document type declarations, and processing instructions."
1505;
1506;       kind -- a TAG-KIND
1507;       head -- an UNRES-NAME. For xml-tokens of kinds 'COMMENT and
1508;               'CDSECT, the head is #f
1509;
1510; For example,
1511;       <P>  => kind='START, head='P
1512;       </P> => kind='END, head='P
1513;       <BR/> => kind='EMPTY-EL, head='BR
1514;       <!DOCTYPE OMF ...> => kind='DECL, head='DOCTYPE
1515;       <?xml version="1.0"?> => kind='PI, head='xml
1516;       &my-ent; => kind = 'ENTITY-REF, head='my-ent
1517;
1518; Character references are not represented by xml-tokens as these references
1519; are transparently resolved into the corresponding characters.
1520;
1521
1522
1523
1524; XML-DECL -- a record
1525
1526; The following is Gambit-specific, see below for a portable declaration
1527;(define-structure xml-decl elems entities notations)
1528
1529; The record represents a datatype of an XML document: the list of
1530; declared elements and their attributes, declared notations, list of
1531; replacement strings or loading procedures for parsed general
1532; entities, etc. Normally an xml-decl record is created from a DTD or
1533; an XML Schema, although it can be created and filled in in many other
1534; ways (e.g., loaded from a file).
1535;
1536; elems: an (assoc) list of decl-elem or #f. The latter instructs
1537;       the parser to do no validation of elements and attributes.
1538;
1539; decl-elem: declaration of one element:
1540;       (elem-name elem-content decl-attrs)
1541;       elem-name is an UNRES-NAME for the element.
1542;       elem-content is an ELEM-CONTENT-MODEL.
1543;       decl-attrs is an ATTLIST, of (ATTR-NAME . VALUE) associations
1544; !!!This element can declare a user procedure to handle parsing of an
1545; element (e.g., to do a custom validation, or to build a hash of
1546; IDs as they're encountered).
1547;
1548; decl-attr: an element of an ATTLIST, declaration of one attribute
1549;       (attr-name content-type use-type default-value)
1550;       attr-name is an UNRES-NAME for the declared attribute
1551;       content-type is a symbol: CDATA, NMTOKEN, NMTOKENS, ...
1552;               or a list of strings for the enumerated type.
1553;       use-type is a symbol: REQUIRED, IMPLIED, FIXED
1554;       default-value is a string for the default value, or #f if not given.
1555;
1556;
1557
1558; see a function make-empty-xml-decl to make a XML declaration entry
1559; suitable for a non-validating parsing.
1560
1561
1562;-------------------------
1563; Utilities
1564
1565; A rather useful utility from SRFI-1
1566; cons* elt1 elt2 ... -> object
1567;    Like LIST, but the last argument provides the tail of the constructed
1568;    list -- i.e., (cons* a1 a2 ... an) = (cons a1 (cons a2 (cons ... an))).
1569;
1570;   (cons* 1 2 3 4) => (1 2 3 . 4)
1571;   (cons* 1) => 1
1572#;(define (cons* first . rest)
1573  (let recur ((x first) (rest rest))
1574    (if (pair? rest)
1575        (cons x (recur (car rest) (cdr rest)))
1576        x)))
1577
1578;   SSAX:warn PORT MESSAGE SPECIALISING-MSG*
1579; to notify the user about warnings that are NOT errors but still
1580; may alert the user.
1581; Result is unspecified.
1582; We need to define the function to allow the self-tests to run.
1583; Normally the definition of SSAX:warn is to be provided by the user.
1584;(run-test
1585(define (SSAX:warn port msg . other-msg)
1586  (apply cerr (cons* "\nWarning: " msg other-msg))
1587  (newline (current-error-port)) )
1588;)
1589
1590
1591;   parser-error PORT MESSAGE SPECIALISING-MSG*
1592; to let the user know of a syntax error or a violation of a
1593; well-formedness or validation constraint.
1594; Result is unspecified.
1595; We need to define the function to allow the self-tests to run.
1596; Normally the definition of parser-error is to be provided by the user.
1597(run-test
1598 (define (parser-error port msg . specializing-msgs)
1599   (apply error (cons msg specializing-msgs)))
1600)
1601
1602; The following is a function that is often used in validation tests,
1603; to make sure that the computed result matches the expected one.
1604; This function is a standard equal? predicate with one exception.
1605; On Scheme systems where (string->symbol "A") and a symbol A
1606; are the same, equal_? is precisely equal?
1607; On other Scheme systems, we compare symbols disregarding their case.
1608; Since this function is used only in tests, we don't have to
1609; strive to make it efficient.
1610(run-test
1611 (define (equal_? e1 e2)
1612   (if (eq? 'A (string->symbol "A")) (equal? e1 e2)
1613       (cond
1614        ((symbol? e1)
1615         (and (symbol? e2) 
1616              (string-ci=? (symbol->string e1) (symbol->string e2))))
1617        ((pair? e1)
1618         (and (pair? e2)
1619              (equal_? (car e1) (car e2)) (equal_? (cdr e1) (cdr e2))))
1620        ((vector? e1)
1621         (and (vector? e2) (equal_? (vector->list e1) (vector->list e2))))
1622        (else
1623         (equal? e1 e2)))))
1624)
1625             
1626
1627; Test if a string is made of only whitespace
1628; An empty string is considered made of whitespace as well
1629(define (string-whitespace? str)
1630  (let ((len (string-length str)))
1631    (cond
1632     ((zero? len) #t)
1633     ((= 1 len) (char-whitespace? (string-ref str 0)))
1634     ((= 2 len) (and (char-whitespace? (string-ref str 0))
1635                     (char-whitespace? (string-ref str 1))))
1636     (else
1637      (let loop ((i 0))
1638        (or (>= i len)
1639            (and (char-whitespace? (string-ref str i))
1640                 (loop (++ i)))))))))
1641
1642; Find val in alist
1643; Return (values found-el remaining-alist) or
1644;        (values #f alist)
1645
1646(define (assq-values val alist)
1647  (let loop ((alist alist) (scanned '()))
1648    (cond
1649     ((null? alist) (values #f scanned))
1650     ((equal? val (caar alist))
1651      (values (car alist) (append scanned (cdr alist))))
1652     (else
1653      (loop (cdr alist) (cons (car alist) scanned))))))
1654
1655; From SRFI-1
1656(define (fold-right kons knil lis1)
1657    (let recur ((lis lis1))
1658       (if (null? lis) knil
1659            (let ((head (car lis)))
1660              (kons head (recur (cdr lis)))))))
1661
1662; Left fold combinator for a single list
1663(define (fold kons knil lis1)
1664  (let lp ((lis lis1) (ans knil))
1665    (if (null? lis) ans
1666      (lp (cdr lis) (kons (car lis) ans)))))
1667
1668
1669;========================================================================
1670;               Lower-level parsers and scanners
1671;
1672; They deal with primitive lexical units (Names, whitespaces, tags)
1673; and with pieces of more generic productions. Most of these parsers
1674; must be called in appropriate context. For example, SSAX:complete-start-tag
1675; must be called only when the start-tag has been detected and its GI
1676; has been read.
1677
1678;------------------------------------------------------------------------
1679;                       Low-level parsing code
1680
1681; Skip the S (whitespace) production as defined by
1682; [3] S ::= (#x20 | #x9 | #xD | #xA)
1683; The procedure returns the first not-whitespace character it
1684; encounters while scanning the PORT. This character is left
1685; on the input stream.
1686
1687(define SSAX:S-chars '(#\space #\tab #\return #\newline))
1688
1689(define (SSAX:skip-S port)
1690  (skip-while SSAX:S-chars port))
1691
1692
1693; Read a Name lexem and return it as string
1694; [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':'
1695;                  | CombiningChar | Extender
1696; [5] Name ::= (Letter | '_' | ':') (NameChar)*
1697;
1698; This code supports the XML Namespace Recommendation REC-xml-names,
1699; which modifies the above productions as follows:
1700;
1701; [4] NCNameChar ::= Letter | Digit | '.' | '-' | '_'
1702;                       | CombiningChar | Extender
1703; [5] NCName ::= (Letter | '_') (NCNameChar)*
1704; As the Rec-xml-names says,
1705; "An XML document conforms to this specification if all other tokens
1706; [other than element types and attribute names] in the document which
1707; are required, for XML conformance, to match the XML production for
1708; Name, match this specification's production for NCName."
1709; Element types and attribute names must match the production QName,
1710; defined below.
1711
1712; Check to see if a-char may start a NCName
1713(define (SSAX:ncname-starting-char? a-char)
1714  (and (char? a-char)
1715    (or
1716      (char-alphabetic? a-char)
1717      (char=? #\_ a-char))))
1718
1719
1720; Read a NCName starting from the current position in the PORT and
1721; return it as a symbol.
1722(define (SSAX:read-NCName port)
1723  (let ((first-char (peek-char port)))
1724    (or (SSAX:ncname-starting-char? first-char)
1725      (parser-error port "XMLNS [4] for '" first-char "'")))
1726  (string->symbol
1727    (next-token-of
1728      (lambda (c)
1729        (cond
1730          ((eof-object? c) #f)
1731          ((char-alphabetic? c) c)
1732          ((string-index "0123456789.-_" c) c)
1733          (else #f)))
1734      port)))
1735
1736; Read a (namespace-) Qualified Name, QName, from the current
1737; position in the PORT.
1738; From REC-xml-names:
1739;       [6] QName ::= (Prefix ':')? LocalPart
1740;       [7] Prefix ::= NCName
1741;       [8] LocalPart ::= NCName
1742; Return: an UNRES-NAME
1743(define (SSAX:read-QName port)
1744  (let ((prefix-or-localpart (SSAX:read-NCName port)))
1745    (case (peek-char port)
1746      ((#\:)                    ; prefix was given after all
1747       (read-char port)         ; consume the colon
1748       (cons prefix-or-localpart (SSAX:read-NCName port)))
1749      (else prefix-or-localpart) ; Prefix was omitted
1750      )))
1751
1752; The prefix of the pre-defined XML namespace
1753(define SSAX:Prefix-XML (string->symbol "xml"))
1754
1755(run-test
1756 (assert (eq? '_
1757                 (call-with-input-string "_" SSAX:read-NCName)))
1758 (assert (eq? '_
1759                 (call-with-input-string "_" SSAX:read-QName)))
1760 (assert (eq? (string->symbol "_abc_")
1761              (call-with-input-string "_abc_;" SSAX:read-NCName)))
1762 (assert (eq? (string->symbol "_abc_")
1763              (call-with-input-string "_abc_;" SSAX:read-QName)))
1764 (assert (eq? (string->symbol "_a.b")
1765              (call-with-input-string "_a.b " SSAX:read-QName)))
1766 (assert (equal? (cons (string->symbol "_a.b") (string->symbol "d.1-ef-"))
1767              (call-with-input-string "_a.b:d.1-ef-;" SSAX:read-QName)))
1768 (assert (equal? (cons (string->symbol "a") (string->symbol "b"))
1769              (call-with-input-string "a:b:c" SSAX:read-QName)))
1770
1771 (assert (failed? (call-with-input-string ":abc" SSAX:read-NCName)))
1772 (assert (failed? (call-with-input-string "1:bc" SSAX:read-NCName)))
1773)
1774
1775; Compare one RES-NAME or an UNRES-NAME with the other.
1776; Return a symbol '<, '>, or '= depending on the result of
1777; the comparison.
1778; Names without PREFIX are always smaller than those with the PREFIX.
1779(define name-compare
1780  (letrec ((symbol-compare
1781            (lambda (symb1 symb2)
1782              (cond
1783               ((eq? symb1 symb2) '=)
1784               ((string<? (symbol->string symb1) (symbol->string symb2))
1785                '<)
1786               (else '>)))))
1787    (lambda (name1 name2)
1788      (cond
1789       ((symbol? name1) (if (symbol? name2) (symbol-compare name1 name2)
1790                            '<))
1791       ((symbol? name2) '>)
1792       ((eq? name2 SSAX:largest-unres-name) '<)
1793       ((eq? name1 SSAX:largest-unres-name) '>)
1794       ((eq? (car name1) (car name2))   ; prefixes the same
1795        (symbol-compare (cdr name1) (cdr name2)))
1796       (else (symbol-compare (car name1) (car name2)))))))
1797
1798; An UNRES-NAME that is postulated to be larger than anything that can occur in
1799; a well-formed XML document.
1800; name-compare enforces this postulate.
1801(define SSAX:largest-unres-name (cons (gensym) (gensym)))
1802
1803(run-test
1804 (assert (eq? '= (name-compare 'ABC 'ABC)))
1805 (assert (eq? '< (name-compare 'ABC 'ABCD)))
1806 (assert (eq? '> (name-compare 'XB 'ABCD)))
1807 (assert (eq? '> (name-compare '(HTML . PRE) 'PRE)))
1808 (assert (eq? '< (name-compare 'HTML '(HTML . PRE))))
1809 (assert (eq? '= (name-compare '(HTML . PRE) '(HTML . PRE))))
1810 (assert (eq? '< (name-compare '(HTML . PRE) '(XML . PRE))))
1811 (assert (eq? '> (name-compare '(HTML . PRE) '(HTML . P))))
1812 (assert (eq? '< (name-compare '(HTML . PRE) SSAX:largest-unres-name)))
1813 (assert (eq? '< (name-compare '(ZZZZ . ZZZ) SSAX:largest-unres-name)))
1814 (assert (eq? '> (name-compare SSAX:largest-unres-name '(ZZZZ . ZZZ) )))
1815)
1816
1817
1818
1819; procedure:    SSAX:read-markup-token PORT
1820; This procedure starts parsing of a markup token. The current position
1821; in the stream must be #\<. This procedure scans enough of the input stream
1822; to figure out what kind of a markup token it is seeing. The procedure returns
1823; an xml-token structure describing the token. Note, generally reading
1824; of the current markup is not finished! In particular, no attributes of
1825; the start-tag token are scanned.
1826;
1827; Here's a detailed break out of the return values and the position in the PORT
1828; when that particular value is returned:
1829;       PI-token:       only PI-target is read.
1830;                       To finish the Processing Instruction and disregard it,
1831;                       call SSAX:skip-pi. SSAX:read-attributes may be useful
1832;                       as well (for PIs whose content is attribute-value
1833;                       pairs)
1834;       END-token:      The end tag is read completely; the current position
1835;                       is right after the terminating #\> character.   
1836;       COMMENT         is read and skipped completely. The current position
1837;                       is right after "-->" that terminates the comment.
1838;       CDSECT          The current position is right after "<!CDATA["
1839;                       Use SSAX:read-CDATA-body to read the rest.
1840;       DECL            We have read the keyword (the one that follows "<!")
1841;                       identifying this declaration markup. The current
1842;                       position is after the keyword (usually a
1843;                       whitespace character)
1844;
1845;       START-token     We have read the keyword (GI) of this start tag.
1846;                       No attributes are scanned yet. We don't know if this
1847;                       tag has an empty content either.
1848;                       Use SSAX:complete-start-tag to finish parsing of
1849;                       the token.
1850
1851(define SSAX:read-markup-token ; procedure SSAX:read-markup-token port
1852 (let ()
1853                ; we have read "<!-". Skip through the rest of the comment
1854                ; Return the 'COMMENT token as an indication we saw a comment
1855                ; and skipped it.
1856  (define (skip-comment port)
1857    (assert-curr-char '(#\-) "XML [15], second dash" port)
1858    (if (not (find-string-from-port? "-->" port))
1859      (parser-error port "XML [15], no -->"))
1860    (make-xml-token 'COMMENT #f))
1861
1862                ; we have read "<![" that must begin a CDATA section
1863  (define (read-CDATA port)
1864    (assert (string=? "CDATA[" (read-string 6 port)))
1865    (make-xml-token 'CDSECT #f))
1866
1867  (lambda (port)
1868    (assert-curr-char '(#\<) "start of the token" port)
1869    (case (peek-char port)
1870      ((#\/) (read-char port)
1871       (begin0 (make-xml-token 'END (SSAX:read-QName port))
1872               (SSAX:skip-S port)
1873               (assert-curr-char '(#\>) "XML [42]" port)))
1874      ((#\?) (read-char port) (make-xml-token 'PI (SSAX:read-NCName port)))
1875      ((#\!)
1876       (case (peek-next-char port)
1877         ((#\-) (read-char port) (skip-comment port))
1878         ((#\[) (read-char port) (read-CDATA port))
1879         (else (make-xml-token 'DECL (SSAX:read-NCName port)))))
1880      (else (make-xml-token 'START (SSAX:read-QName port)))))
1881))
1882
1883
1884; The current position is inside a PI. Skip till the rest of the PI
1885(define (SSAX:skip-pi port)     
1886  (if (not (find-string-from-port? "?>" port))
1887    (parser-error port "Failed to find ?> terminating the PI")))
1888
1889
1890; The current position is right after reading the PITarget. We read the
1891; body of PI and return is as a string. The port will point to the
1892; character right sfter '?>' combination that terminates PI.
1893; [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
1894
1895(define (SSAX:read-pi-body-as-string port)
1896  (SSAX:skip-S port)            ; skip WS after the PI target name
1897  (string-concatenate
1898    (let loop ()
1899      (let ((pi-fragment
1900             (next-token '() '(#\?) "reading PI content" port)))
1901        (if (eq? #\> (peek-next-char port))
1902            (begin
1903              (read-char port)
1904              (cons pi-fragment '()))
1905            (cons* pi-fragment "?" (loop)))))))
1906
1907(run-test
1908 (assert (equal? "p1 content "
1909    (call-with-input-string "<?pi1  p1 content ?>"
1910      (lambda (port)
1911        (SSAX:read-markup-token port)
1912        (SSAX:read-pi-body-as-string port)))))
1913 (assert (equal? "pi2? content? ?"
1914    (call-with-input-string "<?pi2 pi2? content? ??>"
1915      (lambda (port)
1916        (SSAX:read-markup-token port)
1917        (SSAX:read-pi-body-as-string port)))))
1918)
1919
1920;(define (SSAX:read-pi-body-as-name-values port)
1921
1922; The current pos in the port is inside an internal DTD subset
1923; (e.g., after reading #\[ that begins an internal DTD subset)
1924; Skip until the "]>" combination that terminates this DTD
1925(define (SSAX:skip-internal-dtd port)     
1926  (if (not (find-string-from-port? "]>" port))
1927    (parser-error port
1928                  "Failed to find ]> terminating the internal DTD subset")))
1929
1930
1931; procedure+:   SSAX:read-CDATA-body PORT STR-HANDLER SEED
1932;
1933; This procedure must be called after we have read a string "<![CDATA["
1934; that begins a CDATA section. The current position must be the first
1935; position of the CDATA body. This function reads _lines_ of the CDATA
1936; body and passes them to a STR-HANDLER, a character data consumer.
1937;
1938; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED.
1939; The first STRING1 argument to STR-HANDLER never contains a newline.
1940; The second STRING2 argument often will. On the first invocation of
1941; the STR-HANDLER, the seed is the one passed to SSAX:read-CDATA-body
1942; as the third argument. The result of this first invocation will be
1943; passed as the seed argument to the second invocation of the line
1944; consumer, and so on. The result of the last invocation of the
1945; STR-HANDLER is returned by the SSAX:read-CDATA-body.  Note a
1946; similarity to the fundamental 'fold' iterator.
1947;
1948; Within a CDATA section all characters are taken at their face value,
1949; with only three exceptions:
1950;       CR, LF, and CRLF are treated as line delimiters, and passed
1951;       as a single #\newline to the STR-HANDLER
1952;       "]]>" combination is the end of the CDATA section.
1953;       &gt; is treated as an embedded #\> character
1954; Note, &lt; and &amp; are not specially recognized (and are not expanded)!
1955
1956(define SSAX:read-CDATA-body 
1957  (let ((nl-str (string #\newline)))
1958
1959    (lambda (port str-handler seed)
1960      (let loop ((seed seed))
1961        (let ((fragment (next-token '() '(#\return #\newline #\] #\&)
1962                                    "reading CDATA" port)))
1963                        ; that is, we're reading the char after the 'fragment'
1964     (case (read-char port)     
1965       ((#\newline) (loop (str-handler fragment nl-str seed)))
1966       ((#\return)              ; if the next char is #\newline, skip it
1967         (if (eqv? (peek-char port) #\newline) (read-char port))
1968         (loop (str-handler fragment nl-str seed)))
1969       ((#\])
1970        (if (not (eqv? (peek-char port) #\]))
1971            (loop (str-handler fragment "]" seed))
1972            (let check-after-second-braket
1973                ((seed (if (string-null? fragment) seed
1974                           (str-handler fragment "" seed))))
1975              (case (peek-next-char port)       ; after the second bracket
1976                ((#\>) (read-char port) seed)   ; we have read "]]>"
1977                ((#\]) (check-after-second-braket
1978                        (str-handler "]" "" seed)))
1979                (else (loop (str-handler "]]" "" seed)))))))
1980       ((#\&)           ; Note that #\& within CDATA may stand for itself
1981        (let ((ent-ref  ; it does not have to start an entity ref
1982               (next-token-of (lambda (c) 
1983                 (and (not (eof-object? c)) (char-alphabetic? c) c)) port)))
1984          (cond         ; "&gt;" is to be replaced with #\>
1985           ((and (string=? "gt" ent-ref) (eq? (peek-char port) #\;))
1986            (read-char port)
1987            (loop (str-handler fragment ">" seed)))
1988           (else
1989            (loop 
1990             (str-handler ent-ref ""
1991                          (str-handler fragment "&" seed)))))))
1992       (else
1993         (parser-error port "can't happen"))))))))
1994
1995; a few lines of validation code
1996(run-test (letrec
1997  ((consumer (lambda (fragment foll-fragment seed)
1998     (cons* (if (equal? foll-fragment (string #\newline))
1999                " NL\n" foll-fragment) fragment seed)))
2000   (test (lambda (str expected-result)
2001          (display "\nbody: ") (write str) (display "\nResult: ")
2002          (let ((result
2003                 (reverse
2004                  (call-with-input-string str
2005                    (lambda (port) (SSAX:read-CDATA-body port consumer '()))
2006                    ))))
2007            (write result)
2008            (assert (equal? result expected-result)))))
2009   )
2010  (test "]]>" '())
2011  (test "abcd]]>" '("abcd" ""))
2012  (test "abcd]]]>" '("abcd" "" "]" ""))
2013  (test "abcd]]]]>" '("abcd" "" "]" "" "]" ""))
2014  (test "abcd]]]]]>" '("abcd" "" "]" "" "]" "" "]" ""))
2015  (test "abcd]]]a]]>" '("abcd" "" "]" "" "]]" "" "a" ""))
2016  (test "abc\r\ndef\n]]>" '("abc" " NL\n" "def" " NL\n"))
2017  (test "\r\n\r\n]]>" '("" " NL\n" "" " NL\n"))
2018  (test "\r\n\r\na]]>" '("" " NL\n" "" " NL\n" "a" ""))
2019  (test "abc&!!!]]>" '("abc" "&" "" "" "!!!" ""))
2020  (test "abc]]&gt;&gt&amp;]]]&gt;and]]>"
2021    '("abc" "" "]]" "" "" ">" "" "&" "gt" "" "" "&" "amp" "" ";" "" "]" ""
2022      "]]" "" "" ">" "and" ""))
2023))
2024
2025           
2026; procedure+:   SSAX:read-char-ref PORT
2027;
2028; [66]  CharRef ::=  '&#' [0-9]+ ';'
2029;                  | '&#x' [0-9a-fA-F]+ ';'
2030;
2031; This procedure must be called after we we have read "&#"
2032; that introduces a char reference.
2033; The procedure reads this reference and returns the corresponding char
2034; The current position in PORT will be after ";" that terminates
2035; the char reference
2036; Faults detected:
2037;       WFC: XML-Spec.html#wf-Legalchar
2038
2039(define (SSAX:read-char-ref port)
2040  (let* ((base
2041           (cond ((eq? (peek-char port) #\x) (read-char port) 16)
2042                 (else 10)))
2043         (name (next-token '() '(#\;) "XML [66]" port))
2044         (char-code (string->number name base)))
2045    (read-char port)    ; read the terminating #\; char
2046    (if (integer? char-code) (integer->char char-code)
2047      (parser-error port "[wf-Legalchar] broken for '" name "'"))))
2048
2049
2050; procedure+:   SSAX:handle-parsed-entity PORT NAME ENTITIES
2051;               CONTENT-HANDLER STR-HANDLER SEED
2052;
2053; Expand and handle a parsed-entity reference
2054; port - a PORT
2055; name - the name of the parsed entity to expand, a symbol
2056; entities - see ENTITIES
2057; content-handler -- procedure PORT ENTITIES SEED
2058;       that is supposed to return a SEED
2059; str-handler - a STR-HANDLER. It is called if the entity in question
2060; turns out to be a pre-declared entity
2061;
2062; The result is the one returned by CONTENT-HANDLER or STR-HANDLER
2063; Faults detected:
2064;       WFC: XML-Spec.html#wf-entdeclared
2065;       WFC: XML-Spec.html#norecursion
2066
2067(define SSAX:predefined-parsed-entities
2068  `(
2069    (,(string->symbol "amp") . "&")
2070    (,(string->symbol "lt") . "<")
2071    (,(string->symbol "gt") . ">")
2072    (,(string->symbol "apos") . "'")
2073    (,(string->symbol "quot") . "\"")))
2074
2075(define (SSAX:handle-parsed-entity port name entities
2076                                   content-handler str-handler seed)
2077  (cond   ; First we check the list of the declared entities
2078   ((assq name entities) =>
2079    (lambda (decl-entity)
2080      (let ((ent-body (cdr decl-entity)) ; mark the list to prevent recursion
2081            (new-entities (cons (cons name #f) entities)))
2082        (cond
2083         ((string? ent-body)
2084          (call-with-input-string ent-body
2085             (lambda (port) (content-handler port new-entities seed))))
2086         ((procedure? ent-body)
2087          (let ((port (ent-body)))
2088            (begin0
2089             (content-handler port new-entities seed)
2090             (close-input-port port))))
2091         (else
2092          (parser-error port "[norecursion] broken for " name))))))
2093    ((assq name SSAX:predefined-parsed-entities)
2094     => (lambda (decl-entity)
2095          (str-handler (cdr decl-entity) "" seed)))
2096    (else (parser-error port "[wf-entdeclared] broken for " name))))
2097
2098
2099
2100; The ATTLIST Abstract Data Type
2101; Currently is implemented as an assoc list sorted in the ascending
2102; order of NAMES.
2103
2104(define (make-empty-attlist) '())
2105
2106; Add a name-value pair to the existing attlist preserving the order
2107; Return the new list, in the sorted ascending order.
2108; Return #f if a pair with the same name already exists in the attlist
2109
2110(define (attlist-add attlist name-value)
2111  (if (null? attlist) (cons name-value attlist)
2112      (case (name-compare (car name-value) (caar attlist))
2113        ((=) #f)
2114        ((<) (cons name-value attlist))
2115        (else (cons (car attlist) (attlist-add (cdr attlist) name-value)))
2116        )))
2117
2118(define attlist-null? null?)
2119
2120; Given an non-null attlist, return a pair of values: the top and the rest
2121(define (attlist-remove-top attlist)
2122  (values (car attlist) (cdr attlist)))
2123
2124(define (attlist->alist attlist) attlist)
2125(define attlist-fold fold)
2126
2127; procedure+:   SSAX:read-attributes PORT ENTITIES
2128;
2129; This procedure reads and parses a production Attribute*
2130; [41] Attribute ::= Name Eq AttValue
2131; [10] AttValue ::=  '"' ([^<&"] | Reference)* '"'
2132;                 | "'" ([^<&'] | Reference)* "'"
2133; [25] Eq ::= S? '=' S?
2134;
2135;
2136; The procedure returns an ATTLIST, of Name (as UNRES-NAME), Value (as string)
2137; pairs. The current character on the PORT is a non-whitespace character
2138; that is not an ncname-starting character.
2139;
2140; Note the following rules to keep in mind when reading an 'AttValue'
2141; "Before the value of an attribute is passed to the application
2142; or checked for validity, the XML processor must normalize it as follows:
2143; - a character reference is processed by appending the referenced
2144;   character to the attribute value
2145; - an entity reference is processed by recursively processing the
2146;   replacement text of the entity [see ENTITIES]
2147;   [named entities amp lt gt quot apos are assumed pre-declared]
2148; - a whitespace character (#x20, #xD, #xA, #x9) is processed by appending #x20
2149;   to the normalized value, except that only a single #x20 is appended for a
2150;   "#xD#xA" sequence that is part of an external parsed entity or the
2151;   literal entity value of an internal parsed entity
2152; - other characters are processed by appending them to the normalized value "
2153;
2154;
2155; Faults detected:
2156;       WFC: XML-Spec.html#CleanAttrVals
2157;       WFC: XML-Spec.html#uniqattspec
2158
2159(define SSAX:read-attributes  ; SSAX:read-attributes port entities
2160 (let ()
2161                ; Read the AttValue from the PORT up to the delimiter
2162                ; (which can be a single or double-quote character,
2163                ; or even a symbol *eof*)
2164                ; 'prev-fragments' is the list of string fragments, accumulated
2165                ; so far, in reverse order.
2166                ; Return the list of fragments with newly read fragments
2167                ; prepended.
2168  (define (read-attrib-value delimiter port entities prev-fragments)
2169    (let* ((new-fragments
2170            (cons
2171             (next-token '() (cons delimiter
2172                                   '(#\newline #\return #\space #\tab #\< #\&))
2173                         "XML [10]" port)
2174             prev-fragments))
2175           (cterm (read-char port)))
2176      (if (or (eof-object? cterm) (eqv? cterm delimiter))
2177          new-fragments
2178          (case cterm
2179            ((#\newline #\space #\tab)
2180              (read-attrib-value delimiter port entities
2181                                 (cons " " new-fragments)))
2182            ((#\return)
2183              (if (eqv? (peek-char port) #\newline) (read-char port))
2184              (read-attrib-value delimiter port entities
2185                                 (cons " " new-fragments)))
2186            ((#\&)
2187              (cond
2188                ((eqv? (peek-char port) #\#)
2189                  (read-char port)
2190                  (read-attrib-value delimiter port entities
2191                     (cons (string (SSAX:read-char-ref port)) new-fragments)))
2192                (else
2193                 (read-attrib-value delimiter port entities
2194                     (read-named-entity port entities new-fragments)))))
2195            ((#\<) (parser-error port "[CleanAttrVals] broken"))
2196            (else (parser-error port "Can't happen"))))))
2197
2198                ; we have read "&" that introduces a named entity reference.
2199                ; read this reference and return the result of
2200                ; normalizing of the corresponding string
2201                ; (that is, read-attrib-value is applied to the replacement
2202                ; text of the entity)
2203                ; The current position will be after ";" that terminates
2204                ; the entity reference
2205  (define (read-named-entity port entities fragments)
2206    (let ((name (SSAX:read-NCName port)))
2207      (assert-curr-char '(#\;) "XML [68]" port)
2208      (SSAX:handle-parsed-entity port name entities
2209        (lambda (port entities fragments)
2210          (read-attrib-value '*eof* port entities fragments))
2211        (lambda (str1 str2 fragments)
2212          (if (equal? "" str2) (cons str1 fragments)
2213              (cons* str2 str1 fragments)))
2214        fragments)))
2215
2216  (lambda (port entities)
2217    (let loop ((attr-list (make-empty-attlist)))
2218      (if (not (SSAX:ncname-starting-char? (SSAX:skip-S port))) attr-list
2219          (let ((name (SSAX:read-QName port)))
2220            (SSAX:skip-S port)
2221            (assert-curr-char '(#\=) "XML [25]" port)
2222            (SSAX:skip-S port)
2223            (let ((delimiter 
2224                   (assert-curr-char '(#\' #\" ) "XML [10]" port)))
2225              (loop 
2226               (or (attlist-add attr-list 
2227                     (cons name 
2228                           (string-concatenate
2229                            (reverse
2230                             (read-attrib-value delimiter port entities
2231                                                '())))))
2232                   (parser-error port "[uniqattspec] broken for " name))))))))
2233))
2234
2235; a few lines of validation code
2236(run-test (letrec
2237    ((test (lambda (str decl-entities expected-res)
2238          (display "\ninput: ") (write str) (display "\nResult: ")
2239          (let ((result
2240                 (call-with-input-string str
2241              (lambda (port) (SSAX:read-attributes port decl-entities)))))
2242            (write result) (newline)
2243            (assert (equal? result expected-res))))))
2244    (test "" '() '())
2245    (test "href='http://a\tb\r\n\r\n\nc'" '()
2246          `((,(string->symbol "href") . "http://a b   c")))
2247    (test "_1 ='12&amp;' _2= \"\r\n\t12&#10;3\">" '()
2248          '((_1 . "12&") (_2 . "  12\n3")))
2249    (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2250          '((ent . "&lt;xx&gt;"))
2251          `((,(string->symbol "Abc") . ,(string-append "<&>"
2252                                            (string #\newline)))
2253            (,(string->symbol "Next") . "12<xx>34")))
2254    (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&en;34' />" 
2255          `((en . ,(lambda () (open-input-string "&quot;xx&apos;"))))
2256          `((,(string->symbol "Abc") . ,(string-append "<&>"
2257                                            (string #\newline)))
2258            (,(string->symbol "Next") . "12\"xx'34")))
2259    (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2260          '((ent . "&lt;&ent1;T;&gt;") (ent1 . "&amp;"))
2261          `((,(string->symbol "Abc") . ,(string-append "<&>"
2262                                            (string #\newline)))
2263            (,(string->symbol "Next") . "12<&T;>34")))
2264    (assert (failed?
2265        (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2266          '((ent . "<&ent1;T;&gt;") (ent1 . "&amp;")) '())))
2267    (assert (failed?
2268        (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2269          '((ent . "&lt;&ent;T;&gt;") (ent1 . "&amp;")) '())))
2270    (assert (failed?
2271        (test "\tAbc='&lt;&amp;&gt;&#x0A;'\nNext='12&ent;34' />" 
2272          '((ent . "&lt;&ent1;T;&gt;") (ent1 . "&ent;")) '())))
2273    (test "html:href='http://a\tb\r\n\r\n\nc'" '()
2274          `(((,(string->symbol "html") . ,(string->symbol "href"))
2275             . "http://a b   c")))
2276    (test "html:href='ref1' html:src='ref2'" '()
2277          `(((,(string->symbol "html") . ,(string->symbol "href"))
2278             . "ref1")
2279            ((,(string->symbol "html") . ,(string->symbol "src"))
2280             . "ref2")))
2281    (test "html:href='ref1' xml:html='ref2'" '()
2282          `(((,(string->symbol "html") . ,(string->symbol "href"))
2283             . "ref1")
2284            ((,SSAX:Prefix-XML . ,(string->symbol "html"))
2285             . "ref2")))
2286    (assert (failed? (test "html:href='ref1' html:href='ref2'" '() '())))
2287    (assert (failed? (test "html:href='<' html:href='ref2'" '() '())))
2288    (assert (failed? (test "html:href='ref1' html:href='&ref2;'" '() '())))
2289))
2290
2291; SSAX:resolve-name PORT UNRES-NAME NAMESPACES apply-default-ns?
2292;
2293; Convert an UNRES-NAME to a RES-NAME given the appropriate NAMESPACES
2294; declarations.
2295; the last parameter apply-default-ns? determines if the default
2296; namespace applies (for instance, it does not for attribute names)
2297;
2298; Per REC-xml-names/#nsc-NSDeclared, "xml" prefix is considered pre-declared
2299; and bound to the namespace name "http://www.w3.org/XML/1998/namespace".
2300;
2301; This procedure tests for the namespace constraints:
2302; http://www.w3.org/TR/REC-xml-names/#nsc-NSDeclared
2303
2304(define (SSAX:resolve-name port unres-name namespaces apply-default-ns?)
2305  (cond
2306   ((pair? unres-name)          ; it's a QNAME
2307    (cons
2308     (cond
2309     ((assq (car unres-name) namespaces) => cadr)
2310     ((eq? (car unres-name) SSAX:Prefix-XML) SSAX:Prefix-XML)
2311     (else
2312      (parser-error port "[nsc-NSDeclared] broken; prefix " (car unres-name))))
2313     (cdr unres-name)))
2314   (apply-default-ns?           ; Do apply the default namespace, if any
2315    (let ((default-ns (assq '*DEFAULT* namespaces)))
2316      (if (and default-ns (cadr default-ns))
2317          (cons (cadr default-ns) unres-name)
2318          unres-name)))         ; no default namespace declared
2319   (else unres-name)))          ; no prefix, don't apply the default-ns
2320           
2321         
2322(run-test
2323 (let* ((namespaces
2324        '((HTML UHTML . URN-HTML)
2325          (HTML UHTML-1 . URN-HTML)
2326          (A    UHTML . URN-HTML)))
2327        (namespaces-def
2328         (cons
2329          '(*DEFAULT* DEF . URN-DEF) namespaces))
2330        (namespaces-undef
2331         (cons
2332          '(*DEFAULT* #f . #f) namespaces-def))
2333        (port (current-input-port)))
2334
2335   (assert (equal? 'ABC 
2336                   (SSAX:resolve-name port 'ABC namespaces #t)))
2337   (assert (equal? '(DEF . ABC)
2338                   (SSAX:resolve-name port 'ABC namespaces-def #t)))
2339   (assert (equal? 'ABC
2340                   (SSAX:resolve-name port 'ABC namespaces-def #f)))
2341   (assert (equal? 'ABC
2342                   (SSAX:resolve-name port 'ABC namespaces-undef #t)))
2343   (assert (equal? '(UHTML . ABC)
2344                   (SSAX:resolve-name port '(HTML . ABC) namespaces-def #t)))
2345   (assert (equal? '(UHTML . ABC)
2346                   (SSAX:resolve-name port '(HTML . ABC) namespaces-def #f)))
2347   (assert (equal? `(,SSAX:Prefix-XML . space)
2348                   (SSAX:resolve-name port 
2349                       `(,(string->symbol "xml") . space) namespaces-def #f)))
2350   (assert (failed?
2351                   (SSAX:resolve-name port '(XXX . ABC) namespaces-def #f)))
2352))
2353
2354
2355; procedure+:   SSAX:uri-string->symbol URI-STR
2356; Convert a URI-STR to an appropriate symbol
2357(define (SSAX:uri-string->symbol uri-str)
2358  (string->symbol uri-str))
2359
2360; procedure+:   SSAX:complete-start-tag TAG PORT ELEMS ENTITIES NAMESPACES
2361;
2362; This procedure is to complete parsing of a start-tag markup. The
2363; procedure must be called after the start tag token has been
2364; read. TAG is an UNRES-NAME. ELEMS is an instance of xml-decl::elems;
2365; it can be #f to tell the function to do _no_ validation of elements
2366; and their attributes.
2367;
2368; This procedure returns several values:
2369;  ELEM-GI: a RES-NAME.
2370;  ATTRIBUTES: element's attributes, an ATTLIST of (RES-NAME . STRING)
2371;       pairs. The list does NOT include xmlns attributes.
2372;  NAMESPACES: the input list of namespaces amended with namespace
2373;       (re-)declarations contained within the start-tag under parsing
2374;  ELEM-CONTENT-MODEL
2375
2376; On exit, the current position in PORT will be the first character after
2377; #\> that terminates the start-tag markup.
2378;
2379; Faults detected:
2380;       VC: XML-Spec.html#enum
2381;       VC: XML-Spec.html#RequiredAttr
2382;       VC: XML-Spec.html#FixedAttr
2383;       VC: XML-Spec.html#ValueType
2384;       WFC: XML-Spec.html#uniqattspec (after namespaces prefixes are resolved)
2385;       VC: XML-Spec.html#elementvalid
2386;       WFC: REC-xml-names/#dt-NSName
2387
2388; Note, although XML Recommendation does not explicitly say it,
2389; xmlns and xmlns: attributes don't have to be declared (although they
2390; can be declared, to specify their default value)
2391
2392; Procedure:  SSAX:complete-start-tag tag-head port elems entities namespaces
2393(define SSAX:complete-start-tag
2394
2395 (let ((xmlns (string->symbol "xmlns"))
2396       (largest-dummy-decl-attr (list SSAX:largest-unres-name #f #f #f)))
2397
2398  ; Scan through the attlist and validate it, against decl-attrs
2399  ; Return an assoc list with added fixed or implied attrs.
2400  ; Note that both attlist and decl-attrs are ATTLISTs, and therefore,
2401  ; sorted
2402  (define (validate-attrs port attlist decl-attrs)
2403
2404    ; Check to see decl-attr is not of use type REQUIRED. Add
2405    ; the association with the default value, if any declared
2406    (define (add-default-decl decl-attr result)
2407      (let-values*
2408         (((attr-name content-type use-type default-value)
2409           (apply values decl-attr)))
2410         (and (eq? use-type 'REQUIRED)
2411              (parser-error port "[RequiredAttr] broken for" attr-name))
2412         (if default-value
2413             (cons (cons attr-name default-value) result)
2414             result)))
2415
2416    (let loop ((attlist attlist) (decl-attrs decl-attrs) (result '()))
2417      (if (attlist-null? attlist)
2418          (attlist-fold add-default-decl result decl-attrs)
2419          (let-values*
2420           (((attr attr-others)
2421             (attlist-remove-top attlist))
2422            ((decl-attr other-decls)
2423             (if (attlist-null? decl-attrs)
2424                 (values largest-dummy-decl-attr decl-attrs)
2425                 (attlist-remove-top decl-attrs)))
2426            )
2427           (case (name-compare (car attr) (car decl-attr))
2428             ((<) 
2429              (if (or (eq? xmlns (car attr))
2430                      (and (pair? (car attr)) (eq? xmlns (caar attr))))
2431                  (loop attr-others decl-attrs (cons attr result))
2432                  (parser-error port "[ValueType] broken for " attr)))
2433             ((>) 
2434              (loop attlist other-decls 
2435                    (add-default-decl decl-attr result)))
2436             (else      ; matched occurrence of an attr with its declaration
2437              (let-values*
2438               (((attr-name content-type use-type default-value)
2439                 (apply values decl-attr)))
2440               ; Run some tests on the content of the attribute
2441               (cond
2442                ((eq? use-type 'FIXED)
2443                 (or (equal? (cdr attr) default-value)
2444                     (parser-error port "[FixedAttr] broken for " attr-name)))
2445                ((eq? content-type 'CDATA) #t) ; everything goes
2446                ((pair? content-type)
2447                 (or (member (cdr attr) content-type)
2448                     (parser-error port "[enum] broken for " attr-name "="
2449                            (cdr attr))))
2450                (else
2451                 (SSAX:warn port "declared content type " content-type
2452                       " not verified yet")))
2453               (loop attr-others other-decls (cons attr result)))))
2454           ))))
2455           
2456
2457  ; Add a new namespace declaration to namespaces.
2458  ; First we convert the uri-str to a uri-symbol and search namespaces for
2459  ; an association (_ user-prefix . uri-symbol).
2460  ; If found, we return the argument namespaces with an association
2461  ; (prefix user-prefix . uri-symbol) prepended.
2462  ; Otherwise, we prepend (prefix uri-symbol . uri-symbol)
2463  (define (add-ns port prefix uri-str namespaces)
2464    (and (equal? "" uri-str)
2465         (parser-error port "[dt-NSName] broken for " prefix))
2466    (let ((uri-symbol (SSAX:uri-string->symbol uri-str)))
2467      (let loop ((nss namespaces))
2468        (cond
2469         ((null? nss)
2470          (cons (cons* prefix uri-symbol uri-symbol) namespaces))
2471         ((eq? uri-symbol (cddar nss))
2472          (cons (cons* prefix (cadar nss) uri-symbol) namespaces))
2473         (else (loop (cdr nss)))))))
2474     
2475  ; partition attrs into proper attrs and new namespace declarations
2476  ; return two values: proper attrs and the updated namespace declarations
2477  (define (adjust-namespace-decl port attrs namespaces)
2478    (let loop ((attrs attrs) (proper-attrs '()) (namespaces namespaces))
2479      (cond
2480       ((null? attrs) (values proper-attrs namespaces))
2481       ((eq? xmlns (caar attrs))        ; re-decl of the default namespace
2482        (loop (cdr attrs) proper-attrs 
2483              (if (equal? "" (cdar attrs))      ; un-decl of the default ns
2484                  (cons (cons* '*DEFAULT* #f #f) namespaces)
2485                  (add-ns port '*DEFAULT* (cdar attrs) namespaces))))
2486       ((and (pair? (caar attrs)) (eq? xmlns (caaar attrs)))
2487        (loop (cdr attrs) proper-attrs
2488              (add-ns port (cdaar attrs) (cdar attrs) namespaces)))
2489       (else
2490        (loop (cdr attrs) (cons (car attrs) proper-attrs) namespaces)))))
2491
2492    ; The body of the function
2493 (lambda (tag-head port elems entities namespaces)
2494  (let-values* 
2495   ((attlist (SSAX:read-attributes port entities))
2496    (empty-el-tag?
2497     (begin
2498       (SSAX:skip-S port)
2499       (and
2500        (eqv? #\/ 
2501              (assert-curr-char '(#\> #\/) "XML [40], XML [44], no '>'" port))
2502        (assert-curr-char '(#\>) "XML [44], no '>'" port))))
2503    ((elem-content decl-attrs)  ; see xml-decl for their type
2504     (if elems                  ; elements declared: validate!
2505         (cond
2506          ((assoc tag-head elems) =>
2507           (lambda (decl-elem)          ; of type xml-decl::decl-elem
2508             (values
2509              (if empty-el-tag? 'EMPTY-TAG (cadr decl-elem))
2510              (caddr decl-elem))))
2511          (else
2512           (parser-error port "[elementvalid] broken, no decl for " tag-head)))
2513         (values                ; non-validating parsing
2514          (if empty-el-tag? 'EMPTY-TAG 'ANY)
2515          #f)                   ; no attributes declared
2516         ))
2517    (merged-attrs (if decl-attrs (validate-attrs port attlist decl-attrs)
2518                      (attlist->alist attlist)))
2519    ((proper-attrs namespaces)
2520     (adjust-namespace-decl port merged-attrs namespaces))
2521    )
2522   ;(cerr "proper attrs: " proper-attrs nl)
2523   ; build the return value
2524   (values
2525    (SSAX:resolve-name port tag-head namespaces #t)
2526    (fold-right
2527     (lambda (name-value attlist)
2528       (or
2529        (attlist-add attlist
2530           (cons (SSAX:resolve-name port (car name-value) namespaces #f)
2531                 (cdr name-value)))
2532        (parser-error port "[uniqattspec] after NS expansion broken for " 
2533               name-value)))
2534     (make-empty-attlist)
2535     proper-attrs)
2536    namespaces
2537    elem-content)))))
2538
2539(run-test
2540 (let* ((urn-a (string->symbol "urn:a"))
2541        (urn-b (string->symbol "urn:b"))
2542        (urn-html (string->symbol "http://w3c.org/html"))
2543        (namespaces
2544         `((#f '"UHTML" . ,urn-html)
2545           ('"A"  '"UA" . ,urn-a)))
2546          (test
2547           (lambda (tag-head-name elems str)
2548             (call-with-input-string str
2549                (lambda (port)
2550                  (call-with-values
2551                      (lambda ()
2552                              (SSAX:complete-start-tag
2553                               (call-with-input-string tag-head-name
2554                                      (lambda (port) (SSAX:read-QName port)))
2555                               port
2556                               elems '() namespaces))
2557                    list))))))
2558
2559   ; First test with no validation of elements
2560   ;(test "TAG1" #f "")
2561   (assert (equal? `('"TAG1" () ,namespaces ANY)
2562                   (test "TAG1" #f ">")))
2563   (assert (equal? `('"TAG1" () ,namespaces EMPTY-TAG)
2564                   (test "TAG1" #f "/>")))
2565   (assert (equal? `('"TAG1" (('"HREF" . "a")) ,namespaces EMPTY-TAG)
2566                   (test "TAG1" #f "HREF='a'/>")))
2567   (assert (equal? `(('"UA" . '"TAG1") (('"HREF" . "a"))
2568                     ,(cons `(*DEFAULT* '"UA" . ,urn-a) namespaces) ANY)
2569                   (test "TAG1" #f "HREF='a' xmlns='urn:a'>")))
2570   (assert (equal? `('"TAG1" (('"HREF" . "a"))
2571                     ,(cons '(*DEFAULT* #f . #f) namespaces) ANY)
2572                   (test "TAG1" #f "HREF='a' xmlns=''>")))
2573   (assert (failed? (test "UA:TAG1" #f "HREF='a' xmlns=''/>")))
2574   (assert (equal? `(('"UA" . '"TAG1") ((('"UA" . '"HREF") . "a"))
2575                     ,(cons '(*DEFAULT* #f . #f) namespaces) ANY)
2576                   (test "A:TAG1" #f "A:HREF='a' xmlns=''>")))
2577   (assert (equal? `(('"UA" . '"TAG1") ((('"UA" . '"HREF") . "a"))
2578                     ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) ANY)
2579                   (test "A:TAG1" #f "A:HREF='a' xmlns='urn:b'>")))
2580   (assert (failed? (test "B:TAG1" #f "A:HREF='a' xmlns:b=''/>")))
2581   (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a"))
2582                     ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
2583                   (test "B:TAG1" #f "A:HREF='a' xmlns:B='urn:b'>")))
2584   (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a")
2585                                         ((,urn-b . '"SRC") . "b"))
2586                     ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
2587                   (test "B:TAG1" #f 
2588                         "B:SRC='b' A:HREF='a' xmlns:B='urn:b'>")))
2589   (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a")
2590                                         ((,urn-b . '"HREF") . "b"))
2591                          ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
2592                   (test "B:TAG1" #f 
2593                         "B:HREF=\"b\" A:HREF='a' xmlns:B='urn:b'>")))
2594   ; must be an error! Duplicate attr
2595   (assert (failed? (test "B:TAG1" #f
2596                          "HREF=\"b\" HREF='a' xmlns:B='urn:a'/>")))
2597   ; must be an error! Duplicate attr after ns expansion
2598   (assert (failed? (test "B:TAG1" #f 
2599                          "B:HREF=\"b\" A:HREF='a' xmlns:B='urn:a'/>")))
2600   (assert (equal? `(('"UA" . '"TAG1") (('"HREF" . "a")
2601                                        (('"UA" . '"HREF") . "b"))
2602                     ,(cons `(*DEFAULT* '"UA" . ,urn-a) namespaces) ANY)
2603                   (test "TAG1" #f 
2604                         "A:HREF=\"b\" HREF='a' xmlns='urn:a'>")))
2605   (assert (equal? `('"TAG1" ((('"UHTML" . '"HREF") . "a")
2606                              ((,urn-b . '"HREF") . "b"))
2607                     ,(append `(
2608                         ('"HTML" '"UHTML" . ,urn-html)
2609                         ('"B" ,urn-b . ,urn-b))
2610                              namespaces) ANY)
2611                   (test "TAG1" #f 
2612                         "B:HREF=\"b\" xmlns:B='urn:b' xmlns:HTML='http://w3c.org/html' HTML:HREF='a' >")))
2613
2614   ; Now test the validating parsing
2615   ; No decl for tag1
2616   (assert (failed? (test "TAG1" '((TAG2 ANY ()))
2617                          "B:HREF='b' xmlns:B='urn:b'>")))
2618   ; No decl for HREF elem
2619   (cond-expand
2620    ((not (or scm mit-scheme))  ; Regretfully, SCM treats '() as #f
2621     (assert (failed?
2622              (test "TAG1" '(('"TAG1" ANY ()))
2623                    "B:HREF='b' xmlns:B='urn:b'>"))))
2624    (else #t))
2625   ; No decl for HREF elem
2626   (assert (failed?
2627            (test "TAG1" '(('"TAG1" ANY (('"HREF1" CDATA IMPLIED #f))))
2628            "B:HREF='b' xmlns:B='urn:b'>")))
2629   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces EMPTY-TAG)
2630       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
2631             "HREF='b'/>")))
2632   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2633       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
2634             "HREF='b'>")))
2635   ; Req'd attribute not given error
2636   (assert (failed? 
2637            (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
2638                  ">")))
2639   ; Wrong content-type of the attribute
2640   (assert (failed? 
2641       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" ("c") REQUIRED #f))))
2642             "HREF='b'>")))
2643   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2644       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" ("c" "b") IMPLIED #f))))
2645             "HREF='b'>")))
2646   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2647       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED "c"))))
2648             "HREF='b'>")))
2649   ; Bad fixed attribute
2650   (assert (failed? 
2651         (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "c"))))
2652               "HREF='b'>")))
2653   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2654       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "b"))))
2655             "HREF='b'>")))
2656   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2657       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "b")))) ">")))
2658   (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
2659       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED "b")))) ">")))
2660   (assert (equal? `('"TAG1" () ,namespaces PCDATA)
2661       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED #f)))) ">")))
2662   ; Undeclared attr
2663   (assert (failed? 
2664        (test "TAG1"
2665              '(('"TAG1" PCDATA ((('"A" . '"HREF") CDATA IMPLIED "c"))))
2666              "HREF='b'>")))
2667   (assert (equal? `('"TAG1" (('"HREF" . "b") (('"UA" . '"HREF") . "c"))
2668                          ,namespaces PCDATA)
2669       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
2670                                       (('"A" . '"HREF") CDATA IMPLIED "c"))))
2671             "HREF='b'>")))
2672   (assert (equal? `(('"UA" . '"TAG1")
2673                     (('"HREF" . "b") (('"UA" . '"HREF") . "c"))
2674                     ,namespaces PCDATA)
2675       (test "A:TAG1" '((('"A" . '"TAG1") PCDATA
2676                         (('"HREF" NMTOKEN REQUIRED #f)
2677                          (('"A" . '"HREF") CDATA IMPLIED "c"))))
2678             "HREF='b'>")))
2679   (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
2680                     ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
2681       (test "B:TAG1" '((('"B" . '"TAG1") PCDATA (('"HREF" CDATA REQUIRED #f)
2682                           (('"xmlns" . '"B") CDATA IMPLIED "urn:b"))))
2683             "HREF='b'>")))
2684   (assert (equal? `((,urn-b . '"TAG1") (((,urn-b . '"HREF") . "b"))
2685                          ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
2686       (test "B:TAG1" '((('"B" . '"TAG1") PCDATA
2687                         ((('"B" . '"HREF") CDATA REQUIRED #f)
2688                          (('"xmlns" . '"B") CDATA IMPLIED "urn:b"))))
2689             "B:HREF='b'>")))
2690   (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
2691                     ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) PCDATA)
2692       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
2693                           ('"xmlns" CDATA IMPLIED "urn:b"))))
2694             "HREF='b'>")))
2695   ; xmlns not declared
2696   (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
2697                     ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) PCDATA)
2698       (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
2699                           )))
2700             "HREF='b' xmlns='urn:b'>")))
2701   ; xmlns:B not declared
2702   (assert (equal? `((,urn-b . '"TAG1") (((,urn-b . '"HREF") . "b"))
2703                     ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
2704       (test "B:TAG1" '((('"B" . '"TAG1") PCDATA
2705                         ((('"B" . '"HREF") CDATA REQUIRED #f)
2706                           )))
2707             "B:HREF='b' xmlns:B='urn:b'>")))
2708))
2709
2710; procedure+:   SSAX:read-external-ID PORT
2711;
2712; This procedure parses an ExternalID production:
2713; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2714;               | 'PUBLIC' S PubidLiteral S SystemLiteral
2715; [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2716; [12] PubidLiteral ::=  '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2717; [13] PubidChar ::=  #x20 | #xD | #xA | [a-zA-Z0-9]
2718;                         | [-'()+,./:=?;!*#@$_%]
2719;
2720; This procedure is supposed to be called when an ExternalID is expected;
2721; that is, the current character must be either #\S or #\P that start
2722; correspondingly a SYSTEM or PUBLIC token. This procedure returns the
2723; SystemLiteral as a string. A PubidLiteral is disregarded if present.
2724 
2725(define (SSAX:read-external-ID port)
2726  (let ((discriminator (SSAX:read-NCName port)))
2727    (assert-curr-char SSAX:S-chars "space after SYSTEM or PUBLIC" port)
2728    (SSAX:skip-S port)
2729    (let ((delimiter 
2730          (assert-curr-char '(#\' #\" ) "XML [11], XML [12]" port)))
2731      (cond
2732        ((eq? discriminator (string->symbol "SYSTEM"))
2733          (begin0
2734            (next-token '() (list delimiter) "XML [11]" port)
2735            (read-char port)    ; reading the closing delim
2736            ))
2737         ((eq? discriminator (string->symbol "PUBLIC"))
2738           (skip-until (list delimiter) port)
2739           (assert-curr-char SSAX:S-chars "space after PubidLiteral" port)
2740           (SSAX:skip-S port)
2741           (let* ((delimiter 
2742                  (assert-curr-char '(#\' #\" ) "XML [11]" port))
2743                  (systemid
2744                    (next-token '() (list delimiter) "XML [11]" port)))
2745                (read-char port)        ; reading the closing delim
2746                systemid))
2747         (else
2748           (parser-error port "XML [75], " discriminator 
2749                  " rather than SYSTEM or PUBLIC"))))))
2750
2751
2752;-----------------------------------------------------------------------------
2753;                       Higher-level parsers and scanners
2754;
2755; They parse productions corresponding to the whole (document) entity
2756; or its higher-level pieces (prolog, root element, etc).
2757
2758
2759; Scan the Misc production in the context
2760; [1]  document ::=  prolog element Misc*
2761; [22] prolog ::= XMLDecl? Misc* (doctypedec l Misc*)?
2762; [27] Misc ::= Comment | PI |  S
2763;
2764; The following function should be called in the prolog or epilog contexts.
2765; In these contexts, whitespaces are completely ignored.
2766; The return value from SSAX:scan-Misc is either a PI-token,
2767; a DECL-token, a START token, or EOF.
2768; Comments are ignored and not reported.
2769
2770(define (SSAX:scan-Misc port)
2771  (let loop ((c (SSAX:skip-S port)))
2772    (cond
2773      ((eof-object? c) c)
2774      ((not (char=? c #\<))
2775        (parser-error port "XML [22], char '" c "' unexpected"))
2776      (else
2777        (let ((token (SSAX:read-markup-token port)))
2778          (case (xml-token-kind token)
2779            ((COMMENT) (loop (SSAX:skip-S port)))
2780            ((PI DECL START) token)
2781            (else
2782              (parser-error port "XML [22], unexpected token of kind "
2783                     (xml-token-kind token)
2784                     ))))))))
2785
2786; procedure+:   SSAX:read-char-data PORT EXPECT-EOF? STR-HANDLER SEED
2787;
2788; This procedure is to read the character content of an XML document
2789; or an XML element.
2790; [43] content ::=
2791;       (element | CharData | Reference | CDSect | PI
2792;       | Comment)*
2793; To be more precise, the procedure reads CharData, expands CDSect
2794; and character entities, and skips comments. The procedure stops
2795; at a named reference, EOF, at the beginning of a PI or a start/end tag.
2796;
2797; port
2798;       a PORT to read
2799; expect-eof?
2800;       a boolean indicating if EOF is normal, i.e., the character
2801;       data may be terminated by the EOF. EOF is normal
2802;       while processing a parsed entity.
2803; str-handler
2804;       a STR-HANDLER
2805; seed
2806;       an argument passed to the first invocation of STR-HANDLER.
2807;
2808; The procedure returns two results: SEED and TOKEN.
2809; The SEED is the result of the last invocation of STR-HANDLER, or the
2810; original seed if STR-HANDLER was never called.
2811;
2812; TOKEN can be either an eof-object (this can happen only if
2813; expect-eof? was #t), or:
2814;     - an xml-token describing a START tag or an END-tag;
2815;       For a start token, the caller has to finish reading it.
2816;     - an xml-token describing the beginning of a PI. It's up to an
2817;       application to read or skip through the rest of this PI;
2818;     - an xml-token describing a named entity reference.
2819;
2820; CDATA sections and character references are expanded inline and
2821; never returned. Comments are silently disregarded.
2822;
2823; As the XML Recommendation requires, all whitespace in character data
2824; must be preserved. However, a CR character (#xD) must be disregarded
2825; if it appears before a LF character (#xA), or replaced by a #xA character
2826; otherwise. See Secs. 2.10 and 2.11 of the XML Recommendation. See also
2827; the canonical XML Recommendation.
2828
2829        ; SSAX:read-char-data port expect-eof? str-handler seed
2830(define SSAX:read-char-data
2831 (let
2832     ((terminators-usual '(#\< #\& #\return))
2833      (terminators-usual-eof '(#\< *eof* #\& #\return))
2834
2835      (handle-fragment
2836       (lambda (fragment str-handler seed)
2837         (if (string-null? fragment) seed
2838             (str-handler fragment "" seed))))
2839      )
2840
2841   (lambda (port expect-eof? str-handler seed)
2842
2843     ; Very often, the first character we encounter is #\<
2844     ; Therefore, we handle this case in a special, fast path
2845     (if (eqv? #\< (peek-char port))
2846
2847         ; The fast path
2848         (let ((token (SSAX:read-markup-token port)))
2849           (case (xml-token-kind token)
2850             ((START END)       ; The most common case
2851              (values seed token))
2852             ((CDSECT)
2853              (let ((seed (SSAX:read-CDATA-body port str-handler seed)))
2854                (SSAX:read-char-data port expect-eof? str-handler seed)))
2855             ((COMMENT) (SSAX:read-char-data port expect-eof?
2856                                             str-handler seed))
2857             (else
2858              (values seed token))))
2859
2860
2861         ; The slow path
2862         (let ((char-data-terminators
2863                (if expect-eof? terminators-usual-eof terminators-usual)))
2864
2865           (let loop ((seed seed))
2866             (let* ((fragment
2867                     (next-token '() char-data-terminators 
2868                                 "reading char data" port))
2869                    (term-char (peek-char port)) ; one of char-data-terminators
2870                    )
2871               (if (eof-object? term-char)
2872                   (values
2873                    (handle-fragment fragment str-handler seed)
2874                    term-char)
2875                   (case term-char
2876                     ((#\<)
2877                      (let ((token (SSAX:read-markup-token port)))
2878                        (case (xml-token-kind token)
2879                          ((CDSECT)
2880                           (loop
2881                            (SSAX:read-CDATA-body port str-handler
2882                                (handle-fragment fragment str-handler seed))))
2883                          ((COMMENT)
2884                           (loop (handle-fragment fragment str-handler seed)))
2885                          (else
2886                           (values
2887                            (handle-fragment fragment str-handler seed)
2888                            token)))))
2889                     ((#\&)
2890                      (case (peek-next-char port)
2891                        ((#\#) (read-char port) 
2892                         (loop (str-handler fragment
2893                                       (string (SSAX:read-char-ref port))
2894                                       seed)))
2895                        (else
2896                         (let ((name (SSAX:read-NCName port)))
2897                           (assert-curr-char '(#\;) "XML [68]" port)
2898                           (values
2899                            (handle-fragment fragment str-handler seed)
2900                            (make-xml-token 'ENTITY-REF name))))))
2901                     (else              ; This must be a CR character
2902                      (if (eqv? (peek-next-char port) #\newline)
2903                          (read-char port))
2904                      (loop (str-handler fragment (string #\newline) seed))))
2905                   ))))))))
2906
2907
2908; a few lines of validation code
2909(run-test (letrec
2910  ((a-tag (make-xml-token 'START (string->symbol "BR")))
2911   (a-ref (make-xml-token 'ENTITY-REF (string->symbol "lt")))
2912   (eof-object (with-input-from-string "" read))
2913   (str-handler (lambda (fragment foll-fragment seed)
2914     (if (string-null? foll-fragment) (cons fragment seed)
2915         (cons* foll-fragment fragment seed))))
2916   (test (lambda (str expect-eof? expected-data expected-token)
2917           (display "\nbody: ") (write str) (display "\nResult: ")
2918          (let-values*
2919           (((seed token)
2920             (call-with-input-string str
2921                (lambda (port)
2922                 (SSAX:read-char-data port expect-eof? str-handler '()))))
2923            (result (reverse seed)))
2924           (write result)
2925           (display " ")
2926           (display token)
2927           (assert (equal? result expected-data)
2928                   (equal? token expected-token)))))
2929   )
2930  (test "" #t '() eof-object)
2931  (assert (failed? (test "" #f '() eof-object)))
2932  (test "  " #t '("  ") eof-object)
2933  (test "<BR/>" #f '() a-tag)
2934  (test " <BR  />" #f '(" ") a-tag)
2935
2936  (test " &lt;" #f '(" ") a-ref)
2937  (test " a&lt;" #f '(" a") a-ref)
2938  (test " a &lt;" #f '(" a ") a-ref)
2939
2940  (test " <!-- comment--> a  a<BR/>" #f '(" " " a  a") a-tag)
2941  (test " <!-- comment-->\ra  a<BR/>" #f '(" " "" "\n" "a  a") a-tag)
2942  (test " <!-- comment-->\r\na  a<BR/>" #f '(" " "" "\n" "a  a") a-tag)
2943  (test " <!-- comment-->\r\na\t\r\r\na<BR/>" #f
2944        '(" " "" "\n" "a\t" "\n" "" "\n" "a") a-tag)
2945  (test "a<!-- comment--> a  a<BR/>" #f '("a" " a  a") a-tag)
2946  (test "&#x21;<BR/>" #f '("" "!") a-tag)
2947  (test "&#x21;\n<BR/>" #f '("" "!" "\n") a-tag)
2948  (test "\t&#x21;\n<BR/>" #f '("\t" "!" "\n") a-tag)
2949  (test "\t&#x21;\na a<BR/>" #f '("\t" "!" "\na a") a-tag)
2950  (test "\t&#x21;\ra a<BR/>" #f '("\t" "!" "" "\n" "a a") a-tag)
2951
2952  (test " \ta &#x21;   b <BR/>" #f '(" \ta " "!" "   b ") a-tag)
2953  (test " \ta &#x20;   b <BR/>" #f '(" \ta " " " "   b ") a-tag)
2954
2955  (test "<![CDATA[<]]><BR/>" #f '("<") a-tag)
2956  (test "<![CDATA[]]]><BR/>" #f '("]") a-tag)
2957  (test "\t<![CDATA[<]]><BR/>" #f '("\t" "<") a-tag)
2958  (test "\t<![CDATA[<]]>a b<BR/>" #f '("\t" "<" "a b") a-tag)
2959  (test "\t<![CDATA[<]]>  a b<BR/>" #f '("\t" "<" "  a b") a-tag)
2960
2961  (test "\td <![CDATA[  <\r\r\n]]>  a b<BR/>" #f 
2962        '("\td " "  <" "\n" "" "\n" "  a b") a-tag)
2963))
2964
2965
2966
2967; procedure+:   SSAX:assert-token TOKEN KIND GI
2968; Make sure that TOKEN is of anticipated KIND and has anticipated GI
2969; Note GI argument may actually be a pair of two symbols, Namespace
2970; URI or the prefix, and of the localname.
2971; If the assertion fails, error-cont is evaluated by passing it
2972; three arguments: token kind gi. The result of error-cont is returned.
2973(define (SSAX:assert-token token kind gi error-cont)
2974  (or
2975    (and (xml-token? token)
2976      (eq? kind (xml-token-kind token))
2977      (equal? gi (xml-token-head token)))
2978    (error-cont token kind gi)))
2979
Note: See TracBrowser for help on using the repository browser.