source: project/release/3/ssax/ssax.scm @ 13362

Last change on this file since 13362 was 1, checked in by azul, 15 years ago

Import everything.

File size: 15.7 KB
Line 
1;========================================================================
2;               Highest-level parsers: XML to SXML
3
4(define-macro (let-values* bindings . body)
5  (if (null? bindings) (cons 'begin body)
6      (apply (lambda (vars initializer)
7         (let ((cont 
8                (cons 'let-values* 
9                      (cons (cdr bindings) body))))
10           (cond
11            ((not (pair? vars))         ; regular let case, a single var
12             `(let ((,vars ,initializer)) ,cont))
13            ((null? (cdr vars))         ; single var, see the prev case
14             `(let ((,(car vars) ,initializer)) ,cont))
15           (else                        ; the most generic case
16            `(receive ,vars ,initializer ,cont)))))
17       (car bindings))))
18
19; These parsers are a set of syntactic forms to instantiate a SSAX parser.
20; A user can instantiate the parser to do the full validation, or
21; no validation, or any particular validation. The user specifies
22; which PI he wants to be notified about. The user tells what to do
23; with the parsed character and element data. The latter handlers
24; determine if the parsing follows a SAX or a DOM model.
25
26; syntax: SSAX:make-pi-parser my-pi-handlers
27; Create a parser to parse and process one Processing Element (PI).
28
29; my-pi-handlers
30;       An assoc list of pairs (PI-TAG . PI-HANDLER)
31;       where PI-TAG is an NCName symbol, the PI target, and
32;       PI-HANDLER is a procedure PORT PI-TAG SEED
33;       where PORT points to the first symbol after the PI target.
34;       The handler should read the rest of the PI up to and including
35;       the combination '?>' that terminates the PI. The handler should
36;       return a new seed.
37;       One of the PI-TAGs may be a symbol *DEFAULT*. The corresponding
38;       handler will handle PIs that no other handler will. If the
39;       *DEFAULT* PI-TAG is not specified, SSAX:make-pi-parser will make
40;       one, which skips the body of the PI
41;       
42; The output of the SSAX:make-pi-parser is a procedure
43;       PORT PI-TAG SEED
44; that will parse the current PI accoding to user-specified handlers.
45
46(define-macro SSAX:make-pi-parser
47  (lambda (my-pi-handlers)
48  `(lambda (port target seed)
49    (case target
50        ; Generate the body of the case statement
51      ,@(let loop ((pi-handlers my-pi-handlers) (default #f))
52         (cond
53          ((null? pi-handlers)
54           (if default `((else (,default port target seed)))
55               '((else
56                  (SSAX:warn port "\nSkipping PI: " target "\n")
57                  (SSAX:skip-pi port)
58                  seed))))
59          ((eq? '*DEFAULT* (caar pi-handlers))
60           (loop (cdr pi-handlers) (cdar pi-handlers)))
61          (else
62           (cons
63            `((,(caar pi-handlers)) (,(cdar pi-handlers) port target seed))
64            (loop (cdr pi-handlers) default)))))))))
65
66
67; syntax: SSAX:make-elem-parser my-new-level-seed my-finish-element
68;                               my-char-data-handler my-pi-handlers
69
70; Create a parser to parse and process one element, including its
71; character content or children elements. The parser is typically
72; applied to the root element of a document.
73
74; my-new-level-seed
75;       procedure ELEM-GI ATTRIBUTES NAMESPACES EXPECTED-CONTENT SEED
76;               where ELEM-GI is a RES-NAME of the element
77;               about to be processed.
78;       This procedure is to generate the seed to be passed
79;       to handlers that process the content of the element.
80;       This is the function identified as 'fdown' in the denotational
81;       semantics of the XML parser given in the title comments to this
82;       file.
83;
84; my-finish-element
85;       procedure ELEM-GI ATTRIBUTES NAMESPACES PARENT-SEED SEED
86;       This procedure is called when parsing of ELEM-GI is finished.
87;       The SEED is the result from the last content parser (or
88;       from my-new-level-seed if the element has the empty content).
89;       PARENT-SEED is the same seed as was passed to my-new-level-seed.
90;       The procedure is to generate a seed that will be the result
91;       of the element parser.
92;       This is the function identified as 'fup' in the denotational
93;       semantics of the XML parser given in the title comments to this
94;       file.
95;
96; my-char-data-handler
97;       A STR-HANDLER
98;
99; my-pi-handlers
100;       See SSAX:make-pi-handler above
101;
102
103; The generated parser is a
104;       procedure START-TAG-HEAD PORT ELEMS ENTITIES
105;       NAMESPACES PRESERVE-WS? SEED
106; The procedure must be called after the start tag token has been
107; read. START-TAG-HEAD is an UNRES-NAME from the start-element tag.
108; ELEMS is an instance of xml-decl::elems.
109; See SSAX:complete-start-tag::preserve-ws?
110
111; Faults detected:
112;       VC: XML-Spec.html#elementvalid
113;       WFC: XML-Spec.html#GIMatch
114
115
116
117(define-macro SSAX:make-elem-parser
118  (lambda (my-new-level-seed my-finish-element
119           my-char-data-handler my-pi-handlers)
120 
121  `(lambda (start-tag-head port elems entities namespaces
122                           preserve-ws? seed)
123
124     (define xml-space-gi (cons SSAX:Prefix-XML
125                                (string->symbol "space")))
126
127     (let handle-start-tag ((start-tag-head start-tag-head)
128                            (port port) (entities entities)
129                            (namespaces namespaces)
130                            (preserve-ws? preserve-ws?) (parent-seed seed))
131       (let-values*
132        (((elem-gi attributes namespaces expected-content)
133          (SSAX:complete-start-tag start-tag-head port elems
134                                   entities namespaces))
135         (seed
136          (,my-new-level-seed elem-gi attributes
137                              namespaces expected-content parent-seed)))
138        (case expected-content
139          ((EMPTY-TAG)
140           (,my-finish-element
141            elem-gi attributes namespaces parent-seed seed))
142          ((EMPTY)              ; The end tag must immediately follow
143           (SSAX:assert-token 
144            (and (eqv? #\< (SSAX:skip-S port)) (SSAX:read-markup-token port))
145            'END  start-tag-head
146            (lambda (token exp-kind exp-head)
147              (parser-error port "[elementvalid] broken for " token 
148                     " while expecting "
149                     exp-kind exp-head)))
150           (,my-finish-element
151            elem-gi attributes namespaces parent-seed seed))
152          (else         ; reading the content...
153           (let ((preserve-ws?  ; inherit or set the preserve-ws? flag
154                  (cond
155                   ((assoc xml-space-gi attributes) =>
156                    (lambda (name-value)
157                      (equal? "preserve" (cdr name-value))))
158                   (else preserve-ws?))))
159             (let loop ((port port) (entities entities)
160                        (expect-eof? #f) (seed seed))
161               (let-values*
162                (((seed term-token)
163                  (SSAX:read-char-data port expect-eof?
164                                       ,my-char-data-handler seed)))
165                (if (eof-object? term-token)
166                    seed
167                    (case (xml-token-kind term-token)
168                      ((END)
169                       (SSAX:assert-token term-token 'END  start-tag-head
170                          (lambda (token exp-kind exp-head)
171                            (parser-error port "[GIMatch] broken for "
172                                   term-token " while expecting "
173                                   exp-kind exp-head)))
174                       (,my-finish-element
175                        elem-gi attributes namespaces parent-seed seed))
176                      ((PI)
177                       (let ((seed 
178                          ((SSAX:make-pi-parser ,my-pi-handlers)
179                           port (xml-token-head term-token) seed)))
180                         (loop port entities expect-eof? seed)))
181                      ((ENTITY-REF)
182                       (let ((seed
183                              (SSAX:handle-parsed-entity
184                               port (xml-token-head term-token)
185                               entities
186                               (lambda (port entities seed)
187                                 (loop port entities #t seed))
188                               ,my-char-data-handler
189                               seed))) ; keep on reading the content after ent
190                         (loop port entities expect-eof? seed)))
191                      ((START)          ; Start of a child element
192                       (if (eq? expected-content 'PCDATA)
193                           (parser-error port "[elementvalid] broken for "
194                                  elem-gi
195                                  " with char content only; unexpected token "
196                                  term-token))
197                           ; Do other validation of the element content
198                           (let ((seed
199                                  (handle-start-tag
200                                     (xml-token-head term-token)
201                                     port entities namespaces
202                                     preserve-ws? seed)))
203                             (loop port entities expect-eof? seed)))
204                      (else
205                       (parser-error port "XML [43] broken for "
206                                     term-token))))))))
207          )))
208)))
209
210; syntax: SSAX:make-parser user-handler-tag user-handler-proc ...
211;
212; Create an XML parser, an instance of the XML parsing framework.
213; This will be a SAX, a DOM, or a specialized parser depending
214; on the supplied user-handlers.
215
216; user-handler-tag is a symbol that identifies a procedural expression
217; that follows the tag. Given below are tags and signatures of the
218; corresponding procedures. Not all tags have to be specified. If some
219; are omitted, reasonable defaults will apply.
220;
221
222; tag: DOCTYPE
223; handler-procedure: PORT DOCNAME SYSTEMID INTERNAL-SUBSET? SEED
224; If internal-subset? is #t, the current position in the port
225; is right after we have read #\[ that begins the internal DTD subset.
226; We must finish reading of this subset before we return
227; (or must call skip-internal-subset if we aren't interested in reading it).
228; The port at exit must be at the first symbol after the whole
229; DOCTYPE declaration.
230; The handler-procedure must generate four values:
231;       ELEMS ENTITIES NAMESPACES SEED
232; See xml-decl::elems for ELEMS. It may be #f to switch off the validation.
233; NAMESPACES will typically contain USER-PREFIXes for selected URI-SYMBs.
234; The default handler-procedure skips the internal subset,
235; if any, and returns (values #f '() '() seed)
236
237; tag: UNDECL-ROOT
238; handler-procedure: ELEM-GI SEED
239; where ELEM-GI is an UNRES-NAME of the root element. This procedure
240; is called when an XML document under parsing contains _no_ DOCTYPE
241; declaration.
242; The handler-procedure, as a DOCTYPE handler procedure above,
243; must generate four values:
244;       ELEMS ENTITIES NAMESPACES SEED
245; The default handler-procedure returns (values #f '() '() seed)
246
247; tag: DECL-ROOT
248; handler-procedure: ELEM-GI SEED
249; where ELEM-GI is an UNRES-NAME of the root element. This procedure
250; is called when an XML document under parsing does contains the DOCTYPE
251; declaration.
252; The handler-procedure must generate a new SEED (and verify
253; that the name of the root element matches the doctype, if the handler
254; so wishes).
255; The default handler-procedure is the identity function.
256
257; tag: NEW-LEVEL-SEED
258; handler-procedure: see SSAX:make-elem-parser, my-new-level-seed
259
260; tag: FINISH-ELEMENT
261; handler-procedure: see SSAX:make-elem-parser, my-finish-element
262
263; tag: CHAR-DATA-HANDLER
264; handler-procedure: see SSAX:make-elem-parser, my-char-data-handler
265
266; tag: PI
267; handler-procedure: see SSAX:make-pi-parser
268; The default value is '()
269 
270; The generated parser is a
271;       procedure PORT SEED
272
273; This procedure parses the document prolog and then exits to
274; an element parser (created by SSAX:make-elem-parser) to handle
275; the rest.
276;
277; [1]  document ::=  prolog element Misc*
278; [22] prolog ::= XMLDecl? Misc* (doctypedec | Misc*)?
279; [27] Misc ::= Comment | PI |  S
280;
281; [28] doctypedecl ::=  '<!DOCTYPE' S Name (S ExternalID)? S?
282;                       ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
283; [29] markupdecl ::= elementdecl | AttlistDecl
284;                      | EntityDecl
285;                      | NotationDecl | PI
286;                      | Comment
287;
288
289
290(define-macro SSAX:make-parser
291  (lambda user-handlers
292
293  ; An assoc list of user-handler-tag and default handlers
294  (define all-handlers
295    '((DOCTYPE .
296        (lambda (port docname systemid internal-subset? seed)
297          (when internal-subset?
298              (SSAX:warn port "Internal DTD subset is not currently handled ")
299              (SSAX:skip-internal-dtd port))
300          (SSAX:warn port "DOCTYPE DECL " docname " " 
301                systemid " found and skipped")
302          (values #f '() '() seed)
303          ))
304      (UNDECL-ROOT .
305        (lambda (elem-gi seed) (values #f '() '() seed)))
306      (DECL-ROOT .
307        (lambda (elem-gi seed) seed))
308      (NEW-LEVEL-SEED . REQD)   ; required
309      (FINISH-ELEMENT . REQD)   ; required
310      (CHAR-DATA-HANDLER . REQD)        ; required
311      (PI . ())
312      ))
313
314  ; Delete an association with the tag from alist
315  ; exit to cont, passing the tag, tag-association, and the list
316  ; of remaining associations.
317  ; It's an error if the association with the tag does not exist
318  ; in alist
319  (define (delete-assoc alist tag cont)
320    (let loop ((alist alist) (scanned '()))
321      (cond
322       ((null? alist) (error "Unknown user-handler-tag: " tag))
323       ((eq? tag (caar alist))
324        (cont tag (cdar alist) (append scanned (cdr alist))))
325       (else (loop (cdr alist) (cons (car alist) scanned))))))
326
327  ; create an assoc list of tags and handlers
328  ; based on the defaults and on the given handlers
329  (define (merge-handlers declared-handlers given-handlers)
330    (cond
331     ((null? given-handlers)            ; the arguments are exhausted...
332      (cond
333       ((null? declared-handlers) '())
334       ((not (eq? 'REQD (cdar declared-handlers))) ; default value was given:
335        (cons (car declared-handlers)              ; use it
336              (merge-handlers (cdr declared-handlers) given-handlers)))
337       (else (error "The handler for the tag " (caar declared-handlers)
338                    " must be specified"))))
339     ((null? (cdr given-handlers))
340      (error "Odd number of arguments to SSAX:make-parser"))
341     (else
342      (delete-assoc declared-handlers (car given-handlers)
343          (lambda (tag value alist)
344            (cons (cons tag (cadr given-handlers))
345                  (merge-handlers alist (cddr given-handlers))))))))
346
347  (let ((user-handlers (merge-handlers all-handlers user-handlers)))
348
349    (define (get-handler tag)
350      (cond
351       ((assq tag user-handlers) => cdr)
352       (else (error "unknown tag: " tag))))
353
354
355  `(lambda (port seed)
356
357     ; We must've just scanned the DOCTYPE token
358     ; Handle the doctype declaration and exit to
359     ; scan-for-significant-prolog-token-2, and eventually, to the
360     ; element parser.
361     (define (handle-decl port token-head seed)
362       (or (eq? (string->symbol "DOCTYPE") token-head)
363           (parser-error port "XML [22], expected DOCTYPE declaration, found "
364                  token-head))
365       (assert-curr-char SSAX:S-chars "XML [28], space after DOCTYPE" port)
366       (SSAX:skip-S port)
367       (let-values* 
368        ((docname (SSAX:read-QName port))
369         (systemid
370          (and (SSAX:ncname-starting-char? (SSAX:skip-S port))
371               (SSAX:read-external-ID port)))
372         (internal-subset?
373          (begin (SSAX:skip-S port)
374            (eqv? #\[ (assert-curr-char '(#\> #\[)
375                                        "XML [28], end-of-DOCTYPE" port))))
376         ((elems entities namespaces seed)
377          (,(get-handler 'DOCTYPE) port docname systemid
378                            internal-subset? seed))
379         )
380        (scan-for-significant-prolog-token-2 port elems entities namespaces
381                                             seed)))
382
383
384     ; Scan the leading PIs until we encounter either a doctype declaration
385     ; or a start token (of the root element)
386     ; In the latter two cases, we exit to the appropriate continuation
387     (define (scan-for-significant-prolog-token-1 port seed)
388       (let ((token (SSAX:scan-Misc port)))
389         (if (eof-object? token)
390             (parser-error port "XML [22], unexpected EOF")
391             (case (xml-token-kind token)
392               ((PI)
393                (let ((seed 
394                       ((SSAX:make-pi-parser ,(get-handler 'PI))
395                        port (xml-token-head token) seed)))
396                  (scan-for-significant-prolog-token-1 port seed)))
397               ((DECL) (handle-decl port (xml-token-head token) seed))
398               ((START)
399                (let-values*
400                 (((elems entities namespaces seed)
401                   (,(get-handler 'UNDECL-ROOT) (xml-token-head token) seed)))
402                 (element-parser (xml-token-head token) port elems
403                                 entities namespaces #f seed)))
404               (else (parser-error port "XML [22], unexpected markup "
405                                   token))))))
406
407
408     ; Scan PIs after the doctype declaration, till we encounter
409     ; the start tag of the root element. After that we exit
410     ; to the element parser
411     (define (scan-for-significant-prolog-token-2 port elems entities
412                                                  namespaces seed)
413       (let ((token (SSAX:scan-Misc port)))
414         (if (eof-object? token)
415             (parser-error port "XML [22], unexpected EOF")
416             (case (xml-token-kind token)
417               ((PI)
418                (let ((seed 
419                       ((SSAX:make-pi-parser ,(get-handler 'PI))
420                        port (xml-token-head token) seed)))
421                  (scan-for-significant-prolog-token-2 port elems entities
422                                                       namespaces seed)))
423               ((START)
424                (element-parser (xml-token-head token) port elems
425                  entities namespaces #f
426                  (,(get-handler 'DECL-ROOT) (xml-token-head token) seed)))
427               (else (parser-error port "XML [22], unexpected markup "
428                                   token))))))
429
430
431     ; A procedure start-tag-head port elems entities namespaces
432     ;           preserve-ws? seed
433     (define element-parser
434       (SSAX:make-elem-parser ,(get-handler 'NEW-LEVEL-SEED)
435                              ,(get-handler 'FINISH-ELEMENT)
436                              ,(get-handler 'CHAR-DATA-HANDLER)
437                              ,(get-handler 'PI)))
438
439     ; Get the ball rolling ...
440     (scan-for-significant-prolog-token-1 port seed)
441))))
Note: See TracBrowser for help on using the repository browser.