source: project/chicken/trunk/srfi-14.scm @ 15575

Last change on this file since 15575 was 10712, checked in by felix winkelmann, 12 years ago

various tests and improvements

File size: 29.7 KB
Line 
1;;;; srfi-14.scm - Shivers' reference implementation of SRFI-14
2
3
4(declare
5  (unit srfi-14)
6  (fixnum)
7  (disable-interrupts)
8  (standard-bindings)
9  (extended-bindings) 
10  (hide %char-set:s/check %string-iter %char-set-diff+intersection! %char->latin1 %latin1->char
11        %ucs-range->char-set! %string->char-set! %list->char-set! %set-char-set! %char-set-unfold!
12        %char-set-algebra %char-set-cursor-next %char-set-filter! %set-char-set c0 c1 %string-copy
13        %default-base) )
14
15(cond-expand 
16 [paranoia]
17 [else
18  (declare
19    (no-procedure-checks-for-usual-bindings)
20    (bound-to-procedure
21     char-set char-set-complement ucs-range->char-set! ucs-range->char-set char-set-union
22     char-set-adjoin string->char-set list->char-set string-copy make-char-set char-set-copy
23     char-set? char-set-size char-set:s)
24    (no-bound-checks) ) ] )
25
26(include "unsafe-declarations.scm")
27
28(register-feature! 'srfi-14)
29
30
31(define (%latin1->char n) (integer->char n))
32(define (%char->latin1 c) (char->integer c))
33
34
35;;; SRFI-14 character-sets library                              -*- Scheme -*-
36;;;
37;;; - Ported from MIT Scheme runtime by Brian D. Carlstrom.
38;;; - Massively rehacked & extended by Olin Shivers 6/98.
39;;; - Massively redesigned and rehacked 5/2000 during SRFI process.
40;;; At this point, the code bears the following relationship to the
41;;; MIT Scheme code: "This is my grandfather's axe. My father replaced
42;;; the head, and I have replaced the handle." Nonetheless, we preserve
43;;; the MIT Scheme copyright:
44;;;     Copyright (c) 1988-1995 Massachusetts Institute of Technology
45;;; The MIT Scheme license is a "free software" license. See the end of
46;;; this file for the tedious details.
47
48;;; Exports:
49;;; char-set? char-set= char-set<=
50;;; char-set-hash
51;;; char-set-cursor char-set-ref char-set-cursor-next end-of-char-set?
52;;; char-set-fold char-set-unfold char-set-unfold!
53;;; char-set-for-each char-set-map
54;;; char-set-copy
55;;;
56;;; char-set  list->char-set  string->char-set
57;;; char-set! list->char-set! string->char-set!
58;;;
59;;; filterchar-set  ucs-range->char-set  ->char-set
60;;; filterchar-set! ucs-range->char-set!
61;;;
62;;; char-set->list char-set->string
63;;;
64;;; char-set-size char-set-count char-set-contains?
65;;; char-set-every char-set-any
66;;;
67;;; char-set-adjoin  char-set-delete
68;;; char-set-adjoin! char-set-delete!
69;;;
70;;; char-set-complement  char-set-union  char-set-intersection  char-set-difference
71;;; char-set-complement! char-set-union! char-set-intersection! char-set-difference!
72;;;
73;;; char-set-difference char-set-xor  char-set-diff+intersection
74;;; char-set-difference! char-set-xor! char-set-diff+intersection!
75;;;
76;;; char-set:lower-case         char-set:upper-case     char-set:title-case
77;;; char-set:letter             char-set:digit          char-set:letter+digit
78;;; char-set:graphic            char-set:printing       char-set:whitespace
79;;; char-set:iso-control        char-set:punctuation    char-set:symbol
80;;; char-set:hex-digit          char-set:blank          char-set:ascii
81;;; char-set:empty              char-set:full
82
83;;; Imports
84;;; This code has the following non-R5RS dependencies:
85;;; - ERROR
86;;; - %LATIN1->CHAR %CHAR->LATIN1
87;;; - LET-OPTIONALS* and :OPTIONAL macros for parsing, checking & defaulting
88;;;   optional arguments from rest lists.
89;;; - BITWISE-AND for CHAR-SET-HASH
90;;; - The SRFI-19 DEFINE-RECORD-TYPE record macro
91;;; - A simple CHECK-ARG procedure:
92;;;   (lambda (pred val caller) (if (not (pred val)) (error val caller)))
93
94;;; This is simple code, not great code. Char sets are represented as 256-char
95;;; strings. If char I is ASCII/Latin-1 0, then it isn't in the set; if char I
96;;; is ASCII/Latin-1 1, then it is in the set.
97;;; - Should be rewritten to use bit strings or byte vecs.
98;;; - Is Latin-1 specific. Would certainly have to be rewritten for Unicode.
99
100;;; See the end of the file for porting and performance-tuning notes.
101;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
102
103(define (make-char-set s) (##sys#make-structure 'char-set s))
104(define (char-set:s cs) (##sys#slot cs 1))
105(define (char-set? x) (##sys#structure? x 'char-set))
106
107#|
108(define-record-type :char-set
109  (make-char-set s)
110  char-set?
111  (s char-set:s))
112|#
113
114
115(define (%string-copy s) (substring s 0 (string-length s)))
116
117;;; Parse, type-check & default a final optional BASE-CS parameter from
118;;; a rest argument. Return a *fresh copy* of the underlying string.
119;;; The default is the empty set. The PROC argument is to help us
120;;; generate informative error exceptions.
121
122(define (%default-base maybe-base proc)
123  (if (pair? maybe-base)
124      (let ((bcs  (car maybe-base))
125            (tail (cdr maybe-base)))
126        (if (null? tail)
127            (if (char-set? bcs) (%string-copy (char-set:s bcs))
128                (##sys#error "BASE-CS parameter not a char-set" proc bcs))
129            (##sys#error "Expected final base char set -- too many parameters"
130                   proc maybe-base)))
131      (make-string 256 (%latin1->char 0))))
132
133;;; If CS is really a char-set, do CHAR-SET:S, otw report an error msg on
134;;; behalf of our caller, PROC. This procedure exists basically to provide
135;;; explicit error-checking & reporting.
136
137(define (%char-set:s/check cs proc)
138  (let lp ((cs cs))
139    (if (char-set? cs) (char-set:s cs)
140        (lp (##sys#error proc "Not a char-set" cs)))))
141
142
143
144;;; These internal functions hide a lot of the dependency on the
145;;; underlying string representation of char sets. They should be
146;;; inlined if possible.
147
148(define-inline (si=0? s i) (zero? (%char->latin1 (string-ref s i))))
149(define-inline (si=1? s i) (not (si=0? s i)))
150(define-inline (si s i) (%char->latin1 (string-ref s i)))
151(define-inline (%set0! s i) (string-set! s i c0))
152(define-inline (%set1! s i) (string-set! s i c1))
153
154(define c0 (%latin1->char 0))
155(define c1 (%latin1->char 1))
156
157;;; These do various "s[i] := s[i] op val" operations -- see
158;;; %CHAR-SET-ALGEBRA. They are used to implement the various
159;;; set-algebra procedures.
160(define-inline (setv!   s i v) (string-set! s i (%latin1->char v))) ; SET to a Value.
161(define-inline (%not!   s i v) (setv! s i (- 1 v)))
162(define-inline (%and!   s i v) (if (zero? v) (%set0! s i)))
163(define-inline (%or!    s i v) (if (not (zero? v)) (%set1! s i)))
164(define-inline (%minus! s i v) (if (not (zero? v)) (%set0! s i)))
165(define-inline (%xor!   s i v) (if (not (zero? v)) (setv! s i (- 1 (si s i)))))
166
167(define (char-set-copy cs)
168  (make-char-set (%string-copy (%char-set:s/check cs 'char-set-copy))))
169
170(define char-set= 
171  (lambda rest
172    (or (null? rest)
173        (let* ((cs1  (car rest))
174               (rest (cdr rest))
175               (s1 (%char-set:s/check cs1 'char-set=)))
176          (let lp ((rest rest))
177            (or (not (pair? rest))
178                (and (string=? s1 (%char-set:s/check (car rest) 'char-set=))
179                     (lp (cdr rest)))))))))
180
181(define char-set<=
182  (lambda rest
183    (or (null? rest)
184        (let ((cs1  (car rest))
185              (rest (cdr rest)))
186          (let lp ((s1 (%char-set:s/check cs1 'char-set<=))  (rest rest))
187            (or (not (pair? rest))
188                (let ((s2 (%char-set:s/check (car rest) 'char-set<=))
189                      (rest (cdr rest)))
190                  (if (eq? s1 s2) (lp s2 rest) ; Fast path
191                      (let lp2 ((i 255))        ; Real test
192                        (if (< i 0) (lp s2 rest)
193                            (and (<= (si s1 i) (si s2 i))
194                                 (lp2 (- i 1))))))))))) ))
195
196;;; Hash
197;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
198;;; Compute (c + 37 c + 37^2 c + ...) modulo BOUND.
199;;; If you keep BOUND small enough, the intermediate calculations will
200;;; always be fixnums. How small is dependent on the underlying Scheme system;
201;;; we use a default BOUND of 2^22 = 4194304, which should hack it in
202;;; Schemes that give you at least 29 signed bits for fixnums. The core
203;;; calculation that you don't want to overflow is, worst case,
204;;;     (+ 65535 (* 37 (- bound 1)))
205;;; where 65535 is the max character code. Choose the default BOUND to be the
206;;; biggest power of two that won't cause this expression to fixnum overflow,
207;;; and everything will be copacetic.
208
209(define (char-set-hash cs . maybe-bound)
210  (let ((bound (optional maybe-bound 4194304)))
211    (if (zero? bound) (set! bound 4194304))
212    (##sys#check-exact bound 'char-set-hash)
213    (let* ((s (%char-set:s/check cs 'char-set-hash))
214           ;; Compute a 111...1 mask that will cover BOUND-1:
215           (mask (let lp ((i #x10000))  ; Let's skip first 16 iterations, eh?
216                   (if (>= i bound) (- i 1) (lp (+ i i))))))
217      (let lp ((i 255) (ans 0))
218        (if (< i 0) (modulo ans bound)
219            (lp (- i 1)
220                (if (si=0? s i) ans
221                    (fxand mask (+ (* 37 ans) i)))))))) )
222
223
224(define (char-set-contains? cs char)
225  (##sys#check-char char 'char-set-contains?)
226  (si=1? (%char-set:s/check cs 'char-set-contains?)
227;        (%char->latin1 (check-arg char? char char-set-contains?))))
228         (%char->latin1 char) ) )
229
230(define (char-set-size cs)
231  (let ((s (%char-set:s/check cs 'char-set-size)))
232    (let lp ((i 255) (size 0))
233      (if (< i 0) size
234          (lp (- i 1) (+ size (si s i)))))))
235
236(define (char-set-count pred cset)
237;  (check-arg procedure? pred char-set-count)
238  (let ((s (%char-set:s/check cset 'char-set-count)))
239    (let lp ((i 255) (count 0))
240      (if (< i 0) count
241          (lp (- i 1)
242              (if (and (si=1? s i) (pred (%latin1->char i)))
243                  (+ count 1)
244                  count))))))
245
246
247;;; -- Adjoin & delete
248
249(define (%set-char-set set proc cs chars)
250  (let ((s (%string-copy (%char-set:s/check cs proc))))
251    (for-each (lambda (c) (set s (%char->latin1 c)))
252              chars)
253    (make-char-set s)))
254
255(define (%set-char-set! set proc cs chars)
256  (let ((s (%char-set:s/check cs proc)))
257    (for-each (lambda (c) (set s (%char->latin1 c)))
258              chars))
259  cs)
260
261(define (char-set-adjoin cs . chars)
262  (%set-char-set  %set1! 'char-set-adjoin cs chars))
263(define (char-set-adjoin! cs . chars)
264  (%set-char-set! %set1! 'char-set-adjoin! cs chars))
265(define (char-set-delete cs . chars)
266  (%set-char-set  %set0! 'char-set-delete cs chars))
267(define (char-set-delete! cs . chars)
268  (%set-char-set! %set0! 'char-set-delete! cs chars))
269
270
271;;; Cursors
272;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
273;;; Simple implementation. A cursors is an integer index into the
274;;; mark vector, and -1 for the end-of-char-set cursor.
275;;;
276;;; If we represented char sets as a bit set, we could do the following
277;;; trick to pick the lowest bit out of the set:
278;;;   (count-bits (xor (- cset 1) cset))
279;;; (But first mask out the bits already scanned by the cursor first.)
280
281(define (char-set-cursor cset)
282  (%char-set-cursor-next cset 256 'char-set-cursor))
283 
284(define (end-of-char-set? cursor) (< cursor 0))
285
286(define (char-set-ref cset cursor) (%latin1->char cursor))
287
288(define (char-set-cursor-next cset cursor)
289  (##sys#check-exact cursor 'char-set-cursor-next)
290;  (check-arg (lambda (i) (and (integer? i) (exact? i) (<= 0 i 255))) cursor
291;            char-set-cursor-next)
292  (%char-set-cursor-next cset cursor 'char-set-cursor-next))
293
294(define (%char-set-cursor-next cset cursor proc)        ; Internal
295  (let ((s (%char-set:s/check cset proc)))
296    (let lp ((cur cursor))
297      (let ((cur (- cur 1)))
298        (if (or (< cur 0) (si=1? s cur)) cur
299            (lp cur))))))
300
301
302;;; -- for-each map fold unfold every any
303
304(define (char-set-for-each proc cs)
305;  (check-arg procedure? proc char-set-for-each)
306  (let ((s (%char-set:s/check cs 'char-set-for-each)))
307    (let lp ((i 255))
308      (cond ((>= i 0)
309             (if (si=1? s i) (proc (%latin1->char i)))
310             (lp (- i 1)))))))
311
312(define (char-set-map proc cs)
313;  (check-arg procedure? proc char-set-map)
314  (let ((s (%char-set:s/check cs 'char-set-map))
315        (ans (make-string 256 c0)))
316    (let lp ((i 255))
317      (cond ((>= i 0)
318             (if (si=1? s i)
319                 (%set1! ans (%char->latin1 (proc (%latin1->char i)))))
320             (lp (- i 1)))))
321    (make-char-set ans)))
322
323(define (char-set-fold kons knil cs)
324;  (check-arg procedure? kons char-set-fold)
325  (let ((s (%char-set:s/check cs 'char-set-fold)))
326    (let lp ((i 255) (ans knil))
327      (if (< i 0) ans
328          (lp (- i 1)
329              (if (si=0? s i) ans
330                  (kons (%latin1->char i) ans)))))))
331
332(define (char-set-every pred cs)
333;  (check-arg procedure? pred char-set-every)
334  (let ((s (%char-set:s/check cs 'char-set-every)))
335    (let lp ((i 255))
336      (or (< i 0)
337          (and (or (si=0? s i) (pred (%latin1->char i)))
338               (lp (- i 1)))))))
339
340(define (char-set-any pred cs)
341;  (check-arg procedure? pred char-set-any)
342  (let ((s (%char-set:s/check cs 'char-set-any)))
343    (let lp ((i 255))
344      (and (>= i 0)
345           (or (and (si=1? s i) (pred (%latin1->char i)))
346               (lp (- i 1)))))))
347
348
349(define (%char-set-unfold! proc p f g s seed)
350;  (check-arg procedure? p proc)
351;  (check-arg procedure? f proc)
352;  (check-arg procedure? g proc)
353  (let lp ((seed seed))
354    (cond ((not (p seed))                       ; P says we are done.
355           (%set1! s (%char->latin1 (f seed)))  ; Add (F SEED) to set.
356           (lp (g seed))))))                    ; Loop on (G SEED).
357
358(define (char-set-unfold p f g seed . maybe-base)
359  (let ((bs (%default-base maybe-base char-set-unfold)))
360    (%char-set-unfold! char-set-unfold p f g bs seed)
361    (make-char-set bs)))
362
363(define (char-set-unfold! p f g seed base-cset)
364  (%char-set-unfold! char-set-unfold! p f g
365                     (%char-set:s/check base-cset 'char-set-unfold!)
366                     seed)
367  base-cset)
368
369
370
371;;; list <--> char-set
372
373(define (%list->char-set! chars s)
374  (for-each (lambda (char) (%set1! s (%char->latin1 char)))
375            chars))
376
377(define (char-set . chars)
378  (let ((s (make-string 256 c0)))
379    (%list->char-set! chars s)
380    (make-char-set s)))
381
382(define (list->char-set chars . maybe-base)
383  (let ((bs (%default-base maybe-base list->char-set)))
384    (%list->char-set! chars bs)
385    (make-char-set bs)))
386
387(define (list->char-set! chars base-cs)
388  (%list->char-set! chars (%char-set:s/check base-cs 'list->char-set!))
389  base-cs)
390
391
392(define (char-set->list cs)
393  (let ((s (%char-set:s/check cs 'char-set->list)))
394    (let lp ((i 255) (ans '()))
395      (if (< i 0) ans
396          (lp (- i 1)
397              (if (si=0? s i) ans
398                  (cons (%latin1->char i) ans)))))))
399
400
401
402;;; string <--> char-set
403
404(define (%string->char-set! str bs proc)
405  (##sys#check-string str proc)
406;  (check-arg string? str proc)
407  (do ((i (- (string-length str) 1) (- i 1)))
408      ((< i 0))
409    (%set1! bs (%char->latin1 (string-ref str i)))))
410
411(define (string->char-set str . maybe-base)
412  (let ((bs (%default-base maybe-base string->char-set)))
413    (%string->char-set! str bs 'string->char-set)
414    (make-char-set bs)))
415
416(define (string->char-set! str base-cs)
417  (%string->char-set! str (%char-set:s/check base-cs 'string->char-set!)
418                      'string->char-set!)
419  base-cs)
420
421
422(define (char-set->string cs)
423  (let* ((s (%char-set:s/check cs 'char-set->string))
424         (ans (make-string (char-set-size cs))))
425    (let lp ((i 255) (j 0))
426      (if (< i 0) ans
427          (let ((j (if (si=0? s i) j
428                       (begin (string-set! ans j (%latin1->char i))
429                              (+ j 1)))))
430            (lp (- i 1) j))))))
431
432
433;;; -- UCS-range -> char-set
434
435(define (%ucs-range->char-set! lower upper error? bs proc)
436  (##sys#check-exact lower proc)
437  (##sys#check-exact upper proc)
438;  (check-arg (lambda (x) (and (integer? x) (exact? x) (<= 0 x))) lower proc)
439;  (check-arg (lambda (x) (and (integer? x) (exact? x) (<= lower x))) upper proc)
440
441  (if (and (< lower upper) (< 256 upper) error?)
442      (##sys#error "Requested UCS range contains unavailable characters -- this implementation only supports Latin-1"
443             proc lower upper))
444
445  (let lp ((i (- (min upper 256) 1)))
446    (cond ((<= lower i) (%set1! bs i) (lp (- i 1))))))
447
448(define (ucs-range->char-set lower upper . rest)
449  (let-optionals* rest ((error? #f) rest)
450    (let ((bs (%default-base rest ucs-range->char-set)))
451      (%ucs-range->char-set! lower upper error? bs 'ucs-range->char-set)
452      (make-char-set bs))))
453
454(define (ucs-range->char-set! lower upper error? base-cs)
455  (%ucs-range->char-set! lower upper error?
456                         (%char-set:s/check base-cs 'ucs-range->char-set!)
457                         'ucs-range->char-set)
458  base-cs)
459
460
461;;; -- predicate -> char-set
462
463(define (%char-set-filter! pred ds bs proc)
464;  (check-arg procedure? pred proc)
465  (let lp ((i 255))
466    (cond ((>= i 0)
467           (if (and (si=1? ds i) (pred (%latin1->char i)))
468               (%set1! bs i))
469           (lp (- i 1))))))
470
471(define (char-set-filter predicate domain . maybe-base)
472  (let ((bs (%default-base maybe-base char-set-filter)))
473    (%char-set-filter! predicate
474                       (%char-set:s/check domain 'char-set-filter!)
475                       bs
476                       char-set-filter)
477    (make-char-set bs)))
478
479(define (char-set-filter! predicate domain base-cs)
480  (%char-set-filter! predicate
481                     (%char-set:s/check domain 'char-set-filter!)
482                     (%char-set:s/check base-cs 'char-set-filter!)
483                     char-set-filter!)
484  base-cs)
485
486
487;;; {string, char, char-set, char predicate} -> char-set
488
489(define (->char-set x)
490  (cond ((char-set? x) x)
491        ((string? x) (string->char-set x))
492        ((char? x) (char-set x))
493        (else (##sys#error '->char-set "Not a charset, string or char." x))))
494
495
496
497;;; Set algebra
498;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
499;;; The exported ! procs are "linear update" -- allowed, but not required, to
500;;; side-effect their first argument when computing their result. In other
501;;; words, you must use them as if they were completely functional, just like
502;;; their non-! counterparts, and you must additionally ensure that their
503;;; first arguments are "dead" at the point of call. In return, we promise a
504;;; more efficient result, plus allowing you to always assume char-sets are
505;;; unchangeable values.
506
507;;; Apply P to each index and its char code in S: (P I VAL).
508;;; Used by the set-algebra ops.
509
510(define (%string-iter p s)
511  (let lp ((i (- (string-length s) 1)))
512    (cond ((>= i 0)
513           (p i (%char->latin1 (string-ref s i)))
514           (lp (- i 1))))))
515
516;;; String S represents some initial char-set. (OP s i val) does some
517;;; kind of s[i] := s[i] op val update. Do
518;;;     S := S OP CSETi
519;;; for all the char-sets in the list CSETS. The n-ary set-algebra ops
520;;; all use this internal proc.
521
522(define (%char-set-algebra s csets op proc)
523  (for-each (lambda (cset)
524              (let ((s2 (%char-set:s/check cset proc)))
525                (let lp ((i 255))
526                  (cond ((>= i 0)
527                         (op s i (si s2 i))
528                         (lp (- i 1)))))))
529            csets))
530
531
532;;; -- Invert
533
534(define (char-set-complement cs)
535  (let ((s (%char-set:s/check cs 'char-set-complement))
536        (ans (make-string 256)))
537    (%string-iter (lambda (i v) (%not! ans i v)) s)
538    (make-char-set ans)))
539
540(define (char-set-complement! cset)
541  (let ((s (%char-set:s/check cset 'char-set-complement!)))
542    (%string-iter (lambda (i v) (%not! s i v)) s))
543  cset)
544
545
546;;; -- Union
547
548(define (char-set-union! cset1 . csets)
549  (%char-set-algebra (%char-set:s/check cset1 'char-set-union!)
550                     csets %or! 'char-set-union!)
551  cset1)
552
553(define (char-set-union . csets)
554  (if (pair? csets)
555      (let ((s (%string-copy (%char-set:s/check (car csets) 'char-set-union))))
556        (%char-set-algebra s (cdr csets) %or! 'char-set-union)
557        (make-char-set s))
558      (char-set-copy char-set:empty)))
559
560
561;;; -- Intersection
562
563(define (char-set-intersection! cset1 . csets)
564  (%char-set-algebra (%char-set:s/check cset1 'char-set-intersection!)
565                     csets %and! 'char-set-intersection!)
566  cset1)
567
568(define (char-set-intersection . csets)
569  (if (pair? csets)
570      (let ((s (%string-copy (%char-set:s/check (car csets) 'char-set-intersection))))
571        (%char-set-algebra s (cdr csets) %and! 'char-set-intersection)
572        (make-char-set s))
573      (char-set-copy char-set:full)))
574
575
576;;; -- Difference
577
578(define (char-set-difference! cset1 . csets)
579  (%char-set-algebra (%char-set:s/check cset1 'char-set-difference!)
580                     csets %minus! 'char-set-difference!)
581  cset1)
582
583(define (char-set-difference cs1 . csets)
584  (if (pair? csets)
585      (let ((s (%string-copy (%char-set:s/check cs1 'char-set-difference))))
586        (%char-set-algebra s csets %minus! 'char-set-difference)
587        (make-char-set s))
588      (char-set-copy cs1)))
589
590
591;;; -- Xor
592
593(define (char-set-xor! cset1 . csets)
594  (%char-set-algebra (%char-set:s/check cset1 'char-set-xor!)
595                      csets %xor! 'char-set-xor!)
596  cset1)
597
598(define (char-set-xor . csets)
599  (if (pair? csets)
600      (let ((s (%string-copy (%char-set:s/check (car csets) 'char-set-xor))))
601        (%char-set-algebra s (cdr csets) %xor! 'char-set-xor)
602        (make-char-set s))
603      (char-set-copy char-set:empty)))
604
605
606;;; -- Difference & intersection
607
608(define (%char-set-diff+intersection! diff int csets proc)
609  (for-each (lambda (cs)
610              (%string-iter (lambda (i v)
611                              (if (not (zero? v))
612                                  (cond ((si=1? diff i)
613                                         (%set0! diff i)
614                                         (%set1! int  i)))))
615                            (%char-set:s/check cs proc)))
616            csets))
617
618(define (char-set-diff+intersection! cs1 cs2 . csets)
619  (let ((s1 (%char-set:s/check cs1 'char-set-diff+intersection!))
620        (s2 (%char-set:s/check cs2 'char-set-diff+intersection!)))
621    (%string-iter (lambda (i v) (if (zero? v)
622                                    (%set0! s2 i)
623                                    (if (si=1? s2 i) (%set0! s1 i))))
624                  s1)
625    (%char-set-diff+intersection! s1 s2 csets 'char-set-diff+intersection!))
626  (values cs1 cs2))
627
628(define (char-set-diff+intersection cs1 . csets)
629  (let ((diff (string-copy (%char-set:s/check cs1 'char-set-diff+intersection)))
630        (int  (make-string 256 c0)))
631    (%char-set-diff+intersection! diff int csets 'char-set-diff+intersection)
632    (values (make-char-set diff) (make-char-set int))))
633
634
635;;;; System character sets
636;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
637;;; These definitions are for Latin-1.
638;;;
639;;; If your Scheme implementation allows you to mark the underlying strings
640;;; as immutable, you should do so -- it would be very, very bad if a client's
641;;; buggy code corrupted these constants.
642
643(define char-set:empty (char-set))
644(define char-set:full (char-set-complement char-set:empty))
645
646(define char-set:lower-case
647  (let* ((a-z (ucs-range->char-set #x61 #x7B))
648         (latin1 (ucs-range->char-set! #xdf #xf7  #t a-z))
649         (latin2 (ucs-range->char-set! #xf8 #x100 #t latin1)))
650    (char-set-adjoin! latin2 (%latin1->char #xb5))))
651
652(define char-set:upper-case
653  (let ((A-Z (ucs-range->char-set #x41 #x5B)))
654    ;; Add in the Latin-1 upper-case chars.
655    (ucs-range->char-set! #xd8 #xdf #t
656                          (ucs-range->char-set! #xc0 #xd7 #t A-Z))))
657
658(define char-set:title-case char-set:empty)
659
660(define char-set:letter
661  (let ((u/l (char-set-union char-set:upper-case char-set:lower-case)))
662    (char-set-adjoin! u/l
663                      (%latin1->char #xaa)      ; FEMININE ORDINAL INDICATOR
664                      (%latin1->char #xba))))   ; MASCULINE ORDINAL INDICATOR
665
666(define char-set:digit     (string->char-set "0123456789"))
667(define char-set:hex-digit (string->char-set "0123456789abcdefABCDEF"))
668
669(define char-set:letter+digit
670  (char-set-union char-set:letter char-set:digit))
671
672(define char-set:punctuation
673  (let ((ascii (string->char-set "!\"#%&'()*,-./:;?@[\\]_{}"))
674        (latin-1-chars (map %latin1->char '(#xA1 ; INVERTED EXCLAMATION MARK
675                                            #xAB ; LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
676                                            #xAD ; SOFT HYPHEN
677                                            #xB7 ; MIDDLE DOT
678                                            #xBB ; RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
679                                            #xBF)))) ; INVERTED QUESTION MARK
680    (list->char-set! latin-1-chars ascii)))
681
682(define char-set:symbol
683  (let ((ascii (string->char-set "$+<=>^`|~"))
684        (latin-1-chars (map %latin1->char '(#x00A2 ; CENT SIGN
685                                            #x00A3 ; POUND SIGN
686                                            #x00A4 ; CURRENCY SIGN
687                                            #x00A5 ; YEN SIGN
688                                            #x00A6 ; BROKEN BAR
689                                            #x00A7 ; SECTION SIGN
690                                            #x00A8 ; DIAERESIS
691                                            #x00A9 ; COPYRIGHT SIGN
692                                            #x00AC ; NOT SIGN
693                                            #x00AE ; REGISTERED SIGN
694                                            #x00AF ; MACRON
695                                            #x00B0 ; DEGREE SIGN
696                                            #x00B1 ; PLUS-MINUS SIGN
697                                            #x00B4 ; ACUTE ACCENT
698                                            #x00B6 ; PILCROW SIGN
699                                            #x00B8 ; CEDILLA
700                                            #x00D7 ; MULTIPLICATION SIGN
701                                            #x00F7)))) ; DIVISION SIGN
702    (list->char-set! latin-1-chars ascii)))
703 
704
705(define char-set:graphic
706  (char-set-union char-set:letter+digit char-set:punctuation char-set:symbol))
707
708(define char-set:whitespace
709  (list->char-set (map %latin1->char '(#x09 ; HORIZONTAL TABULATION
710                                       #x0A ; LINE FEED         
711                                       #x0B ; VERTICAL TABULATION
712                                       #x0C ; FORM FEED
713                                       #x0D ; CARRIAGE RETURN
714                                       #x20 ; SPACE
715                                       #xA0))))
716
717(define char-set:printing (char-set-union char-set:whitespace char-set:graphic)) ; NO-BREAK SPACE
718
719(define char-set:blank
720  (list->char-set (map %latin1->char '(#x09 ; HORIZONTAL TABULATION
721                                       #x20 ; SPACE
722                                       #xA0)))) ; NO-BREAK SPACE
723
724
725(define char-set:iso-control
726  (ucs-range->char-set! #x7F #xA0 #t (ucs-range->char-set 0 32)))
727
728(define char-set:ascii (ucs-range->char-set 0 128))
729
730
731;;; Porting & performance-tuning notes
732;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
733;;; See the section at the beginning of this file on external dependencies.
734;;;
735;;; First and foremost, rewrite this code to use bit vectors of some sort.
736;;; This will give big speedup and memory savings.
737;;;
738;;; - LET-OPTIONALS* macro.
739;;; This is only used once. You can rewrite the use, port the hairy macro
740;;; definition (which is implemented using a Clinger-Rees low-level
741;;; explicit-renaming macro system), or port the simple, high-level
742;;; definition, which is less efficient.
743;;;
744;;; - :OPTIONAL macro
745;;; Very simply defined using an R5RS high-level macro.
746;;;
747;;; Implementations that can arrange for the base char sets to be immutable
748;;; should do so. (E.g., Scheme 48 allows one to mark a string as immutable,
749;;; which can be used to protect the underlying strings.) It would be very,
750;;; very bad if a client's buggy code corrupted these constants.
751;;;
752;;; There is a fair amount of argument checking. This is, strictly speaking,
753;;; unnecessary -- the actual body of the procedures will blow up if an
754;;; illegal value is passed in. However, the error message will not be as good
755;;; as if the error were caught at the "higher level." Also, a very, very
756;;; smart Scheme compiler may be able to exploit having the type checks done
757;;; early, so that the actual body of the procedures can assume proper values.
758;;; This isn't likely; this kind of compiler technology isn't common any
759;;; longer.
760;;;
761;;; The overhead of optional-argument parsing is irritating. The optional
762;;; arguments must be consed into a rest list on entry, and then parsed out.
763;;; Function call should be a matter of a few register moves and a jump; it
764;;; should not involve heap allocation! Your Scheme system may have a superior
765;;; non-R5RS optional-argument system that can eliminate this overhead. If so,
766;;; then this is a prime candidate for optimising these procedures,
767;;; *especially* the many optional BASE-CS parameters.
768;;;
769;;; Note that optional arguments are also a barrier to procedure integration.
770;;; If your Scheme system permits you to specify alternate entry points
771;;; for a call when the number of optional arguments is known in a manner
772;;; that enables inlining/integration, this can provide performance
773;;; improvements.
774;;;
775;;; There is enough *explicit* error checking that *all* internal operations
776;;; should *never* produce a type or index-range error. Period. Feel like
777;;; living dangerously? *Big* performance win to be had by replacing string
778;;; and record-field accessors and setters with unsafe equivalents in the
779;;; code. Similarly, fixnum-specific operators can speed up the arithmetic
780;;; done on the index values in the inner loops. The only arguments that are
781;;; not completely error checked are
782;;;   - string lists (complete checking requires time proportional to the
783;;;     length of the list)
784;;;   - procedure arguments, such as char->char maps & predicates.
785;;;     There is no way to check the range & domain of procedures in Scheme.
786;;; Procedures that take these parameters cannot fully check their
787;;; arguments. But all other types to all other procedures are fully
788;;; checked.
789;;;
790;;; This does open up the alternate possibility of simply *removing* these
791;;; checks, and letting the safe primitives raise the errors. On a dumb
792;;; Scheme system, this would provide speed (by eliminating the redundant
793;;; error checks) at the cost of error-message clarity.
794;;;
795;;; In an interpreted Scheme, some of these procedures, or the internal
796;;; routines with % prefixes, are excellent candidates for being rewritten
797;;; in C.
798;;;
799;;; It would also be nice to have the ability to mark some of these
800;;; routines as candidates for inlining/integration.
801;;;
802;;; See the comments preceding the hash function code for notes on tuning
803;;; the default bound so that the code never overflows your implementation's
804;;; fixnum size into bignum calculation.
805;;;
806;;; All the %-prefixed routines in this source code are written
807;;; to be called internally to this library. They do *not* perform
808;;; friendly error checks on the inputs; they assume everything is
809;;; proper. They also do not take optional arguments. These two properties
810;;; save calling overhead and enable procedure integration -- but they
811;;; are not appropriate for exported routines.
812
813;;; Copyright notice
814;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
815;;; Copyright (c) 1988-1995 Massachusetts Institute of Technology
816;;;
817;;; This material was developed by the Scheme project at the Massachusetts
818;;; Institute of Technology, Department of Electrical Engineering and
819;;; Computer Science.  Permission to copy and modify this software, to
820;;; redistribute either the original software or a modified version, and
821;;; to use this software for any purpose is granted, subject to the
822;;; following restrictions and understandings.
823;;;
824;;; 1. Any copy made of this software must include this copyright notice
825;;; in full.
826;;;
827;;; 2. Users of this software agree to make their best efforts (a) to
828;;; return to the MIT Scheme project any improvements or extensions that
829;;; they make, so that these may be included in future releases; and (b)
830;;; to inform MIT of noteworthy uses of this software.
831;;;
832;;; 3. All materials developed as a consequence of the use of this
833;;; software shall duly acknowledge such use, in accordance with the usual
834;;; standards of acknowledging credit in academic research.
835;;;
836;;; 4. MIT has made no warrantee or representation that the operation of
837;;; this software will be error-free, and MIT is under no obligation to
838;;; provide any services, by way of maintenance, update, or otherwise.
839;;;
840;;; 5. In conjunction with products arising from the use of this material,
841;;; there shall be no use of the name of the Massachusetts Institute of
842;;; Technology nor of any adaptation thereof in any advertising,
843;;; promotional, or sales literature without prior written consent from
844;;; MIT in each case.
Note: See TracBrowser for help on using the repository browser.