Ticket #1182: utf8-validation.diff
File utf8-validation.diff, 4.4 KB (added by , 10 years ago) |
---|
-
tests/utf8-test.scm
1 1 2 (use test utf8 utf8-srfi-13 utf8-srfi-14) 2 (use test utf8 utf8-srfi-13 utf8-srfi-14 utf8-validation) 3 (import (prefix scheme core-)) 3 4 4 5 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5 6 ;; R5RS … … 547 548 (test-assert (char-set-contains? char-set:full #\a)) 548 549 (test-assert (char-set-contains? char-set:full (string-ref "ã" 0))) 549 550 551 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 552 ;; utf8-validation 553 554 (test-assert "stray continuation byte" 555 (not (utf8-string? (core-string #\x (integer->char #b10101010) #\x)))) 556 557 (test-assert "overlong" 558 (not (utf8-string? (core-list->string `(#\x ,@(map integer->char '(#b11000000 #b10100111)) #\x))))) 559 560 (test-assert "incomplete" 561 (not (utf8-string? (core-string #\x (integer->char #b11000001))))) 562 563 (test-assert (utf8-string? "Ð")) 564 (test-assert (utf8-string? "â")) 565 (test-assert (utf8-string? "ð")) 566 (test-assert (utf8-string? "xÐâðx")) 567 550 568 (test-end) -
utf8-validation.scm
1 (module utf8-validation 2 3 (utf8-string?) 4 5 (import chicken 6 scheme) 7 8 (define (fxand= a b c) 9 (fx= (fxand a b) c)) 10 11 (define (char<= a b c) 12 (and (char<=? a b) 13 (char<=? b c))) 14 15 (define (utf8-tail? c) 16 (char<= #\x80 c #\xBF)) 17 18 (define (utf8-2? c0 c1) 19 (and (char<= #\xC2 c0 #\xDF) 20 (utf8-tail? c1))) 21 22 (define (utf8-3? c0 c1 c2) 23 (and (utf8-tail? c2) 24 (cond ((char=? c0 #\xE0) 25 (char<= #\xA0 c1 #\xBF)) 26 ((char<= #\xE1 c0 #\xEC) 27 (utf8-tail? c1)) 28 ((char=? c0 #\xED) 29 (char<= #\x80 c1 #\x9F)) 30 ((char<= #\xEE c0 #\xEF) 31 (utf8-tail? c1)) 32 (else #f)))) 33 34 (define (utf8-4? c0 c1 c2 c3) 35 (and (utf8-tail? c2) 36 (utf8-tail? c3) 37 (cond ((char=? c0 #\xF0) 38 (char<= #\x90 c1 #\xBF)) 39 ((char<= #\xF1 c0 #\xF3) 40 (utf8-tail? c1)) 41 ((char=? c0 #\xF4) 42 (char<= #\x80 c1 #\x8F)) 43 (else #f)))) 44 45 (define (utf8-string? str) 46 (let ((len (string-length str))) 47 (let loop ((pos 0)) 48 (or (fx= pos len) 49 (let* ((c0 (string-ref str pos)) 50 (i0 (char->integer c0))) 51 (let-syntax ((validate (syntax-rules () 52 ((_ valid? c ... n) 53 (and (fx<= n (fx- len pos)) 54 (valid? c0 (string-ref str (fx+ pos c)) ...) 55 (loop (fx+ pos n))))))) 56 (cond ((fx<= i0 #x7F) 57 (loop (fx+ pos 1))) 58 ((fxand= i0 #xE0 #xC0) 59 (validate utf8-2? 1 2)) 60 ((fxand= i0 #xF0 #xE0) 61 (validate utf8-3? 1 2 3)) 62 ((fxand= i0 #xF8 #xF0) 63 (validate utf8-4? 1 2 3 4)) 64 (else #f)))))))) 65 66 ) -
utf8.setup
30 30 ("utf8-srfi-13.so" ("utf8-srfi-13.scm") 31 31 (compile -fixnum-arithmetic -inline -local -s -O2 -d1 32 32 -j utf8-srfi-13 utf8-srfi-13.scm) 33 (compile -s -O2 -d0 utf8-srfi-13.import.scm))) 33 (compile -s -O2 -d0 utf8-srfi-13.import.scm)) 34 ("utf8-validation.so" ("utf8-validation.scm") 35 (compile -s -O3 -d1 36 -j utf8-validation utf8-validation.scm) 37 (compile -s -O2 -d0 utf8-validation.import.scm))) 34 38 '("utf8-lolevel.so" "utf8.so" "utf8-srfi-14.so" 35 "unicode-char-sets.so" "utf8-case-map.so" "utf8-srfi-13.so")) 39 "unicode-char-sets.so" "utf8-case-map.so" "utf8-srfi-13.so" 40 "utf8-validation.so")) 36 41 37 42 (install-extension 'utf8-lolevel 38 43 '("utf8-lolevel.so" "utf8-lolevel.import.so") … … 58 63 (install-extension 'utf8-srfi-13 59 64 '("utf8-srfi-13.so" "utf8-srfi-13.import.so") 60 65 `((version ,version))) 66 67 (install-extension 'utf8-validation 68 '("utf8-validation.so" "utf8-validation.import.so") 69 `((version ,version)))