Ticket #1182: utf8-validation-2.diff
File utf8-validation-2.diff, 4.2 KB (added by , 10 years ago) |
---|
-
tests/utf8-test.scm
1 1 2 2 (use test utf8 utf8-srfi-13 utf8-srfi-14) 3 (import (prefix (only scheme string list->string) byte-)) 3 4 4 5 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5 6 ;; R5RS … … 547 548 (test-assert (char-set-contains? char-set:full #\a)) 548 549 (test-assert (char-set-contains? char-set:full (string-ref "ã" 0))) 549 550 551 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 552 ;; utf8-validation 553 554 (test-assert "stray continuation byte" 555 (not (valid-string? (byte-string #\x (integer->char #b10101010) #\x)))) 556 557 (test-assert "overlong" 558 (not (valid-string? (byte-list->string `(#\x ,@(map integer->char '(#b11000000 #b10100111)) #\x))))) 559 560 (test-assert "incomplete" 561 (not (valid-string? (byte-string #\x (integer->char #b11000001))))) 562 563 (test-assert (valid-string? "Ð")) 564 (test-assert (valid-string? "â")) 565 (test-assert (valid-string? "ð")) 566 (test-assert (valid-string? "xÐâðx")) 567 550 568 (test-end) -
utf8-lolevel.scm
38 38 utf8-string-ref utf8-string-set! utf8-string-length 39 39 utf8-substring 40 40 utf8-string->list utf8-prev-char utf8-next-char 41 make-utf8-string 41 make-utf8-string utf8-string? 42 42 with-substring-offsets with-two-substring-offsets 43 43 ;; string-pointers 44 44 make-string-pointer string-pointer? sp-copy … … 381 381 (bitwise-and #b00111111 b2)) 382 382 (- i 1)))))))))))) 383 383 384 (define (char<= a b c) 385 (and (char<=? a b) 386 (char<=? b c))) 387 388 (define (utf8-tail? c) 389 (char<= #\x80 c #\xBF)) 390 391 (define (utf8-2? c0 c1) 392 (and (char<= #\xC2 c0 #\xDF) 393 (utf8-tail? c1))) 394 395 (define (utf8-3? c0 c1 c2) 396 (and (utf8-tail? c2) 397 (cond ((char=? c0 #\xE0) 398 (char<= #\xA0 c1 #\xBF)) 399 ((char<= #\xE1 c0 #\xEC) 400 (utf8-tail? c1)) 401 ((char=? c0 #\xED) 402 (char<= #\x80 c1 #\x9F)) 403 ((char<= #\xEE c0 #\xEF) 404 (utf8-tail? c1)) 405 (else #f)))) 406 407 (define (utf8-4? c0 c1 c2 c3) 408 (and (utf8-tail? c2) 409 (utf8-tail? c3) 410 (cond ((char=? c0 #\xF0) 411 (char<= #\x90 c1 #\xBF)) 412 ((char<= #\xF1 c0 #\xF3) 413 (utf8-tail? c1)) 414 ((char=? c0 #\xF4) 415 (char<= #\x80 c1 #\x8F)) 416 (else #f)))) 417 418 (define (utf8-string? str) 419 (let ((len (string-length str))) 420 (let loop ((pos 0)) 421 (or (fx= pos len) 422 (let ((c0 (string-ref str pos))) 423 (let-syntax ((validate (syntax-rules () 424 ((_ valid? c ... n) 425 (and (fx<= n (fx- len pos)) 426 (valid? c0 (string-ref str (fx+ pos c)) ...) 427 (loop (fx+ pos n))))))) 428 (if (char<=? c0 #\x7F) 429 (loop (fx+ pos 1)) 430 (case (utf8-start-byte->length (char->integer c0)) 431 ((2) (validate utf8-2? 1 2)) 432 ((3) (validate utf8-3? 1 2 3)) 433 ((4) (validate utf8-4? 1 2 3 4)) 434 (else #f))))))))) 435 436 384 437 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 385 438 386 439 (define-syntax in-utf8-string -
utf8.scm
225 225 string-match string-match-positions string-match-offsets 226 226 string-search string-search-positions string-search-offsets 227 227 ;; new 228 string-set 228 string-set valid-string? 229 229 ) 230 230 231 231 (import (rename (except scheme … … 258 258 (string-substitute* byte-string-substitute*) 259 259 (string-split-fields byte-string-split-fields)) 260 260 ports 261 utf8-lolevel) 261 (rename utf8-lolevel 262 (utf8-string? valid-string?))) 262 263 263 264 (require-library regex utf8-lolevel) 264 265