Ticket #1182: utf8-validation-2.diff

File utf8-validation-2.diff, 4.2 KB (added by Moritz Heidkamp, 5 years ago)
  • tests/utf8-test.scm

     
    11
    22(use test utf8 utf8-srfi-13 utf8-srfi-14)
     3(import (prefix (only scheme string list->string) byte-))
    34
    45;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    56;; R5RS
     
    547548(test-assert (char-set-contains? char-set:full #\a))
    548549(test-assert (char-set-contains? char-set:full (string-ref "あ" 0)))
    549550
     551;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     552;; utf8-validation
     553
     554(test-assert "stray continuation byte"
     555             (not (valid-string? (byte-string #\x (integer->char #b10101010) #\x))))
     556
     557(test-assert "overlong"
     558             (not (valid-string? (byte-list->string `(#\x ,@(map integer->char '(#b11000000 #b10100111)) #\x)))))
     559
     560(test-assert "incomplete"
     561             (not (valid-string? (byte-string #\x (integer->char #b11000001)))))
     562
     563(test-assert (valid-string? "Є"))
     564(test-assert (valid-string? "☭"))
     565(test-assert (valid-string? "😈"))
     566(test-assert (valid-string? "xЄ☭😈x"))
     567
    550568(test-end)
  • utf8-lolevel.scm

     
    3838   utf8-string-ref utf8-string-set! utf8-string-length
    3939   utf8-substring
    4040   utf8-string->list utf8-prev-char utf8-next-char
    41    make-utf8-string
     41   make-utf8-string utf8-string?
    4242   with-substring-offsets with-two-substring-offsets
    4343   ;; string-pointers
    4444   make-string-pointer string-pointer? sp-copy
     
    381381                                      (bitwise-and #b00111111 b2))
    382382                         (- i 1))))))))))))
    383383
     384(define (char<= a b c)
     385  (and (char<=? a b)
     386       (char<=? b c)))
     387
     388(define (utf8-tail? c)
     389  (char<= #\x80 c #\xBF))
     390
     391(define (utf8-2? c0 c1)
     392  (and (char<= #\xC2 c0 #\xDF)
     393       (utf8-tail? c1)))
     394
     395(define (utf8-3? c0 c1 c2)
     396  (and (utf8-tail? c2)
     397       (cond ((char=? c0 #\xE0)
     398              (char<= #\xA0 c1 #\xBF))
     399             ((char<= #\xE1 c0 #\xEC)
     400              (utf8-tail? c1))
     401             ((char=? c0 #\xED)
     402              (char<= #\x80 c1 #\x9F))
     403             ((char<= #\xEE c0 #\xEF)
     404              (utf8-tail? c1))
     405             (else #f))))
     406
     407(define (utf8-4? c0 c1 c2 c3)
     408  (and (utf8-tail? c2)
     409       (utf8-tail? c3)
     410       (cond ((char=? c0 #\xF0)
     411              (char<= #\x90 c1 #\xBF))
     412             ((char<= #\xF1 c0 #\xF3)
     413              (utf8-tail? c1))
     414             ((char=? c0 #\xF4)
     415              (char<= #\x80 c1 #\x8F))
     416             (else #f))))
     417
     418(define (utf8-string? str)
     419  (let ((len (string-length str)))
     420    (let loop ((pos 0))
     421      (or (fx= pos len)
     422          (let ((c0 (string-ref str pos)))
     423            (let-syntax ((validate (syntax-rules ()
     424                                     ((_ valid? c ... n)
     425                                      (and (fx<= n (fx- len pos))
     426                                           (valid? c0 (string-ref str (fx+ pos c)) ...)
     427                                           (loop (fx+ pos n)))))))
     428              (if (char<=? c0 #\x7F)
     429                  (loop (fx+ pos 1))
     430                  (case (utf8-start-byte->length (char->integer c0))
     431                    ((2) (validate utf8-2? 1 2))
     432                    ((3) (validate utf8-3? 1 2 3))
     433                    ((4) (validate utf8-4? 1 2 3 4))
     434                    (else #f)))))))))
     435
     436
    384437;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    385438
    386439(define-syntax in-utf8-string
  • utf8.scm

     
    225225  string-match string-match-positions string-match-offsets
    226226  string-search string-search-positions string-search-offsets
    227227  ;; new
    228   string-set
     228  string-set valid-string?
    229229  )
    230230
    231231(import (rename (except scheme
     
    258258                (string-substitute* byte-string-substitute*)
    259259                (string-split-fields byte-string-split-fields))
    260260        ports
    261         utf8-lolevel)
     261        (rename utf8-lolevel
     262                (utf8-string? valid-string?)))
    262263
    263264(require-library regex utf8-lolevel)
    264265