Changeset 28086 in project


Ignore:
Timestamp:
01/14/13 08:43:47 (7 years ago)
Author:
Ivan Raikov
Message:

uri-generic: created a branch for experimentation with utf8

Location:
release/4/uri-generic/branches
Files:
1 added
3 edited
1 copied

Legend:

Unmodified
Added
Removed
  • release/4/uri-generic/branches/utf8/tests/run.scm

    r25789 r28086  
    549549            rfc5122-refs))
    550550
     551
     552(define  utf8-cases
     553  '(;; URI representation of IRIs test cases
     554    ("http://example.com/삌계탕"
     555     "http://example.com/test")
     556    ))
     557
     558(test-group "URI representation of IRIs"
     559  (for-each (lambda (s)
     560              (test (cadr s) (uri->string (iri/uri-map (car s)))))
     561            utf8-cases))
     562
    551563(define make-cases
    552564  `(("http://example.com:123/foo/bar?a=b;c=d#location"
  • release/4/uri-generic/branches/utf8/uri-generic.scm

    r25789 r28086  
    4141
    4242(module uri-generic
    43   (uri-reference make-uri update-uri update-authority
     43  (uri-reference iri->uri make-uri update-uri update-authority
    4444   uri-reference? uri-auth uri-authority uri-scheme uri-path uri-query
    4545   uri-fragment uri-host uri-port uri-username uri-password
     
    5858(import chicken scheme extras data-structures ports)
    5959 
    60 (require-extension matchable defstruct srfi-1 srfi-4 srfi-13 srfi-14)
    61 
    62 ;; What to do with these?
    63 #;(cond-expand
    64    (utf8-strings (use utf8-srfi-13 utf8-srfi-14))
    65    (else (use srfi-13 srfi-14)))
     60(require-extension matchable defstruct srfi-1)
     61(require-library srfi-13 srfi-14)
     62(import
     63 (only srfi-13
     64       string-index string-upcase string-downcase)
     65 (only srfi-14
     66       char-set? char-set char-set-complement char-set-union char-set->string char-set-contains?
     67       string->char-set char-set:letter+digit char-set:letter char-set:digit char-set:hex-digit char-set:full
     68       char-set:lower-case char-set:upper-case char-set:whitespace
     69       ))
     70
    6671
    6772(defstruct URI      scheme authority path query fragment)
     
    152157(define (pct-encoded? c)      (match c ((#\% h1 h2) (and (hexdigit-char? h1) (hexdigit-char? h2)))
    153158                                     (else #f)))
    154 
    155 
     159   
    156160;; Helper functions for character parsing
    157161 
     
    231235;; RFC 3986, section 2.1
    232236;;
     237
     238(define (hex-digit i)
     239  (and (>= i 0) (< i 16)
     240       (string-ref (string-upcase (number->string i 16)) 0)))
     241
    233242;; Returns a 'pct-encoded' sequence of octets.
    234243;;
     244
    235245(define (pct-encode char-list char-set)
    236   (define (hex-digit i)
    237     (and (>= i 0) (< i 16)
    238          (car (string->list (string-upcase (number->string i 16))))))
    239246  (reverse (fold (lambda (c cl)
    240247                   (if (char-set-contains? char-set c)
     
    323330            (match-let* (((ua rst)  (authority rst))
    324331                         ((up rst)  (path-abempty rst)))
     332                        (print "ua = " ua)
     333                        (print "up = " up)
    325334                        (list ua up rst)))
    326335         (else (match-let (((up rst) (or (path-abs s) (path-rootless s) (list '() s))))
     
    680689    (and s (or (uri s) (relative-ref s)))))
    681690
     691;; IRI to URI mapping (RFC3987, section 3.1)
     692
     693(define (iri->uri s)
     694  (let ((s (if (string? s) (uri-string->char-list s) s)))
     695    (print "s = " s)
     696    (and s (or (uri s) (relative-ref s)))))
     697
     698
    682699;; (define uri-reference? URI) ; Already defined as URI? (struct predicate)
    683700
     
    834851           (cond ((char? x) (cons x ax))
    835852                 ((list? x) (append-reverse x ax)))) (list) s)))
     853
     854
     855;; from SRFI-33, useful in splitting up the bit patterns used to
     856;; represent unicode values in utf8
     857(define-inline (extract-bit-field size position n)
     858  (bitwise-and (bitwise-not (arithmetic-shift -1 size))
     859               (arithmetic-shift n (- position))))
     860
     861
     862;; The following is borrowed from the utf8 library by Alex Shinn:
     863;; number of total bytes in a utf8 char given the 1st byte
     864(define utf8-start-byte-length
     865'#(
     8661 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x
     8671 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x
     8681 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x
     8691 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x
     8701 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x
     8711 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x
     8721 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x
     8731 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x
     8741 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x
     8751 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x
     8761 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax
     8771 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx
     8782 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx
     8792 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx
     8803 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex
     8814 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx
     882))
     883
     884
     885;; The following routine is based on the read-utf8-char routine
     886;; from the utf8 library by Alex Shinn:
     887(define-inline (utf8-char-car lst)
     888  (let* ((b1 (car lst)) (bi1 (char->integer b1))
     889         (len (vector-ref utf8-start-byte-length bi1)))
     890    (if (<= len 1)
     891        (list (list b1) (cdr lst))
     892        (let loop ((res (list bi1)) (b (cdr lst)) (i (- len 1)))
     893          (if (zero? i)
     894              (list ;; converts UTF-8 sequences to pct-encoded char lists
     895               ;; the pct-encoded list is built in reverse order
     896               (fold-right
     897                (lambda (x ax)
     898                  (let ((h1 (hex-digit (quotient x 16)))
     899                        (h2 (hex-digit (remainder x 16))))
     900                       (cons  (list #\% h1 h2) ax)))
     901                '() res)
     902               b)
     903              (let ((bi2 (char->integer (car b))))
     904                (cond
     905                 ((not (= #b10 (extract-bit-field 2 6 bi2)))
     906                  (error 'utf8-char-car "invalid utf8 sequence" bi1 bi2))
     907                 (else
     908                  (loop (cons bi2 res) (cdr b) (- i 1)))
     909                 ))
     910              ))
     911        )
     912    ))
     913
    836914   
    837915;; Convert a string to a URI character list
     
    841919    (if (null? lst) (reverse cs)
    842920        (match lst
    843                ((#\% h1 h2 . rst)  (and (hexdigit-char? h1) (hexdigit-char? h2)
    844                                         (loop (cons (list #\% h1 h2) cs) rst)))
    845                (((and c (? char?)) . rst)  (loop (cons c cs) rst))))))
     921               ((#\% h1 h2 . rst) 
     922                (and (hexdigit-char? h1) (hexdigit-char? h2)
     923                     (loop (cons (list #\% h1 h2) cs) rst)))
     924               (else
     925                (match-let (((c rst) (utf8-char-car lst)))
     926                           (loop (append c cs) rst)) )
     927               ))
     928    ))
     929
    846930   
    847931;;
  • release/4/uri-generic/branches/utf8/uri-generic.setup

    r25790 r28086  
    11;; -*- Hen -*-
    22
    3 (compile -s -O2 uri-generic.scm -j uri-generic)
     3(compile -s -O -d2 -S uri-generic.scm -j uri-generic)
    44(compile -s -O2 uri-generic.import.scm)
    55
Note: See TracChangeset for help on using the changeset viewer.