1 | ;;;; ucs.scm - unicode operations |
---|
2 | |
---|
3 | |
---|
4 | ;;XXX all hopelessly ugly and inefficient |
---|
5 | |
---|
6 | (define (string->ucs str) |
---|
7 | (let* ((len (string-length str)) |
---|
8 | (ucs (make-u16vector len))) |
---|
9 | (do ((i 0 (add1 i))) |
---|
10 | ((>= i len) ucs) |
---|
11 | (u16vector-set! ucs i (char->integer (string-ref str i)))))) |
---|
12 | |
---|
13 | (define (symbol->ucs-symbol sym) |
---|
14 | (ucs-string->symbol (string->ucs (symbol->string sym)))) |
---|
15 | |
---|
16 | (define (ucs->string uv) |
---|
17 | (let* ((len (u16vector-length uv)) |
---|
18 | (str (make-string len))) |
---|
19 | (do ((i 0 (add1 i))) |
---|
20 | ((>= i len) str) |
---|
21 | (let ((c (u16vector-ref uv i))) |
---|
22 | (if (> c 255) |
---|
23 | (error 'ucs->string "can not convert UCS character to string" c) |
---|
24 | (string-set! str i (integer->char c))))))) |
---|
25 | |
---|
26 | ;; only for testing |
---|
27 | (define (ucs->ascii-string uv) |
---|
28 | (string-concatenate |
---|
29 | (map (lambda (c) |
---|
30 | (if (< c 256) |
---|
31 | (string (integer->char c)) |
---|
32 | (sprintf "{~x}" c))) |
---|
33 | (u16vector->list uv)))) |
---|
34 | |
---|
35 | (define (ucs->raw-string uv) |
---|
36 | (blob->string (u16vector->blob/shared uv))) |
---|
37 | |
---|
38 | (define ucs-length u16vector-length) |
---|
39 | |
---|
40 | (define (ucs-append u1 u2) |
---|
41 | (let* ((len1 (u16vector-length u1)) |
---|
42 | (len2 (u16vector-length u2)) |
---|
43 | (u (make-u16vector (+ len1 len2)))) |
---|
44 | (do ((i 0 (add1 i))) |
---|
45 | ((>= i len1)) |
---|
46 | (u16vector-set! u i (u16vector-ref u1 i))) |
---|
47 | (do ((i 0 (add1 i))) |
---|
48 | ((>= i len2)) |
---|
49 | (u16vector-set! u (+ len1 i) (u16vector-ref u2 i))) |
---|
50 | u)) |
---|
51 | |
---|
52 | (define (utf8-string->ucs str) |
---|
53 | (list->u16vector (utf8->list str))) |
---|
54 | |
---|
55 | (define (ucs-string->symbol uv) |
---|
56 | (string->symbol (ucs->raw-string uv))) |
---|
57 | |
---|
58 | |
---|
59 | ;;; This is taken from Alex Shinn's UTF8 egg: |
---|
60 | |
---|
61 | (define (ucs->utf8 i) |
---|
62 | (cond ((fx<= i #x7F) |
---|
63 | (string (integer->char i)) ) |
---|
64 | ((fx<= i #x7FF) |
---|
65 | (string (integer->char (fxior #b11000000 (fxshr i 6))) |
---|
66 | (integer->char (fxior #b10000000 (fxand i #b111111)))) ) |
---|
67 | ((fx<= i #xFFFF) |
---|
68 | (string (integer->char (fxior #b11100000 (fxshr i 12))) |
---|
69 | (integer->char (fxior #b10000000 (fxand (fxshr i 6) #b111111))) |
---|
70 | (integer->char (fxior #b10000000 (fxand i #b111111)))) ) |
---|
71 | ((fx<= i #x1FFFFF) |
---|
72 | (string (integer->char (fxior #b11110000 (fxshr i 18))) |
---|
73 | (integer->char (fxior #b10000000 (fxand (fxshr i 12) #b111111))) |
---|
74 | (integer->char (fxior #b10000000 (fxand (fxshr i 6) #b111111))) |
---|
75 | (integer->char (fxior #b10000000 (fxand i #b111111)))) ) |
---|
76 | (else |
---|
77 | (error 'char->utf8 "UTF-8 codepoint out of range" i) ) ) ) |
---|
78 | |
---|
79 | (define (string-int-ref s i) |
---|
80 | (char->integer (string-ref s i))) |
---|
81 | |
---|
82 | (define utf8-start-byte->length |
---|
83 | (let ((table '#( |
---|
84 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x |
---|
85 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x |
---|
86 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x |
---|
87 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x |
---|
88 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x |
---|
89 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x |
---|
90 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x |
---|
91 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x |
---|
92 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x |
---|
93 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x |
---|
94 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax |
---|
95 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx |
---|
96 | 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx |
---|
97 | 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx |
---|
98 | 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex |
---|
99 | 4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx |
---|
100 | ))) |
---|
101 | (lambda (i) (vector-ref table i)))) |
---|
102 | |
---|
103 | (define (extract-bit-field size position n) |
---|
104 | (bitwise-and (bitwise-not (arithmetic-shift -1 size)) |
---|
105 | (arithmetic-shift n (- position)))) |
---|
106 | |
---|
107 | (define (utf8-ref s off) |
---|
108 | (let* ((c (string-int-ref s off)) |
---|
109 | (len (utf8-start-byte->length c)) |
---|
110 | (limit (string-length s))) |
---|
111 | (if (<= len 1) |
---|
112 | c |
---|
113 | (let ((end (+ off len))) |
---|
114 | (if (> end limit) |
---|
115 | (error 'utf8-ref "utf8 trailing char overflow" s off) |
---|
116 | (let loop ((i (+ off 1)) (res (extract-bit-field (- 7 len) 0 c))) |
---|
117 | (if (= i end) |
---|
118 | res |
---|
119 | (loop (+ i 1) |
---|
120 | (bitwise-ior (arithmetic-shift res 6) |
---|
121 | (bitwise-and #b00111111 |
---|
122 | (string-int-ref s i))))))))))) |
---|
123 | |
---|
124 | (define (utf8->list str) |
---|
125 | (let ((limit (string-length str))) |
---|
126 | (let lp ((i 0) (res '())) |
---|
127 | (if (>= i limit) |
---|
128 | (reverse res) |
---|
129 | (lp (+ i (utf8-start-byte->length (string-int-ref str i))) |
---|
130 | (cons (utf8-ref str i) res)))))) |
---|