1 | ;;;; utf8-lolevel.scm -- encoding utils |
---|
2 | ;; |
---|
3 | ;; Copyright (c) 2004-2009 Alex Shinn. All rights reserved. |
---|
4 | ;; BSD-style license: http://synthcode.com/license.txt |
---|
5 | |
---|
6 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
---|
7 | ;; This is an internal library used by the utf8 interface. |
---|
8 | ;; You probably don't want to use this. |
---|
9 | ;; |
---|
10 | ;; Notes: |
---|
11 | ;; |
---|
12 | ;; 'pos' and 'index' refer to conceptual utf8 indices (Unicode codepoints). |
---|
13 | ;; 'off' and 'pointer' refer to actual byte offsets. |
---|
14 | ;; 'sp-' is a string-pointer function referring to offsets. |
---|
15 | |
---|
16 | ;; Uses ##sys#become! since all types are correct at runtime. |
---|
17 | ;; |
---|
18 | ;; Assumes string-length, string-ref & string-set! are rewritten by |
---|
19 | ;; the compiler. |
---|
20 | |
---|
21 | (declare |
---|
22 | (no-argc-checks) |
---|
23 | (no-bound-checks) |
---|
24 | (no-procedure-checks) |
---|
25 | (bound-to-procedure |
---|
26 | ##sys#char->utf8-string ##sys#become!)) |
---|
27 | |
---|
28 | (require-library data-structures lolevel) |
---|
29 | |
---|
30 | (module utf8-lolevel |
---|
31 | ( |
---|
32 | ;; utils |
---|
33 | string-int-ref string-int-set! ascii-string? |
---|
34 | ;; utf8 encoding |
---|
35 | string-set-at-byte-in-place! string-set-at-byte |
---|
36 | utf8-start-byte->length ucs-integer->length |
---|
37 | utf8-index->offset utf8-offset->index |
---|
38 | utf8-string-ref utf8-string-set! utf8-string-length |
---|
39 | utf8-substring |
---|
40 | utf8-string->list utf8-prev-char utf8-next-char |
---|
41 | make-utf8-string |
---|
42 | with-substring-offsets with-two-substring-offsets |
---|
43 | ;; string-pointers |
---|
44 | make-string-pointer string-pointer? sp-copy |
---|
45 | sp-first sp-last sp-next sp-prev sp-ref sp-ref->string sp-set! |
---|
46 | sp-before sp-after sp-substring |
---|
47 | sp-check? sp-check-lo? sp-check-hi? |
---|
48 | ;; I/O |
---|
49 | read-utf8-char write-utf8-char char->utf8-string |
---|
50 | ;; fast-loop iterator |
---|
51 | in-utf8-string |
---|
52 | ) |
---|
53 | |
---|
54 | (import scheme chicken extras lolevel) |
---|
55 | |
---|
56 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
---|
57 | |
---|
58 | ;; utilities which take and return integers |
---|
59 | (define (string-int-ref s i) |
---|
60 | (char->integer (string-ref s i))) |
---|
61 | (define (string-int-set! s i c) |
---|
62 | (string-set! s i (integer->char c))) |
---|
63 | |
---|
64 | ;; determine if a string only has 7-bit ASCII characters |
---|
65 | (define (ascii-string? str) |
---|
66 | (let ((limit (string-length str))) |
---|
67 | (let loop ((i 0)) |
---|
68 | (or (= i limit) |
---|
69 | (and (> 128 (string-int-ref str i)) |
---|
70 | (loop (+ i 1))))))) |
---|
71 | |
---|
72 | ;; from SRFI-33, useful in splitting up the bit patterns used to |
---|
73 | ;; represent unicode values in utf8 |
---|
74 | (define (extract-bit-field size position n) |
---|
75 | (bitwise-and (bitwise-not (arithmetic-shift -1 size)) |
---|
76 | (arithmetic-shift n (- position)))) |
---|
77 | |
---|
78 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
---|
79 | ;; indexing utils |
---|
80 | |
---|
81 | ;; number of total bytes in a utf8 char given the 1st byte |
---|
82 | (define utf8-start-byte->length |
---|
83 | (let ((table '#( |
---|
84 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x |
---|
85 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x |
---|
86 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x |
---|
87 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x |
---|
88 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x |
---|
89 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x |
---|
90 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x |
---|
91 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x |
---|
92 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x |
---|
93 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x |
---|
94 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax |
---|
95 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx |
---|
96 | 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx |
---|
97 | 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx |
---|
98 | 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex |
---|
99 | 4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx |
---|
100 | ))) |
---|
101 | (lambda (i) (vector-ref table i)))) |
---|
102 | |
---|
103 | (define (ucs-integer->length x) |
---|
104 | (cond |
---|
105 | ((<= x #x7F) 1) |
---|
106 | ((<= x #x7FF) 2) |
---|
107 | ((<= x #xFFFF) 3) |
---|
108 | ((<= x #x1FFFFF) 4) |
---|
109 | (else (error "unicode codepoint out of range:" x)))) |
---|
110 | |
---|
111 | (define (utf8-index->offset s pos) |
---|
112 | (if (zero? pos) |
---|
113 | 0 |
---|
114 | (let ((limit (string-length s))) |
---|
115 | (let loop ((i 0) (count 0)) |
---|
116 | (cond |
---|
117 | ((= count pos) i) |
---|
118 | ((>= i limit) (error "index out of range" s pos)) |
---|
119 | (else |
---|
120 | (loop (+ i (utf8-start-byte->length (string-int-ref s i))) |
---|
121 | (+ count 1)))))))) |
---|
122 | |
---|
123 | (define (utf8-offset->index s off) |
---|
124 | (let ((limit (string-length s))) |
---|
125 | (let loop ((i 0) (count 0)) |
---|
126 | (cond |
---|
127 | ((>= i off) (if (= i off) count (- count 1))) |
---|
128 | ((>= i limit) (error "index out of range" s off)) |
---|
129 | (else |
---|
130 | (loop (+ i (utf8-start-byte->length (string-int-ref s i))) |
---|
131 | (+ count 1))))))) |
---|
132 | |
---|
133 | ;; return offset of previous char, or #f if at start of string |
---|
134 | (define (utf8-prev-char s off) |
---|
135 | (let loop ((i (- off 1))) |
---|
136 | (cond |
---|
137 | ((negative? i) #f) |
---|
138 | ((= #b10000000 (bitwise-and #b11000000 (string-int-ref s i))) |
---|
139 | (loop (- i 1))) |
---|
140 | (else i)))) |
---|
141 | |
---|
142 | ;; return offset of next char, or #f if at end of string |
---|
143 | (define (utf8-next-char s off) |
---|
144 | (let ((limit (string-length s))) |
---|
145 | (and (< off limit) |
---|
146 | (let ((res (+ off (utf8-start-byte->length (string-int-ref s off))))) |
---|
147 | (and (<= res limit) res))))) |
---|
148 | |
---|
149 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
---|
150 | ;; redefine string primitives |
---|
151 | |
---|
152 | (define (utf8-substring s start . opt) |
---|
153 | (with-substring-offsets substring s (cons start opt))) |
---|
154 | |
---|
155 | (define (utf8-string-length s) |
---|
156 | (let ((limit (string-length s))) |
---|
157 | (let lp ((i 0) (res 0)) |
---|
158 | (if (>= i limit) |
---|
159 | res |
---|
160 | (lp (+ i (utf8-start-byte->length (string-int-ref s i))) |
---|
161 | (+ res 1)))))) |
---|
162 | |
---|
163 | (define (with-substring-offsets proc s opt) |
---|
164 | (let* ((start (if (pair? opt) (car opt) 0)) |
---|
165 | (b1 (utf8-index->offset s start)) |
---|
166 | (opt2 (if (pair? opt) (cdr opt) '()))) |
---|
167 | (let ((limit (string-length s))) |
---|
168 | (if (pair? opt2) |
---|
169 | (let ((end (car opt2))) |
---|
170 | (let lp ((b2 b1) (count start)) |
---|
171 | (cond |
---|
172 | ((= count end) (proc s b1 b2)) |
---|
173 | ((> b2 limit) (error "index out of range" s end)) |
---|
174 | (else |
---|
175 | (lp (+ b2 (utf8-start-byte->length (string-int-ref s b2))) |
---|
176 | (+ count 1)))))) |
---|
177 | (proc s b1 limit)))) ) |
---|
178 | |
---|
179 | (define (with-two-substring-offsets proc s1 s2 opt) |
---|
180 | (with-substring-offsets |
---|
181 | (lambda (s1 start1 end1) |
---|
182 | (with-substring-offsets |
---|
183 | (lambda (s2 start2 end2) |
---|
184 | (proc s1 s2 start1 end1 start2 end2)) |
---|
185 | s2 (if (and (pair? opt) (pair? (cdr opt))) (cddr opt) '()))) |
---|
186 | s1 opt) ) |
---|
187 | |
---|
188 | (define (utf8-string->list str) |
---|
189 | (let ((limit (string-length str))) |
---|
190 | (let lp ((i 0) (res '())) |
---|
191 | (if (>= i limit) |
---|
192 | (reverse res) |
---|
193 | (lp (+ i (utf8-start-byte->length (string-int-ref str i))) |
---|
194 | (cons (sp-ref str i) res)))))) |
---|
195 | |
---|
196 | (define (make-utf8-string len . opt) |
---|
197 | (if (pair? opt) |
---|
198 | (let* ((c (car opt)) |
---|
199 | (c-i (char->integer c)) |
---|
200 | (c-len (ucs-integer->length c-i))) |
---|
201 | (if (<= c-len 1) |
---|
202 | (make-string len c) |
---|
203 | (let* ((size (* len c-len)) |
---|
204 | (res (make-string size))) |
---|
205 | (let lp ((i 0)) |
---|
206 | (if (>= i size) |
---|
207 | res |
---|
208 | (begin |
---|
209 | (string-set-at-byte-in-place! res size c-len i c-i) |
---|
210 | (lp (+ i c-len)))))))) |
---|
211 | (make-string len))) |
---|
212 | |
---|
213 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
---|
214 | ;; accessors |
---|
215 | |
---|
216 | (define (sp-ref s off) |
---|
217 | (let* ((c (string-int-ref s off)) |
---|
218 | (len (utf8-start-byte->length c)) |
---|
219 | (limit (string-length s))) |
---|
220 | (if (<= len 1) |
---|
221 | (integer->char c) |
---|
222 | (let ((end (+ off len))) |
---|
223 | (if (> end limit) |
---|
224 | (error "utf8 trailing char overflow" s off) |
---|
225 | (let loop ((i (+ off 1)) (res (extract-bit-field (- 7 len) 0 c))) |
---|
226 | (if (= i end) |
---|
227 | (integer->char res) |
---|
228 | (loop (+ i 1) |
---|
229 | (bitwise-ior (arithmetic-shift res 6) |
---|
230 | (bitwise-and #b00111111 |
---|
231 | (string-int-ref s i))))))))))) |
---|
232 | |
---|
233 | (define (sp-ref->string s off) |
---|
234 | (let* ((c (string-int-ref s off)) |
---|
235 | (len (utf8-start-byte->length c)) |
---|
236 | (limit (string-length s)) |
---|
237 | (end (+ off len))) |
---|
238 | (if (> end limit) |
---|
239 | (error "utf8 trailing char overflow" s off) |
---|
240 | (substring s off end)))) |
---|
241 | |
---|
242 | (define (utf8-string-ref s pos) |
---|
243 | (sp-ref s (utf8-index->offset s pos))) |
---|
244 | |
---|
245 | (define (string-set-at-byte-in-place! s limit c-len off val-i) |
---|
246 | (let ((end (+ off c-len))) |
---|
247 | (cond |
---|
248 | ((> end limit) |
---|
249 | (error "utf8 trailing char overflow" s off)) |
---|
250 | ((<= c-len 1) |
---|
251 | (string-int-set! s off val-i)) |
---|
252 | (else |
---|
253 | (let* ((tag (- (expt 2 c-len) 1)) |
---|
254 | (tag-shift (arithmetic-shift tag (- 8 c-len))) |
---|
255 | (body (extract-bit-field (- 7 c-len) |
---|
256 | (* 6 (- c-len 1)) |
---|
257 | val-i)) |
---|
258 | (b1 (bitwise-ior tag-shift body))) |
---|
259 | (string-int-set! s off b1)) |
---|
260 | (let loop ((i 1)) |
---|
261 | (unless (= i c-len) |
---|
262 | (let ((b (bitwise-ior |
---|
263 | #b10000000 |
---|
264 | (extract-bit-field 6 (* 6 (- c-len i 1)) val-i)))) |
---|
265 | (string-int-set! s (+ off i) b) |
---|
266 | (loop (+ i 1))))))))) |
---|
267 | |
---|
268 | (define (string-set-at-byte s size byte c-len val) |
---|
269 | (let ((s1 (substring s 0 byte)) |
---|
270 | (s2 (char->utf8-string val)) |
---|
271 | (s3 (substring s (+ byte c-len) size))) |
---|
272 | (string-append s1 s2 s3))) |
---|
273 | |
---|
274 | (define (sp-set! s off val) |
---|
275 | (let* ((limit (string-length s)) |
---|
276 | (c (string-int-ref s off)) |
---|
277 | (c-len (utf8-start-byte->length c)) |
---|
278 | (val-i (char->integer val)) |
---|
279 | (val-len (ucs-integer->length val-i))) |
---|
280 | (if (not (= c-len val-len)) |
---|
281 | ;; different size, allocate & become new string |
---|
282 | (let ((res (string-set-at-byte s limit off c-len val))) |
---|
283 | (##sys#become! (list (cons s res)))) |
---|
284 | ;; modify in place |
---|
285 | (string-set-at-byte-in-place! s limit c-len off val-i)))) |
---|
286 | |
---|
287 | (define (utf8-string-set! s pos val) |
---|
288 | (sp-set! s (utf8-index->offset s pos) val)) |
---|
289 | |
---|
290 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
---|
291 | ;; String Pointers |
---|
292 | |
---|
293 | (define (make-string-pointer s . opt) |
---|
294 | (if (pair? opt) |
---|
295 | (let ((pos (car opt))) |
---|
296 | (if (negative? pos) |
---|
297 | (sp-prev s (utf8-prev-char s (string-length s)) (+ pos 1)) |
---|
298 | (sp-next s 0 pos))) |
---|
299 | 0)) |
---|
300 | |
---|
301 | (define (string-pointer? obj) |
---|
302 | (integer? obj)) |
---|
303 | |
---|
304 | (define (sp-copy sp) sp) |
---|
305 | |
---|
306 | (define (sp-check-lo? s sp) |
---|
307 | (positive? sp)) |
---|
308 | |
---|
309 | (define (sp-check-hi? s sp) |
---|
310 | (< sp (string-length s))) |
---|
311 | |
---|
312 | (define (sp-check? s sp) |
---|
313 | (and (sp-check-lo? s sp) (sp-check-hi? s sp))) |
---|
314 | |
---|
315 | ;; returns the next string-pointer, or the string-length (an invalid |
---|
316 | ;; pointer) otherwise |
---|
317 | (define (sp-next s sp . opt) |
---|
318 | (let loop ((i (if (pair? opt) (car opt) 1)) |
---|
319 | (sp sp)) |
---|
320 | (if (positive? i) |
---|
321 | (let ((res (utf8-next-char s sp))) |
---|
322 | (if res |
---|
323 | (loop (- i 1) res) |
---|
324 | (string-length s))) |
---|
325 | sp))) |
---|
326 | |
---|
327 | (define (sp-prev s sp . opt) |
---|
328 | (let loop ((i (if (pair? opt) (car opt) 1)) |
---|
329 | (sp sp)) |
---|
330 | (if (positive? i) |
---|
331 | (let ((res (utf8-prev-char s sp))) |
---|
332 | (if res |
---|
333 | (loop (- i 1) res) |
---|
334 | -1)) |
---|
335 | sp))) |
---|
336 | |
---|
337 | (define (sp-first s) 0) |
---|
338 | (define (sp-last s) (string-length s)) |
---|
339 | |
---|
340 | (define (sp-before s sp) |
---|
341 | (substring s 0 sp)) |
---|
342 | |
---|
343 | (define (sp-after s sp) |
---|
344 | (substring s sp)) |
---|
345 | |
---|
346 | (define (sp-substring s . opt) |
---|
347 | (if (null? opt) |
---|
348 | (substring s 0) |
---|
349 | (apply substring s opt))) |
---|
350 | |
---|
351 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
---|
352 | ;; Basic I/O |
---|
353 | |
---|
354 | ;; now in the core library |
---|
355 | (define (char->utf8-string c) |
---|
356 | (##sys#char->utf8-string c)) |
---|
357 | |
---|
358 | (define (write-utf8-char c . opt) |
---|
359 | (display (char->utf8-string c) |
---|
360 | (if (pair? opt) (car opt) (current-output-port)))) |
---|
361 | |
---|
362 | (define (read-utf8-char . opt) |
---|
363 | (let* ((p (if (pair? opt) (car opt) (current-input-port))) |
---|
364 | (b1 (read-byte p))) |
---|
365 | (if (eof-object? b1) |
---|
366 | b1 |
---|
367 | (let ((len (utf8-start-byte->length b1))) |
---|
368 | (if (<= len 1) |
---|
369 | (integer->char b1) |
---|
370 | (let loop ((res (extract-bit-field (- 7 len) 0 b1)) |
---|
371 | (i (- len 1))) |
---|
372 | (if (zero? i) |
---|
373 | (integer->char res) |
---|
374 | (let ((b2 (read-byte p))) |
---|
375 | (cond |
---|
376 | ((eof-object? b2) b2) |
---|
377 | ((not (= #b10 (extract-bit-field 2 6 b2))) |
---|
378 | (error "invalid utf8 sequence")) |
---|
379 | (else |
---|
380 | (loop (bitwise-ior (arithmetic-shift res 6) |
---|
381 | (bitwise-and #b00111111 b2)) |
---|
382 | (- i 1)))))))))))) |
---|
383 | |
---|
384 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
---|
385 | |
---|
386 | (define-syntax in-utf8-string |
---|
387 | (syntax-rules () |
---|
388 | ((in-utf8-string ((var) (str)) next . rest) |
---|
389 | (in-utf8-string ((var off) (str)) next . rest)) |
---|
390 | ((in-utf8-string ((var off) (str)) next . rest) |
---|
391 | (next ((tmp str) (lim (sp-last tmp))) |
---|
392 | ((off (sp-first tmp) |
---|
393 | (fx+ off (utf8-start-byte->length (string-int-ref str off))))) |
---|
394 | ((fx>= off lim)) |
---|
395 | ((var (sp-ref tmp off))) |
---|
396 | () |
---|
397 | . rest)))) |
---|
398 | |
---|
399 | ) |
---|