Ticket #1851: generate-sets.scm

File generate-sets.scm, 12.0 KB (added by zaifir, 33 hours ago)

Script to generate Unicode char set modules from current UCD data.

Line 
1;;; $Id: generate-sets.scm,v 1.57 2025/07/01 22:26:21 wcm Exp wcm $
2;;;
3;;; Copyright (C) 2025 Wolfgang Corcoran-Mathe <wcm@sigwinch.xyz>
4;;;
5;;; Permission is hereby granted, free of charge, to any person
6;;; obtaining a copy of this software and associated documentation
7;;; files (the "Software"), to deal in the Software without
8;;; restriction, including without limitation the rights to use,
9;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
10;;; copies of the Software, and to permit persons to whom the Software
11;;; is furnished to do so, subject to the following conditions:
12;;;
13;;; The above copyright notice and this permission notice shall be
14;;; included in all copies or substantial nortions of the Software.
15;;;
16;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
17;;; KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
18;;; WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE
19;;; AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23;;; OTHER DEALINGS IN THE SOFTWARE.
24
25;;; This script generates modules for a collection of Unicode character
26;;; sets. Each set has its own module, e.g. (unicode-char-sets han).
27
28;;; Each set is constructed by taking the union of a number of singleton
29;;; or range sets.
30
31(import (scheme)
32        (chicken base)
33        (chicken format)
34        (chicken irregex)
35        (chicken io)
36        (chicken pathname)
37        (chicken port)
38        (chicken process)
39        (chicken string)
40        (chicken time posix)
41        (only (srfi 1) append-map)
42        (srfi 69))
43
44;;; Parse Unicode Scripts.txt tables & produce a table associating
45;;; each script property with a list of codepoint ranges.
46
47;; Range = Integer | (List Integer)
48
49;; Pattern describing a single UCD record. A range is either a
50;; single codepoint ("single") or a codepoint interval ("low" &
51;; "high").
52(define property-record-pattern
53  (irregex '(: (or (: (submatch-named low (+ hex-digit))  ; range
54                      ".."
55                      (submatch-named high (+ hex-digit)))
56                   (submatch-named single (+ hex-digit)))
57               (* whitespace) ";" (* whitespace)
58               (submatch-named property
59                 (+ (or alphabetic ("_"))))
60               (* any))))
61
62;; parse-property-record : String -> (Pair String Range)
63;; Parse a UCD record into property & range components.
64(define (parse-property-record string)
65  (cond ((irregex-match property-record-pattern string) =>
66         (lambda (m)
67           (cons (extract-property string m)
68                 (parse-range string m))))
69        (else (error 'parse-property-record
70                     "invalid record"
71                     string))))
72
73;; extract-property-record : String x Irx-Match -> String
74(define (extract-property string match)
75  (or (irregex-match-substring match 'property)
76      (error 'extract-property
77             "invalid record: no property field?"
78             string)))
79
80;; parse-range : String x Irx-Match -> Range
81(define (parse-range string match)
82  (cond ((irregex-match-substring match 'low) =>
83         (lambda (low)
84           (cond ((irregex-match-substring match 'high) =>
85                  (lambda (high)
86                    (list (string->number low 16)
87                          (string->number high 16))))
88                 (else (error 'parse-range
89                              "upper bound of range missing"
90                              string)))))
91        ((irregex-match-substring match 'single) =>
92         (lambda (s) (string->number s 16)))
93        (else (error 'parse-range
94                     "invalid record: couldn't parse range"
95                     string))))
96
97;; parse-property-records : Input-Port ->
98;;                            (Hash-table String (List Range))
99;;
100;; Parse all UCD records read from *port* & return a table mapping
101;; each property to a list of codepoint ranges with that property.
102(define (parse-property-records port)
103  (let ((table (make-hash-table string=? string-hash)))
104    (port-for-each
105     (lambda (s)
106       (unless (blank-or-comment-line? s)
107         (let ((p (parse-property-record s)))
108           (hash-table-update! table
109                               (car p)
110                               (lambda (rs) (cons (cdr p) rs))
111                               (lambda () '())))))
112     (lambda () (read-line port)))
113    table))
114
115(define blank-or-comment-pattern
116  (irregex '(: (* whitespace) (? "#" (* any)))))
117
118(define (blank-or-comment-line? string)
119  (irregex-match? blank-or-comment-pattern string))
120
121;;; Code emitters
122
123;; emit-property-set-definition : (Hash-table String (List Range)) x
124;;                                  String x
125;;                                  (List String) -> Void
126;;
127;; Print the definition of a single char set containing all of
128;; the codepoints with a property in *properties*.
129(define (emit-property-set-definition property-table name properties)
130  (let ((ranges (collect-ranges property-table properties)))
131    (when (null? ranges)
132      (warning "Emitting empty set!" name properties))
133    (printf "(define ~a~%" name)
134    (printf "(let ((ranges '~S))~%" ranges)
135    (print "(foldl adjoin-range (iset) ranges)))")
136    (printf "(iset-optimize! ~a)~%~%" name)))
137
138;; collect-ranges : (Hash-table String (List Range)) x
139;;                    (List String) -> (List Range)
140;;
141;; Merge ranges for the given property-keys.
142(define (collect-ranges table keys)
143  (append-map (lambda (k)
144                (hash-table-ref table k))
145              keys))
146
147;; emit-timestamp : () -> Void
148;;
149;; Print a comment containing a UTC timestamp in ISO 8601 format.
150(define (emit-timestamp)
151  (printf ";;; Generated ~a~%"
152          (time->string (seconds->utc-time) "%FT%TZ")))
153
154;; Prefix for all emitted modules.
155(define library-namespace "unicode-char-sets")
156
157;; name->module-path : String -> Pathname
158(define (name->module-path name)
159  (make-pathname #f
160                 (string-append library-namespace "." name)
161                 "scm"))
162
163;;; Set specifications
164
165;;; Char set specifications
166
167;; Generated from Scripts.txt
168(define script-set-specs
169  '((arabic              Arabic)
170    (armenian            Armenian)
171    (bengali             Bengali)
172    (bopomofo            Bopomofo)
173    (braille             Braille)
174    (buhid               Buhid)
175    (canadian-aboriginal Canadian_Aboriginal)
176    (cherokee            Cherokee)
177    (common              Common)
178    (cypriot             Cypriot)
179    (cyrillic            Cyrillic)
180    (deseret             Deseret)
181    (devanagari          Devanagari)
182    (ethiopic            Ethiopic)
183    (georgian            Georgian)
184    (gothic              Gothic)
185    (greek               Greek)
186    (gujarati            Gujarati)
187    (gurmukhi            Gurmukhi)
188    (han                 Han)
189    (hangul              Hangul)
190    (hanunoo             Hanunoo)
191    (hebrew              Hebrew)
192    (hiragana            Hiragana)
193    (inherited           Inherited)
194    (kannada             Kannada)
195    (katakana            Katakana)
196    (khmer               Khmer)
197    (lao                 Lao)
198    (latin               Latin)
199    (limbu               Limbu)
200    (linear-b            Linear_B)
201    (malayalam           Malayalam)
202    (mongolian           Mongolian)
203    (myanmar             Myanmar)
204    (ogham               Ogham)
205    (old-italic          Old_Italic)
206    (oriya               Oriya)
207    (osmanya             Osmanya)
208    (runic               Runic)
209    (sinhala             Sinhala)
210    ))
211
212;; Generated from PropList.txt
213(define proplist-set-specs
214  '((bidi-control         Bidi_Control)
215    (white-space          White_Space)
216    (dash                 Dash)
217    (deprecated           Deprecated)
218    (diacritic            Diacritic)
219    (extender             Extender)
220    (grapheme-extend      Other_Grapheme_Extend)
221    (hex-digit            Hex_Digit)
222    (hyphen               Hyphen)
223    (id-start             Other_ID_Start)
224    (id-continue          Other_ID_Continue)
225    (ideographic          Ideographic)
226    (ids-unary-operator   IDS_Unary_Operator)
227    (ids-binary-operator  IDS_Binary_Operator)
228    (ids-trinary-operator IDS_Trinary_Operator)
229    (join-control         Join_Control)
230    (other-alphabetic     Other_Alphabetic)
231    (other-math           Other_Math)
232    (quotation-mark       Quotation_Mark)
233    (radical              Radical)
234    (default-ignorable-code-point Other_Default_Ignorable_Code_Point)
235    (logical-order-exception Logical_Order_Exception)
236    (terminal-punctuation Terminal_Punctuation)
237    ))
238
239;; Generated from DerivedCoreProperties.txt
240(define derivedcore-set-specs
241  '((alphabetic Alphabetic)
242    (grapheme-base Grapheme_Base)
243    (grapheme-link Grapheme_Link)
244    (lowercase  Lowercase)
245    (math       Math)
246    (uppercase  Uppercase)))
247
248;; Generated from DerivedNumericType.txt
249(define numeric-set-specs
250  '((numeric Decimal Digit Numeric)
251    (digit   Decimal Digit)
252    ))
253
254;; Generate from emoji-data.txt
255(define emoji-set-specs
256  '((emoji Emoji)
257  ))
258
259;;; Driver
260
261;; emit-set-modules : () -> Void
262;;
263;; Emit all modules.
264(define (emit-set-modules)
265  (for-each
266   (lambda (p)
267     (let ((table (call-with-input-file (car p)
268                                        parse-property-records)))
269       (emit-property-set-modules table (cdr p))))
270   `(("Scripts.txt" . ,script-set-specs)
271     ("PropList.txt" . ,proplist-set-specs)
272     ("DerivedCoreProperties.txt" . ,derivedcore-set-specs)
273     ("DerivedNumericType.txt" . ,numeric-set-specs)
274     ("emoji-data.txt" . ,emoji-set-specs)
275     ))
276  (exit 0))
277
278(define (emit-adjoin-range)
279  (display #<<END
280(define (adjoin-range set r)
281  (cond ((integer? r) (iset-adjoin set r))
282        ((pair? r)
283         (iset-union set (make-iset (car r) (cadr r))))))
284END
285  )
286  (newline)
287  (newline))
288
289;; emit-property-set-modules : (Hash-table String (List Range)) x
290;;                              (List (Pair Symbol Symbol)) ->
291;;                              Void
292;;
293;; For each pair in *set-specs*, emit a Scheme source file which
294;; defines a module exporting a single char-set definition.
295(define (emit-property-set-modules prop-table set-specs)
296  (for-each
297   (lambda (p)
298     (let* ((name (symbol->string (car p)))
299            (set-name (string-append "char-set:" name))
300            (prop (map symbol->string (cdr p))))
301       (with-output-to-file
302        (name->module-path name)
303        (lambda ()
304          (emit-timestamp)
305          (printf "(module (~a ~a) (~A)~%"
306                  library-namespace
307                  name
308                  set-name)
309          (print "(import (scheme) (chicken base) (iset))")
310          (emit-adjoin-range)
311          (emit-property-set-definition prop-table set-name prop)
312          (print ")")))))
313   set-specs))
314
315;;; UCD file fetching
316
317(define unicode-version "16.0.0")
318
319(define url-base "ftp://ftp.unicode.org/Public")
320
321;; Everything is *not* in the same place, so adjust these carefully.
322(define urls-to-download
323  (list (sprintf "~a/~a/ucd/Scripts.txt" url-base unicode-version)
324        (sprintf "~a/~a/ucd/PropList.txt" url-base unicode-version)
325        (sprintf "~a/~a/ucd/DerivedCoreProperties.txt"
326                 url-base
327                 unicode-version)
328        (sprintf "~a/~a/ucd/extracted/DerivedNumericType.txt"
329                 url-base
330                 unicode-version)
331        (sprintf "~a/~a/ucd/emoji/emoji-data.txt"
332                 url-base
333                 unicode-version)
334        ))
335
336;; fetch-file : String -> Void
337;;
338;; Attempt to retrieve a file from the Internet. This is a hack:
339;; we try curl(1), we try wget(1), & then we give up.
340(define (fetch-file url)
341  (let ((r (system (string-append "curl -OL " url))))
342    (unless (zero? r)
343      (let ((r* (system (string-append "wget " url))))
344        (unless (zero? r*)
345          (error 'fetch-file "failed to fetch file" url))))))
346
347;; fetch-all : () -> Void
348(define (fetch-all)
349  (print "Fetching Unicode data files ...")
350  (for-each fetch-file urls-to-download))
351
352;;; Main script entry point
353
354(fetch-all)
355(emit-set-modules)