;;; $Id: generate-sets.scm,v 1.57 2025/07/01 22:26:21 wcm Exp wcm $
;;;
;;; Copyright (C) 2025 Wolfgang Corcoran-Mathe <wcm@sigwinch.xyz>
;;;
;;; Permission is hereby granted, free of charge, to any person
;;; obtaining a copy of this software and associated documentation
;;; files (the "Software"), to deal in the Software without
;;; restriction, including without limitation the rights to use,
;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
;;; copies of the Software, and to permit persons to whom the Software
;;; is furnished to do so, subject to the following conditions:
;;;
;;; The above copyright notice and this permission notice shall be
;;; included in all copies or substantial nortions of the Software.
;;;
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
;;; KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
;;; WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE
;;; AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
;;; OTHER DEALINGS IN THE SOFTWARE.

;;; This script generates modules for a collection of Unicode character
;;; sets. Each set has its own module, e.g. (unicode-char-sets han).

;;; Each set is constructed by taking the union of a number of singleton
;;; or range sets.

(import (scheme)
        (chicken base)
        (chicken format)
        (chicken irregex)
        (chicken io)
        (chicken pathname)
        (chicken port)
        (chicken process)
        (chicken string)
        (chicken time posix)
        (only (srfi 1) append-map)
        (srfi 69))

;;; Parse Unicode Scripts.txt tables & produce a table associating
;;; each script property with a list of codepoint ranges.

;; Range = Integer | (List Integer)

;; Pattern describing a single UCD record. A range is either a
;; single codepoint ("single") or a codepoint interval ("low" &
;; "high").
(define property-record-pattern
  (irregex '(: (or (: (submatch-named low (+ hex-digit))  ; range
                      ".."
                      (submatch-named high (+ hex-digit)))
                   (submatch-named single (+ hex-digit)))
               (* whitespace) ";" (* whitespace)
               (submatch-named property
                 (+ (or alphabetic ("_"))))
               (* any))))

;; parse-property-record : String -> (Pair String Range)
;; Parse a UCD record into property & range components.
(define (parse-property-record string)
  (cond ((irregex-match property-record-pattern string) =>
         (lambda (m)
           (cons (extract-property string m)
                 (parse-range string m))))
        (else (error 'parse-property-record
                     "invalid record"
                     string))))

;; extract-property-record : String x Irx-Match -> String
(define (extract-property string match)
  (or (irregex-match-substring match 'property)
      (error 'extract-property
             "invalid record: no property field?"
             string)))

;; parse-range : String x Irx-Match -> Range
(define (parse-range string match)
  (cond ((irregex-match-substring match 'low) =>
         (lambda (low)
           (cond ((irregex-match-substring match 'high) =>
                  (lambda (high)
                    (list (string->number low 16)
                          (string->number high 16))))
                 (else (error 'parse-range
                              "upper bound of range missing"
                              string)))))
        ((irregex-match-substring match 'single) =>
         (lambda (s) (string->number s 16)))
        (else (error 'parse-range
                     "invalid record: couldn't parse range"
                     string))))

;; parse-property-records : Input-Port ->
;;                            (Hash-table String (List Range))
;;
;; Parse all UCD records read from *port* & return a table mapping
;; each property to a list of codepoint ranges with that property.
(define (parse-property-records port)
  (let ((table (make-hash-table string=? string-hash)))
    (port-for-each
     (lambda (s)
       (unless (blank-or-comment-line? s)
         (let ((p (parse-property-record s)))
           (hash-table-update! table
                               (car p)
                               (lambda (rs) (cons (cdr p) rs))
                               (lambda () '())))))
     (lambda () (read-line port)))
    table))

(define blank-or-comment-pattern
  (irregex '(: (* whitespace) (? "#" (* any)))))

(define (blank-or-comment-line? string)
  (irregex-match? blank-or-comment-pattern string))

;;; Code emitters

;; emit-property-set-definition : (Hash-table String (List Range)) x
;;                                  String x
;;                                  (List String) -> Void
;;
;; Print the definition of a single char set containing all of
;; the codepoints with a property in *properties*.
(define (emit-property-set-definition property-table name properties)
  (let ((ranges (collect-ranges property-table properties)))
    (when (null? ranges)
      (warning "Emitting empty set!" name properties))
    (printf "(define ~a~%" name)
    (printf "(let ((ranges '~S))~%" ranges)
    (print "(foldl adjoin-range (iset) ranges)))")
    (printf "(iset-optimize! ~a)~%~%" name)))

;; collect-ranges : (Hash-table String (List Range)) x
;;                    (List String) -> (List Range)
;;
;; Merge ranges for the given property-keys.
(define (collect-ranges table keys)
  (append-map (lambda (k)
                (hash-table-ref table k))
              keys))

;; emit-timestamp : () -> Void
;;
;; Print a comment containing a UTC timestamp in ISO 8601 format.
(define (emit-timestamp)
  (printf ";;; Generated ~a~%"
          (time->string (seconds->utc-time) "%FT%TZ")))

;; Prefix for all emitted modules.
(define library-namespace "unicode-char-sets")

;; name->module-path : String -> Pathname
(define (name->module-path name)
  (make-pathname #f
                 (string-append library-namespace "." name)
                 "scm"))

;;; Set specifications

;;; Char set specifications

;; Generated from Scripts.txt
(define script-set-specs
  '((arabic              Arabic)
    (armenian            Armenian)
    (bengali             Bengali)
    (bopomofo            Bopomofo)
    (braille             Braille)
    (buhid               Buhid)
    (canadian-aboriginal Canadian_Aboriginal)
    (cherokee            Cherokee)
    (common              Common)
    (cypriot             Cypriot)
    (cyrillic            Cyrillic)
    (deseret             Deseret)
    (devanagari          Devanagari)
    (ethiopic            Ethiopic)
    (georgian            Georgian)
    (gothic              Gothic)
    (greek               Greek)
    (gujarati            Gujarati)
    (gurmukhi            Gurmukhi)
    (han                 Han)
    (hangul              Hangul)
    (hanunoo             Hanunoo)
    (hebrew              Hebrew)
    (hiragana            Hiragana)
    (inherited           Inherited)
    (kannada             Kannada)
    (katakana            Katakana)
    (khmer               Khmer)
    (lao                 Lao)
    (latin               Latin)
    (limbu               Limbu)
    (linear-b            Linear_B)
    (malayalam           Malayalam)
    (mongolian           Mongolian)
    (myanmar             Myanmar)
    (ogham               Ogham)
    (old-italic          Old_Italic)
    (oriya               Oriya)
    (osmanya             Osmanya)
    (runic               Runic)
    (sinhala             Sinhala)
    ))

;; Generated from PropList.txt
(define proplist-set-specs
  '((bidi-control         Bidi_Control)
    (white-space          White_Space)
    (dash                 Dash)
    (deprecated           Deprecated)
    (diacritic            Diacritic)
    (extender             Extender)
    (grapheme-extend      Other_Grapheme_Extend)
    (hex-digit            Hex_Digit)
    (hyphen               Hyphen)
    (id-start             Other_ID_Start)
    (id-continue          Other_ID_Continue)
    (ideographic          Ideographic)
    (ids-unary-operator   IDS_Unary_Operator)
    (ids-binary-operator  IDS_Binary_Operator)
    (ids-trinary-operator IDS_Trinary_Operator)
    (join-control         Join_Control)
    (other-alphabetic     Other_Alphabetic)
    (other-math           Other_Math)
    (quotation-mark       Quotation_Mark)
    (radical              Radical)
    (default-ignorable-code-point Other_Default_Ignorable_Code_Point)
    (logical-order-exception Logical_Order_Exception)
    (terminal-punctuation Terminal_Punctuation)
    ))

;; Generated from DerivedCoreProperties.txt
(define derivedcore-set-specs
  '((alphabetic Alphabetic)
    (grapheme-base Grapheme_Base)
    (grapheme-link Grapheme_Link)
    (lowercase  Lowercase)
    (math       Math)
    (uppercase  Uppercase)))

;; Generated from DerivedNumericType.txt
(define numeric-set-specs
  '((numeric Decimal Digit Numeric)
    (digit   Decimal Digit)
    ))

;; Generate from emoji-data.txt
(define emoji-set-specs
  '((emoji Emoji)
  ))

;;; Driver

;; emit-set-modules : () -> Void
;;
;; Emit all modules.
(define (emit-set-modules)
  (for-each
   (lambda (p)
     (let ((table (call-with-input-file (car p)
                                        parse-property-records)))
       (emit-property-set-modules table (cdr p))))
   `(("Scripts.txt" . ,script-set-specs)
     ("PropList.txt" . ,proplist-set-specs)
     ("DerivedCoreProperties.txt" . ,derivedcore-set-specs)
     ("DerivedNumericType.txt" . ,numeric-set-specs)
     ("emoji-data.txt" . ,emoji-set-specs)
     ))
  (exit 0))

(define (emit-adjoin-range)
  (display #<<END
(define (adjoin-range set r)
  (cond ((integer? r) (iset-adjoin set r))
        ((pair? r)
         (iset-union set (make-iset (car r) (cadr r))))))
END
  )
  (newline)
  (newline))

;; emit-property-set-modules : (Hash-table String (List Range)) x
;;                              (List (Pair Symbol Symbol)) ->
;;                              Void
;;
;; For each pair in *set-specs*, emit a Scheme source file which
;; defines a module exporting a single char-set definition.
(define (emit-property-set-modules prop-table set-specs)
  (for-each
   (lambda (p)
     (let* ((name (symbol->string (car p)))
            (set-name (string-append "char-set:" name))
            (prop (map symbol->string (cdr p))))
       (with-output-to-file
        (name->module-path name)
        (lambda ()
          (emit-timestamp)
          (printf "(module (~a ~a) (~A)~%"
                  library-namespace
                  name
                  set-name)
          (print "(import (scheme) (chicken base) (iset))")
          (emit-adjoin-range)
          (emit-property-set-definition prop-table set-name prop)
          (print ")")))))
   set-specs))

;;; UCD file fetching

(define unicode-version "16.0.0")

(define url-base "ftp://ftp.unicode.org/Public")

;; Everything is *not* in the same place, so adjust these carefully.
(define urls-to-download
  (list (sprintf "~a/~a/ucd/Scripts.txt" url-base unicode-version)
        (sprintf "~a/~a/ucd/PropList.txt" url-base unicode-version)
        (sprintf "~a/~a/ucd/DerivedCoreProperties.txt"
                 url-base
                 unicode-version)
        (sprintf "~a/~a/ucd/extracted/DerivedNumericType.txt"
                 url-base
                 unicode-version)
        (sprintf "~a/~a/ucd/emoji/emoji-data.txt"
                 url-base
                 unicode-version)
        ))

;; fetch-file : String -> Void
;;
;; Attempt to retrieve a file from the Internet. This is a hack:
;; we try curl(1), we try wget(1), & then we give up.
(define (fetch-file url)
  (let ((r (system (string-append "curl -OL " url))))
    (unless (zero? r)
      (let ((r* (system (string-append "wget " url))))
        (unless (zero? r*)
          (error 'fetch-file "failed to fetch file" url))))))

;; fetch-all : () -> Void
(define (fetch-all)
  (print "Fetching Unicode data files ...")
  (for-each fetch-file urls-to-download))

;;; Main script entry point

(fetch-all)
(emit-set-modules)
