Changeset 14307 in project


Ignore:
Timestamp:
04/20/09 04:48:56 (11 years ago)
Author:
Alex Shinn
Message:

charconv for chicken 4

Location:
release/4/charconv
Files:
1 deleted
3 edited
1 copied

Legend:

Unmodified
Added
Removed
  • release/4/charconv/trunk/charconv.meta

    r9914 r14307  
    1 ;;; charconv.meta -*- Hen -*-
     1;;; charconv.meta -*- scheme -*-
     2
    23((egg "charconv.egg")
    34 (synopsis "Character encoding utilities")
    45 (category parsing)
    5  (needs iconv autoload)
     6 (needs iconv)
    67 (license "BSD")
    78 (doc-from-wiki)
  • release/4/charconv/trunk/charconv.scm

    r9919 r14307  
    11;;;; charconv.scm -- encoding utils
    22;;
    3 ;; Copyright (c) 2004-2005 Alex Shinn
     3;; Copyright (c) 2004-2009 Alex Shinn
    44;; All rights reserved.
    55;;
     
    9393;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    9494
    95 (cond-expand
    96  (compiling
    97   (declare
    98    (fixnum)
    99    (export
    100     make-encoded-input-port make-encoded-output-port
    101     open-encoded-input-file open-encoded-output-file
    102     with-input-from-encoded-file with-output-to-encoded-file
    103     call-with-encoded-input-file call-with-encoded-output-file
    104     detect-encoding detect-file-encoding
    105     ces-equivalent? ces-upper-compatible? ces-convert
    106     #;ces-converted-length
    107     read-encoded-string
    108     )))
    109  (else
    110   ))
    111 
    112 (require-for-syntax 'autoload)
    113 
    114 (require-extension regex posix srfi-69)
    115 
    116 (autoload iconv iconv iconv-open)
     95(require-library regex iconv)
     96
     97(module charconv
     98  (
     99   make-encoded-input-port make-encoded-output-port
     100   open-encoded-input-file open-encoded-output-file
     101   with-input-from-encoded-file with-output-to-encoded-file
     102   call-with-encoded-input-file call-with-encoded-output-file
     103   detect-encoding detect-file-encoding
     104   ces-equivalent? ces-upper-compatible? ces-convert
     105   #;ces-converted-length
     106   read-encoded-string
     107   )
     108
     109(import scheme chicken extras regex ports posix srfi-69 iconv)
    117110
    118111;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     
    466459
    467460;; There is very little overlap in different encoding types, so rather
    468 ;; than a data-driven state-machine or statistical methods we just code
    469 ;; the states directly, which is fast and uses very little memory.  The
    470 ;; next step will be to use statistical analysis to detect between
    471 ;; languages (which can be avoided if the user has a preferred locale,
    472 ;; or if the file is in UTF-8).
    473 
    474 (define-macro (make-detect-state-machine next limit i c . states)
    475   (define (expand-clauses ls)
    476     (let lp ((ls ls) (res '()))
    477       (if (null? ls)
    478         (reverse res)
    479         (let ((check (car ls))
    480               (state (caddr ls)))
    481           (case (cadr ls)
    482             ((-->)
    483              (lp (cdddr ls) (cons `(,check ,state) res)))
    484             ((->)
    485              (if (and (pair? (cdddr ls)) (number? (cadddr ls)))
    486                (lp (cddddr ls) (cons `(,check (,state (+ ,i ,(cadddr ls)))) res))
    487                (lp (cdddr ls) (cons `(,check (,state (+ ,i 1))) res))))
    488             (else (error "invalid state machine: " ls)))))))
    489   (define (make-state ls)
    490     (let ((name (car ls))
    491           (final (cadr ls)))
    492       `(define (,name ,i)
    493          (if (>= ,i ,limit)
    494            ,final
    495            (let ((,c (,next i)))
    496              (cond
    497                ,@(expand-clauses (cddr ls))))))))
    498   `(begin ,@(map make-state states)))
     461;; than a data-driven state-machine or statistical methods we just
     462;; code the states directly, which is fast and uses very little
     463;; memory, but doesn't help distinguish between the single-byte 8-bit
     464;; encodings.  To address this we'll need to use statistical analysis
     465;; to detect between languages.  Hopefully, though, UTF-8 is replacing
     466;; most uses of the old 8-bit encodings.
     467
     468(define-syntax make-detect-state-machine
     469  (lambda (expr rename compare)
     470    (apply
     471     (lambda (next limit i c . states)
     472       (define (expand-clauses ls)
     473         (let lp ((ls ls) (res '()))
     474           (if (null? ls)
     475               (reverse res)
     476               (let ((check (car ls))
     477                     (state (caddr ls)))
     478                 (case (cadr ls)
     479                   ((-->)
     480                    (lp (cdddr ls) (cons `(,check ,state) res)))
     481                   ((->)
     482                    (if (and (pair? (cdddr ls)) (number? (cadddr ls)))
     483                        (lp (cddddr ls)
     484                            (cons `(,check
     485                                    (,state (,(rename '+) ,i ,(cadddr ls))))
     486                                  res))
     487                        (lp (cdddr ls)
     488                            (cons `(,check (,state (,(rename '+) ,i 1))) res))))
     489                   (else (error "invalid state machine: " ls)))))))
     490       (define (make-state ls)
     491         (let ((name (car ls))
     492               (final (cadr ls)))
     493           `(,(rename 'define) (,name ,i)
     494             (,(rename 'if) (,(rename '>=) ,i ,limit)
     495              ,final
     496              (,(rename 'let) ((,c (,next i)))
     497               (,(rename 'cond)
     498                ,@(expand-clauses (cddr ls))))))))
     499       `(,(rename 'begin) ,@(map make-state states)))
     500     (cdr expr))))
    499501
    500502(define (detect-encoding-type str)
     
    629631
    630632(define detect-file-encoding
    631   (let ((rx (regexp"^(?:[^\n]*\n)?[^\n]*-\\*-[^\n]*\\bcoding:\\s*\\b(\\S+)\\b[^\n]*-\\*-"#t))
     633  (let ((rx (regexp "^(?:[^\n]*\n)?[^\n]*-\\*-[^\n]*\\bcoding:\\s*\\b(\\S+)\\b[^\n]*-\\*-" #t))
    632634        (cache (make-hash-table string=?)))
    633635    (lambda (file . o)
     
    644646              res)))))))
    645647
     648)
  • release/4/charconv/trunk/charconv.setup

    r9915 r14307  
    11;; -*- mode: scheme -*-
    22
    3 (compile -s -O2 -d1 charconv.scm)
     3(compile -s -O2 -d1 -j charconv charconv.scm)
     4(compile -s -O2 -d1 charconv.import.scm)
     5
    46(install-extension
    57 'charconv
    6  '("charconv.html" "charconv.setup" "charconv.so")
    7  '((version "1.2.1")
     8 '("charconv.html" "charconv.so" "charconv.import.so")
     9 '((version "1.3.0")
    810   (documentation "charconv.html")))
    911
Note: See TracChangeset for help on using the changeset viewer.