source: project/release/3/fancypants/fancypants.scm @ 14852

Last change on this file since 14852 was 14852, checked in by sjamaan, 11 years ago

Add small bugfixes

File size: 10.8 KB
Line 
1;;; fancypants - Automatic ASCII smart quotes and ligature handling for SXML
2;
3; Copyright (c) 2006-2009 Peter Bex (Peter.Bex@xs4all.nl)
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions
8; are met:
9; 1. Redistributions of source code must retain the above copyright
10;    notice, this list of conditions and the following disclaimer.
11; 2. Redistributions in binary form must reproduce the above copyright
12;    notice, this list of conditions and the following disclaimer in the
13;    documentation and/or other materials provided with the distribution.
14; 3. Neither the name of Peter Bex nor the names of any contributors may
15;    be used to endorse or promote products derived from this software
16;    without specific prior written permission.
17;
18; THIS SOFTWARE IS PROVIDED BY PETER BEX AND CONTRIBUTORS ``AS IS'' AND ANY
19; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21; DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS BE LIABLE
22; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26; LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27; OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28; SUCH DAMAGE.
29
30(declare
31 (hide point-split-string parts->regex flatten-string))
32
33(use srfi-1 srfi-13 sxml-transforms regex)
34
35;; Split up a string at predefined points, returning a list with the pieces.
36(define (point-split-string string points)
37  (let loop ((p points))
38    (cond
39     ((null? p) (list string))
40     ((= (string-length string) 0) '())
41     ((string-contains string (car p)) => (lambda (start)
42                                            (let ((len (string-length (car p))))
43                                              (append (point-split-string (string-take string start) points)
44                                                      (list (string-copy string start (+ start len)))
45                                                      (point-split-string (string-drop string (+ start len)) points)))))
46     (else (loop (cdr p))))))
47
48;; See http://www.unicode.org/charts/PDF/UFB00.pdf
49(define default-ligature-map
50  '(("ffi" . (& "#xfb03"))
51    ("ffl" . (& "#xfb04"))
52    ("ff"  . (& "#xfb00"))
53    ("fi"  . (& "#xfb01"))
54    ("fl"  . (& "#xfb02"))
55;   ("st"  . (& "#xfb06"))  ;; This one is too conspicuous in standard fonts
56    ("ft"  . (& "#xfb05"))))
57
58;; See http://www.unicode.org/charts/PDF/U2000.pdf
59(define default-punctuation-map
60  '(("..."   . (& "#x2026"))
61    (".."    . (& "#x2025"))
62    (". . ." . (& "#x2026"))
63    ; We could also use #x2013, #x2014 instead.
64    ("---"   . (& "mdash"))
65    ("--"    . (& "ndash"))))
66
67;; See http://www.unicode.org/charts/PDF/U2190.pdf
68(define default-arrow-map
69  '(("->>" . (& "#x21a0"))
70    ("<<-" . (& "#x219e"))
71    ("->|" . (& "#x21e5"))
72    ("|<-" . (& "#x21e4"))
73    ("<->" . (& "#x2194"))
74    ("->"  . (& "#x2192"))
75    ("<-"  . (& "#x2190"))
76    ("<=>" . (& "#x21d4"))
77    ("=>"  . (& "#x21d2"))
78    ("<="  . (& "#x21d0"))))
79
80(define default-map `(,@default-ligature-map
81                      ,@default-punctuation-map
82                      ,@default-arrow-map))
83
84;; Don't try to do anything with these
85(define default-exceptions '(head script pre code kbd samp @))
86
87;; Split ASCII "ligatures" and such from their surrounding strings and change
88;; them to their respective Unicode ligature. ie, "fine" => ((& "#xfb01") "ne")
89(define (fancify string character-map)
90  (map (lambda (piece)
91         (alist-ref piece character-map string=? piece))  ; alist-ref is Chicken-specific
92       (point-split-string string (map car character-map))))
93
94(define (make-fancy-rules . rest)
95  (let-optionals rest ((exceptions default-exceptions)
96                       (character-map default-map))
97    `(,@(map (lambda (x)
98               `(,x *preorder* . ,(lambda x x))) exceptions)
99      (*text* . ,(lambda (tag str)
100                   (if (string? str)
101                       (cons '*flatten* (fancify str character-map))
102                       str)))
103      (*default* . ,(lambda contents
104                      (flatten-strings contents))))))
105
106;; Structure of these lists: (pre match post how counts?)
107;; pre is the part of the string that's before the quote to match, post is the
108;; string that is after the match (any may be #f).
109;; how is one of single, double, single-open, double-open, single-close
110;; or double-close.
111;; counts? is a boolean describing whether the quote should influence the
112;; nesting of the next quote or not.  (ie, "isn't" => #f, since the '
113;; doesn't mean an opening quote is closed by the quote).
114;; Note that you (currently) can't use brackets in these regexes, since that
115;; messes up the expected structure of the result of string-search-positions.
116(define all-quotes '(("n" "'"  "t"  single-close #f) ; Aren't you?
117                     (#f  "'"  "re" single-close #f) ; We're here
118                     (#f  "'"  "s"  single-close #f) ; Jack's widget
119                     ("s" "'" " |$" single-close #f) ; The Jacksons' car
120                     ("^| " "'" "[0-9]+s" single-close #f) ; The '90s
121                     (#f  "\"" #f   double #t)
122                     (#f  "``" #f   double-open #t)
123                     (#f  "''" #f   double-close #t)
124                     (#f  "'"  #f   single #t)))
125
126;; Note we might need to escape parens.  In the end I probably want to
127;; replace this hack with SREs (scsh-regexp), which are composable.
128(define (parts->regex pre match post)
129  (string-append "(" (or pre "") ")"
130                 "(" (or match "") ")"
131                 "(" (or post "") ")"))
132
133;; See http://www.unicode.org/charts/PDF/U2000.pdf
134;; This is pretty ugly code.
135(define (smarten-quotes contents #!optional (quotes all-quotes) (exceptions default-exceptions))
136  (let ((single-open-count 0)
137        (double-open-count 0)
138        (big-regex  (string-join (map (lambda (parts)
139                                        (parts->regex (first parts) (second parts) (third parts)))
140                                      quotes) "|")))
141    (let loop ((contents contents)
142               (result '()))
143      (cond
144       ((null? contents) (reverse result))
145       ((member (car contents) exceptions) (append (reverse result) contents))
146       ((pair? (car contents)) (loop (cdr contents) (cons (loop (car contents) '()) result)))
147       ((string? (car contents))
148        (let string-loop ((str (car contents))
149                          (result-strings '()))
150          (let ((pos (string-search-positions big-regex str)))
151            (if (not pos)
152                (let ((string-list (append result-strings (list str))))
153                  (if (null? string-list)
154                      (loop (cdr contents) result)
155                      (loop (cdr contents) (cons (cons '*flatten* string-list) result))))
156                (let* ((before (string-take str (caar pos))) ;; non-matching part
157                       (after  (string-drop str (cadar pos))) ;; non-matching part
158                       (match-pos  (list-index identity (cdr pos)))
159                       (parts (car (drop quotes (quotient match-pos 3)))) ;; Three parts of the matching quotes
160                       (matching (drop (cdr pos) match-pos)) ;; Matching positions (corresponding to parts)
161                       (pre  (string-copy str (car (first matching)) (cadr (first matching))))
162                       (post (string-copy str (car (third matching)) (cadr (third matching))))
163                       (new-quote 
164                        (case (fourth parts)
165                          ((single-open)
166                           (when (fifth parts)
167                               (set! single-open-count (add1 single-open-count)))
168                           '(& "#x2018"))
169                          ((single-close)
170                           (when (and (fifth parts) (> single-open-count 0))
171                               (set! single-open-count (sub1 single-open-count)))
172                           '(& "#x2019"))
173                          ((double-open)
174                           (when (fifth parts)
175                               (set! double-open-count (add1 double-open-count)))
176                           '(& "#x201c"))
177                          ((double-close)
178                           (when (and (fifth parts) (> double-open-count 0))
179                               (set! double-open-count (sub1 double-open-count)))
180                           '(& "#x201d"))
181                          ;; For the balanced ones, close it if it was open,
182                          ;; open it if it was closed
183                          ((single)
184                           (if (> single-open-count 0)
185                               (begin
186                                 (when (fifth parts) (set! single-open-count (sub1 single-open-count)))
187                                 '(& "#x2019"))
188                               (begin
189                                 (when (fifth parts) (set! single-open-count (add1 single-open-count)))
190                                 '(& "#x2018"))))
191                          ((double)
192                           (if (> double-open-count 0)
193                               (begin
194                                 (when (fifth parts) (set! double-open-count (sub1 double-open-count)))
195                                 '(& "#x201d"))
196                               (begin
197                                 (when (fifth parts) (set! double-open-count (add1 double-open-count)))
198                                 '(& "#x201c"))))
199                          (else (error 'smarten-quotes "Unkown quote matching type: " (fourth parts))))))
200                  (string-loop
201                   after
202                   ;; XXX don't use empty strings for before/pre and post
203                   (append result-strings
204                           (list (string-append before pre) new-quote post))))))))
205       (else (loop (cdr contents) (cons (car contents) result)))))))
206
207;; We have to jump through some hoops to get the SXML normalized again.
208;; That's what you get when trying to map one string to a list of strings :)
209;;
210;; NOTE: There's probably a way to eliminate all this reversing.
211;; This is inefficient.
212(define (flatten-strings data)
213  (let loop ((data data)
214             (result '()))
215    (cond
216     ((null? data) (reverse result))
217     ((not (pair? data)) (cons data result))
218     ((and (pair? (car data)) (eq? (caar data) '*flatten*))
219      (loop (cdr data) (append (reverse (loop (cdar data) '())) result)))
220     ((pair? (car data)) (loop (cdr data) (cons (loop (car data) '()) result)))
221     (else (loop (cdr data) (cons (car data) result))))))
222
223;; This is a very simple implementation, it doesn't really use pre-post-order.
224;; It's handy nonetheless because you can easily integrate it into an existing
225;; pre-post-order chain with sxml-apply-rules.
226(define (make-smart-quote-rules . rest)
227  (let-optionals rest ((exceptions default-exceptions)
228                       (quotes all-quotes))
229     `((*text* . ,(lambda (tag data) data))  ;; Not needed?
230       (*default* *preorder* . ,(lambda (tag . contents)
231                                  (flatten-strings (cons tag (smarten-quotes contents quotes exceptions))))))))
Note: See TracBrowser for help on using the repository browser.