source: project/release/4/uri-common/trunk/uri-common.scm @ 13176

Last change on this file since 13176 was 13176, checked in by sjamaan, 11 years ago

Work on normalization and add tests for that

File size: 12.7 KB
Line 
1;;
2;; URI-common provides URI handling procedures for common URI schemes
3;; that are based on the generic syntax such as http, https, file, ftp.
4;; It also provides automatic form-urlencoded query argument
5;; encoding/decoding
6;;
7; Copyright (c) 2008-2009, Peter Bex
8; All rights reserved.
9;
10; Redistribution and use in source and binary forms, with or without
11; modification, are permitted provided that the following conditions
12; are met:
13;
14; 1. Redistributions of source code must retain the above copyright
15;    notice, this list of conditions and the following disclaimer.
16; 2. Redistributions in binary form must reproduce the above copyright
17;    notice, this list of conditions and the following disclaimer in the
18;    documentation and/or other materials provided with the distribution.
19; 3. Neither the name of the author nor the names of its
20;    contributors may be used to endorse or promote products derived
21;    from this software without specific prior written permission.
22;
23; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26; FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27; COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
28; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32; STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
34; OF THE POSSIBILITY OF SUCH DAMAGE.
35;
36; Please report bugs, suggestions and ideas to the Chicken Trac
37; ticket tracking system (assign tickets to user 'sjamaan'):
38; http://trac.callcc.org
39
40(provide 'uri-common)
41
42(module uri-common
43  (uri-reference absolute-uri uri->uri-generic uri-generic->uri
44   update-uri uri? uri-scheme uri-username uri-password
45   uri-host uri-port uri-path uri-query uri-fragment
46   uri->string form-urlencode form-urldecode
47   uri-relative-to uri-relative-from
48   uri-normalize-path-segments uri-normalize-case
49   char-set:query/fragment)
50
51(import chicken scheme)
52(require-extension srfi-1 srfi-13 srfi-14 extras data-structures
53                   defstruct matchable)
54(require-library uri-generic)
55(import (prefix uri-generic generic:))
56
57;; We could use the hostinfo egg for this, but that would be yet another
58;; dependency. Besides, not all service names have a matching URI scheme
59;; nor do all URI schemes have a matching service name.
60(define default-ports
61  '((http . 80)              ; RFC 2616
62    (https . 443)            ; RFC 2818
63    (shttp . 80)             ; RFC 2660
64    (ftp . 21)               ; RFC 959; no official URI scheme defined
65    ;; nonstandard, but could be useful
66    (svn+ssh . 22)
67    (svn . 3690)
68    ))
69
70;; A common URI is a generic URI plus stored decoded versions of most components
71(defstruct URI-common
72  generic username password host path query fragment)
73
74(define-record-printer (URI-common x out)
75  (fprintf out "#(URI-common scheme=~S port=~S host=~S path=~S query=~S fragment=~S"
76           (generic:uri-scheme (URI-common-generic x))
77           (generic:uri-port (URI-common-generic x))
78           (URI-common-host x)
79           (URI-common-path x)
80           (URI-common-query x)
81           (URI-common-fragment x)))
82
83;;; Conversion procedures
84(define (uri->uri-generic uri)
85  (URI-common-generic uri))
86
87(define (uri-reference u)
88  (uri-generic->uri (generic:uri-reference u)))
89
90(define (absolute-uri u)
91  (uri-generic->uri (generic:absolute-uri u)))
92
93(define (uri-generic->uri uri)
94  (make-URI-common generic: uri
95                   username: (decode-string* (generic:uri-username uri))
96                   password: (decode-string* (generic:uri-password uri))
97                   host:     (decode-string* (generic:uri-host uri))
98                   path:     (decode-path (generic:uri-path uri))
99                   query:    (form-urldecode (generic:uri-query uri))
100                   fragment: (decode-string* (generic:uri-fragment uri))))
101
102(define (decode-string* s)
103  (and s (generic:uri-decode-string s)))
104
105;;; Accessors
106(define uri?         URI-common?)
107(define uri-scheme   (compose generic:uri-scheme URI-common-generic))
108(define uri-username URI-common-username)
109(define uri-password URI-common-password)
110(define uri-host     URI-common-host)
111(define uri-query    URI-common-query)
112(define uri-fragment URI-common-fragment)
113
114(define (uri-path uc)
115  (let ((path (URI-common-path uc)))
116    ;; does not apply for relative references
117    ;; XXX: Make a "relative-ref?" predicate (or something like that)
118    (if (and (uri-host uc)
119             (or (eq? path '()) (eq? path #f)))
120        '(/ "")
121        path)))
122
123(define (uri-port uc)
124  (let ((u (URI-common-generic uc)))
125    (or (generic:uri-port u)
126        (alist-ref (generic:uri-scheme u) default-ports))))
127
128;;; Updaters
129(define update-uri
130  (let ((unset (list 'unset)))
131    (lambda (uc #!key
132                (scheme unset) (username unset) (password unset)
133                (host unset) (port unset)
134                (path unset) (query unset) (fragment unset))
135      (let* ((uc (update-URI-common uc)) ; new copy
136             (actual-scheme (if (eq? scheme unset)
137                                (generic:uri-scheme (URI-common-generic uc))
138                                scheme))
139             (path (if (and actual-scheme (or (eq? path #f) (eq? path '())))
140                       '(/ "") ; normalize path
141                       path))
142             ;; XXX is this really the desired behaviour?
143             ;; maybe simpler is better: do not reset to #f on default port?
144             (port (if (or
145                        (and (not (eq? scheme unset)) ; scheme specified...
146                             (eq? port unset)) ; ...and no explicit port?
147                        (default-port? port actual-scheme)) ; or default port?
148                       #f               ; then clear port
149                       port)))
150        ;; This code is ugly!
151        (unless (eq? scheme unset)
152          (URI-common-generic-set!
153           uc (generic:update-uri (URI-common-generic uc) scheme: scheme)))
154        (unless (eq? username unset)
155          (URI-common-generic-set!
156           uc (generic:update-uri (URI-common-generic uc)
157                                  username: (encode-string* username)))
158          (URI-common-username-set! uc username))
159        (unless (eq? password unset)
160          (URI-common-generic-set!
161           uc (generic:update-uri (URI-common-generic uc)
162                                  password: (encode-string* password)))
163          (URI-common-password-set! uc password))
164        (unless (eq? host unset)
165          (URI-common-generic-set!
166           uc (generic:update-uri (URI-common-generic uc)
167                                  host: (encode-string* host)))
168          (URI-common-host-set! uc host))
169        (unless (eq? port unset)
170          (URI-common-generic-set!
171           uc (generic:update-uri (URI-common-generic uc) port: port)))
172        (unless (eq? path unset)
173          (URI-common-generic-set!
174           uc (generic:update-uri (URI-common-generic uc)
175                                  path: (encode-path path)))
176          (URI-common-path-set! uc path))
177        (unless (eq? query unset)
178          (URI-common-generic-set!
179           uc (generic:update-uri (URI-common-generic uc)
180                                  query: (form-urlencode query)))
181          (URI-common-query-set! uc query))
182        (unless (eq? fragment unset)
183          (URI-common-generic-set!
184           uc (generic:update-uri (URI-common-generic uc)
185                                  fragment: (encode-string*
186                                             fragment
187                                             char-set:query/fragment)))
188          (URI-common-fragment-set! uc fragment))
189        uc))))
190
191(define (encode-string* s . rest)
192  (and s (apply generic:uri-encode-string s rest)))
193
194(define (default-port? port scheme)
195  (eqv? port (alist-ref scheme default-ports)))
196
197(define (encode-path p)
198  (and p (match p
199                (('/ . rst) (cons '/ (map generic:uri-encode-string rst)))
200                (else (map generic:uri-encode-string p)))))
201
202;; Characters allowed in queries and fragments
203(define char-set:query/fragment
204  (char-set-difference
205   (char-set-complement generic:char-set:uri-unreserved)
206   (string->char-set ":@?/")
207   generic:char-set:sub-delims))
208
209;;; Handling of application/x-www-form-urlencoded data
210;;
211;; This implements both HTML 4's specification
212;; (http://www.w3.org/TR/html401/interact/forms.html#h-17.13.4.1)
213;; and XHTML XForms' specification
214;; (http://www.w3.org/TR/xforms/#structure-model-submission)
215;;
216;; The latter is a more generalised form of the former, as it allows
217;; the user to specify a custom separator character.  The HTML 4
218;; spec also contains a recommendation
219;; (http://www.w3.org/TR/html401/appendix/notes.html#h-B.2.2)
220;; that semicolons should be used instead of ampersands as a separator.
221;; However, it provides no mechanism to select the separator to use
222;; when submitting a form, which makes it a pretty useless recommendation.
223;; This recommendation also complicates matters on the server because one
224;; would need to handle both form-generated GET query parameters and
225;; hardcoded GET query parameters as specified in anchors.
226;;
227;; There's also a 2006 Internet-Draft by Bjoern Hoehrmann that was
228;; intended to standardize this, but it was allowed to expire in 2007:
229;; http://ietfreport.isoc.org/idref/draft-hoehrmann-urlencoded
230
231(define form-urlencoded-separator (make-parameter ";&"))
232
233(define (form-urlencode alist #!key (separator (form-urlencoded-separator)))
234  (and alist (not (null? alist))
235       (let* ((separator-chars (->char-set separator))
236              (join-string (string-take
237                            (if (string? separator-chars)
238                                separator-chars
239                                (char-set->string separator-chars)) 1))
240              (enc (lambda (s)
241                     (string-translate*
242                      (generic:uri-encode-string
243                       s
244                       (char-set-union separator-chars
245                                       (char-set #\= #\+)
246                                       (char-set-delete char-set:query/fragment
247                                                        #\space)))
248                      '((" " . "+"))))))
249         (string-join
250          (reverse (fold
251                    (lambda (arg query)
252                      (match arg
253                             ((a . #f) query)
254                             ((a . #t) (cons (enc (->string a)) query))
255                             ((a . b) (cons
256                                       (sprintf "~A=~A"
257                                                (enc (->string a))
258                                                (enc b))
259                                       query))))
260                    '() alist))
261          join-string))))
262
263(define (form-urldecode query #!key (separator (form-urlencoded-separator)))
264  (if query
265      (map (lambda (part)
266             (let ((idx (string-index part #\=))
267                   (decode (lambda (s)
268                             (generic:uri-decode-string
269                              (string-translate* s '(("+" . "%20")))))))
270               (if idx
271                   (cons (string->symbol (decode (string-take part idx)))
272                         (decode (string-drop part (add1 idx))))
273                   (cons (string->symbol (decode part))
274                         #t))))
275           (string-split query (char-set->string (->char-set separator)) #t))
276      '())) ; _always_ provide a list interface for the query, even if not there
277
278(define (decode-path p)
279  (and p (match p
280                (('/ . rst) (cons '/ (map generic:uri-decode-string rst)))
281                (else (map generic:uri-decode-string p)))))
282
283;;; Miscellaneous procedures
284
285;; Simple convenience procedures
286(define (uri->string uri . args)
287  (apply generic:uri->string (URI-common-generic uri) args))
288
289(define (wrap proc)
290  (lambda args
291    (uri-generic->uri (apply proc (map URI-common-generic args)))))
292
293;; TODO: What about normalization issues here? Right now uri-relative-from
294;; gives a nonempty reference when uri1 has path=() and uri2 has path=(/ "")
295;; This could be considered a bug.  Same for uri->string and with port-nrs
296;; However, URIs with paths updated by this egg do not have that problem.
297(define uri-relative-to             (wrap generic:uri-relative-to))
298(define uri-relative-from           (wrap generic:uri-relative-from))
299(define uri-normalize-case          (wrap generic:uri-normalize-case))
300(define uri-normalize-path-segments (wrap generic:uri-normalize-path-segments))
301
302)
Note: See TracBrowser for help on using the repository browser.