1 | """RFC 3986 URI parsing and relative reference resolution / absolutization. |
---|
2 | |
---|
3 | (aka splitting and joining) |
---|
4 | |
---|
5 | Copyright 2006 John J. Lee <jjl@pobox.com> |
---|
6 | |
---|
7 | This code is free software; you can redistribute it and/or modify it under |
---|
8 | the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt |
---|
9 | included with the distribution). |
---|
10 | |
---|
11 | """ |
---|
12 | |
---|
13 | # XXX Wow, this is ugly. Overly-direct translation of the RFC ATM. |
---|
14 | |
---|
15 | import sys, re, posixpath, urllib |
---|
16 | |
---|
17 | ## def chr_range(a, b): |
---|
18 | ## return "".join(map(chr, range(ord(a), ord(b)+1))) |
---|
19 | |
---|
20 | ## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
---|
21 | ## "abcdefghijklmnopqrstuvwxyz" |
---|
22 | ## "0123456789" |
---|
23 | ## "-_.~") |
---|
24 | ## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]" |
---|
25 | ## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%' |
---|
26 | # this re matches any character that's not in URI_CHARS |
---|
27 | BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]") |
---|
28 | |
---|
29 | |
---|
30 | def clean_url(url, encoding): |
---|
31 | # percent-encode illegal URI characters |
---|
32 | # Trying to come up with test cases for this gave me a headache, revisit |
---|
33 | # when do switch to unicode. |
---|
34 | # Somebody else's comments (lost the attribution): |
---|
35 | ## - IE will return you the url in the encoding you send it |
---|
36 | ## - Mozilla/Firefox will send you latin-1 if there's no non latin-1 |
---|
37 | ## characters in your link. It will send you utf-8 however if there are... |
---|
38 | if type(url) == type(""): |
---|
39 | url = url.decode(encoding, "replace") |
---|
40 | url = url.strip() |
---|
41 | # for second param to urllib.quote(), we want URI_CHARS, minus the |
---|
42 | # 'always_safe' characters that urllib.quote() never percent-encodes |
---|
43 | return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~") |
---|
44 | |
---|
45 | def is_clean_uri(uri): |
---|
46 | """ |
---|
47 | >>> is_clean_uri("ABC!") |
---|
48 | True |
---|
49 | >>> is_clean_uri(u"ABC!") |
---|
50 | True |
---|
51 | >>> is_clean_uri("ABC|") |
---|
52 | False |
---|
53 | >>> is_clean_uri(u"ABC|") |
---|
54 | False |
---|
55 | >>> is_clean_uri("http://example.com/0") |
---|
56 | True |
---|
57 | >>> is_clean_uri(u"http://example.com/0") |
---|
58 | True |
---|
59 | """ |
---|
60 | # note module re treats bytestrings as through they were decoded as latin-1 |
---|
61 | # so this function accepts both unicode and bytestrings |
---|
62 | return not bool(BAD_URI_CHARS_RE.search(uri)) |
---|
63 | |
---|
64 | |
---|
65 | SPLIT_MATCH = re.compile( |
---|
66 | r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match |
---|
67 | def urlsplit(absolute_uri): |
---|
68 | """Return scheme, authority, path, query, fragment.""" |
---|
69 | match = SPLIT_MATCH(absolute_uri) |
---|
70 | if match: |
---|
71 | g = match.groups() |
---|
72 | return g[1], g[3], g[4], g[6], g[8] |
---|
73 | |
---|
74 | def urlunsplit(parts): |
---|
75 | scheme, authority, path, query, fragment = parts |
---|
76 | r = [] |
---|
77 | append = r.append |
---|
78 | if scheme is not None: |
---|
79 | append(scheme) |
---|
80 | append(":") |
---|
81 | if authority is not None: |
---|
82 | append("//") |
---|
83 | append(authority) |
---|
84 | append(path) |
---|
85 | if query is not None: |
---|
86 | append("?") |
---|
87 | append(query) |
---|
88 | if fragment is not None: |
---|
89 | append("#") |
---|
90 | append(fragment) |
---|
91 | return "".join(r) |
---|
92 | |
---|
93 | def urljoin(base_uri, uri_reference): |
---|
94 | return urlunsplit(urljoin_parts(urlsplit(base_uri), |
---|
95 | urlsplit(uri_reference))) |
---|
96 | |
---|
97 | # oops, this doesn't do the same thing as the literal translation |
---|
98 | # from the RFC below |
---|
99 | ## def urljoin_parts(base_parts, reference_parts): |
---|
100 | ## scheme, authority, path, query, fragment = base_parts |
---|
101 | ## rscheme, rauthority, rpath, rquery, rfragment = reference_parts |
---|
102 | |
---|
103 | ## # compute target URI path |
---|
104 | ## if rpath == "": |
---|
105 | ## tpath = path |
---|
106 | ## else: |
---|
107 | ## tpath = rpath |
---|
108 | ## if not tpath.startswith("/"): |
---|
109 | ## tpath = merge(authority, path, tpath) |
---|
110 | ## tpath = posixpath.normpath(tpath) |
---|
111 | |
---|
112 | ## if rscheme is not None: |
---|
113 | ## return (rscheme, rauthority, tpath, rquery, rfragment) |
---|
114 | ## elif rauthority is not None: |
---|
115 | ## return (scheme, rauthority, tpath, rquery, rfragment) |
---|
116 | ## elif rpath == "": |
---|
117 | ## if rquery is not None: |
---|
118 | ## tquery = rquery |
---|
119 | ## else: |
---|
120 | ## tquery = query |
---|
121 | ## return (scheme, authority, tpath, tquery, rfragment) |
---|
122 | ## else: |
---|
123 | ## return (scheme, authority, tpath, rquery, rfragment) |
---|
124 | |
---|
125 | def urljoin_parts(base_parts, reference_parts): |
---|
126 | scheme, authority, path, query, fragment = base_parts |
---|
127 | rscheme, rauthority, rpath, rquery, rfragment = reference_parts |
---|
128 | |
---|
129 | if rscheme == scheme: |
---|
130 | rscheme = None |
---|
131 | |
---|
132 | if rscheme is not None: |
---|
133 | tscheme, tauthority, tpath, tquery = ( |
---|
134 | rscheme, rauthority, remove_dot_segments(rpath), rquery) |
---|
135 | else: |
---|
136 | if rauthority is not None: |
---|
137 | tauthority, tpath, tquery = ( |
---|
138 | rauthority, remove_dot_segments(rpath), rquery) |
---|
139 | else: |
---|
140 | if rpath == "": |
---|
141 | tpath = path |
---|
142 | if rquery is not None: |
---|
143 | tquery = rquery |
---|
144 | else: |
---|
145 | tquery = query |
---|
146 | else: |
---|
147 | if rpath.startswith("/"): |
---|
148 | tpath = remove_dot_segments(rpath) |
---|
149 | else: |
---|
150 | tpath = merge(authority, path, rpath) |
---|
151 | tpath = remove_dot_segments(tpath) |
---|
152 | tquery = rquery |
---|
153 | tauthority = authority |
---|
154 | tscheme = scheme |
---|
155 | tfragment = rfragment |
---|
156 | return (tscheme, tauthority, tpath, tquery, tfragment) |
---|
157 | |
---|
158 | # um, something *vaguely* like this is what I want, but I have to generate |
---|
159 | # lots of test cases first, if only to understand what it is that |
---|
160 | # remove_dot_segments really does... |
---|
161 | ## def remove_dot_segments(path): |
---|
162 | ## if path == '': |
---|
163 | ## return '' |
---|
164 | ## comps = path.split('/') |
---|
165 | ## new_comps = [] |
---|
166 | ## for comp in comps: |
---|
167 | ## if comp in ['.', '']: |
---|
168 | ## if not new_comps or new_comps[-1]: |
---|
169 | ## new_comps.append('') |
---|
170 | ## continue |
---|
171 | ## if comp != '..': |
---|
172 | ## new_comps.append(comp) |
---|
173 | ## elif new_comps: |
---|
174 | ## new_comps.pop() |
---|
175 | ## return '/'.join(new_comps) |
---|
176 | |
---|
177 | |
---|
178 | def remove_dot_segments(path): |
---|
179 | r = [] |
---|
180 | while path: |
---|
181 | # A |
---|
182 | if path.startswith("../"): |
---|
183 | path = path[3:] |
---|
184 | continue |
---|
185 | if path.startswith("./"): |
---|
186 | path = path[2:] |
---|
187 | continue |
---|
188 | # B |
---|
189 | if path.startswith("/./"): |
---|
190 | path = path[2:] |
---|
191 | continue |
---|
192 | if path == "/.": |
---|
193 | path = "/" |
---|
194 | continue |
---|
195 | # C |
---|
196 | if path.startswith("/../"): |
---|
197 | path = path[3:] |
---|
198 | if r: |
---|
199 | r.pop() |
---|
200 | continue |
---|
201 | if path == "/..": |
---|
202 | path = "/" |
---|
203 | if r: |
---|
204 | r.pop() |
---|
205 | continue |
---|
206 | # D |
---|
207 | if path == ".": |
---|
208 | path = path[1:] |
---|
209 | continue |
---|
210 | if path == "..": |
---|
211 | path = path[2:] |
---|
212 | continue |
---|
213 | # E |
---|
214 | start = 0 |
---|
215 | if path.startswith("/"): |
---|
216 | start = 1 |
---|
217 | ii = path.find("/", start) |
---|
218 | if ii < 0: |
---|
219 | ii = None |
---|
220 | r.append(path[:ii]) |
---|
221 | if ii is None: |
---|
222 | break |
---|
223 | path = path[ii:] |
---|
224 | return "".join(r) |
---|
225 | |
---|
226 | def merge(base_authority, base_path, ref_path): |
---|
227 | # XXXX Oddly, the sample Perl implementation of this by Roy Fielding |
---|
228 | # doesn't even take base_authority as a parameter, despite the wording in |
---|
229 | # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity. |
---|
230 | #if base_authority is not None and base_path == "": |
---|
231 | if base_path == "": |
---|
232 | return "/" + ref_path |
---|
233 | ii = base_path.rfind("/") |
---|
234 | if ii >= 0: |
---|
235 | return base_path[:ii+1] + ref_path |
---|
236 | return ref_path |
---|
237 | |
---|
238 | if __name__ == "__main__": |
---|
239 | import doctest |
---|
240 | doctest.testmod() |
---|