| 1 | """RFC 3986 URI parsing and relative reference resolution / absolutization. |
|---|
| 2 | |
|---|
| 3 | (aka splitting and joining) |
|---|
| 4 | |
|---|
| 5 | Copyright 2006 John J. Lee <jjl@pobox.com> |
|---|
| 6 | |
|---|
| 7 | This code is free software; you can redistribute it and/or modify it under |
|---|
| 8 | the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt |
|---|
| 9 | included with the distribution). |
|---|
| 10 | |
|---|
| 11 | """ |
|---|
| 12 | |
|---|
| 13 | # XXX Wow, this is ugly. Overly-direct translation of the RFC ATM. |
|---|
| 14 | |
|---|
| 15 | import sys, re, posixpath, urllib |
|---|
| 16 | |
|---|
| 17 | ## def chr_range(a, b): |
|---|
| 18 | ## return "".join(map(chr, range(ord(a), ord(b)+1))) |
|---|
| 19 | |
|---|
| 20 | ## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
|---|
| 21 | ## "abcdefghijklmnopqrstuvwxyz" |
|---|
| 22 | ## "0123456789" |
|---|
| 23 | ## "-_.~") |
|---|
| 24 | ## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]" |
|---|
| 25 | ## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%' |
|---|
| 26 | # this re matches any character that's not in URI_CHARS |
|---|
| 27 | BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]") |
|---|
| 28 | |
|---|
| 29 | |
|---|
| 30 | def clean_url(url, encoding): |
|---|
| 31 | # percent-encode illegal URI characters |
|---|
| 32 | # Trying to come up with test cases for this gave me a headache, revisit |
|---|
| 33 | # when do switch to unicode. |
|---|
| 34 | # Somebody else's comments (lost the attribution): |
|---|
| 35 | ## - IE will return you the url in the encoding you send it |
|---|
| 36 | ## - Mozilla/Firefox will send you latin-1 if there's no non latin-1 |
|---|
| 37 | ## characters in your link. It will send you utf-8 however if there are... |
|---|
| 38 | if type(url) == type(""): |
|---|
| 39 | url = url.decode(encoding, "replace") |
|---|
| 40 | url = url.strip() |
|---|
| 41 | # for second param to urllib.quote(), we want URI_CHARS, minus the |
|---|
| 42 | # 'always_safe' characters that urllib.quote() never percent-encodes |
|---|
| 43 | return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~") |
|---|
| 44 | |
|---|
| 45 | def is_clean_uri(uri): |
|---|
| 46 | """ |
|---|
| 47 | >>> is_clean_uri("ABC!") |
|---|
| 48 | True |
|---|
| 49 | >>> is_clean_uri(u"ABC!") |
|---|
| 50 | True |
|---|
| 51 | >>> is_clean_uri("ABC|") |
|---|
| 52 | False |
|---|
| 53 | >>> is_clean_uri(u"ABC|") |
|---|
| 54 | False |
|---|
| 55 | >>> is_clean_uri("http://example.com/0") |
|---|
| 56 | True |
|---|
| 57 | >>> is_clean_uri(u"http://example.com/0") |
|---|
| 58 | True |
|---|
| 59 | """ |
|---|
| 60 | # note module re treats bytestrings as through they were decoded as latin-1 |
|---|
| 61 | # so this function accepts both unicode and bytestrings |
|---|
| 62 | return not bool(BAD_URI_CHARS_RE.search(uri)) |
|---|
| 63 | |
|---|
| 64 | |
|---|
| 65 | SPLIT_MATCH = re.compile( |
|---|
| 66 | r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match |
|---|
| 67 | def urlsplit(absolute_uri): |
|---|
| 68 | """Return scheme, authority, path, query, fragment.""" |
|---|
| 69 | match = SPLIT_MATCH(absolute_uri) |
|---|
| 70 | if match: |
|---|
| 71 | g = match.groups() |
|---|
| 72 | return g[1], g[3], g[4], g[6], g[8] |
|---|
| 73 | |
|---|
| 74 | def urlunsplit(parts): |
|---|
| 75 | scheme, authority, path, query, fragment = parts |
|---|
| 76 | r = [] |
|---|
| 77 | append = r.append |
|---|
| 78 | if scheme is not None: |
|---|
| 79 | append(scheme) |
|---|
| 80 | append(":") |
|---|
| 81 | if authority is not None: |
|---|
| 82 | append("//") |
|---|
| 83 | append(authority) |
|---|
| 84 | append(path) |
|---|
| 85 | if query is not None: |
|---|
| 86 | append("?") |
|---|
| 87 | append(query) |
|---|
| 88 | if fragment is not None: |
|---|
| 89 | append("#") |
|---|
| 90 | append(fragment) |
|---|
| 91 | return "".join(r) |
|---|
| 92 | |
|---|
| 93 | def urljoin(base_uri, uri_reference): |
|---|
| 94 | return urlunsplit(urljoin_parts(urlsplit(base_uri), |
|---|
| 95 | urlsplit(uri_reference))) |
|---|
| 96 | |
|---|
| 97 | # oops, this doesn't do the same thing as the literal translation |
|---|
| 98 | # from the RFC below |
|---|
| 99 | ## def urljoin_parts(base_parts, reference_parts): |
|---|
| 100 | ## scheme, authority, path, query, fragment = base_parts |
|---|
| 101 | ## rscheme, rauthority, rpath, rquery, rfragment = reference_parts |
|---|
| 102 | |
|---|
| 103 | ## # compute target URI path |
|---|
| 104 | ## if rpath == "": |
|---|
| 105 | ## tpath = path |
|---|
| 106 | ## else: |
|---|
| 107 | ## tpath = rpath |
|---|
| 108 | ## if not tpath.startswith("/"): |
|---|
| 109 | ## tpath = merge(authority, path, tpath) |
|---|
| 110 | ## tpath = posixpath.normpath(tpath) |
|---|
| 111 | |
|---|
| 112 | ## if rscheme is not None: |
|---|
| 113 | ## return (rscheme, rauthority, tpath, rquery, rfragment) |
|---|
| 114 | ## elif rauthority is not None: |
|---|
| 115 | ## return (scheme, rauthority, tpath, rquery, rfragment) |
|---|
| 116 | ## elif rpath == "": |
|---|
| 117 | ## if rquery is not None: |
|---|
| 118 | ## tquery = rquery |
|---|
| 119 | ## else: |
|---|
| 120 | ## tquery = query |
|---|
| 121 | ## return (scheme, authority, tpath, tquery, rfragment) |
|---|
| 122 | ## else: |
|---|
| 123 | ## return (scheme, authority, tpath, rquery, rfragment) |
|---|
| 124 | |
|---|
| 125 | def urljoin_parts(base_parts, reference_parts): |
|---|
| 126 | scheme, authority, path, query, fragment = base_parts |
|---|
| 127 | rscheme, rauthority, rpath, rquery, rfragment = reference_parts |
|---|
| 128 | |
|---|
| 129 | if rscheme == scheme: |
|---|
| 130 | rscheme = None |
|---|
| 131 | |
|---|
| 132 | if rscheme is not None: |
|---|
| 133 | tscheme, tauthority, tpath, tquery = ( |
|---|
| 134 | rscheme, rauthority, remove_dot_segments(rpath), rquery) |
|---|
| 135 | else: |
|---|
| 136 | if rauthority is not None: |
|---|
| 137 | tauthority, tpath, tquery = ( |
|---|
| 138 | rauthority, remove_dot_segments(rpath), rquery) |
|---|
| 139 | else: |
|---|
| 140 | if rpath == "": |
|---|
| 141 | tpath = path |
|---|
| 142 | if rquery is not None: |
|---|
| 143 | tquery = rquery |
|---|
| 144 | else: |
|---|
| 145 | tquery = query |
|---|
| 146 | else: |
|---|
| 147 | if rpath.startswith("/"): |
|---|
| 148 | tpath = remove_dot_segments(rpath) |
|---|
| 149 | else: |
|---|
| 150 | tpath = merge(authority, path, rpath) |
|---|
| 151 | tpath = remove_dot_segments(tpath) |
|---|
| 152 | tquery = rquery |
|---|
| 153 | tauthority = authority |
|---|
| 154 | tscheme = scheme |
|---|
| 155 | tfragment = rfragment |
|---|
| 156 | return (tscheme, tauthority, tpath, tquery, tfragment) |
|---|
| 157 | |
|---|
| 158 | # um, something *vaguely* like this is what I want, but I have to generate |
|---|
| 159 | # lots of test cases first, if only to understand what it is that |
|---|
| 160 | # remove_dot_segments really does... |
|---|
| 161 | ## def remove_dot_segments(path): |
|---|
| 162 | ## if path == '': |
|---|
| 163 | ## return '' |
|---|
| 164 | ## comps = path.split('/') |
|---|
| 165 | ## new_comps = [] |
|---|
| 166 | ## for comp in comps: |
|---|
| 167 | ## if comp in ['.', '']: |
|---|
| 168 | ## if not new_comps or new_comps[-1]: |
|---|
| 169 | ## new_comps.append('') |
|---|
| 170 | ## continue |
|---|
| 171 | ## if comp != '..': |
|---|
| 172 | ## new_comps.append(comp) |
|---|
| 173 | ## elif new_comps: |
|---|
| 174 | ## new_comps.pop() |
|---|
| 175 | ## return '/'.join(new_comps) |
|---|
| 176 | |
|---|
| 177 | |
|---|
| 178 | def remove_dot_segments(path): |
|---|
| 179 | r = [] |
|---|
| 180 | while path: |
|---|
| 181 | # A |
|---|
| 182 | if path.startswith("../"): |
|---|
| 183 | path = path[3:] |
|---|
| 184 | continue |
|---|
| 185 | if path.startswith("./"): |
|---|
| 186 | path = path[2:] |
|---|
| 187 | continue |
|---|
| 188 | # B |
|---|
| 189 | if path.startswith("/./"): |
|---|
| 190 | path = path[2:] |
|---|
| 191 | continue |
|---|
| 192 | if path == "/.": |
|---|
| 193 | path = "/" |
|---|
| 194 | continue |
|---|
| 195 | # C |
|---|
| 196 | if path.startswith("/../"): |
|---|
| 197 | path = path[3:] |
|---|
| 198 | if r: |
|---|
| 199 | r.pop() |
|---|
| 200 | continue |
|---|
| 201 | if path == "/..": |
|---|
| 202 | path = "/" |
|---|
| 203 | if r: |
|---|
| 204 | r.pop() |
|---|
| 205 | continue |
|---|
| 206 | # D |
|---|
| 207 | if path == ".": |
|---|
| 208 | path = path[1:] |
|---|
| 209 | continue |
|---|
| 210 | if path == "..": |
|---|
| 211 | path = path[2:] |
|---|
| 212 | continue |
|---|
| 213 | # E |
|---|
| 214 | start = 0 |
|---|
| 215 | if path.startswith("/"): |
|---|
| 216 | start = 1 |
|---|
| 217 | ii = path.find("/", start) |
|---|
| 218 | if ii < 0: |
|---|
| 219 | ii = None |
|---|
| 220 | r.append(path[:ii]) |
|---|
| 221 | if ii is None: |
|---|
| 222 | break |
|---|
| 223 | path = path[ii:] |
|---|
| 224 | return "".join(r) |
|---|
| 225 | |
|---|
| 226 | def merge(base_authority, base_path, ref_path): |
|---|
| 227 | # XXXX Oddly, the sample Perl implementation of this by Roy Fielding |
|---|
| 228 | # doesn't even take base_authority as a parameter, despite the wording in |
|---|
| 229 | # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity. |
|---|
| 230 | #if base_authority is not None and base_path == "": |
|---|
| 231 | if base_path == "": |
|---|
| 232 | return "/" + ref_path |
|---|
| 233 | ii = base_path.rfind("/") |
|---|
| 234 | if ii >= 0: |
|---|
| 235 | return base_path[:ii+1] + ref_path |
|---|
| 236 | return ref_path |
|---|
| 237 | |
|---|
| 238 | if __name__ == "__main__": |
|---|
| 239 | import doctest |
|---|
| 240 | doctest.testmod() |
|---|