| 1 | """Stateful programmatic WWW navigation, after Perl's WWW::Mechanize. |
|---|
| 2 | |
|---|
| 3 | Copyright 2003-2006 John J. Lee <jjl@pobox.com> |
|---|
| 4 | Copyright 2003 Andy Lester (original Perl code) |
|---|
| 5 | |
|---|
| 6 | This code is free software; you can redistribute it and/or modify it |
|---|
| 7 | under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt |
|---|
| 8 | included with the distribution). |
|---|
| 9 | |
|---|
| 10 | """ |
|---|
| 11 | |
|---|
| 12 | import urllib2, sys, copy, re, os, urllib |
|---|
| 13 | |
|---|
| 14 | |
|---|
| 15 | from _useragent import UserAgentBase |
|---|
| 16 | from _html import DefaultFactory |
|---|
| 17 | import _response |
|---|
| 18 | import _request |
|---|
| 19 | import _rfc3986 |
|---|
| 20 | |
|---|
| 21 | __version__ = (0, 1, 8, "b", None) # 0.1.8b |
|---|
| 22 | |
|---|
| 23 | class BrowserStateError(Exception): pass |
|---|
| 24 | class LinkNotFoundError(Exception): pass |
|---|
| 25 | class FormNotFoundError(Exception): pass |
|---|
| 26 | |
|---|
| 27 | |
|---|
| 28 | def sanepathname2url(path): |
|---|
| 29 | urlpath = urllib.pathname2url(path) |
|---|
| 30 | if os.name == "nt" and urlpath.startswith("///"): |
|---|
| 31 | urlpath = urlpath[2:] |
|---|
| 32 | # XXX don't ask me about the mac... |
|---|
| 33 | return urlpath |
|---|
| 34 | |
|---|
| 35 | |
|---|
| 36 | class History: |
|---|
| 37 | """ |
|---|
| 38 | |
|---|
| 39 | Though this will become public, the implied interface is not yet stable. |
|---|
| 40 | |
|---|
| 41 | """ |
|---|
| 42 | def __init__(self): |
|---|
| 43 | self._history = [] # LIFO |
|---|
| 44 | def add(self, request, response): |
|---|
| 45 | self._history.append((request, response)) |
|---|
| 46 | def back(self, n, _response): |
|---|
| 47 | response = _response # XXX move Browser._response into this class? |
|---|
| 48 | while n > 0 or response is None: |
|---|
| 49 | try: |
|---|
| 50 | request, response = self._history.pop() |
|---|
| 51 | except IndexError: |
|---|
| 52 | raise BrowserStateError("already at start of history") |
|---|
| 53 | n -= 1 |
|---|
| 54 | return request, response |
|---|
| 55 | def clear(self): |
|---|
| 56 | del self._history[:] |
|---|
| 57 | def close(self): |
|---|
| 58 | for request, response in self._history: |
|---|
| 59 | if response is not None: |
|---|
| 60 | response.close() |
|---|
| 61 | del self._history[:] |
|---|
| 62 | |
|---|
| 63 | |
|---|
| 64 | class HTTPRefererProcessor(urllib2.BaseHandler): |
|---|
| 65 | def http_request(self, request): |
|---|
| 66 | # See RFC 2616 14.36. The only times we know the source of the |
|---|
| 67 | # request URI has a URI associated with it are redirect, and |
|---|
| 68 | # Browser.click() / Browser.submit() / Browser.follow_link(). |
|---|
| 69 | # Otherwise, it's the user's job to add any Referer header before |
|---|
| 70 | # .open()ing. |
|---|
| 71 | if hasattr(request, "redirect_dict"): |
|---|
| 72 | request = self.parent._add_referer_header( |
|---|
| 73 | request, origin_request=False) |
|---|
| 74 | return request |
|---|
| 75 | |
|---|
| 76 | https_request = http_request |
|---|
| 77 | |
|---|
| 78 | |
|---|
| 79 | class Browser(UserAgentBase): |
|---|
| 80 | """Browser-like class with support for history, forms and links. |
|---|
| 81 | |
|---|
| 82 | BrowserStateError is raised whenever the browser is in the wrong state to |
|---|
| 83 | complete the requested operation - eg., when .back() is called when the |
|---|
| 84 | browser history is empty, or when .follow_link() is called when the current |
|---|
| 85 | response does not contain HTML data. |
|---|
| 86 | |
|---|
| 87 | Public attributes: |
|---|
| 88 | |
|---|
| 89 | request: current request (mechanize.Request or urllib2.Request) |
|---|
| 90 | form: currently selected form (see .select_form()) |
|---|
| 91 | |
|---|
| 92 | """ |
|---|
| 93 | |
|---|
| 94 | handler_classes = copy.copy(UserAgentBase.handler_classes) |
|---|
| 95 | handler_classes["_referer"] = HTTPRefererProcessor |
|---|
| 96 | default_features = copy.copy(UserAgentBase.default_features) |
|---|
| 97 | default_features.append("_referer") |
|---|
| 98 | |
|---|
| 99 | def __init__(self, |
|---|
| 100 | factory=None, |
|---|
| 101 | history=None, |
|---|
| 102 | request_class=None, |
|---|
| 103 | ): |
|---|
| 104 | """ |
|---|
| 105 | |
|---|
| 106 | Only named arguments should be passed to this constructor. |
|---|
| 107 | |
|---|
| 108 | factory: object implementing the mechanize.Factory interface. |
|---|
| 109 | history: object implementing the mechanize.History interface. Note |
|---|
| 110 | this interface is still experimental and may change in future. |
|---|
| 111 | request_class: Request class to use. Defaults to mechanize.Request |
|---|
| 112 | by default for Pythons older than 2.4, urllib2.Request otherwise. |
|---|
| 113 | |
|---|
| 114 | The Factory and History objects passed in are 'owned' by the Browser, |
|---|
| 115 | so they should not be shared across Browsers. In particular, |
|---|
| 116 | factory.set_response() should not be called except by the owning |
|---|
| 117 | Browser itself. |
|---|
| 118 | |
|---|
| 119 | Note that the supplied factory's request_class is overridden by this |
|---|
| 120 | constructor, to ensure only one Request class is used. |
|---|
| 121 | |
|---|
| 122 | """ |
|---|
| 123 | self._handle_referer = True |
|---|
| 124 | |
|---|
| 125 | if history is None: |
|---|
| 126 | history = History() |
|---|
| 127 | self._history = history |
|---|
| 128 | |
|---|
| 129 | if request_class is None: |
|---|
| 130 | if not hasattr(urllib2.Request, "add_unredirected_header"): |
|---|
| 131 | request_class = _request.Request |
|---|
| 132 | else: |
|---|
| 133 | request_class = urllib2.Request # Python >= 2.4 |
|---|
| 134 | |
|---|
| 135 | if factory is None: |
|---|
| 136 | factory = DefaultFactory() |
|---|
| 137 | factory.set_request_class(request_class) |
|---|
| 138 | self._factory = factory |
|---|
| 139 | self.request_class = request_class |
|---|
| 140 | |
|---|
| 141 | self.request = None |
|---|
| 142 | self._set_response(None, False) |
|---|
| 143 | |
|---|
| 144 | # do this last to avoid __getattr__ problems |
|---|
| 145 | UserAgentBase.__init__(self) |
|---|
| 146 | |
|---|
| 147 | def close(self): |
|---|
| 148 | UserAgentBase.close(self) |
|---|
| 149 | if self._response is not None: |
|---|
| 150 | self._response.close() |
|---|
| 151 | if self._history is not None: |
|---|
| 152 | self._history.close() |
|---|
| 153 | self._history = None |
|---|
| 154 | |
|---|
| 155 | # make use after .close easy to spot |
|---|
| 156 | self.form = None |
|---|
| 157 | self.request = self._response = None |
|---|
| 158 | self.request = self.response = self.set_response = None |
|---|
| 159 | self.geturl = self.reload = self.back = None |
|---|
| 160 | self.clear_history = self.set_cookie = self.links = self.forms = None |
|---|
| 161 | self.viewing_html = self.encoding = self.title = None |
|---|
| 162 | self.select_form = self.click = self.submit = self.click_link = None |
|---|
| 163 | self.follow_link = self.find_link = None |
|---|
| 164 | |
|---|
| 165 | def set_handle_referer(self, handle): |
|---|
| 166 | """Set whether to add Referer header to each request. |
|---|
| 167 | |
|---|
| 168 | This base class does not implement this feature (so don't turn this on |
|---|
| 169 | if you're using this base class directly), but the subclass |
|---|
| 170 | mechanize.Browser does. |
|---|
| 171 | |
|---|
| 172 | """ |
|---|
| 173 | self._set_handler("_referer", handle) |
|---|
| 174 | self._handle_referer = bool(handle) |
|---|
| 175 | |
|---|
| 176 | def _add_referer_header(self, request, origin_request=True): |
|---|
| 177 | if self.request is None: |
|---|
| 178 | return request |
|---|
| 179 | scheme = request.get_type() |
|---|
| 180 | original_scheme = self.request.get_type() |
|---|
| 181 | if scheme not in ["http", "https"]: |
|---|
| 182 | return request |
|---|
| 183 | if not origin_request and not self.request.has_header("Referer"): |
|---|
| 184 | return request |
|---|
| 185 | |
|---|
| 186 | if (self._handle_referer and |
|---|
| 187 | original_scheme in ["http", "https"] and |
|---|
| 188 | not (original_scheme == "https" and scheme != "https")): |
|---|
| 189 | # strip URL fragment (RFC 2616 14.36) |
|---|
| 190 | parts = _rfc3986.urlsplit(self.request.get_full_url()) |
|---|
| 191 | parts = parts[:-1]+(None,) |
|---|
| 192 | referer = _rfc3986.urlunsplit(parts) |
|---|
| 193 | request.add_unredirected_header("Referer", referer) |
|---|
| 194 | return request |
|---|
| 195 | |
|---|
| 196 | def open_novisit(self, url, data=None): |
|---|
| 197 | """Open a URL without visiting it. |
|---|
| 198 | |
|---|
| 199 | The browser state (including .request, .response(), history, forms and |
|---|
| 200 | links) are all left unchanged by calling this function. |
|---|
| 201 | |
|---|
| 202 | The interface is the same as for .open(). |
|---|
| 203 | |
|---|
| 204 | This is useful for things like fetching images. |
|---|
| 205 | |
|---|
| 206 | See also .retrieve(). |
|---|
| 207 | |
|---|
| 208 | """ |
|---|
| 209 | return self._mech_open(url, data, visit=False) |
|---|
| 210 | |
|---|
| 211 | def open(self, url, data=None): |
|---|
| 212 | return self._mech_open(url, data) |
|---|
| 213 | |
|---|
| 214 | def _mech_open(self, url, data=None, update_history=True, visit=None): |
|---|
| 215 | try: |
|---|
| 216 | url.get_full_url |
|---|
| 217 | except AttributeError: |
|---|
| 218 | # string URL -- convert to absolute URL if required |
|---|
| 219 | scheme, authority = _rfc3986.urlsplit(url)[:2] |
|---|
| 220 | if scheme is None: |
|---|
| 221 | # relative URL |
|---|
| 222 | if self._response is None: |
|---|
| 223 | raise BrowserStateError( |
|---|
| 224 | "can't fetch relative reference: " |
|---|
| 225 | "not viewing any document") |
|---|
| 226 | url = _rfc3986.urljoin(self._response.geturl(), url) |
|---|
| 227 | |
|---|
| 228 | request = self._request(url, data, visit) |
|---|
| 229 | visit = request.visit |
|---|
| 230 | if visit is None: |
|---|
| 231 | visit = True |
|---|
| 232 | |
|---|
| 233 | if visit: |
|---|
| 234 | self._visit_request(request, update_history) |
|---|
| 235 | |
|---|
| 236 | success = True |
|---|
| 237 | try: |
|---|
| 238 | response = UserAgentBase.open(self, request, data) |
|---|
| 239 | except urllib2.HTTPError, error: |
|---|
| 240 | success = False |
|---|
| 241 | if error.fp is None: # not a response |
|---|
| 242 | raise |
|---|
| 243 | response = error |
|---|
| 244 | ## except (IOError, socket.error, OSError), error: |
|---|
| 245 | ## # Yes, urllib2 really does raise all these :-(( |
|---|
| 246 | ## # See test_urllib2.py for examples of socket.gaierror and OSError, |
|---|
| 247 | ## # plus note that FTPHandler raises IOError. |
|---|
| 248 | ## # XXX I don't seem to have an example of exactly socket.error being |
|---|
| 249 | ## # raised, only socket.gaierror... |
|---|
| 250 | ## # I don't want to start fixing these here, though, since this is a |
|---|
| 251 | ## # subclass of OpenerDirector, and it would break old code. Even in |
|---|
| 252 | ## # Python core, a fix would need some backwards-compat. hack to be |
|---|
| 253 | ## # acceptable. |
|---|
| 254 | ## raise |
|---|
| 255 | |
|---|
| 256 | if visit: |
|---|
| 257 | self._set_response(response, False) |
|---|
| 258 | response = copy.copy(self._response) |
|---|
| 259 | elif response is not None: |
|---|
| 260 | response = _response.upgrade_response(response) |
|---|
| 261 | |
|---|
| 262 | if not success: |
|---|
| 263 | raise response |
|---|
| 264 | return response |
|---|
| 265 | |
|---|
| 266 | def __str__(self): |
|---|
| 267 | text = [] |
|---|
| 268 | text.append("<%s " % self.__class__.__name__) |
|---|
| 269 | if self._response: |
|---|
| 270 | text.append("visiting %s" % self._response.geturl()) |
|---|
| 271 | else: |
|---|
| 272 | text.append("(not visiting a URL)") |
|---|
| 273 | if self.form: |
|---|
| 274 | text.append("\n selected form:\n %s\n" % str(self.form)) |
|---|
| 275 | text.append(">") |
|---|
| 276 | return "".join(text) |
|---|
| 277 | |
|---|
| 278 | def response(self): |
|---|
| 279 | """Return a copy of the current response. |
|---|
| 280 | |
|---|
| 281 | The returned object has the same interface as the object returned by |
|---|
| 282 | .open() (or urllib2.urlopen()). |
|---|
| 283 | |
|---|
| 284 | """ |
|---|
| 285 | return copy.copy(self._response) |
|---|
| 286 | |
|---|
| 287 | def open_local_file(self, filename): |
|---|
| 288 | path = sanepathname2url(os.path.abspath(filename)) |
|---|
| 289 | url = 'file://'+path |
|---|
| 290 | return self.open(url) |
|---|
| 291 | |
|---|
| 292 | def set_response(self, response): |
|---|
| 293 | """Replace current response with (a copy of) response. |
|---|
| 294 | |
|---|
| 295 | response may be None. |
|---|
| 296 | |
|---|
| 297 | This is intended mostly for HTML-preprocessing. |
|---|
| 298 | """ |
|---|
| 299 | self._set_response(response, True) |
|---|
| 300 | |
|---|
| 301 | def _set_response(self, response, close_current): |
|---|
| 302 | # sanity check, necessary but far from sufficient |
|---|
| 303 | if not (response is None or |
|---|
| 304 | (hasattr(response, "info") and hasattr(response, "geturl") and |
|---|
| 305 | hasattr(response, "read") |
|---|
| 306 | ) |
|---|
| 307 | ): |
|---|
| 308 | raise ValueError("not a response object") |
|---|
| 309 | |
|---|
| 310 | self.form = None |
|---|
| 311 | if response is not None: |
|---|
| 312 | response = _response.upgrade_response(response) |
|---|
| 313 | if close_current and self._response is not None: |
|---|
| 314 | self._response.close() |
|---|
| 315 | self._response = response |
|---|
| 316 | self._factory.set_response(response) |
|---|
| 317 | |
|---|
| 318 | def visit_response(self, response, request=None): |
|---|
| 319 | """Visit the response, as if it had been .open()ed. |
|---|
| 320 | |
|---|
| 321 | Unlike .set_response(), this updates history rather than replacing the |
|---|
| 322 | current response. |
|---|
| 323 | """ |
|---|
| 324 | if request is None: |
|---|
| 325 | request = _request.Request(response.geturl()) |
|---|
| 326 | self._visit_request(request, True) |
|---|
| 327 | self._set_response(response, False) |
|---|
| 328 | |
|---|
| 329 | def _visit_request(self, request, update_history): |
|---|
| 330 | if self._response is not None: |
|---|
| 331 | self._response.close() |
|---|
| 332 | if self.request is not None and update_history: |
|---|
| 333 | self._history.add(self.request, self._response) |
|---|
| 334 | self._response = None |
|---|
| 335 | # we want self.request to be assigned even if UserAgentBase.open |
|---|
| 336 | # fails |
|---|
| 337 | self.request = request |
|---|
| 338 | |
|---|
| 339 | def geturl(self): |
|---|
| 340 | """Get URL of current document.""" |
|---|
| 341 | if self._response is None: |
|---|
| 342 | raise BrowserStateError("not viewing any document") |
|---|
| 343 | return self._response.geturl() |
|---|
| 344 | |
|---|
| 345 | def reload(self): |
|---|
| 346 | """Reload current document, and return response object.""" |
|---|
| 347 | if self.request is None: |
|---|
| 348 | raise BrowserStateError("no URL has yet been .open()ed") |
|---|
| 349 | if self._response is not None: |
|---|
| 350 | self._response.close() |
|---|
| 351 | return self._mech_open(self.request, update_history=False) |
|---|
| 352 | |
|---|
| 353 | def back(self, n=1): |
|---|
| 354 | """Go back n steps in history, and return response object. |
|---|
| 355 | |
|---|
| 356 | n: go back this number of steps (default 1 step) |
|---|
| 357 | |
|---|
| 358 | """ |
|---|
| 359 | if self._response is not None: |
|---|
| 360 | self._response.close() |
|---|
| 361 | self.request, response = self._history.back(n, self._response) |
|---|
| 362 | self.set_response(response) |
|---|
| 363 | if not response.read_complete: |
|---|
| 364 | return self.reload() |
|---|
| 365 | return copy.copy(response) |
|---|
| 366 | |
|---|
| 367 | def clear_history(self): |
|---|
| 368 | self._history.clear() |
|---|
| 369 | |
|---|
| 370 | def set_cookie(self, cookie_string): |
|---|
| 371 | """Request to set a cookie. |
|---|
| 372 | |
|---|
| 373 | Note that it is NOT necessary to call this method under ordinary |
|---|
| 374 | circumstances: cookie handling is normally entirely automatic. The |
|---|
| 375 | intended use case is rather to simulate the setting of a cookie by |
|---|
| 376 | client script in a web page (e.g. JavaScript). In that case, use of |
|---|
| 377 | this method is necessary because mechanize currently does not support |
|---|
| 378 | JavaScript, VBScript, etc. |
|---|
| 379 | |
|---|
| 380 | The cookie is added in the same way as if it had arrived with the |
|---|
| 381 | current response, as a result of the current request. This means that, |
|---|
| 382 | for example, if it is not appropriate to set the cookie based on the |
|---|
| 383 | current request, no cookie will be set. |
|---|
| 384 | |
|---|
| 385 | The cookie will be returned automatically with subsequent responses |
|---|
| 386 | made by the Browser instance whenever that's appropriate. |
|---|
| 387 | |
|---|
| 388 | cookie_string should be a valid value of the Set-Cookie header. |
|---|
| 389 | |
|---|
| 390 | For example: |
|---|
| 391 | |
|---|
| 392 | browser.set_cookie( |
|---|
| 393 | "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT") |
|---|
| 394 | |
|---|
| 395 | Currently, this method does not allow for adding RFC 2986 cookies. |
|---|
| 396 | This limitation will be lifted if anybody requests it. |
|---|
| 397 | |
|---|
| 398 | """ |
|---|
| 399 | if self._response is None: |
|---|
| 400 | raise BrowserStateError("not viewing any document") |
|---|
| 401 | if self.request.get_type() not in ["http", "https"]: |
|---|
| 402 | raise BrowserStateError("can't set cookie for non-HTTP/HTTPS " |
|---|
| 403 | "transactions") |
|---|
| 404 | cookiejar = self._ua_handlers["_cookies"].cookiejar |
|---|
| 405 | response = self.response() # copy |
|---|
| 406 | headers = response.info() |
|---|
| 407 | headers["Set-cookie"] = cookie_string |
|---|
| 408 | cookiejar.extract_cookies(response, self.request) |
|---|
| 409 | |
|---|
| 410 | def links(self, **kwds): |
|---|
| 411 | """Return iterable over links (mechanize.Link objects).""" |
|---|
| 412 | if not self.viewing_html(): |
|---|
| 413 | raise BrowserStateError("not viewing HTML") |
|---|
| 414 | links = self._factory.links() |
|---|
| 415 | if kwds: |
|---|
| 416 | return self._filter_links(links, **kwds) |
|---|
| 417 | else: |
|---|
| 418 | return links |
|---|
| 419 | |
|---|
| 420 | def forms(self): |
|---|
| 421 | """Return iterable over forms. |
|---|
| 422 | |
|---|
| 423 | The returned form objects implement the ClientForm.HTMLForm interface. |
|---|
| 424 | |
|---|
| 425 | """ |
|---|
| 426 | if not self.viewing_html(): |
|---|
| 427 | raise BrowserStateError("not viewing HTML") |
|---|
| 428 | return self._factory.forms() |
|---|
| 429 | |
|---|
| 430 | def global_form(self): |
|---|
| 431 | """Return the global form object, or None if the factory implementation |
|---|
| 432 | did not supply one. |
|---|
| 433 | |
|---|
| 434 | The "global" form object contains all controls that are not descendants of |
|---|
| 435 | any FORM element. |
|---|
| 436 | |
|---|
| 437 | The returned form object implements the ClientForm.HTMLForm interface. |
|---|
| 438 | |
|---|
| 439 | This is a separate method since the global form is not regarded as part |
|---|
| 440 | of the sequence of forms in the document -- mostly for |
|---|
| 441 | backwards-compatibility. |
|---|
| 442 | |
|---|
| 443 | """ |
|---|
| 444 | if not self.viewing_html(): |
|---|
| 445 | raise BrowserStateError("not viewing HTML") |
|---|
| 446 | return self._factory.global_form |
|---|
| 447 | |
|---|
| 448 | def viewing_html(self): |
|---|
| 449 | """Return whether the current response contains HTML data.""" |
|---|
| 450 | if self._response is None: |
|---|
| 451 | raise BrowserStateError("not viewing any document") |
|---|
| 452 | return self._factory.is_html |
|---|
| 453 | |
|---|
| 454 | def encoding(self): |
|---|
| 455 | if self._response is None: |
|---|
| 456 | raise BrowserStateError("not viewing any document") |
|---|
| 457 | return self._factory.encoding |
|---|
| 458 | |
|---|
| 459 | def title(self): |
|---|
| 460 | """Return title, or None if there is no title element in the document. |
|---|
| 461 | |
|---|
| 462 | Tags are stripped or textified as described in docs for |
|---|
| 463 | PullParser.get_text() method of pullparser module. |
|---|
| 464 | |
|---|
| 465 | """ |
|---|
| 466 | if not self.viewing_html(): |
|---|
| 467 | raise BrowserStateError("not viewing HTML") |
|---|
| 468 | return self._factory.title |
|---|
| 469 | |
|---|
| 470 | def select_form(self, name=None, predicate=None, nr=None): |
|---|
| 471 | """Select an HTML form for input. |
|---|
| 472 | |
|---|
| 473 | This is a bit like giving a form the "input focus" in a browser. |
|---|
| 474 | |
|---|
| 475 | If a form is selected, the Browser object supports the HTMLForm |
|---|
| 476 | interface, so you can call methods like .set_value(), .set(), and |
|---|
| 477 | .click(). |
|---|
| 478 | |
|---|
| 479 | Another way to select a form is to assign to the .form attribute. The |
|---|
| 480 | form assigned should be one of the objects returned by the .forms() |
|---|
| 481 | method. |
|---|
| 482 | |
|---|
| 483 | At least one of the name, predicate and nr arguments must be supplied. |
|---|
| 484 | If no matching form is found, mechanize.FormNotFoundError is raised. |
|---|
| 485 | |
|---|
| 486 | If name is specified, then the form must have the indicated name. |
|---|
| 487 | |
|---|
| 488 | If predicate is specified, then the form must match that function. The |
|---|
| 489 | predicate function is passed the HTMLForm as its single argument, and |
|---|
| 490 | should return a boolean value indicating whether the form matched. |
|---|
| 491 | |
|---|
| 492 | nr, if supplied, is the sequence number of the form (where 0 is the |
|---|
| 493 | first). Note that control 0 is the first form matching all the other |
|---|
| 494 | arguments (if supplied); it is not necessarily the first control in the |
|---|
| 495 | form. The "global form" (consisting of all form controls not contained |
|---|
| 496 | in any FORM element) is considered not to be part of this sequence and |
|---|
| 497 | to have no name, so will not be matched unless both name and nr are |
|---|
| 498 | None. |
|---|
| 499 | |
|---|
| 500 | """ |
|---|
| 501 | if not self.viewing_html(): |
|---|
| 502 | raise BrowserStateError("not viewing HTML") |
|---|
| 503 | if (name is None) and (predicate is None) and (nr is None): |
|---|
| 504 | raise ValueError( |
|---|
| 505 | "at least one argument must be supplied to specify form") |
|---|
| 506 | |
|---|
| 507 | global_form = self._factory.global_form |
|---|
| 508 | if nr is None and name is None and \ |
|---|
| 509 | predicate is not None and predicate(global_form): |
|---|
| 510 | self.form = global_form |
|---|
| 511 | return |
|---|
| 512 | |
|---|
| 513 | orig_nr = nr |
|---|
| 514 | for form in self.forms(): |
|---|
| 515 | if name is not None and name != form.name: |
|---|
| 516 | continue |
|---|
| 517 | if predicate is not None and not predicate(form): |
|---|
| 518 | continue |
|---|
| 519 | if nr: |
|---|
| 520 | nr -= 1 |
|---|
| 521 | continue |
|---|
| 522 | self.form = form |
|---|
| 523 | break # success |
|---|
| 524 | else: |
|---|
| 525 | # failure |
|---|
| 526 | description = [] |
|---|
| 527 | if name is not None: description.append("name '%s'" % name) |
|---|
| 528 | if predicate is not None: |
|---|
| 529 | description.append("predicate %s" % predicate) |
|---|
| 530 | if orig_nr is not None: description.append("nr %d" % orig_nr) |
|---|
| 531 | description = ", ".join(description) |
|---|
| 532 | raise FormNotFoundError("no form matching "+description) |
|---|
| 533 | |
|---|
| 534 | def click(self, *args, **kwds): |
|---|
| 535 | """See ClientForm.HTMLForm.click for documentation.""" |
|---|
| 536 | if not self.viewing_html(): |
|---|
| 537 | raise BrowserStateError("not viewing HTML") |
|---|
| 538 | request = self.form.click(*args, **kwds) |
|---|
| 539 | return self._add_referer_header(request) |
|---|
| 540 | |
|---|
| 541 | def submit(self, *args, **kwds): |
|---|
| 542 | """Submit current form. |
|---|
| 543 | |
|---|
| 544 | Arguments are as for ClientForm.HTMLForm.click(). |
|---|
| 545 | |
|---|
| 546 | Return value is same as for Browser.open(). |
|---|
| 547 | |
|---|
| 548 | """ |
|---|
| 549 | return self.open(self.click(*args, **kwds)) |
|---|
| 550 | |
|---|
| 551 | def click_link(self, link=None, **kwds): |
|---|
| 552 | """Find a link and return a Request object for it. |
|---|
| 553 | |
|---|
| 554 | Arguments are as for .find_link(), except that a link may be supplied |
|---|
| 555 | as the first argument. |
|---|
| 556 | |
|---|
| 557 | """ |
|---|
| 558 | if not self.viewing_html(): |
|---|
| 559 | raise BrowserStateError("not viewing HTML") |
|---|
| 560 | if not link: |
|---|
| 561 | link = self.find_link(**kwds) |
|---|
| 562 | else: |
|---|
| 563 | if kwds: |
|---|
| 564 | raise ValueError( |
|---|
| 565 | "either pass a Link, or keyword arguments, not both") |
|---|
| 566 | request = self.request_class(link.absolute_url) |
|---|
| 567 | return self._add_referer_header(request) |
|---|
| 568 | |
|---|
| 569 | def follow_link(self, link=None, **kwds): |
|---|
| 570 | """Find a link and .open() it. |
|---|
| 571 | |
|---|
| 572 | Arguments are as for .click_link(). |
|---|
| 573 | |
|---|
| 574 | Return value is same as for Browser.open(). |
|---|
| 575 | |
|---|
| 576 | """ |
|---|
| 577 | return self.open(self.click_link(link, **kwds)) |
|---|
| 578 | |
|---|
| 579 | def find_link(self, **kwds): |
|---|
| 580 | """Find a link in current page. |
|---|
| 581 | |
|---|
| 582 | Links are returned as mechanize.Link objects. |
|---|
| 583 | |
|---|
| 584 | # Return third link that .search()-matches the regexp "python" |
|---|
| 585 | # (by ".search()-matches", I mean that the regular expression method |
|---|
| 586 | # .search() is used, rather than .match()). |
|---|
| 587 | find_link(text_regex=re.compile("python"), nr=2) |
|---|
| 588 | |
|---|
| 589 | # Return first http link in the current page that points to somewhere |
|---|
| 590 | # on python.org whose link text (after tags have been removed) is |
|---|
| 591 | # exactly "monty python". |
|---|
| 592 | find_link(text="monty python", |
|---|
| 593 | url_regex=re.compile("http.*python.org")) |
|---|
| 594 | |
|---|
| 595 | # Return first link with exactly three HTML attributes. |
|---|
| 596 | find_link(predicate=lambda link: len(link.attrs) == 3) |
|---|
| 597 | |
|---|
| 598 | Links include anchors (<a>), image maps (<area>), and frames (<frame>, |
|---|
| 599 | <iframe>). |
|---|
| 600 | |
|---|
| 601 | All arguments must be passed by keyword, not position. Zero or more |
|---|
| 602 | arguments may be supplied. In order to find a link, all arguments |
|---|
| 603 | supplied must match. |
|---|
| 604 | |
|---|
| 605 | If a matching link is not found, mechanize.LinkNotFoundError is raised. |
|---|
| 606 | |
|---|
| 607 | text: link text between link tags: eg. <a href="blah">this bit</a> (as |
|---|
| 608 | returned by pullparser.get_compressed_text(), ie. without tags but |
|---|
| 609 | with opening tags "textified" as per the pullparser docs) must compare |
|---|
| 610 | equal to this argument, if supplied |
|---|
| 611 | text_regex: link text between tag (as defined above) must match the |
|---|
| 612 | regular expression object or regular expression string passed as this |
|---|
| 613 | argument, if supplied |
|---|
| 614 | name, name_regex: as for text and text_regex, but matched against the |
|---|
| 615 | name HTML attribute of the link tag |
|---|
| 616 | url, url_regex: as for text and text_regex, but matched against the |
|---|
| 617 | URL of the link tag (note this matches against Link.url, which is a |
|---|
| 618 | relative or absolute URL according to how it was written in the HTML) |
|---|
| 619 | tag: element name of opening tag, eg. "a" |
|---|
| 620 | predicate: a function taking a Link object as its single argument, |
|---|
| 621 | returning a boolean result, indicating whether the links |
|---|
| 622 | nr: matches the nth link that matches all other criteria (default 0) |
|---|
| 623 | |
|---|
| 624 | """ |
|---|
| 625 | try: |
|---|
| 626 | return self._filter_links(self._factory.links(), **kwds).next() |
|---|
| 627 | except StopIteration: |
|---|
| 628 | raise LinkNotFoundError() |
|---|
| 629 | |
|---|
| 630 | def __getattr__(self, name): |
|---|
| 631 | # pass through ClientForm / DOMForm methods and attributes |
|---|
| 632 | form = self.__dict__.get("form") |
|---|
| 633 | if form is None: |
|---|
| 634 | raise AttributeError( |
|---|
| 635 | "%s instance has no attribute %s (perhaps you forgot to " |
|---|
| 636 | ".select_form()?)" % (self.__class__, name)) |
|---|
| 637 | return getattr(form, name) |
|---|
| 638 | |
|---|
| 639 | def _filter_links(self, links, |
|---|
| 640 | text=None, text_regex=None, |
|---|
| 641 | name=None, name_regex=None, |
|---|
| 642 | url=None, url_regex=None, |
|---|
| 643 | tag=None, |
|---|
| 644 | predicate=None, |
|---|
| 645 | nr=0 |
|---|
| 646 | ): |
|---|
| 647 | if not self.viewing_html(): |
|---|
| 648 | raise BrowserStateError("not viewing HTML") |
|---|
| 649 | |
|---|
| 650 | found_links = [] |
|---|
| 651 | orig_nr = nr |
|---|
| 652 | |
|---|
| 653 | for link in links: |
|---|
| 654 | if url is not None and url != link.url: |
|---|
| 655 | continue |
|---|
| 656 | if url_regex is not None and not re.search(url_regex, link.url): |
|---|
| 657 | continue |
|---|
| 658 | if (text is not None and |
|---|
| 659 | (link.text is None or text != link.text)): |
|---|
| 660 | continue |
|---|
| 661 | if (text_regex is not None and |
|---|
| 662 | (link.text is None or not re.search(text_regex, link.text))): |
|---|
| 663 | continue |
|---|
| 664 | if name is not None and name != dict(link.attrs).get("name"): |
|---|
| 665 | continue |
|---|
| 666 | if name_regex is not None: |
|---|
| 667 | link_name = dict(link.attrs).get("name") |
|---|
| 668 | if link_name is None or not re.search(name_regex, link_name): |
|---|
| 669 | continue |
|---|
| 670 | if tag is not None and tag != link.tag: |
|---|
| 671 | continue |
|---|
| 672 | if predicate is not None and not predicate(link): |
|---|
| 673 | continue |
|---|
| 674 | if nr: |
|---|
| 675 | nr -= 1 |
|---|
| 676 | continue |
|---|
| 677 | yield link |
|---|
| 678 | nr = orig_nr |
|---|