""" Implements TwillBrowser, a simple stateful wrapper for mechanize.Browser. See _browser.py for mechanize code. """ OUT=None # Python imports import re # wwwsearch imports import _mechanize_dist as mechanize from _mechanize_dist import BrowserStateError, LinkNotFoundError, ClientForm # twill package imports from _browser import PatchedMechanizeBrowser from utils import print_form, ConfigurableParsingFactory, \ ResultWrapper, unique_match, HistoryStack from errors import TwillException # # TwillBrowser # class TwillBrowser(object): """ Wrap mechanize behavior in a simple stateful way. Public variables: * result -- mechanize-style 'result' object. """ def __init__(self): # # create special link/forms parsing code to run tidy on HTML first. # factory = ConfigurableParsingFactory() # # Create the mechanize browser. # b = PatchedMechanizeBrowser(history=HistoryStack(), factory=factory) self._browser = b self.result = None self.last_submit_button = None # # create & set a cookie jar. # policy = mechanize.DefaultCookiePolicy(rfc2965=True) cj = mechanize.LWPCookieJar(policy=policy) self._browser.set_cookiejar(cj) self.cj = cj # Ask for MIME type 'text/html' by preference. self._browser.addheaders = [("Accept", "text/html; */*")] # ignore robots.txt self._browser.set_handle_robots(None) # create an HTTP auth handler self.creds = mechanize.HTTPPasswordMgr() # do handle HTTP-EQUIV properly. self._browser.set_handle_equiv(True) # callables to be called after each page load. self._post_load_hooks = [] ### get/set HTTP authentication stuff. def _set_creds(self, creds): self._creds = creds self._browser.set_password_manager(creds) def _get_creds(self): return self._creds creds = property(_get_creds, _set_creds) def go(self, url): """ Visit given URL. """ try_urls = [ url, ] # if this is an absolute URL that is just missing the 'http://' at # the beginning, try fixing that. if url.find('://') == -1: full_url = 'http://%s' % (url,) # mimic browser behavior try_urls.append(full_url) # if this is a '?' URL, then assume that we want to tack it onto # the end of the current URL. if url.startswith('?'): current_url = self.get_url() current_url = current_url.split('?')[0] try_urls = [ current_url + url, ] success = False for u in try_urls: try: self._journey('open', u) success = True break except IOError: # @CTB test this! pass if success: print>>OUT, '==> at', self.get_url() else: raise BrowserStateError("cannot go to '%s'" % (url,)) def reload(self): """ Tell the browser to reload the current page. """ self._journey('reload') print>>OUT, '==> reloaded' def back(self): """ Return to previous page, if possible. """ try: self._journey('back') print>>OUT, '==> back to', self.get_url() except BrowserStateError: print>>OUT, '==> back at empty page.' def get_code(self): """ Get the HTTP status code received for the current page. """ if self.result: return self.result.get_http_code() return None def get_html(self): """ Get the HTML for the current page. """ if self.result: return self.result.get_page() return None def get_title(self): """ Get content of the HTML title element for the current page. """ return self._browser.title() def get_url(self): """ Get the URL of the current page. """ if self.result: return self.result.get_url() return None def find_link(self, pattern): """ Find the first link with a URL, link text, or name matching the given pattern. """ # # first, try to find a link matching that regexp. # try: l = self._browser.find_link(url_regex=pattern) except LinkNotFoundError: # # then, look for a text match. # try: l = self._browser.find_link(text_regex=pattern) except LinkNotFoundError: # # finally, look for a name match. # try: l = self._browser.find_link(name_regex=pattern) except LinkNotFoundError: l = None return l def follow_link(self, link): """ Follow the given link. """ self._journey('follow_link', link) print>>OUT, '==> at', self.get_url() def set_agent_string(self, agent): """ Set the agent string to the given value. """ for i in xrange(len(self._browser.addheaders)): if self._browser.addheaders[i][0] == "User-agent": del self._browser.addheaders[i] break self._browser.addheaders += [("User-agent", agent)] def showforms(self): """ Pretty-print all of the forms. Include the global form (form elements outside of