1 | """ |
---|
2 | Extension functions to check all of the links on a page. |
---|
3 | |
---|
4 | Usage: |
---|
5 | |
---|
6 | check_links [ <pattern> ] |
---|
7 | |
---|
8 | Make sure that all of the HTTP links on the current page can be visited |
---|
9 | successfully. If 'pattern' is given, check only URLs that match that |
---|
10 | regular expression. |
---|
11 | |
---|
12 | If option 'check_links.only_collect_bad_links' is on, then all bad |
---|
13 | links are silently collected across all calls to check_links. The |
---|
14 | function 'report_bad_links' can then be used to report all of the links, |
---|
15 | together with their referring pages. |
---|
16 | """ |
---|
17 | |
---|
18 | __all__ = ['check_links', 'report_bad_links'] |
---|
19 | |
---|
20 | DEBUG=True |
---|
21 | |
---|
22 | import re |
---|
23 | from twill import commands |
---|
24 | from twill.errors import TwillAssertionError |
---|
25 | |
---|
26 | ### first, set up config options & persistent 'bad links' memory... |
---|
27 | |
---|
28 | if commands._options.get('check_links.only_collection_bad_links') is None: |
---|
29 | commands._options['check_links.only_collect_bad_links'] = False |
---|
30 | |
---|
31 | bad_links_dict = {} |
---|
32 | |
---|
33 | # |
---|
34 | # main function: 'check_links' |
---|
35 | # |
---|
36 | |
---|
37 | def check_links(pattern = '', visited={}): |
---|
38 | """ |
---|
39 | >> check_links [ <pattern> ] |
---|
40 | |
---|
41 | Make sure that all of the HTTP links on the current page can be visited |
---|
42 | with an HTTP response 200 (success). If 'pattern' is given, interpret |
---|
43 | it as a regular expression that link URLs must contain in order to be |
---|
44 | tested, e.g. |
---|
45 | |
---|
46 | check_links http://.*\.google\.com |
---|
47 | |
---|
48 | would check only links to google URLs. Note that because 'follow' |
---|
49 | is used to visit the pages, the referrer URL is properly set on the |
---|
50 | visit. |
---|
51 | """ |
---|
52 | from twill import commands |
---|
53 | |
---|
54 | if DEBUG: |
---|
55 | print 'in check_links' |
---|
56 | |
---|
57 | OUT = commands.OUT |
---|
58 | browser = commands.browser |
---|
59 | |
---|
60 | # |
---|
61 | # compile the regexp |
---|
62 | # |
---|
63 | |
---|
64 | regexp = None |
---|
65 | if pattern: |
---|
66 | regexp = re.compile(pattern) |
---|
67 | |
---|
68 | # |
---|
69 | # iterate over all links, collecting those that match. |
---|
70 | # |
---|
71 | # note that in the case of duplicate URLs, only one of the |
---|
72 | # links is actually followed! |
---|
73 | # |
---|
74 | |
---|
75 | collected_urls = {} |
---|
76 | |
---|
77 | links = list(browser._browser.links()) |
---|
78 | if not links: |
---|
79 | if DEBUG: |
---|
80 | print>>OUT, "no links to check!?" |
---|
81 | return |
---|
82 | |
---|
83 | for link in links: |
---|
84 | url = link.absolute_url |
---|
85 | url = url.split('#', 1)[0] # get rid of subpage pointers |
---|
86 | |
---|
87 | if not (url.startswith('http://') or url.startswith('https://')): |
---|
88 | if DEBUG: |
---|
89 | print>>OUT, "url '%s' is not an HTTP link; ignoring" % (url,) |
---|
90 | continue |
---|
91 | |
---|
92 | if regexp: |
---|
93 | if regexp.search(url): |
---|
94 | collected_urls[url] = link |
---|
95 | if DEBUG: |
---|
96 | print>>OUT, "Gathered URL %s -- matched regexp" % (url,) |
---|
97 | elif DEBUG: |
---|
98 | print>>OUT, "URL %s doesn't match regexp" % (url,) |
---|
99 | else: |
---|
100 | collected_urls[url] = link |
---|
101 | if DEBUG: |
---|
102 | print>>OUT, "Gathered URL %s." % (url,) |
---|
103 | |
---|
104 | # |
---|
105 | # now, for each unique URL, follow the link. Trap ALL exceptions |
---|
106 | # as failures. |
---|
107 | # |
---|
108 | |
---|
109 | failed = [] |
---|
110 | for link in collected_urls.values(): |
---|
111 | went = False |
---|
112 | try: |
---|
113 | if DEBUG: |
---|
114 | print>>OUT, "Trying %s" % (link.absolute_url,), |
---|
115 | |
---|
116 | if not visited.has_key(link.absolute_url): |
---|
117 | went = True |
---|
118 | browser.follow_link(link) |
---|
119 | |
---|
120 | code = browser.get_code() |
---|
121 | assert code == 200 |
---|
122 | |
---|
123 | visited[link.absolute_url] = 1 |
---|
124 | |
---|
125 | if DEBUG: |
---|
126 | print>>OUT, '...success!' |
---|
127 | else: |
---|
128 | if DEBUG: |
---|
129 | print>>OUT, ' (already visited successfully)' |
---|
130 | except: |
---|
131 | failed.append(link.absolute_url) |
---|
132 | if DEBUG: |
---|
133 | print>>OUT, '...failure ;(' |
---|
134 | |
---|
135 | if went: |
---|
136 | browser.back() |
---|
137 | |
---|
138 | if failed: |
---|
139 | if commands._options['check_links.only_collect_bad_links']: |
---|
140 | for l in failed: |
---|
141 | refering_pages = bad_links_dict.get(l, []) |
---|
142 | print '***', browser.get_url() |
---|
143 | refering_pages.append(browser.get_url()) |
---|
144 | bad_links_dict[l] = refering_pages |
---|
145 | else: |
---|
146 | print>>OUT, '\nCould not follow %d links' % (len(failed),) |
---|
147 | print>>OUT, '\t%s\n' % '\n\t'.join(failed) |
---|
148 | raise TwillAssertionError("broken links on page") |
---|
149 | |
---|
150 | def report_bad_links(fail_if_exist='+', flush_bad_links='+'): |
---|
151 | """ |
---|
152 | >> report_bad_links [<fail-if-exist> [<flush-bad-links>]] |
---|
153 | |
---|
154 | Report all of the links collected across check_links runs (collected |
---|
155 | if and only if the config option check_links.only_collect_bad_links |
---|
156 | is set). |
---|
157 | |
---|
158 | If <fail-if-exist> is false (true by default) then the command will |
---|
159 | fail after reporting any bad links. |
---|
160 | |
---|
161 | If <flush-bad-links> is false (true by default) then the list of |
---|
162 | bad links will be retained across the function call. |
---|
163 | """ |
---|
164 | global bad_links_dict |
---|
165 | |
---|
166 | from twill import utils |
---|
167 | fail_if_exist = utils.make_boolean(fail_if_exist) |
---|
168 | flush_bad_links = utils.make_boolean(flush_bad_links) |
---|
169 | |
---|
170 | from twill import commands |
---|
171 | OUT = commands.OUT |
---|
172 | |
---|
173 | if not bad_links_dict: |
---|
174 | print>>OUT, '\nNo bad links to report.\n' |
---|
175 | else: |
---|
176 | print>>OUT, '\nCould not follow %d links' % (len(bad_links_dict),) |
---|
177 | for page, referers in bad_links_dict.items(): |
---|
178 | err_msg = "\t link '%s' (occurs on: " % (page,)\ |
---|
179 | + ",".join(referers) + ')' |
---|
180 | print>>OUT, err_msg |
---|
181 | |
---|
182 | if flush_bad_links: |
---|
183 | bad_links_dict = {} |
---|
184 | |
---|
185 | if fail_if_exist: |
---|
186 | raise TwillAssertionError("broken links encountered") |
---|