1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """ |
---|
4 | Connects to a UCSC table browser and scrapes chrominfo for every build |
---|
5 | specified by an input file (such as one output by parse_builds.py). |
---|
6 | If not input file specified, it will connect using parse_builds.py to |
---|
7 | retrieve a list of available builds. |
---|
8 | |
---|
9 | All chromInfo is placed in a path with the convention |
---|
10 | {dbpath}/buildname.len |
---|
11 | |
---|
12 | Usage: |
---|
13 | python build_chrom_db.py dbpath/ [builds_file] |
---|
14 | """ |
---|
15 | |
---|
16 | import sys |
---|
17 | import parse_builds |
---|
18 | import urllib |
---|
19 | import fileinput |
---|
20 | |
---|
21 | def getchrominfo(url, db): |
---|
22 | tableURL = "http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?" |
---|
23 | URL = tableURL + urllib.urlencode({ |
---|
24 | "clade" : "", |
---|
25 | "org" : "", |
---|
26 | "db" : db, |
---|
27 | "hgta_outputType": "primaryTable", |
---|
28 | "hgta_group" : "allTables", |
---|
29 | "hgta_table" : "chromInfo", |
---|
30 | "hgta_track" : db, |
---|
31 | "hgta_regionType":"", |
---|
32 | "position":"", |
---|
33 | "hgta_doTopSubmit" : "get info"}) |
---|
34 | page = urllib.urlopen(URL) |
---|
35 | for line in page: |
---|
36 | line = line.rstrip( "\r\n" ) |
---|
37 | if line.startswith("#"): continue |
---|
38 | fields = line.split("\t") |
---|
39 | if len(fields) > 1: |
---|
40 | yield [fields[0], fields[1]] |
---|
41 | |
---|
42 | if __name__ == "__main__": |
---|
43 | if len(sys.argv) == 1: |
---|
44 | print "Path to place chromInfo tables must be specified." |
---|
45 | sys.exit(1) |
---|
46 | dbpath = sys.argv[1] |
---|
47 | builds = [] |
---|
48 | if len(sys.argv) > 2: |
---|
49 | try: |
---|
50 | buildfile = fileinput.FileInput(sys.argv[2]) |
---|
51 | for line in buildfile: |
---|
52 | if line.startswith("#"): continue |
---|
53 | builds.append(line.split("\t")[0]) |
---|
54 | except: |
---|
55 | print "Bad input file." |
---|
56 | sys.exit(1) |
---|
57 | else: |
---|
58 | try: |
---|
59 | for build in parse_builds.getbuilds("http://genome-test.cse.ucsc.edu/cgi-bin/das/dsn"): |
---|
60 | builds.append(build[0]) |
---|
61 | except: |
---|
62 | print "Unable to retrieve builds." |
---|
63 | sys.exit(1) |
---|
64 | for build in builds: |
---|
65 | if build == "?":continue # no lengths for unspecified chrom |
---|
66 | outfile = open(dbpath + build + ".len", "w") |
---|
67 | print "Retrieving "+build |
---|
68 | for chrominfo in getchrominfo("http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?",build): |
---|
69 | print >> outfile,"\t".join(chrominfo) |
---|
70 | outfile.close() |
---|