[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | """ |
---|
| 4 | Connects to a UCSC table browser and scrapes chrominfo for every build |
---|
| 5 | specified by an input file (such as one output by parse_builds.py). |
---|
| 6 | If not input file specified, it will connect using parse_builds.py to |
---|
| 7 | retrieve a list of available builds. |
---|
| 8 | |
---|
| 9 | All chromInfo is placed in a path with the convention |
---|
| 10 | {dbpath}/buildname.len |
---|
| 11 | |
---|
| 12 | Usage: |
---|
| 13 | python build_chrom_db.py dbpath/ [builds_file] |
---|
| 14 | """ |
---|
| 15 | |
---|
| 16 | import sys |
---|
| 17 | import parse_builds |
---|
| 18 | import urllib |
---|
| 19 | import fileinput |
---|
| 20 | |
---|
| 21 | def getchrominfo(url, db): |
---|
| 22 | tableURL = "http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?" |
---|
| 23 | URL = tableURL + urllib.urlencode({ |
---|
| 24 | "clade" : "", |
---|
| 25 | "org" : "", |
---|
| 26 | "db" : db, |
---|
| 27 | "hgta_outputType": "primaryTable", |
---|
| 28 | "hgta_group" : "allTables", |
---|
| 29 | "hgta_table" : "chromInfo", |
---|
| 30 | "hgta_track" : db, |
---|
| 31 | "hgta_regionType":"", |
---|
| 32 | "position":"", |
---|
| 33 | "hgta_doTopSubmit" : "get info"}) |
---|
| 34 | page = urllib.urlopen(URL) |
---|
| 35 | for line in page: |
---|
| 36 | line = line.rstrip( "\r\n" ) |
---|
| 37 | if line.startswith("#"): continue |
---|
| 38 | fields = line.split("\t") |
---|
| 39 | if len(fields) > 1: |
---|
| 40 | yield [fields[0], fields[1]] |
---|
| 41 | |
---|
| 42 | if __name__ == "__main__": |
---|
| 43 | if len(sys.argv) == 1: |
---|
| 44 | print "Path to place chromInfo tables must be specified." |
---|
| 45 | sys.exit(1) |
---|
| 46 | dbpath = sys.argv[1] |
---|
| 47 | builds = [] |
---|
| 48 | if len(sys.argv) > 2: |
---|
| 49 | try: |
---|
| 50 | buildfile = fileinput.FileInput(sys.argv[2]) |
---|
| 51 | for line in buildfile: |
---|
| 52 | if line.startswith("#"): continue |
---|
| 53 | builds.append(line.split("\t")[0]) |
---|
| 54 | except: |
---|
| 55 | print "Bad input file." |
---|
| 56 | sys.exit(1) |
---|
| 57 | else: |
---|
| 58 | try: |
---|
| 59 | for build in parse_builds.getbuilds("http://genome-test.cse.ucsc.edu/cgi-bin/das/dsn"): |
---|
| 60 | builds.append(build[0]) |
---|
| 61 | except: |
---|
| 62 | print "Unable to retrieve builds." |
---|
| 63 | sys.exit(1) |
---|
| 64 | for build in builds: |
---|
| 65 | if build == "?":continue # no lengths for unspecified chrom |
---|
| 66 | outfile = open(dbpath + build + ".len", "w") |
---|
| 67 | print "Retrieving "+build |
---|
| 68 | for chrominfo in getchrominfo("http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?",build): |
---|
| 69 | print >> outfile,"\t".join(chrominfo) |
---|
| 70 | outfile.close() |
---|