1 | #!/usr/bin/python2.6 |
---|
2 | |
---|
3 | """ |
---|
4 | Read a table dump in the UCSC gene table format and print a tab separated |
---|
5 | list of intervals corresponding to requested features of each gene. |
---|
6 | |
---|
7 | usage: ucsc_gene_table_to_intervals.py [options] < gene_table.txt |
---|
8 | |
---|
9 | options: |
---|
10 | -h, --help show this help message and exit |
---|
11 | -rREGION, --region=REGION |
---|
12 | Limit to region: one of coding, utr3, utr5, transcribed [default] |
---|
13 | -e, --exons Only print intervals overlapping an exon |
---|
14 | """ |
---|
15 | |
---|
16 | import optparse, string, sys |
---|
17 | |
---|
18 | def main(): |
---|
19 | # Parse command line |
---|
20 | parser = optparse.OptionParser( usage="%prog [options] < gene_table.txt" ) |
---|
21 | parser.add_option( "-r", "--region", dest="region", default="transcribed", |
---|
22 | help="Limit to region: one of coding, utr3, utr5, transcribed [default]" ) |
---|
23 | parser.add_option( "-e", "--exons", action="store_true", dest="exons", |
---|
24 | help="Only print intervals overlapping an exon" ) |
---|
25 | parser.add_option( "-s", "--strand", action="store_true", dest="strand", |
---|
26 | help="Print strand after interval" ) |
---|
27 | parser.add_option( "-b", "--nobin", action="store_false", dest="discard_first_column", default=True, |
---|
28 | help="file doesn't contain a 'bin' column (use this for pre-hg18 files)" ) |
---|
29 | options, args = parser.parse_args() |
---|
30 | assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" |
---|
31 | |
---|
32 | # Read table from stdin and handle each gene |
---|
33 | for line in sys.stdin: |
---|
34 | |
---|
35 | # Parse fields from gene tabls |
---|
36 | fields = line.split( '\t' ) |
---|
37 | if (options.discard_first_column): fields.pop(0) |
---|
38 | chrom = fields[1] |
---|
39 | strand = fields[2] |
---|
40 | tx_start = int( fields[3] ) |
---|
41 | tx_end = int( fields[4] ) |
---|
42 | cds_start = int( fields[5] ) |
---|
43 | cds_end = int( fields[6] ) |
---|
44 | |
---|
45 | # Determine the subset of the transcribed region we are interested in |
---|
46 | if options.region == 'utr3': |
---|
47 | if strand == '-': region_start, region_end = tx_start, cds_start |
---|
48 | else: region_start, region_end = cds_end, tx_end |
---|
49 | elif options.region == 'utr5': |
---|
50 | if strand == '-': region_start, region_end = cds_end, tx_end |
---|
51 | else: region_start, region_end = tx_start, cds_start |
---|
52 | elif options.region == 'coding': |
---|
53 | region_start, region_end = cds_start, cds_end |
---|
54 | else: |
---|
55 | region_start, region_end = tx_start, tx_end |
---|
56 | |
---|
57 | # If only interested in exons, print the portion of each exon overlapping |
---|
58 | # the region of interest, otherwise print the span of the region |
---|
59 | if options.exons: |
---|
60 | exon_starts = map( int, fields[8].rstrip( ',\n' ).split( ',' ) ) |
---|
61 | exon_ends = map( int, fields[9].rstrip( ',\n' ).split( ',' ) ) |
---|
62 | for start, end in zip( exon_starts, exon_ends ): |
---|
63 | start = max( start, region_start ) |
---|
64 | end = min( end, region_end ) |
---|
65 | if start < end: |
---|
66 | if strand: print_tab_sep( chrom, start, end, strand ) |
---|
67 | else: print_tab_sep( chrom, start, end ) |
---|
68 | else: |
---|
69 | if strand: print_tab_sep( chrom, region_start, region_end, strand ) |
---|
70 | else: print_tab_sep( chrom, region_start, region_end ) |
---|
71 | |
---|
72 | def print_tab_sep( *args ): |
---|
73 | """Print items in `l` to stdout separated by tabs""" |
---|
74 | print string.join( [ str( f ) for f in args ], '\t' ) |
---|
75 | |
---|
76 | if __name__ == "__main__": main() |
---|