root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/EGG-INFO/scripts/ucsc_gene_table_to_intervals.py @ 3

リビジョン 3, 3.4 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1#!/usr/bin/python2.6
2
3"""
4Read a table dump in the UCSC gene table format and print a tab separated
5list of intervals corresponding to requested features of each gene.
6
7usage: ucsc_gene_table_to_intervals.py [options] < gene_table.txt
8
9options:
10  -h, --help            show this help message and exit
11  -rREGION, --region=REGION
12                        Limit to region: one of coding, utr3, utr5, transcribed [default]
13  -e, --exons           Only print intervals overlapping an exon
14"""
15
16import optparse, string, sys
17
18def main():
19    # Parse command line   
20    parser = optparse.OptionParser( usage="%prog [options] < gene_table.txt" )
21    parser.add_option( "-r", "--region", dest="region", default="transcribed",
22                       help="Limit to region: one of coding, utr3, utr5, transcribed [default]" )
23    parser.add_option( "-e", "--exons",  action="store_true", dest="exons",
24                       help="Only print intervals overlapping an exon" )
25    parser.add_option( "-s", "--strand",  action="store_true", dest="strand",
26                       help="Print strand after interval" )
27    parser.add_option( "-b", "--nobin",  action="store_false", dest="discard_first_column", default=True,
28                       help="file doesn't contain a 'bin' column (use this for pre-hg18 files)" )
29    options, args = parser.parse_args()
30    assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument"
31
32    # Read table from stdin and handle each gene
33    for line in sys.stdin:
34
35        # Parse fields from gene tabls
36        fields = line.split( '\t' )
37        if (options.discard_first_column): fields.pop(0)
38        chrom = fields[1]
39        strand = fields[2]
40        tx_start = int( fields[3] )
41        tx_end = int( fields[4] )
42        cds_start = int( fields[5] )
43        cds_end = int( fields[6] )
44
45        # Determine the subset of the transcribed region we are interested in
46        if options.region == 'utr3':
47            if strand == '-': region_start, region_end = tx_start, cds_start
48            else: region_start, region_end = cds_end, tx_end
49        elif options.region == 'utr5':
50            if strand == '-': region_start, region_end = cds_end, tx_end
51            else: region_start, region_end = tx_start, cds_start
52        elif options.region == 'coding':
53            region_start, region_end = cds_start, cds_end
54        else:
55            region_start, region_end = tx_start, tx_end
56
57        # If only interested in exons, print the portion of each exon overlapping
58        # the region of interest, otherwise print the span of the region
59        if options.exons:
60            exon_starts = map( int, fields[8].rstrip( ',\n' ).split( ',' ) )
61            exon_ends = map( int, fields[9].rstrip( ',\n' ).split( ',' ) )
62            for start, end in zip( exon_starts, exon_ends ):
63                start = max( start, region_start )
64                end = min( end, region_end )
65                if start < end:
66                    if strand: print_tab_sep( chrom, start, end, strand )
67                    else: print_tab_sep( chrom, start, end )
68        else:
69            if strand: print_tab_sep( chrom, region_start, region_end, strand )
70            else: print_tab_sep( chrom, region_start, region_end )
71
72def print_tab_sep( *args ):
73    """Print items in `l` to stdout separated by tabs"""
74    print string.join( [ str( f ) for f in args ], '\t' )
75
76if __name__ == "__main__": main()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。