| 1 | """ |
|---|
| 2 | Readers extracting gene (exon and intron) information from bed / gtf / gff |
|---|
| 3 | formats. |
|---|
| 4 | |
|---|
| 5 | - GeneReader: yields exons |
|---|
| 6 | - CDSReader: yields cds_exons |
|---|
| 7 | - FeatureReader: yields cds_exons, introns, exons |
|---|
| 8 | |
|---|
| 9 | For gff/gtf, the start_codon stop_codon line types are merged with CDSs. |
|---|
| 10 | """ |
|---|
| 11 | |
|---|
| 12 | import sys |
|---|
| 13 | from bx.bitset import * |
|---|
| 14 | from bx.bitset_utils import * |
|---|
| 15 | from bx.bitset_builders import * |
|---|
| 16 | |
|---|
| 17 | def GeneReader( fh, format='gff' ): |
|---|
| 18 | """ yield chrom, strand, gene_exons, name """ |
|---|
| 19 | |
|---|
| 20 | known_formats = ( 'gff', 'gtf', 'bed') |
|---|
| 21 | if format not in known_formats: |
|---|
| 22 | print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats )) |
|---|
| 23 | raise '?' |
|---|
| 24 | |
|---|
| 25 | if format == 'bed': |
|---|
| 26 | for line in fh: |
|---|
| 27 | f = line.strip().split() |
|---|
| 28 | chrom = f[0] |
|---|
| 29 | chrom_start = int(f[1]) |
|---|
| 30 | name = f[4] |
|---|
| 31 | strand = f[5] |
|---|
| 32 | cdsStart = int(f[6]) |
|---|
| 33 | cdsEnd = int(f[7]) |
|---|
| 34 | blockCount = int(f[9]) |
|---|
| 35 | blockSizes = [ int(i) for i in f[10].strip(',').split(',') ] |
|---|
| 36 | blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] |
|---|
| 37 | |
|---|
| 38 | # grab cdsStart - cdsEnd |
|---|
| 39 | gene_exons = [] |
|---|
| 40 | for base,offset in zip( blockStarts, blockSizes ): |
|---|
| 41 | exon_start = base |
|---|
| 42 | exon_end = base+offset |
|---|
| 43 | gene_exons.append( (exon_start, exon_end) ) |
|---|
| 44 | yield chrom, strand, gene_exons, name |
|---|
| 45 | genelist = {} |
|---|
| 46 | grouplist = [] |
|---|
| 47 | if format == 'gff' or format == 'gtf': |
|---|
| 48 | for line in fh: |
|---|
| 49 | if line.startswith('#'): continue |
|---|
| 50 | fields = line.strip().split('\t') |
|---|
| 51 | if len( fields ) < 9: continue |
|---|
| 52 | |
|---|
| 53 | # fields |
|---|
| 54 | |
|---|
| 55 | chrom = fields[0] |
|---|
| 56 | ex_st = int( fields[3] ) - 1 # make zero-centered |
|---|
| 57 | ex_end = int( fields[4] ) #+ 1 # make exclusive |
|---|
| 58 | strand = fields[6] |
|---|
| 59 | |
|---|
| 60 | if format == 'gtf': |
|---|
| 61 | group = fields[8].split(';')[0] |
|---|
| 62 | else: |
|---|
| 63 | group = fields[8] |
|---|
| 64 | |
|---|
| 65 | if group not in grouplist: grouplist.append( group ) |
|---|
| 66 | if group not in genelist: |
|---|
| 67 | genelist[group] = (chrom, strand, []) |
|---|
| 68 | exons_i = 2 |
|---|
| 69 | genelist[group][exons_i].append( ( ex_st, ex_end ) ) |
|---|
| 70 | |
|---|
| 71 | sp = lambda a,b: cmp( a[0], b[0] ) |
|---|
| 72 | |
|---|
| 73 | #for gene in genelist.values(): |
|---|
| 74 | for gene in grouplist: |
|---|
| 75 | chrom, strand, gene_exons = genelist[ gene ] |
|---|
| 76 | gene_exons = bitset_union( gene_exons ) |
|---|
| 77 | yield chrom, strand, gene_exons, gene |
|---|
| 78 | |
|---|
| 79 | def CDSReader( fh, format='gff' ): |
|---|
| 80 | """ yield chrom, strand, cds_exons, name """ |
|---|
| 81 | |
|---|
| 82 | known_formats = ( 'gff', 'gtf', 'bed') |
|---|
| 83 | if format not in known_formats: |
|---|
| 84 | print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats )) |
|---|
| 85 | raise '?' |
|---|
| 86 | |
|---|
| 87 | if format == 'bed': |
|---|
| 88 | for line in fh: |
|---|
| 89 | f = line.strip().split() |
|---|
| 90 | chrom = f[0] |
|---|
| 91 | chrom_start = int(f[1]) |
|---|
| 92 | name = f[4] |
|---|
| 93 | strand = f[5] |
|---|
| 94 | cdsStart = int(f[6]) |
|---|
| 95 | cdsEnd = int(f[7]) |
|---|
| 96 | blockCount = int(f[9]) |
|---|
| 97 | blockSizes = [ int(i) for i in f[10].strip(',').split(',') ] |
|---|
| 98 | blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] |
|---|
| 99 | |
|---|
| 100 | # grab cdsStart - cdsEnd |
|---|
| 101 | cds_exons = [] |
|---|
| 102 | cds_seq = '' |
|---|
| 103 | genome_seq_index = [] |
|---|
| 104 | for base,offset in zip( blockStarts, blockSizes ): |
|---|
| 105 | if (base + offset) < cdsStart: continue |
|---|
| 106 | if base > cdsEnd: continue |
|---|
| 107 | exon_start = max( base, cdsStart ) |
|---|
| 108 | exon_end = min( base+offset, cdsEnd ) |
|---|
| 109 | cds_exons.append( (exon_start, exon_end) ) |
|---|
| 110 | yield chrom, strand, cds_exons, name |
|---|
| 111 | |
|---|
| 112 | genelist = {} |
|---|
| 113 | grouplist = [] |
|---|
| 114 | if format == 'gff' or format == 'gtf': |
|---|
| 115 | for line in fh: |
|---|
| 116 | if line.startswith('#'): continue |
|---|
| 117 | fields = line.strip().split('\t') |
|---|
| 118 | if len( fields ) < 9: continue |
|---|
| 119 | if fields[2] not in ('CDS', 'stop_codon', 'start_codon'): continue |
|---|
| 120 | |
|---|
| 121 | # fields |
|---|
| 122 | |
|---|
| 123 | chrom = fields[0] |
|---|
| 124 | ex_st = int( fields[3] ) - 1 # make zero-centered |
|---|
| 125 | ex_end = int( fields[4] ) #+ 1 # make exclusive |
|---|
| 126 | strand = fields[6] |
|---|
| 127 | |
|---|
| 128 | if format == 'gtf': |
|---|
| 129 | group = fields[8].split(';')[0] |
|---|
| 130 | else: |
|---|
| 131 | group = fields[8] |
|---|
| 132 | |
|---|
| 133 | if group not in grouplist: grouplist.append( group ) |
|---|
| 134 | if group not in genelist: |
|---|
| 135 | genelist[group] = (chrom, strand, []) |
|---|
| 136 | |
|---|
| 137 | genelist[group][2].append( ( ex_st, ex_end ) ) |
|---|
| 138 | |
|---|
| 139 | sp = lambda a,b: cmp( a[0], b[0] ) |
|---|
| 140 | |
|---|
| 141 | #for gene in genelist.values(): |
|---|
| 142 | for gene in grouplist: |
|---|
| 143 | chrom, strand, cds_exons = genelist[ gene ] |
|---|
| 144 | seqlen = sum([ a[1]-a[0] for a in cds_exons ]) |
|---|
| 145 | overhang = seqlen % 3 |
|---|
| 146 | if overhang > 0: |
|---|
| 147 | #print >>sys.stderr, "adjusting ", gene |
|---|
| 148 | if strand == '+': |
|---|
| 149 | cds_exons[-1] = ( cds_exons[-1][0], cds_exons[-1][1] - overhang ) |
|---|
| 150 | else: |
|---|
| 151 | cds_exons[0] = ( cds_exons[0][0] + overhang, cds_exons[0][1] ) |
|---|
| 152 | cds_exons = bitset_union( cds_exons ) |
|---|
| 153 | yield chrom, strand, cds_exons, gene |
|---|
| 154 | |
|---|
| 155 | def FeatureReader( fh, format='gff', alt_introns_subtract="exons", gtf_parse=None): |
|---|
| 156 | """ |
|---|
| 157 | yield chrom, strand, cds_exons, introns, exons, name |
|---|
| 158 | |
|---|
| 159 | gtf_parse Example: |
|---|
| 160 | # parse gene_id from transcript_id "AC073130.2-001"; gene_id "TES"; |
|---|
| 161 | gene_name = lambda s: s.split(';')[1].split()[1].strip('"') |
|---|
| 162 | |
|---|
| 163 | for chrom, strand, cds_exons, introns, exons, name in FeatureReader( sys.stdin, format='gtf', gtf_parse=gene_name ) |
|---|
| 164 | """ |
|---|
| 165 | |
|---|
| 166 | known_formats = ( 'gff', 'gtf', 'bed') |
|---|
| 167 | if format not in known_formats: |
|---|
| 168 | print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats )) |
|---|
| 169 | raise '?' |
|---|
| 170 | |
|---|
| 171 | if format == 'bed': |
|---|
| 172 | for line in fh: |
|---|
| 173 | f = line.strip().split() |
|---|
| 174 | chrom = f[0] |
|---|
| 175 | chrom_start = int(f[1]) |
|---|
| 176 | name = f[4] |
|---|
| 177 | strand = f[5] |
|---|
| 178 | cdsStart = int(f[6]) |
|---|
| 179 | cdsEnd = int(f[7]) |
|---|
| 180 | blockCount = int(f[9]) |
|---|
| 181 | blockSizes = [ int(i) for i in f[10].strip(',').split(',') ] |
|---|
| 182 | blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] |
|---|
| 183 | |
|---|
| 184 | # grab cdsStart - cdsEnd |
|---|
| 185 | cds_exons = [] |
|---|
| 186 | exons = [] |
|---|
| 187 | |
|---|
| 188 | cds_seq = '' |
|---|
| 189 | genome_seq_index = [] |
|---|
| 190 | for base,offset in zip( blockStarts, blockSizes ): |
|---|
| 191 | if (base + offset) < cdsStart: continue |
|---|
| 192 | if base > cdsEnd: continue |
|---|
| 193 | # exons |
|---|
| 194 | exon_start = base |
|---|
| 195 | exon_end = base+offset |
|---|
| 196 | exons.append( (exon_start, exon_end) ) |
|---|
| 197 | # cds exons |
|---|
| 198 | exon_start = max( base, cdsStart ) |
|---|
| 199 | exon_end = min( base+offset, cdsEnd ) |
|---|
| 200 | cds_exons.append( (exon_start, exon_end) ) |
|---|
| 201 | cds_exons = bitset_union( cds_exons ) |
|---|
| 202 | exons = bitset_union( exons ) |
|---|
| 203 | introns = bitset_complement( exons ) |
|---|
| 204 | yield chrom, strand, cds_exons, introns, exons, name |
|---|
| 205 | |
|---|
| 206 | genelist = {} |
|---|
| 207 | grouplist = [] |
|---|
| 208 | if format == 'gff' or format == 'gtf': |
|---|
| 209 | for line in fh: |
|---|
| 210 | if line.startswith('#'): continue |
|---|
| 211 | fields = line.strip().split('\t') |
|---|
| 212 | if len( fields ) < 9: continue |
|---|
| 213 | |
|---|
| 214 | # fields |
|---|
| 215 | |
|---|
| 216 | chrom = fields[0] |
|---|
| 217 | ex_st = int( fields[3] ) - 1 # make zero-centered |
|---|
| 218 | ex_end = int( fields[4] ) #+ 1 # make exclusive |
|---|
| 219 | strand = fields[6] |
|---|
| 220 | |
|---|
| 221 | if format == 'gtf': |
|---|
| 222 | if not gtf_parse: |
|---|
| 223 | group = fields[8].split(';')[0] |
|---|
| 224 | else: |
|---|
| 225 | group = gtf_parse( fields[8] ) |
|---|
| 226 | else: |
|---|
| 227 | group = fields[8] |
|---|
| 228 | |
|---|
| 229 | # Results are listed in the same order as encountered |
|---|
| 230 | if group not in grouplist: grouplist.append( group ) |
|---|
| 231 | |
|---|
| 232 | if group not in genelist: |
|---|
| 233 | # chrom, strand, cds_exons, introns, exons, cds_start, cds_end |
|---|
| 234 | genelist[group] = [chrom, strand, [], [], [], None, None] |
|---|
| 235 | |
|---|
| 236 | if fields[2] == 'exon': |
|---|
| 237 | genelist[group][4].append( ( ex_st, ex_end ) ) |
|---|
| 238 | |
|---|
| 239 | elif fields[2] in ('CDS', 'stop_codon', 'start_codon'): |
|---|
| 240 | genelist[group][2].append( ( ex_st, ex_end ) ) |
|---|
| 241 | |
|---|
| 242 | if fields[2] == 'start_codon': |
|---|
| 243 | if strand == '+': genelist[group][5] = ex_st |
|---|
| 244 | else: genelist[group][5] = ex_end |
|---|
| 245 | if fields[2] == 'stop_codon': |
|---|
| 246 | if strand == '+': genelist[group][5] = ex_end |
|---|
| 247 | else: genelist[group][5] = ex_st |
|---|
| 248 | |
|---|
| 249 | elif fields[2] == 'intron': |
|---|
| 250 | genelist[group][3].append( ( ex_st, ex_end ) ) |
|---|
| 251 | |
|---|
| 252 | for gene in grouplist: |
|---|
| 253 | chrom, strand, cds_exons, introns, exons, cds_start, cds_end = genelist[ gene ] |
|---|
| 254 | |
|---|
| 255 | cds_exons = bitset_union( cds_exons ) |
|---|
| 256 | exons = bitset_union( exons ) |
|---|
| 257 | |
|---|
| 258 | # assure that cds exons were within the cds range |
|---|
| 259 | if cds_start is not None and cds_end is not None: |
|---|
| 260 | if strand == '+': |
|---|
| 261 | cds_exons = bitset_intersect( cds_exons, [(cds_start,cds_end)] ) |
|---|
| 262 | else: |
|---|
| 263 | cds_exons = bitset_intersect( cds_exons, [(cds_end,cds_start)] ) |
|---|
| 264 | |
|---|
| 265 | # assure that introns are non-overlapping with themselves or exons |
|---|
| 266 | if alt_introns_subtract: |
|---|
| 267 | if alt_introns_subtract == 'exons': |
|---|
| 268 | introns = bitset_subtract( introns, exons ) |
|---|
| 269 | if alt_introns_subtract == 'cds_exons': |
|---|
| 270 | introns = bitset_subtract( introns, cds_exons ) |
|---|
| 271 | else: introns = bitset_union( introns ) |
|---|
| 272 | |
|---|
| 273 | # assure CDS is a multiple of 3, trim from last exon if necessary |
|---|
| 274 | seqlen = sum([ a[1]-a[0] for a in cds_exons ]) |
|---|
| 275 | overhang = seqlen % 3 |
|---|
| 276 | if overhang > 0: |
|---|
| 277 | if strand == '+': |
|---|
| 278 | cds_exons[-1] = ( cds_exons[-1][0], cds_exons[-1][1] - overhang ) |
|---|
| 279 | else: |
|---|
| 280 | cds_exons[0] = ( cds_exons[0][0] + overhang, cds_exons[0][1] ) |
|---|
| 281 | |
|---|
| 282 | yield chrom, strand, cds_exons, introns, exons, gene |
|---|
| 283 | |
|---|