1 | #Dan Blankenberg |
---|
2 | from optparse import OptionParser |
---|
3 | import sys |
---|
4 | import galaxy_utils.sequence.vcf |
---|
5 | |
---|
6 | from galaxy import eggs |
---|
7 | import pkg_resources; pkg_resources.require( "bx-python" ) |
---|
8 | import bx.align.maf |
---|
9 | |
---|
10 | UNKNOWN_NUCLEOTIDE = '*' |
---|
11 | |
---|
12 | class PopulationVCFParser( object ): |
---|
13 | def __init__( self, reader, name ): |
---|
14 | self.reader = reader |
---|
15 | self.name = name |
---|
16 | self.counter = 0 |
---|
17 | def next( self ): |
---|
18 | rval = [] |
---|
19 | vc = self.reader.next() |
---|
20 | for i, allele in enumerate( vc.alt ): |
---|
21 | rval.append( ( '%s_%i.%i' % ( self.name, i + 1, self.counter + 1 ), allele ) ) |
---|
22 | self.counter += 1 |
---|
23 | return ( vc, rval ) |
---|
24 | def __iter__( self ): |
---|
25 | while True: |
---|
26 | yield self.next() |
---|
27 | |
---|
28 | class SampleVCFParser( object ): |
---|
29 | def __init__( self, reader ): |
---|
30 | self.reader = reader |
---|
31 | self.counter = 0 |
---|
32 | def next( self ): |
---|
33 | rval = [] |
---|
34 | vc = self.reader.next() |
---|
35 | alleles = [ vc.ref ] + vc.alt |
---|
36 | |
---|
37 | if 'GT' in vc.format: |
---|
38 | gt_index = vc.format.index( 'GT' ) |
---|
39 | for sample_name, sample_value in zip( vc.sample_names, vc.sample_values ): |
---|
40 | gt_indexes = [] |
---|
41 | for i in sample_value[ gt_index ].replace( '|', '/' ).replace( '\\', '/' ).split( '/' ): #Do we need to consider phase here? |
---|
42 | try: |
---|
43 | gt_indexes.append( int( i ) ) |
---|
44 | except: |
---|
45 | gt_indexes.append( None ) |
---|
46 | for i, allele_i in enumerate( gt_indexes ): |
---|
47 | if allele_i is not None: |
---|
48 | rval.append( ( '%s_%i.%i' % ( sample_name, i + 1, self.counter + 1 ), alleles[ allele_i ] ) ) |
---|
49 | self.counter += 1 |
---|
50 | return ( vc, rval ) |
---|
51 | def __iter__( self ): |
---|
52 | while True: |
---|
53 | yield self.next() |
---|
54 | |
---|
55 | def main(): |
---|
56 | usage = "usage: %prog [options] output_file dbkey inputfile pop_name" |
---|
57 | parser = OptionParser( usage=usage ) |
---|
58 | parser.add_option( "-p", "--population", action="store_true", dest="population", default=False, help="Create MAF on a per population basis") |
---|
59 | parser.add_option( "-s", "--sample", action="store_true", dest="sample", default=False, help="Create MAF on a per sample basis") |
---|
60 | parser.add_option( "-n", "--name", dest="name", default='Unknown Custom Track', help="Name for Custom Track") |
---|
61 | parser.add_option( "-g", "--galaxy", action="store_true", dest="galaxy", default=False, help="Tool is being executed by Galaxy (adds extra error messaging).") |
---|
62 | |
---|
63 | |
---|
64 | ( options, args ) = parser.parse_args() |
---|
65 | |
---|
66 | if len ( args ) < 3: |
---|
67 | if options.galaxy: |
---|
68 | print >>sys.stderr, "It appears that you forgot to specify an input VCF file, click 'Add new VCF...' to add at least input.\n" |
---|
69 | parser.error( "Need to specify an output file, a dbkey and at least one input file" ) |
---|
70 | |
---|
71 | if not ( options.population ^ options.sample ): |
---|
72 | parser.error( 'You must specify either a per population conversion or a per sample conversion, but not both' ) |
---|
73 | |
---|
74 | out = open( args.pop(0), 'wb' ) |
---|
75 | out.write( 'track name="%s" visibility=pack\n' % options.name.replace( "\"", "'" ) ) |
---|
76 | |
---|
77 | maf_writer = bx.align.maf.Writer( out ) |
---|
78 | |
---|
79 | dbkey = args.pop(0) |
---|
80 | |
---|
81 | vcf_files = [] |
---|
82 | if options.population: |
---|
83 | i = 0 |
---|
84 | while args: |
---|
85 | filename = args.pop( 0 ) |
---|
86 | pop_name = args.pop( 0 ).replace( ' ', '_' ) |
---|
87 | if not pop_name: |
---|
88 | pop_name = 'population_%i' % ( i + 1 ) |
---|
89 | vcf_files.append( PopulationVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ), pop_name ) ) |
---|
90 | i += 1 |
---|
91 | else: |
---|
92 | while args: |
---|
93 | filename = args.pop( 0 ) |
---|
94 | vcf_files.append( SampleVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ) ) |
---|
95 | |
---|
96 | non_spec_skipped = 0 |
---|
97 | for vcf_file in vcf_files: |
---|
98 | for vc, variants in vcf_file: |
---|
99 | num_ins = 0 |
---|
100 | num_dels = 0 |
---|
101 | for variant_name, variant_text in variants: |
---|
102 | if 'D' in variant_text: |
---|
103 | num_dels = max( num_dels, int( variant_text[1:] ) ) |
---|
104 | elif 'I' in variant_text: |
---|
105 | num_ins = max( num_ins, len( variant_text ) - 1 ) |
---|
106 | |
---|
107 | alignment = bx.align.maf.Alignment() |
---|
108 | ref_text = vc.ref + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) ) |
---|
109 | start_pos = vc.pos - 1 |
---|
110 | if num_dels and start_pos: |
---|
111 | ref_text = UNKNOWN_NUCLEOTIDE + ref_text |
---|
112 | start_pos -= 1 |
---|
113 | alignment.add_component( bx.align.maf.Component( src='%s.%s%s' % ( |
---|
114 | dbkey, ("chr" if not vc.chrom.startswith("chr") else ""), vc.chrom ), |
---|
115 | start = start_pos, size = len( ref_text.replace( '-', '' ) ), |
---|
116 | strand = '+', src_size = start_pos + len( ref_text ), |
---|
117 | text = ref_text ) ) |
---|
118 | for variant_name, variant_text in variants: |
---|
119 | #FIXME: |
---|
120 | ## skip non-spec. compliant data, see: http://1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 for format spec |
---|
121 | ## this check is due to data having indels not represented in the published format spec, |
---|
122 | ## e.g. 1000 genomes pilot 1 indel data: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/indels/CEU.SRP000031.2010_03.indels.sites.vcf.gz |
---|
123 | if variant_text and variant_text[0] in [ '-', '+' ]: |
---|
124 | non_spec_skipped += 1 |
---|
125 | continue |
---|
126 | |
---|
127 | #do we need a left padding unknown nucleotide (do we have deletions)? |
---|
128 | if num_dels and start_pos: |
---|
129 | var_text = UNKNOWN_NUCLEOTIDE |
---|
130 | else: |
---|
131 | var_text = '' |
---|
132 | if 'D' in variant_text: |
---|
133 | cur_num_del = int( variant_text[1:] ) |
---|
134 | pre_del = min( len( vc.ref ), cur_num_del ) |
---|
135 | post_del = cur_num_del - pre_del |
---|
136 | var_text = var_text + '-' * pre_del + '-' * num_ins + '-' * post_del |
---|
137 | var_text = var_text + UNKNOWN_NUCLEOTIDE * ( len( ref_text ) - len( var_text ) ) |
---|
138 | elif 'I' in variant_text: |
---|
139 | cur_num_ins = len( variant_text ) - 1 |
---|
140 | var_text = var_text + vc.ref + variant_text[1:] + '-' * ( num_ins - cur_num_ins ) + UNKNOWN_NUCLEOTIDE * max( 0, ( num_dels - 1 ) ) |
---|
141 | else: |
---|
142 | var_text = var_text + variant_text + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) ) |
---|
143 | alignment.add_component( bx.align.maf.Component( src=variant_name, start = 0, size = len( var_text.replace( '-', '' ) ), strand = '+', src_size = len( var_text.replace( '-', '' ) ), text = var_text ) ) |
---|
144 | maf_writer.write( alignment ) |
---|
145 | |
---|
146 | maf_writer.close() |
---|
147 | |
---|
148 | if non_spec_skipped: |
---|
149 | print 'Skipped %i non-specification compliant indels.' % non_spec_skipped |
---|
150 | |
---|
151 | if __name__ == "__main__": main() |
---|