1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """ |
---|
4 | Reads an interval or gene BED and a MAF Source. |
---|
5 | Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates |
---|
6 | |
---|
7 | Alignment blocks are layered ontop of each other based upon score. |
---|
8 | |
---|
9 | usage: %prog maf_file [options] |
---|
10 | -d, --dbkey=d: Database key, ie hg17 |
---|
11 | -c, --chromCol=c: Column of Chr |
---|
12 | -s, --startCol=s: Column of Start |
---|
13 | -e, --endCol=e: Column of End |
---|
14 | -S, --strandCol=S: Column of Strand |
---|
15 | -G, --geneBED: Input is a Gene BED file, process and join exons as one region |
---|
16 | -t, --mafSourceType=t: Type of MAF source to use |
---|
17 | -m, --mafSource=m: Path of source MAF file, if not using cached version |
---|
18 | -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version |
---|
19 | -i, --interval_file=i: Input interval file |
---|
20 | -o, --output_file=o: Output MAF file |
---|
21 | -p, --species=p: Species to include in output |
---|
22 | -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species. |
---|
23 | -z, --mafIndexFileDir=z: Directory of local maf_index.loc file |
---|
24 | |
---|
25 | usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR |
---|
26 | """ |
---|
27 | |
---|
28 | #Dan Blankenberg |
---|
29 | from galaxy import eggs |
---|
30 | from galaxy.tools.util import maf_utilities |
---|
31 | import pkg_resources; pkg_resources.require( "bx-python" ) |
---|
32 | from bx.cookbook import doc_optparse |
---|
33 | import bx.intervals.io |
---|
34 | import sys |
---|
35 | |
---|
36 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
37 | |
---|
38 | def stop_err( msg ): |
---|
39 | sys.stderr.write( msg ) |
---|
40 | sys.exit() |
---|
41 | |
---|
42 | def __main__(): |
---|
43 | |
---|
44 | #Parse Command Line |
---|
45 | options, args = doc_optparse.parse( __doc__ ) |
---|
46 | mincols = 0 |
---|
47 | strand_col = -1 |
---|
48 | |
---|
49 | if options.dbkey: |
---|
50 | primary_species = options.dbkey |
---|
51 | else: |
---|
52 | primary_species = None |
---|
53 | if primary_species in [None, "?", "None"]: |
---|
54 | stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." ) |
---|
55 | |
---|
56 | include_primary = True |
---|
57 | secondary_species = maf_utilities.parse_species_option( options.species ) |
---|
58 | if secondary_species: |
---|
59 | species = list( secondary_species ) # make copy of species list |
---|
60 | if primary_species in secondary_species: |
---|
61 | secondary_species.remove( primary_species ) |
---|
62 | else: |
---|
63 | include_primary = False |
---|
64 | else: |
---|
65 | species = None |
---|
66 | |
---|
67 | if options.interval_file: |
---|
68 | interval_file = options.interval_file |
---|
69 | else: |
---|
70 | stop_err( "Input interval file has not been specified." ) |
---|
71 | |
---|
72 | if options.output_file: |
---|
73 | output_file = options.output_file |
---|
74 | else: |
---|
75 | stop_err( "Output file has not been specified." ) |
---|
76 | |
---|
77 | if not options.geneBED: |
---|
78 | if options.chromCol: |
---|
79 | chr_col = int( options.chromCol ) - 1 |
---|
80 | else: |
---|
81 | stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." ) |
---|
82 | |
---|
83 | if options.startCol: |
---|
84 | start_col = int( options.startCol ) - 1 |
---|
85 | else: |
---|
86 | stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." ) |
---|
87 | |
---|
88 | if options.endCol: |
---|
89 | end_col = int( options.endCol ) - 1 |
---|
90 | else: |
---|
91 | stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." ) |
---|
92 | |
---|
93 | if options.strandCol: |
---|
94 | strand_col = int( options.strandCol ) - 1 |
---|
95 | |
---|
96 | mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir |
---|
97 | |
---|
98 | overwrite_with_gaps = True |
---|
99 | if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false': |
---|
100 | overwrite_with_gaps = False |
---|
101 | |
---|
102 | #Finish parsing command line |
---|
103 | |
---|
104 | #get index for mafs based on type |
---|
105 | index = index_filename = None |
---|
106 | #using specified uid for locally cached |
---|
107 | if options.mafSourceType.lower() in ["cached"]: |
---|
108 | index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile ) |
---|
109 | if index is None: |
---|
110 | stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) ) |
---|
111 | elif options.mafSourceType.lower() in ["user"]: |
---|
112 | #index maf for use here, need to remove index_file when finished |
---|
113 | index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] ) |
---|
114 | if index is None: |
---|
115 | stop_err( "Your MAF file appears to be malformed." ) |
---|
116 | else: |
---|
117 | stop_err( "Invalid MAF source type specified." ) |
---|
118 | |
---|
119 | #open output file |
---|
120 | output = open( output_file, "w" ) |
---|
121 | |
---|
122 | if options.geneBED: |
---|
123 | region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() ) |
---|
124 | else: |
---|
125 | region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) ) |
---|
126 | |
---|
127 | #Step through intervals |
---|
128 | regions_extracted = 0 |
---|
129 | line_count = 0 |
---|
130 | for line_count, line in region_enumerator: |
---|
131 | try: |
---|
132 | if options.geneBED: #Process as Gene BED |
---|
133 | try: |
---|
134 | starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line ) |
---|
135 | #create spliced alignment object |
---|
136 | alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) |
---|
137 | primary_name = secondary_name = fields[3] |
---|
138 | alignment_strand = fields[5] |
---|
139 | except Exception, e: |
---|
140 | print "Error loading exon positions from input line %i: %s" % ( line_count, e ) |
---|
141 | continue |
---|
142 | else: #Process as standard intervals |
---|
143 | try: |
---|
144 | #create spliced alignment object |
---|
145 | alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) |
---|
146 | primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end ) |
---|
147 | secondary_name = "" |
---|
148 | alignment_strand = line.strand |
---|
149 | except Exception, e: |
---|
150 | print "Error loading region positions from input line %i: %s" % ( line_count, e ) |
---|
151 | continue |
---|
152 | |
---|
153 | #Write alignment to output file |
---|
154 | #Output primary species first, if requested |
---|
155 | if include_primary: |
---|
156 | output.write( ">%s.%s\n" %( primary_species, primary_name ) ) |
---|
157 | if alignment_strand == "-": |
---|
158 | output.write( alignment.get_sequence_reverse_complement( primary_species ) ) |
---|
159 | else: |
---|
160 | output.write( alignment.get_sequence( primary_species ) ) |
---|
161 | output.write( "\n" ) |
---|
162 | #Output all remainging species |
---|
163 | for spec in secondary_species or alignment.get_species_names( skip = primary_species ): |
---|
164 | if secondary_name: |
---|
165 | output.write( ">%s.%s\n" % ( spec, secondary_name ) ) |
---|
166 | else: |
---|
167 | output.write( ">%s\n" % ( spec ) ) |
---|
168 | if alignment_strand == "-": |
---|
169 | output.write( alignment.get_sequence_reverse_complement( spec ) ) |
---|
170 | else: |
---|
171 | output.write( alignment.get_sequence( spec ) ) |
---|
172 | output.write( "\n" ) |
---|
173 | |
---|
174 | output.write( "\n" ) |
---|
175 | |
---|
176 | regions_extracted += 1 |
---|
177 | |
---|
178 | except Exception, e: |
---|
179 | print "Unexpected error from input line %i: %s" % ( line_count, e ) |
---|
180 | continue |
---|
181 | |
---|
182 | #close output file |
---|
183 | output.close() |
---|
184 | |
---|
185 | #remove index file if created during run |
---|
186 | maf_utilities.remove_temp_index_file( index_filename ) |
---|
187 | |
---|
188 | #Print message about success for user |
---|
189 | if regions_extracted > 0: |
---|
190 | print "%i regions were processed successfully." % ( regions_extracted ) |
---|
191 | else: |
---|
192 | print "No regions were processed successfully." |
---|
193 | if line_count > 0 and options.geneBED: |
---|
194 | print "This tool requires your input file to conform to the 12 column BED standard." |
---|
195 | |
---|
196 | if __name__ == "__main__": __main__() |
---|