[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | """ |
---|
| 4 | Reads an interval or gene BED and a MAF Source. |
---|
| 5 | Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates |
---|
| 6 | |
---|
| 7 | Alignment blocks are layered ontop of each other based upon score. |
---|
| 8 | |
---|
| 9 | usage: %prog maf_file [options] |
---|
| 10 | -d, --dbkey=d: Database key, ie hg17 |
---|
| 11 | -c, --chromCol=c: Column of Chr |
---|
| 12 | -s, --startCol=s: Column of Start |
---|
| 13 | -e, --endCol=e: Column of End |
---|
| 14 | -S, --strandCol=S: Column of Strand |
---|
| 15 | -G, --geneBED: Input is a Gene BED file, process and join exons as one region |
---|
| 16 | -t, --mafSourceType=t: Type of MAF source to use |
---|
| 17 | -m, --mafSource=m: Path of source MAF file, if not using cached version |
---|
| 18 | -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version |
---|
| 19 | -i, --interval_file=i: Input interval file |
---|
| 20 | -o, --output_file=o: Output MAF file |
---|
| 21 | -p, --species=p: Species to include in output |
---|
| 22 | -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species. |
---|
| 23 | -z, --mafIndexFileDir=z: Directory of local maf_index.loc file |
---|
| 24 | |
---|
| 25 | usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR |
---|
| 26 | """ |
---|
| 27 | |
---|
| 28 | #Dan Blankenberg |
---|
| 29 | from galaxy import eggs |
---|
| 30 | from galaxy.tools.util import maf_utilities |
---|
| 31 | import pkg_resources; pkg_resources.require( "bx-python" ) |
---|
| 32 | from bx.cookbook import doc_optparse |
---|
| 33 | import bx.intervals.io |
---|
| 34 | import sys |
---|
| 35 | |
---|
| 36 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 37 | |
---|
| 38 | def stop_err( msg ): |
---|
| 39 | sys.stderr.write( msg ) |
---|
| 40 | sys.exit() |
---|
| 41 | |
---|
| 42 | def __main__(): |
---|
| 43 | |
---|
| 44 | #Parse Command Line |
---|
| 45 | options, args = doc_optparse.parse( __doc__ ) |
---|
| 46 | mincols = 0 |
---|
| 47 | strand_col = -1 |
---|
| 48 | |
---|
| 49 | if options.dbkey: |
---|
| 50 | primary_species = options.dbkey |
---|
| 51 | else: |
---|
| 52 | primary_species = None |
---|
| 53 | if primary_species in [None, "?", "None"]: |
---|
| 54 | stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." ) |
---|
| 55 | |
---|
| 56 | include_primary = True |
---|
| 57 | secondary_species = maf_utilities.parse_species_option( options.species ) |
---|
| 58 | if secondary_species: |
---|
| 59 | species = list( secondary_species ) # make copy of species list |
---|
| 60 | if primary_species in secondary_species: |
---|
| 61 | secondary_species.remove( primary_species ) |
---|
| 62 | else: |
---|
| 63 | include_primary = False |
---|
| 64 | else: |
---|
| 65 | species = None |
---|
| 66 | |
---|
| 67 | if options.interval_file: |
---|
| 68 | interval_file = options.interval_file |
---|
| 69 | else: |
---|
| 70 | stop_err( "Input interval file has not been specified." ) |
---|
| 71 | |
---|
| 72 | if options.output_file: |
---|
| 73 | output_file = options.output_file |
---|
| 74 | else: |
---|
| 75 | stop_err( "Output file has not been specified." ) |
---|
| 76 | |
---|
| 77 | if not options.geneBED: |
---|
| 78 | if options.chromCol: |
---|
| 79 | chr_col = int( options.chromCol ) - 1 |
---|
| 80 | else: |
---|
| 81 | stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." ) |
---|
| 82 | |
---|
| 83 | if options.startCol: |
---|
| 84 | start_col = int( options.startCol ) - 1 |
---|
| 85 | else: |
---|
| 86 | stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." ) |
---|
| 87 | |
---|
| 88 | if options.endCol: |
---|
| 89 | end_col = int( options.endCol ) - 1 |
---|
| 90 | else: |
---|
| 91 | stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." ) |
---|
| 92 | |
---|
| 93 | if options.strandCol: |
---|
| 94 | strand_col = int( options.strandCol ) - 1 |
---|
| 95 | |
---|
| 96 | mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir |
---|
| 97 | |
---|
| 98 | overwrite_with_gaps = True |
---|
| 99 | if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false': |
---|
| 100 | overwrite_with_gaps = False |
---|
| 101 | |
---|
| 102 | #Finish parsing command line |
---|
| 103 | |
---|
| 104 | #get index for mafs based on type |
---|
| 105 | index = index_filename = None |
---|
| 106 | #using specified uid for locally cached |
---|
| 107 | if options.mafSourceType.lower() in ["cached"]: |
---|
| 108 | index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile ) |
---|
| 109 | if index is None: |
---|
| 110 | stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) ) |
---|
| 111 | elif options.mafSourceType.lower() in ["user"]: |
---|
| 112 | #index maf for use here, need to remove index_file when finished |
---|
| 113 | index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] ) |
---|
| 114 | if index is None: |
---|
| 115 | stop_err( "Your MAF file appears to be malformed." ) |
---|
| 116 | else: |
---|
| 117 | stop_err( "Invalid MAF source type specified." ) |
---|
| 118 | |
---|
| 119 | #open output file |
---|
| 120 | output = open( output_file, "w" ) |
---|
| 121 | |
---|
| 122 | if options.geneBED: |
---|
| 123 | region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() ) |
---|
| 124 | else: |
---|
| 125 | region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) ) |
---|
| 126 | |
---|
| 127 | #Step through intervals |
---|
| 128 | regions_extracted = 0 |
---|
| 129 | line_count = 0 |
---|
| 130 | for line_count, line in region_enumerator: |
---|
| 131 | try: |
---|
| 132 | if options.geneBED: #Process as Gene BED |
---|
| 133 | try: |
---|
| 134 | starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line ) |
---|
| 135 | #create spliced alignment object |
---|
| 136 | alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) |
---|
| 137 | primary_name = secondary_name = fields[3] |
---|
| 138 | alignment_strand = fields[5] |
---|
| 139 | except Exception, e: |
---|
| 140 | print "Error loading exon positions from input line %i: %s" % ( line_count, e ) |
---|
| 141 | continue |
---|
| 142 | else: #Process as standard intervals |
---|
| 143 | try: |
---|
| 144 | #create spliced alignment object |
---|
| 145 | alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) |
---|
| 146 | primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end ) |
---|
| 147 | secondary_name = "" |
---|
| 148 | alignment_strand = line.strand |
---|
| 149 | except Exception, e: |
---|
| 150 | print "Error loading region positions from input line %i: %s" % ( line_count, e ) |
---|
| 151 | continue |
---|
| 152 | |
---|
| 153 | #Write alignment to output file |
---|
| 154 | #Output primary species first, if requested |
---|
| 155 | if include_primary: |
---|
| 156 | output.write( ">%s.%s\n" %( primary_species, primary_name ) ) |
---|
| 157 | if alignment_strand == "-": |
---|
| 158 | output.write( alignment.get_sequence_reverse_complement( primary_species ) ) |
---|
| 159 | else: |
---|
| 160 | output.write( alignment.get_sequence( primary_species ) ) |
---|
| 161 | output.write( "\n" ) |
---|
| 162 | #Output all remainging species |
---|
| 163 | for spec in secondary_species or alignment.get_species_names( skip = primary_species ): |
---|
| 164 | if secondary_name: |
---|
| 165 | output.write( ">%s.%s\n" % ( spec, secondary_name ) ) |
---|
| 166 | else: |
---|
| 167 | output.write( ">%s\n" % ( spec ) ) |
---|
| 168 | if alignment_strand == "-": |
---|
| 169 | output.write( alignment.get_sequence_reverse_complement( spec ) ) |
---|
| 170 | else: |
---|
| 171 | output.write( alignment.get_sequence( spec ) ) |
---|
| 172 | output.write( "\n" ) |
---|
| 173 | |
---|
| 174 | output.write( "\n" ) |
---|
| 175 | |
---|
| 176 | regions_extracted += 1 |
---|
| 177 | |
---|
| 178 | except Exception, e: |
---|
| 179 | print "Unexpected error from input line %i: %s" % ( line_count, e ) |
---|
| 180 | continue |
---|
| 181 | |
---|
| 182 | #close output file |
---|
| 183 | output.close() |
---|
| 184 | |
---|
| 185 | #remove index file if created during run |
---|
| 186 | maf_utilities.remove_temp_index_file( index_filename ) |
---|
| 187 | |
---|
| 188 | #Print message about success for user |
---|
| 189 | if regions_extracted > 0: |
---|
| 190 | print "%i regions were processed successfully." % ( regions_extracted ) |
---|
| 191 | else: |
---|
| 192 | print "No regions were processed successfully." |
---|
| 193 | if line_count > 0 and options.geneBED: |
---|
| 194 | print "This tool requires your input file to conform to the 12 column BED standard." |
---|
| 195 | |
---|
| 196 | if __name__ == "__main__": __main__() |
---|