root/galaxy-central/tools/maf/interval_maf_to_merged_fasta.py

リビジョン 2, 8.4 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3"""
4Reads an interval or gene BED and a MAF Source.
5Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates
6
7Alignment blocks are layered ontop of each other based upon score.
8
9usage: %prog maf_file [options]
10   -d, --dbkey=d: Database key, ie hg17
11   -c, --chromCol=c: Column of Chr
12   -s, --startCol=s: Column of Start
13   -e, --endCol=e: Column of End
14   -S, --strandCol=S: Column of Strand
15   -G, --geneBED: Input is a Gene BED file, process and join exons as one region
16   -t, --mafSourceType=t: Type of MAF source to use
17   -m, --mafSource=m: Path of source MAF file, if not using cached version
18   -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
19   -i, --interval_file=i:       Input interval file
20   -o, --output_file=o:      Output MAF file
21   -p, --species=p: Species to include in output
22   -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.
23   -z, --mafIndexFileDir=z: Directory of local maf_index.loc file
24
25usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR
26"""
27
28#Dan Blankenberg
29from galaxy import eggs
30from galaxy.tools.util import maf_utilities
31import pkg_resources; pkg_resources.require( "bx-python" )
32from bx.cookbook import doc_optparse
33import bx.intervals.io
34import sys
35
36assert sys.version_info[:2] >= ( 2, 4 )
37
38def stop_err( msg ):
39    sys.stderr.write( msg )
40    sys.exit()
41
42def __main__():
43   
44    #Parse Command Line
45    options, args = doc_optparse.parse( __doc__ )
46    mincols = 0
47    strand_col = -1
48   
49    if options.dbkey:
50        primary_species = options.dbkey
51    else:
52        primary_species = None
53    if primary_species in [None, "?", "None"]:
54        stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
55   
56    include_primary = True
57    secondary_species = maf_utilities.parse_species_option( options.species )
58    if secondary_species:
59        species = list( secondary_species ) # make copy of species list
60        if primary_species in secondary_species:
61            secondary_species.remove( primary_species )
62        else:
63            include_primary = False
64    else:
65        species = None
66   
67    if options.interval_file:
68        interval_file = options.interval_file
69    else:
70        stop_err( "Input interval file has not been specified." )
71   
72    if options.output_file:
73        output_file = options.output_file
74    else:
75        stop_err( "Output file has not been specified." )
76   
77    if not options.geneBED:
78        if options.chromCol:
79            chr_col = int( options.chromCol ) - 1
80        else:
81            stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
82       
83        if options.startCol:
84            start_col = int( options.startCol ) - 1
85        else:
86            stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
87       
88        if options.endCol:
89            end_col = int( options.endCol ) - 1
90        else:
91            stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
92       
93        if options.strandCol:
94            strand_col = int( options.strandCol ) - 1
95   
96    mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir
97   
98    overwrite_with_gaps = True
99    if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false':
100        overwrite_with_gaps = False
101   
102    #Finish parsing command line
103       
104    #get index for mafs based on type
105    index = index_filename = None
106    #using specified uid for locally cached
107    if options.mafSourceType.lower() in ["cached"]:
108        index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile )
109        if index is None:
110            stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) )
111    elif options.mafSourceType.lower() in ["user"]:
112        #index maf for use here, need to remove index_file when finished
113        index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] )
114        if index is None:
115            stop_err( "Your MAF file appears to be malformed." )
116    else:
117        stop_err( "Invalid MAF source type specified." )
118   
119    #open output file
120    output = open( output_file, "w" )
121   
122    if options.geneBED:
123        region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )
124    else:
125        region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) )
126   
127    #Step through intervals
128    regions_extracted = 0
129    line_count = 0
130    for line_count, line in region_enumerator:
131        try:
132            if options.geneBED: #Process as Gene BED
133                try:
134                    starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )
135                    #create spliced alignment object
136                    alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
137                    primary_name = secondary_name = fields[3]
138                    alignment_strand = fields[5]
139                except Exception, e:
140                    print "Error loading exon positions from input line %i: %s" % ( line_count, e )
141                    continue
142            else: #Process as standard intervals
143                try:
144                    #create spliced alignment object
145                    alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
146                    primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )
147                    secondary_name = ""
148                    alignment_strand = line.strand
149                except Exception, e:
150                    print "Error loading region positions from input line %i: %s" % ( line_count, e )
151                    continue
152           
153            #Write alignment to output file
154            #Output primary species first, if requested
155            if include_primary:
156                output.write( ">%s.%s\n" %( primary_species, primary_name ) )
157                if alignment_strand == "-":
158                    output.write( alignment.get_sequence_reverse_complement( primary_species ) )
159                else:
160                    output.write( alignment.get_sequence( primary_species ) )
161                output.write( "\n" )
162            #Output all remainging species
163            for spec in secondary_species or alignment.get_species_names( skip = primary_species ):
164                if secondary_name:
165                    output.write( ">%s.%s\n" % ( spec, secondary_name ) )
166                else:
167                    output.write( ">%s\n" % ( spec ) )
168                if alignment_strand == "-":
169                    output.write( alignment.get_sequence_reverse_complement( spec ) )
170                else:
171                    output.write( alignment.get_sequence( spec ) )
172                output.write( "\n" )
173           
174            output.write( "\n" )
175           
176            regions_extracted += 1
177       
178        except Exception, e:
179            print "Unexpected error from input line %i: %s" % ( line_count, e )
180            continue
181   
182    #close output file
183    output.close()
184   
185    #remove index file if created during run
186    maf_utilities.remove_temp_index_file( index_filename )
187   
188    #Print message about success for user
189    if regions_extracted > 0:
190        print "%i regions were processed successfully." % ( regions_extracted )
191    else:
192        print "No regions were processed successfully."
193        if line_count > 0 and options.geneBED:
194            print "This tool requires your input file to conform to the 12 column BED standard."
195
196if __name__ == "__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。