root/galaxy-central/lib/galaxy/datatypes/converters/interval_to_bedstrict_converter.py @ 2

リビジョン 2, 8.0 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2#Dan Blankenberg
3
4import sys
5from galaxy import eggs
6import pkg_resources; pkg_resources.require( "bx-python" )
7import bx.intervals.io
8
9assert sys.version_info[:2] >= ( 2, 4 )
10
11def stop_err( msg ):
12    sys.stderr.write( msg )
13    sys.exit()
14
15def force_bed_field_count( fields, region_count, force_num_columns ):
16    if force_num_columns >= 4 and len( fields ) < 4:
17        fields.append( 'region_%i' % ( region_count ) )
18    if force_num_columns >= 5 and len( fields ) < 5:
19        fields.append( '0' )
20    if force_num_columns >= 6 and len( fields ) < 6:
21        fields.append( '+' )
22    if force_num_columns >= 7 and len( fields ) < 7:
23        fields.append( fields[1] )
24    if force_num_columns >= 8 and len( fields ) < 8:
25        fields.append( fields[2] )
26    if force_num_columns >= 9 and len( fields ) < 9:
27        fields.append( '0' )
28    if force_num_columns >= 10 and len( fields ) < 10:
29        fields.append( '0' )
30    if force_num_columns >= 11 and len( fields ) < 11:
31        fields.append( ',' )
32    if force_num_columns >= 12 and len( fields ) < 12:
33        fields.append( ',' )
34    return fields[:force_num_columns]
35
36def __main__():
37    output_name = sys.argv[1]
38    input_name = sys.argv[2]
39    try:
40        chromCol = int( sys.argv[3] ) - 1
41    except:
42        stop_err( "'%s' is an invalid chrom column, correct the column settings before attempting to convert the data format." % str( sys.argv[3] ) )
43    try:
44        startCol = int( sys.argv[4] ) - 1
45    except:
46        stop_err( "'%s' is an invalid start column, correct the column settings before attempting to convert the data format." % str( sys.argv[4] ) )
47    try:
48        endCol = int( sys.argv[5] ) - 1
49    except:
50        stop_err( "'%s' is an invalid end column, correct the column settings before attempting to convert the data format." % str( sys.argv[5] ) )
51    try:
52        strandCol = int( sys.argv[6] ) - 1
53    except:
54        strandCol = -1
55    try:
56        nameCol = int( sys.argv[7] ) - 1
57    except:
58        nameCol = -1
59    try:
60        extension = sys.argv[8]
61    except:
62        extension = 'interval' #default extension
63    try:
64        force_num_columns = int( sys.argv[9] )
65    except:
66        force_num_columns = None
67   
68    skipped_lines = 0
69    first_skipped_line = None
70    out = open( output_name,'w' )
71    count = 0
72    #does file already conform to bed strict?
73    #if so, we want to keep extended columns, otherwise we'll create a generic 6 column bed file
74    strict_bed = True
75    if extension in [ 'bed', 'bedstrict', 'bed6', 'bed12' ] and ( chromCol, startCol, endCol) == ( 0, 1, 2) and ( nameCol < 0 or nameCol == 3 ) and ( strandCol < 0 or strandCol == 5 ):
76        for count, line in enumerate( open( input_name ) ):
77            line = line.rstrip( '\n\r' )
78            if line == "" or line.startswith("#"):
79                skipped_lines += 1
80                if first_skipped_line is None:
81                    first_skipped_line = count + 1
82                continue
83            fields = line.split('\t')
84            try:
85                assert len( fields ) >= 3, 'A BED file requires at least 3 columns' #we can't fix this
86                if len(fields) > 12:
87                    strict_bed = False
88                    break
89                #name (fields[3]) can be anything, no verification needed
90                if len( fields ) > 4:
91                    float( fields[4] ) #score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray).
92                    if len( fields ) > 5:
93                        assert fields[5] in [ '+', '-' ], 'Invalid strand' #strand - Defines the strand - either '+' or '-'.
94                        if len( fields ) > 6:
95                            int( fields[6] ) #thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
96                            if len( fields ) > 7:
97                                int( fields[7] ) #thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
98                                if len( fields ) > 8: 
99                                    if fields[8] != '0': #itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
100                                        fields2 = fields[8].split( ',' )
101                                        assert len( fields2 ) == 3, 'RGB value must be 0 or have length of 3'
102                                        for field in fields2:
103                                            int( field ) #rgb values are integers
104                                    if len( fields ) > 9:
105                                        int( fields[9] ) #blockCount - The number of blocks (exons) in the BED line.
106                                        if len( fields ) > 10:
107                                            if fields[10] != ',': #blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
108                                                fields2 = fields[10].rstrip( "," ).split( "," ) #remove trailing comma and split on comma
109                                                for field in fields2:
110                                                    int( field )
111                                            if len( fields ) > 11:
112                                                if fields[11] != ',': #blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
113                                                    fields2 = fields[11].rstrip( "," ).split( "," ) #remove trailing comma and split on comma
114                                                    for field in fields2:
115                                                        int( field )
116            except:
117                strict_bed = False
118                break
119            if force_num_columns is not None and len( fields ) != force_num_columns:
120                line = '\t'.join( force_bed_field_count( fields, count, force_num_columns ) )
121            out.write( "%s\n" % line )
122    else:
123        strict_bed = False
124    out.close()
125   
126    if not strict_bed:
127        skipped_lines = 0
128        first_skipped_line = None
129        out = open( output_name,'w' )
130        count = 0
131        for count, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( input_name, 'r' ), chrom_col=chromCol, start_col=startCol, end_col=endCol, strand_col=strandCol, fix_strand=True, return_header=False, return_comments=False ) ):
132            try:
133                if nameCol >= 0:
134                    name = region.fields[nameCol]
135                else:
136                    raise IndexError
137            except:
138                name = "region_%i" % count
139            try:
140                fields = map( str, [ region.chrom, region.start, region.end, name, 0, region.strand ] )
141                if force_num_columns is not None and len( fields ) != force_num_columns:
142                    fields = force_bed_field_count( fields, count, force_num_columns )
143                out.write( "%s\n" % '\t'.join( fields ) )
144            except:
145                skipped_lines += 1
146                if first_skipped_line is None:
147                    first_skipped_line = count + 1
148        out.close()
149    print "%i regions converted to BED." % ( count + 1 - skipped_lines )
150    if skipped_lines > 0:
151        print "Skipped %d blank or invalid lines starting with line # %d." % ( skipped_lines, first_skipped_line )
152
153if __name__ == "__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。