1 | #!/usr/bin/env python |
---|
2 | #Guruprasad Ananda |
---|
3 | |
---|
4 | import sys, os, zipfile, tempfile |
---|
5 | |
---|
6 | QUAL_UPPER_BOUND = 41 |
---|
7 | QUAL_LOWER_BOUND = 1 |
---|
8 | |
---|
9 | def stop_err( msg ): |
---|
10 | sys.stderr.write( "%s\n" % msg ) |
---|
11 | sys.exit() |
---|
12 | |
---|
13 | def unzip( filename ): |
---|
14 | zip_file = zipfile.ZipFile( filename, 'r' ) |
---|
15 | tmpfilename = tempfile.NamedTemporaryFile().name |
---|
16 | for name in zip_file.namelist(): |
---|
17 | file( tmpfilename, 'a' ).write( zip_file.read( name ) ) |
---|
18 | zip_file.close() |
---|
19 | return tmpfilename |
---|
20 | |
---|
21 | def __main__(): |
---|
22 | |
---|
23 | infile_score_name = sys.argv[1].strip() |
---|
24 | fout = open(sys.argv[2].strip(),'r+w') |
---|
25 | |
---|
26 | infile_is_zipped = False |
---|
27 | if zipfile.is_zipfile( infile_score_name ): |
---|
28 | infile_is_zipped = True |
---|
29 | infile_name = unzip( infile_score_name ) |
---|
30 | else: |
---|
31 | infile_name = infile_score_name |
---|
32 | |
---|
33 | readlen = None |
---|
34 | j = 0 |
---|
35 | for line in file( infile_name ): |
---|
36 | line = line.strip() |
---|
37 | if not(line) or line.startswith("#") or line.startswith(">"): |
---|
38 | continue |
---|
39 | elems = line.split() |
---|
40 | try: |
---|
41 | for item in elems: |
---|
42 | assert int(item) |
---|
43 | if not(readlen): |
---|
44 | readlen = len(elems) |
---|
45 | if len(elems) != readlen: |
---|
46 | print "Note: Reads in the input dataset are of variable lengths." |
---|
47 | j += 1 |
---|
48 | except: |
---|
49 | invalid_lines += 1 |
---|
50 | if j > 10: |
---|
51 | break |
---|
52 | |
---|
53 | invalid_lines = 0 |
---|
54 | position_dict = {} |
---|
55 | print >>fout, "column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW" |
---|
56 | for k,line in enumerate(file( infile_name )): |
---|
57 | line = line.strip() |
---|
58 | if not(line) or line.startswith("#") or line.startswith(">"): |
---|
59 | continue |
---|
60 | elems = line.split() |
---|
61 | if position_dict == {}: |
---|
62 | for pos in range(readlen): |
---|
63 | position_dict[pos] = [0]*QUAL_UPPER_BOUND |
---|
64 | if len(elems) != readlen: |
---|
65 | invalid_lines += 1 |
---|
66 | continue |
---|
67 | for ind,item in enumerate(elems): |
---|
68 | try: |
---|
69 | item = int(item) |
---|
70 | position_dict[ind][item]+=1 |
---|
71 | except: |
---|
72 | pass |
---|
73 | |
---|
74 | invalid_positions = 0 |
---|
75 | for pos in position_dict: |
---|
76 | carr = position_dict[pos] #count array for position pos |
---|
77 | total = sum(carr) #number of bases found in this column. |
---|
78 | med_elem = int(round(total/2.0)) |
---|
79 | lowest = None #Lowest quality score value found in this column. |
---|
80 | highest = None #Highest quality score value found in this column. |
---|
81 | median = None #Median quality score value found in this column. |
---|
82 | qsum = 0.0 #Sum of quality score values for this column. |
---|
83 | q1 = None #1st quartile quality score. |
---|
84 | q3 = None #3rd quartile quality score. |
---|
85 | q1_elem = int(round((total+1)/4.0)) |
---|
86 | q3_elem = int(round((total+1)*3/4.0)) |
---|
87 | |
---|
88 | try: |
---|
89 | for ind,cnt in enumerate(carr): |
---|
90 | qsum += ind*cnt |
---|
91 | |
---|
92 | if cnt!=0: |
---|
93 | highest = ind |
---|
94 | |
---|
95 | if lowest==None and cnt!=0: #first non-zero count |
---|
96 | lowest = ind |
---|
97 | |
---|
98 | if q1==None: |
---|
99 | if sum(carr[:ind+1]) >= q1_elem: |
---|
100 | q1 = ind |
---|
101 | |
---|
102 | if median==None: |
---|
103 | if sum(carr[:ind+1]) < med_elem: |
---|
104 | continue |
---|
105 | median = ind |
---|
106 | if total%2 == 0: #even number of elements |
---|
107 | median2 = median |
---|
108 | if sum(carr[:ind+1]) < med_elem+1: |
---|
109 | for ind2,elem in enumerate(carr[ind+1:]): |
---|
110 | if elem != 0: |
---|
111 | median2 = ind+ind2+1 |
---|
112 | break |
---|
113 | median = (median + median2)/2.0 |
---|
114 | |
---|
115 | |
---|
116 | if q3==None: |
---|
117 | if sum(carr[:ind+1]) >= q3_elem: |
---|
118 | q3 = ind |
---|
119 | |
---|
120 | |
---|
121 | mean = qsum/total #Mean quality score value for this column. |
---|
122 | iqr = q3-q1 |
---|
123 | left_whisker = max(q1 - 1.5*iqr,lowest) |
---|
124 | right_whisker = min(q3 + 1.5*iqr,highest) |
---|
125 | |
---|
126 | print >>fout,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(pos+1,total,lowest,highest,qsum,mean,q1,median,q3,iqr,left_whisker,right_whisker) |
---|
127 | except: |
---|
128 | invalid_positions += 1 |
---|
129 | nullvals = ['NA']*11 |
---|
130 | print >>fout,"%s\t%s" %(pos+1,'\t'.join(nullvals)) |
---|
131 | |
---|
132 | if invalid_lines: |
---|
133 | print "Skipped %d reads as invalid." %invalid_lines |
---|
134 | if invalid_positions: |
---|
135 | print "Skipped stats computation for %d read postions." %invalid_positions |
---|
136 | |
---|
137 | if __name__=="__main__": |
---|
138 | __main__() |
---|
139 | |
---|
140 | |
---|