[2] | 1 | #!/usr/bin/env python |
---|
| 2 | #Guruprasad Ananda |
---|
| 3 | |
---|
| 4 | import sys, os, zipfile, tempfile |
---|
| 5 | |
---|
| 6 | QUAL_UPPER_BOUND = 41 |
---|
| 7 | QUAL_LOWER_BOUND = 1 |
---|
| 8 | |
---|
| 9 | def stop_err( msg ): |
---|
| 10 | sys.stderr.write( "%s\n" % msg ) |
---|
| 11 | sys.exit() |
---|
| 12 | |
---|
| 13 | def unzip( filename ): |
---|
| 14 | zip_file = zipfile.ZipFile( filename, 'r' ) |
---|
| 15 | tmpfilename = tempfile.NamedTemporaryFile().name |
---|
| 16 | for name in zip_file.namelist(): |
---|
| 17 | file( tmpfilename, 'a' ).write( zip_file.read( name ) ) |
---|
| 18 | zip_file.close() |
---|
| 19 | return tmpfilename |
---|
| 20 | |
---|
| 21 | def __main__(): |
---|
| 22 | |
---|
| 23 | infile_score_name = sys.argv[1].strip() |
---|
| 24 | fout = open(sys.argv[2].strip(),'r+w') |
---|
| 25 | |
---|
| 26 | infile_is_zipped = False |
---|
| 27 | if zipfile.is_zipfile( infile_score_name ): |
---|
| 28 | infile_is_zipped = True |
---|
| 29 | infile_name = unzip( infile_score_name ) |
---|
| 30 | else: |
---|
| 31 | infile_name = infile_score_name |
---|
| 32 | |
---|
| 33 | readlen = None |
---|
| 34 | j = 0 |
---|
| 35 | for line in file( infile_name ): |
---|
| 36 | line = line.strip() |
---|
| 37 | if not(line) or line.startswith("#") or line.startswith(">"): |
---|
| 38 | continue |
---|
| 39 | elems = line.split() |
---|
| 40 | try: |
---|
| 41 | for item in elems: |
---|
| 42 | assert int(item) |
---|
| 43 | if not(readlen): |
---|
| 44 | readlen = len(elems) |
---|
| 45 | if len(elems) != readlen: |
---|
| 46 | print "Note: Reads in the input dataset are of variable lengths." |
---|
| 47 | j += 1 |
---|
| 48 | except: |
---|
| 49 | invalid_lines += 1 |
---|
| 50 | if j > 10: |
---|
| 51 | break |
---|
| 52 | |
---|
| 53 | invalid_lines = 0 |
---|
| 54 | position_dict = {} |
---|
| 55 | print >>fout, "column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW" |
---|
| 56 | for k,line in enumerate(file( infile_name )): |
---|
| 57 | line = line.strip() |
---|
| 58 | if not(line) or line.startswith("#") or line.startswith(">"): |
---|
| 59 | continue |
---|
| 60 | elems = line.split() |
---|
| 61 | if position_dict == {}: |
---|
| 62 | for pos in range(readlen): |
---|
| 63 | position_dict[pos] = [0]*QUAL_UPPER_BOUND |
---|
| 64 | if len(elems) != readlen: |
---|
| 65 | invalid_lines += 1 |
---|
| 66 | continue |
---|
| 67 | for ind,item in enumerate(elems): |
---|
| 68 | try: |
---|
| 69 | item = int(item) |
---|
| 70 | position_dict[ind][item]+=1 |
---|
| 71 | except: |
---|
| 72 | pass |
---|
| 73 | |
---|
| 74 | invalid_positions = 0 |
---|
| 75 | for pos in position_dict: |
---|
| 76 | carr = position_dict[pos] #count array for position pos |
---|
| 77 | total = sum(carr) #number of bases found in this column. |
---|
| 78 | med_elem = int(round(total/2.0)) |
---|
| 79 | lowest = None #Lowest quality score value found in this column. |
---|
| 80 | highest = None #Highest quality score value found in this column. |
---|
| 81 | median = None #Median quality score value found in this column. |
---|
| 82 | qsum = 0.0 #Sum of quality score values for this column. |
---|
| 83 | q1 = None #1st quartile quality score. |
---|
| 84 | q3 = None #3rd quartile quality score. |
---|
| 85 | q1_elem = int(round((total+1)/4.0)) |
---|
| 86 | q3_elem = int(round((total+1)*3/4.0)) |
---|
| 87 | |
---|
| 88 | try: |
---|
| 89 | for ind,cnt in enumerate(carr): |
---|
| 90 | qsum += ind*cnt |
---|
| 91 | |
---|
| 92 | if cnt!=0: |
---|
| 93 | highest = ind |
---|
| 94 | |
---|
| 95 | if lowest==None and cnt!=0: #first non-zero count |
---|
| 96 | lowest = ind |
---|
| 97 | |
---|
| 98 | if q1==None: |
---|
| 99 | if sum(carr[:ind+1]) >= q1_elem: |
---|
| 100 | q1 = ind |
---|
| 101 | |
---|
| 102 | if median==None: |
---|
| 103 | if sum(carr[:ind+1]) < med_elem: |
---|
| 104 | continue |
---|
| 105 | median = ind |
---|
| 106 | if total%2 == 0: #even number of elements |
---|
| 107 | median2 = median |
---|
| 108 | if sum(carr[:ind+1]) < med_elem+1: |
---|
| 109 | for ind2,elem in enumerate(carr[ind+1:]): |
---|
| 110 | if elem != 0: |
---|
| 111 | median2 = ind+ind2+1 |
---|
| 112 | break |
---|
| 113 | median = (median + median2)/2.0 |
---|
| 114 | |
---|
| 115 | |
---|
| 116 | if q3==None: |
---|
| 117 | if sum(carr[:ind+1]) >= q3_elem: |
---|
| 118 | q3 = ind |
---|
| 119 | |
---|
| 120 | |
---|
| 121 | mean = qsum/total #Mean quality score value for this column. |
---|
| 122 | iqr = q3-q1 |
---|
| 123 | left_whisker = max(q1 - 1.5*iqr,lowest) |
---|
| 124 | right_whisker = min(q3 + 1.5*iqr,highest) |
---|
| 125 | |
---|
| 126 | print >>fout,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(pos+1,total,lowest,highest,qsum,mean,q1,median,q3,iqr,left_whisker,right_whisker) |
---|
| 127 | except: |
---|
| 128 | invalid_positions += 1 |
---|
| 129 | nullvals = ['NA']*11 |
---|
| 130 | print >>fout,"%s\t%s" %(pos+1,'\t'.join(nullvals)) |
---|
| 131 | |
---|
| 132 | if invalid_lines: |
---|
| 133 | print "Skipped %d reads as invalid." %invalid_lines |
---|
| 134 | if invalid_positions: |
---|
| 135 | print "Skipped stats computation for %d read postions." %invalid_positions |
---|
| 136 | |
---|
| 137 | if __name__=="__main__": |
---|
| 138 | __main__() |
---|
| 139 | |
---|
| 140 | |
---|