#!/usr/bin/env python """ tax_read_grouping.py finds reads that only hit one taxonomic group. For example, consider the folliowing example: read1 mammalia read1 insecta read2 insecta in this case only read2 will be selected becuase it stays within insecta This program takes the following options: file in taxonomy format - dataset that complies with Galaxy's taxonomy format id column - integer specifying the number of column containing seq id (starting with 1) taxonomic ranks - a comma separated list of ranks from this list: superkingdom kingdom subkingdom superphylum phylum subphylum superclass class subclass superorder order suborder superfamily family subfamily tribe subtribe genus subgenus species subspecies output format - reads or counts """ from galaxy import eggs import pkg_resources pkg_resources.require( 'pysqlite' ) from pysqlite2 import dbapi2 as sqlite import string, sys, tempfile # This dictionary maps taxonomic ranks to fields of Taxonomy file taxRank = { 'root' :2, 'superkingdom':3, 'kingdom' :4, 'subkingdom' :5, 'superphylum' :6, 'phylum' :7, 'subphylum' :8, 'superclass' :9, 'class' :10, 'subclass' :11, 'superorder' :12, 'ord' :13, 'suborder' :14, 'superfamily' :15, 'family' :16, 'subfamily' :17, 'tribe' :18, 'subtribe' :19, 'genus' :20, 'subgenus' :21, 'species' :22, 'subspecies' :23, 'order' :13 } def stop_err(msg): sys.stderr.write(msg) sys.exit() db = tempfile.NamedTemporaryFile('w') try: con = sqlite.connect(db.name) cur = con.cursor() except: stop_err('Cannot connect to %s\n') % db.name try: tax_file = open(sys.argv[1], 'r') id_col = int(sys.argv[2]) - 1 taxa = string.split(sys.argv[3].rstrip(),',') if sys.argv[4] == 'reads': out_format = True elif sys.argv[4] == 'counts': out_format = False else: stop_err('Please specify "reads" or "counts" for output format\n') out_file = open(sys.argv[5], 'w') except: stop_err('Check arguments\n') if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n') sql = "" for i in range(len(taxa)): if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order' sql += '%s text, ' % taxa[i] sql = sql.strip(', ') sql = 'create table tax (name varchar(50) not null, ' + sql + ')' cur.execute(sql) invalid_line_number = 0 try: for line in tax_file: fields = string.split(line.rstrip(), '\t') if len(fields) < 24: invalid_line_number += 1 continue # Skipping malformed taxonomy lines val_string = '"' + fields[id_col] + '", ' for rank in taxa: taxon = fields[taxRank[rank]] val_string += '"%s", ' % taxon val_string = val_string.strip(', ') val_string = "insert into tax values(" + val_string + ")" cur.execute(val_string) except Exception, e: stop_err('%s\n' % e) tax_file.close() try: for rank in taxa: cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank ) cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) ) cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank) cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) ) if rank == 'ord': rankName = 'order' else: rankName = rank if out_format: cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank) for item in cur.fetchall(): out_string = '%s\t%s\t' % ( item[0], item[1] ) out_string += rankName print >>out_file, out_string else: cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank) for item in cur.fetchall(): out_string = '%s\t%s\t' % ( item[0], item[1] ) out_string += rankName print >>out_file, out_string except Exception, e: stop_err("%s\n" % e)