[2] | 1 | #!/usr/bin/env python |
---|
| 2 | import os, sys, tempfile |
---|
| 3 | |
---|
| 4 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 5 | |
---|
| 6 | def __main__(): |
---|
| 7 | """ |
---|
| 8 | Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts |
---|
| 9 | produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only |
---|
| 10 | transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for |
---|
| 11 | a particular sample. |
---|
| 12 | """ |
---|
| 13 | # Read parms. |
---|
| 14 | tracking_file_name = sys.argv[1] |
---|
| 15 | transcripts_file_name = sys.argv[2] |
---|
| 16 | output_file_name = sys.argv[3] |
---|
| 17 | sample_number = int ( sys.argv[4] ) |
---|
| 18 | |
---|
| 19 | # Open files. |
---|
| 20 | transcripts_file = open( transcripts_file_name, 'r' ) |
---|
| 21 | output_file = open( output_file_name, 'w' ) |
---|
| 22 | |
---|
| 23 | # Read transcript IDs from tracking file. |
---|
| 24 | transcript_ids = {} |
---|
| 25 | for i, line in enumerate( file( tracking_file_name ) ) : |
---|
| 26 | # Split line into elements. Line format is |
---|
| 27 | # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>|<transcript_id>|<FMI>|<FPKM>|<conf_lo>|<conf_hi>] |
---|
| 28 | line = line.rstrip( '\r\n' ) |
---|
| 29 | elems = line.split( '\t' ) |
---|
| 30 | |
---|
| 31 | # Get transcript info. |
---|
| 32 | if sample_number == 1: |
---|
| 33 | transcript_info = elems[4] |
---|
| 34 | elif sample_number == 2: |
---|
| 35 | transcript_info = elems[5] |
---|
| 36 | if not transcript_info.startswith('q'): |
---|
| 37 | # No transcript for this sample. |
---|
| 38 | continue |
---|
| 39 | |
---|
| 40 | # Get and store transcript id. |
---|
| 41 | transcript_id = transcript_info.split('|')[1] |
---|
| 42 | transcript_id = transcript_id.strip('"') |
---|
| 43 | transcript_ids[transcript_id] = "" |
---|
| 44 | |
---|
| 45 | # Filter transcripts file using transcript_ids |
---|
| 46 | for i, line in enumerate( file( transcripts_file_name ) ): |
---|
| 47 | # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. |
---|
| 48 | elems = line.split( '\t' ) |
---|
| 49 | |
---|
| 50 | # Get attributes. |
---|
| 51 | attributes_list = elems[8].split(";") |
---|
| 52 | attributes = {} |
---|
| 53 | for name_value_pair in attributes_list: |
---|
| 54 | pair = name_value_pair.strip().split(" ") |
---|
| 55 | name = pair[0].strip() |
---|
| 56 | if name == '': |
---|
| 57 | continue |
---|
| 58 | # Need to strip double quote from values |
---|
| 59 | value = pair[1].strip(" \"") |
---|
| 60 | attributes[name] = value |
---|
| 61 | |
---|
| 62 | # Get element's transcript id. |
---|
| 63 | transcript_id = attributes['transcript_id'] |
---|
| 64 | if transcript_id in transcript_ids: |
---|
| 65 | output_file.write(line) |
---|
| 66 | |
---|
| 67 | # Clean up. |
---|
| 68 | output_file.close() |
---|
| 69 | |
---|
| 70 | if __name__ == "__main__": __main__() |
---|