1 | #!/usr/bin/env python |
---|
2 | import os, sys, tempfile |
---|
3 | |
---|
4 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
5 | |
---|
6 | def __main__(): |
---|
7 | """ |
---|
8 | Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts |
---|
9 | produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only |
---|
10 | transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for |
---|
11 | a particular sample. |
---|
12 | """ |
---|
13 | # Read parms. |
---|
14 | tracking_file_name = sys.argv[1] |
---|
15 | transcripts_file_name = sys.argv[2] |
---|
16 | output_file_name = sys.argv[3] |
---|
17 | sample_number = int ( sys.argv[4] ) |
---|
18 | |
---|
19 | # Open files. |
---|
20 | transcripts_file = open( transcripts_file_name, 'r' ) |
---|
21 | output_file = open( output_file_name, 'w' ) |
---|
22 | |
---|
23 | # Read transcript IDs from tracking file. |
---|
24 | transcript_ids = {} |
---|
25 | for i, line in enumerate( file( tracking_file_name ) ) : |
---|
26 | # Split line into elements. Line format is |
---|
27 | # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>|<transcript_id>|<FMI>|<FPKM>|<conf_lo>|<conf_hi>] |
---|
28 | line = line.rstrip( '\r\n' ) |
---|
29 | elems = line.split( '\t' ) |
---|
30 | |
---|
31 | # Get transcript info. |
---|
32 | if sample_number == 1: |
---|
33 | transcript_info = elems[4] |
---|
34 | elif sample_number == 2: |
---|
35 | transcript_info = elems[5] |
---|
36 | if not transcript_info.startswith('q'): |
---|
37 | # No transcript for this sample. |
---|
38 | continue |
---|
39 | |
---|
40 | # Get and store transcript id. |
---|
41 | transcript_id = transcript_info.split('|')[1] |
---|
42 | transcript_id = transcript_id.strip('"') |
---|
43 | transcript_ids[transcript_id] = "" |
---|
44 | |
---|
45 | # Filter transcripts file using transcript_ids |
---|
46 | for i, line in enumerate( file( transcripts_file_name ) ): |
---|
47 | # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. |
---|
48 | elems = line.split( '\t' ) |
---|
49 | |
---|
50 | # Get attributes. |
---|
51 | attributes_list = elems[8].split(";") |
---|
52 | attributes = {} |
---|
53 | for name_value_pair in attributes_list: |
---|
54 | pair = name_value_pair.strip().split(" ") |
---|
55 | name = pair[0].strip() |
---|
56 | if name == '': |
---|
57 | continue |
---|
58 | # Need to strip double quote from values |
---|
59 | value = pair[1].strip(" \"") |
---|
60 | attributes[name] = value |
---|
61 | |
---|
62 | # Get element's transcript id. |
---|
63 | transcript_id = attributes['transcript_id'] |
---|
64 | if transcript_id in transcript_ids: |
---|
65 | output_file.write(line) |
---|
66 | |
---|
67 | # Clean up. |
---|
68 | output_file.close() |
---|
69 | |
---|
70 | if __name__ == "__main__": __main__() |
---|