root/galaxy-central/tools/ngs_rna/tophat_wrapper.xml

リビジョン 2, 27.7 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1<tool id="tophat" name="Tophat" version="1.1.1">
2    <description>Find splice junctions using RNA-seq data</description>
3    <requirements>
4        <requirement type="package">tophat</requirement>
5    </requirements>
6    <command interpreter="python">
7        tophat_wrapper.py
8            ## Change this to accomodate the number of threads you have available.
9            --num-threads="4"
10           
11            ## Provide outputs.
12            --coverage-output=$coverage
13            --junctions-output=$junctions
14            --hits-output=$accepted_hits
15           
16            ## Handle reference file.
17            #if $refGenomeSource.genomeSource == "history":
18                --own-file=$refGenomeSource.ownFile
19                --indexes-path="None"
20            #else:
21                --own-file="None"
22                --indexes-path=$refGenomeSource.index
23            #end if
24           
25            ## Are reads single-end or paired?
26            --single-paired=$singlePaired.sPaired
27           
28            ## First input file always required.
29            --input1=$singlePaired.input1
30           
31            ## Set parms based on whether reads are single-end or paired.
32            #if $singlePaired.sPaired == "single":
33                --input2="None"
34                 -r "None"
35                 --settings=$singlePaired.sParams.sSettingsType
36                 #if $singlePaired.sParams.sSettingsType == "full":
37                   --mate-std-dev="None"
38                   -a $singlePaired.sParams.anchor_length
39                   -m $singlePaired.sParams.splice_mismatches
40                   -i $singlePaired.sParams.min_intron_length
41                   -I $singlePaired.sParams.max_intron_length
42                   -F $singlePaired.sParams.junction_filter
43                   -g $singlePaired.sParams.max_multihits
44                   --min-segment-intron $singlePaired.sParams.min_segment_intron
45                   --max-segment-intron $singlePaired.sParams.max_segment_intron
46                   --seg-mismatches=$singlePaired.sParams.seg_mismatches
47                   --seg-length=$singlePaired.sParams.seg_length
48                   #if $singlePaired.sParams.closure_search.use_search == "Yes":
49                        --closure-search
50                        --min-closure-exon $singlePaired.sParams.closure_search.min_closure_exon
51                        --min-closure-intron $singlePaired.sParams.closure_search.min_closure_intron
52                        --max-closure-intron $singlePaired.sParams.closure_search.max_closure_intron
53                   #else:
54                        --no-closure-search
55                   #end if
56                   #if $singlePaired.sParams.coverage_search.use_search == "Yes":
57                        --coverage-search
58                        --min-coverage-intron $singlePaired.sParams.coverage_search.min_coverage_intron
59                        --max-coverage-intron $singlePaired.sParams.coverage_search.max_coverage_intron
60                   #else:
61                        --no-coverage-search
62                   #end if
63                   ## No idea why the type conversion is necessary, but it seems to be.
64                   #if str ($singlePaired.sParams.microexon_search) == "Yes":
65                        --microexon-search
66                   #end if
67                 #end if
68             #else:
69                 --input2=$singlePaired.input2
70                 -r $singlePaired.mate_inner_distance
71                 --settings=$singlePaired.pParams.pSettingsType
72                 #if $singlePaired.pParams.pSettingsType == "full":
73                   --mate-std-dev=$singlePaired.pParams.mate_std_dev
74                   -a $singlePaired.pParams.anchor_length
75                   -m $singlePaired.pParams.splice_mismatches
76                   -i $singlePaired.pParams.min_intron_length
77                   -I $singlePaired.pParams.max_intron_length
78                   -F $singlePaired.pParams.junction_filter
79                   -g $singlePaired.pParams.max_multihits
80                   --min-segment-intron $singlePaired.pParams.min_segment_intron
81                   --max-segment-intron $singlePaired.pParams.max_segment_intron
82                   --seg-mismatches=$singlePaired.pParams.seg_mismatches
83                   --seg-length=$singlePaired.pParams.seg_length
84                   #if $singlePaired.pParams.closure_search.use_search == "Yes":
85                        --closure-search
86                        --min-closure-exon $singlePaired.pParams.closure_search.min_closure_exon
87                        --min-closure-intron $singlePaired.pParams.closure_search.min_closure_intron
88                        --max-closure-intron $singlePaired.pParams.closure_search.max_closure_intron
89                   #else:
90                        --no-closure-search
91                   #end if
92                   #if $singlePaired.pParams.coverage_search.use_search == "Yes":
93                        --coverage-search
94                        --min-coverage-intron $singlePaired.pParams.coverage_search.min_coverage_intron
95                        --max-coverage-intron $singlePaired.pParams.coverage_search.max_coverage_intron
96                   #else:
97                        --no-coverage-search
98                   #end if
99                   ## No idea why the type conversion is necessary, but it seems to be.
100                   #if str ($singlePaired.pParams.microexon_search) == "Yes":
101                        --microexon-search
102                   #end if
103                 #end if
104             #end if
105    </command>
106    <inputs>
107        <conditional name="refGenomeSource">
108          <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
109            <option value="indexed">Use a built-in index</option>
110            <option value="history">Use one from the history</option>
111          </param>
112          <when value="indexed">
113            <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
114              <options from_file="bowtie_indices.loc">
115                <column name="value" index="1" />
116                <column name="name" index="0" />
117              </options>
118            </param>
119          </when>
120          <when value="history">
121            <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
122          </when>  <!-- history -->
123        </conditional>  <!-- refGenomeSource -->
124        <conditional name="singlePaired">
125            <param name="sPaired" type="select" label="Is this library mate-paired?">
126              <option value="single">Single-end</option>
127              <option value="paired">Paired-end</option>
128            </param>
129            <when value="single">
130              <param format="fastqsanger" name="input1" type="data" label="RNA-Seq FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>
131              <conditional name="sParams">
132                <param name="sSettingsType" type="select" label="TopHat settings to use" help="You can use the default settings or set custom values for any of Tophat's parameters.">
133                  <option value="preSet">Use Defaults</option>
134                  <option value="full">Full parameter list</option>
135                </param>
136                <when value="preSet" />
137                <!-- Full/advanced parms. -->
138                <when value="full">
139                  <param name="anchor_length" type="integer" value="8" label="Anchor length (at least 3)" help="Report junctions spanned by reads with at least this many bases on each side of the junction." />
140                  <param name="splice_mismatches" type="integer" value="0" label="Maximum number of mismatches that can appear in the anchor region of spliced alignment" />
141                  <param name="min_intron_length" type="integer" value="70" label="The minimum intron length" help="TopHat will ignore donor/acceptor pairs closer than this many bases apart." />
142                  <param name="max_intron_length" type="integer" value="500000" label="The maximum intron length" help="When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read." />
143                  <param name="junction_filter" type="float" value="0.15" label="Minimum isoform fraction: filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)" help="0.0 to 1.0 (0 to turn off)" />
144                  <param name="max_multihits" type="integer" value="40" label="Maximum number of alignments to be allowed" />
145                  <param name="min_segment_intron" type="integer" value="50" label="Minimum intron length that may be found during split-segment (default) search" />
146                  <param name="max_segment_intron" type="integer" value="500000" label="Maximum intron length that may be found during split-segment (default) search" />
147                  <param name="seg_mismatches" type="integer" value="2" label="Number of mismatches allowed in each segment alignment for reads mapped independently" />
148                  <param name="seg_length" type="integer" value="25" label="Minimum length of read segments" />
149                  <!-- Closure search. -->
150                  <conditional name="closure_search">
151                    <param name="use_search" type="select" label="Use Closure Search">
152                      <option value="No">No</option>
153                      <option value="Yes">Yes</option>
154                    </param>
155                    <when value="Yes">
156                        <param name="min_closure_exon" type="integer" value="50" label="During closure search for paired end reads, exonic hops in the potential splice graph must be at least this long. The default is 50." />
157                        <param name="min_closure_intron" type="integer" value="50" label="Minimum intron length that may be found during closure search" />
158                        <param name="max_closure_intron" type="integer" value="5000" label="Maximum intron length that may be found during closure search" />
159                    </when>
160                    <when value="No" />
161                  </conditional>
162                  <!-- Coverage search. -->
163                  <conditional name="coverage_search">
164                    <param name="use_search" type="select" label="Use Coverage Search">
165                      <option value="No">No</option>
166                      <option value="Yes">Yes</option>
167                    </param>
168                    <when value="Yes">
169                        <param name="min_coverage_intron" type="integer" value="50" label="Minimum intron length that may be found during coverage search" />
170                        <param name="max_coverage_intron" type="integer" value="20000" label="Maximum intron length that may be found during coverage search" />
171                    </when>
172                    <when value="No" />
173                  </conditional>     
174                  <param name="microexon_search" type="select" label="Use Microexon Search" help="With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.">
175                    <option value="No">No</option>
176                    <option value="Yes">Yes</option>
177                  </param>
178                </when>  <!-- full -->
179              </conditional>  <!-- sParams -->
180            </when>
181            <when value="paired">
182              <param format="fastqsanger" name="input1" type="data" label="RNA-Seq FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>
183              <param format="fastqsanger" name="input2" type="data" label="RNA-Seq FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>
184              <param name="mate_inner_distance" type="integer" value="20" label="Mean Inner Distance between Mate Pairs" />
185              <conditional name="pParams">
186                <param name="pSettingsType" type="select" label="TopHat settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full parameter list">
187                  <option value="preSet">Commonly used</option>
188                  <option value="full">Full parameter list</option>
189                </param>
190                <when value="preSet" />
191                <!-- Full/advanced parms. -->
192                <when value="full">
193                    <param name="mate_std_dev" type="integer" value="20" label="Std. Dev for Distance between Mate Pairs"  help="The standard deviation for the distribution on inner distances between mate pairs."/>
194                  <param name="anchor_length" type="integer" value="8" label="Anchor length (at least 3)" help="Report junctions spanned by reads with at least this many bases on each side of the junction." />
195                  <param name="splice_mismatches" type="integer" value="0" label="Maximum number of mismatches that can appear in the anchor region of spliced alignment" />
196                  <param name="min_intron_length" type="integer" value="70" label="The minimum intron length" help="TopHat will ignore donor/acceptor pairs closer than this many bases apart." />
197                  <param name="max_intron_length" type="integer" value="500000" label="The maximum intron length" help="When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read." />
198                  <param name="junction_filter" type="float" value="0.15" label="Minimum isoform fraction: filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)" help="0.0 to 1.0 (0 to turn off)" />
199                  <param name="max_multihits" type="integer" value="40" label="Maximum number of alignments to be allowed" />
200                  <param name="min_segment_intron" type="integer" value="50" label="Minimum intron length that may be found during split-segment (default) search" />
201                  <param name="max_segment_intron" type="integer" value="500000" label="Maximum intron length that may be found during split-segment (default) search" />
202                  <param name="seg_mismatches" type="integer" value="2" label="Number of mismatches allowed in each segment alignment for reads mapped independently" />
203                  <param name="seg_length" type="integer" value="25" label="Minimum length of read segments" />
204                  <!-- Closure search. -->
205                  <conditional name="closure_search">
206                    <param name="use_search" type="select" label="Use Closure Search">
207                      <option value="No">No</option>
208                      <option value="Yes">Yes</option>
209                    </param>
210                    <when value="Yes">
211                        <param name="min_closure_exon" type="integer" value="50" label="During closure search for paired end reads, exonic hops in the potential splice graph must be at least this long. The default is 50." />
212                        <param name="min_closure_intron" type="integer" value="50" label="Minimum intron length that may be found during closure search" />
213                        <param name="max_closure_intron" type="integer" value="5000" label="Maximum intron length that may be found during closure search" />
214                    </when>
215                    <when value="No" />
216                  </conditional>
217                  <!-- Coverage search. -->
218                  <conditional name="coverage_search">
219                    <param name="use_search" type="select" label="Use Coverage Search">
220                      <option value="No">No</option>
221                      <option value="Yes">Yes</option>
222                    </param>
223                    <when value="Yes">
224                        <param name="min_coverage_intron" type="integer" value="50" label="Minimum intron length that may be found during coverage search" />
225                        <param name="max_coverage_intron" type="integer" value="20000" label="Maximum intron length that may be found during coverage search" />
226                    </when>
227                    <when value="No" />
228                  </conditional>     
229                  <param name="microexon_search" type="select" label="Use Microexon Search" help="With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.">
230                    <option value="No">No</option>
231                    <option value="Yes">Yes</option>
232                  </param>
233                </when>  <!-- full -->
234              </conditional>  <!-- pParams -->
235            </when>
236        </conditional>
237    </inputs>
238
239    <outputs>
240        <data format="sam" name="accepted_hits" label="${tool.name} on ${on_string}: accepted_hits"/>
241        <data format="bedgraph" name="coverage" label="${tool.name} on ${on_string}: coverage"/>
242        <data format="bed" name="junctions" label="${tool.name} on ${on_string}: splice junctions"/>
243    </outputs>
244
245    <tests>
246<!--        <test>
247            <param name="genomeSource" value="indexed"/>
248            <param name="index" value="equCab2chrM"/>
249            <param name="sPaired" value="single"/>
250            <param name="input1" ftype="fastqsanger" value="tophat_in1.fq"/>
251            <param name="sSettingsType" value="preSet"/>
252-->            <!--
253            Can't test this right now because first lines of file are run-specific.
254            <output name="accepted_hits" file="tophat_out1.sam"/>
255            -->
256<!--            <output name="coverage" file="tophat_out2.wig"/>
257            <output name="junctions" file="tophat_out3.bed"/>
258        </test>
259-->     
260        <!-- Test using test data: paired-end reads, index from history. -->
261        <test>
262            <param name="genomeSource" value="history"/>
263            <param name="ownFile" ftype="fasta" value="tophat_in3.fa"/>
264            <param name="sPaired" value="paired"/>
265            <param name="input1" ftype="fastqsanger" value="tophat_in1.fq"/>
266            <param name="input2" ftype="fastqsanger" value="tophat_in2.fq"/>
267            <param name="mate_inner_distance" value="20"/>
268            <param name="pSettingsType" value="preSet"/>
269            <output name="accepted_hits" file="tophat_out1.sam" sort="True"/>
270            <output name="coverage" file="tophat_out2.wig"/>
271            <output name="junctions" file="tophat_out3.bed"/>
272        </test>
273<!--        <test>
274            <param name="genomeSource" value="history"/>
275            <param name="ownFile" value="phiX.fasta"/>
276            <param name="sPaired" value="single"/>
277            <param name="input1" ftype="fastqsanger" value="tophat_in1.fq"/>
278            <param name="sSettingsType" value="full"/>
279            <param name="anchor_length" value="8"/>
280            <param name="splice_mismatches" value="0"/>
281            <param name="min_intron_length" value="70"/>
282            <param name="max_intron_length" value="500000"/>
283            <param name="quals_scale" value="default"/>
284            <param name="junction_filter" value="0.15"/>
285            <param name="max_multihits" value="40"/>
286            <param name="min_segment_intron" value="50" />
287            <param name="max_segment_intron" value="500000" />
288            <param name="seg_mismatches" value="2"/>
289            <param name="seg_length" value="25"/>
290-->            <!--
291            Can't test this right now because first lines of file are run-specific.
292            <output name="accepted_hits" file="tophat_out1.sam"/>
293            -->
294<!--            <output name="coverage" file="tophat_out2.wig"/>
295            <output name="junctions" file="tophat_out3.bed"/>
296        </test>
297        <test>
298            <param name="genomeSource" value="indexed"/>
299            <param name="index" value="equCab2chrM"/>
300            <param name="sPaired" value="paired"/>
301            <param name="input1" ftype="fastqsanger" value="tophat_in1.fq"/>
302            <param name="input2" ftype="fastqsanger" value="tophat_in2.fq"/>
303            <param name="mate_inner_distance" value="20"/>
304            <param name="pSettingsType" value="full"/>
305            <param name="mate_std_dev" value="20"/>
306            <param name="anchor_length" value="8"/>
307            <param name="splice_mismatches" value="0"/>
308            <param name="min_intron_length" value="70"/>
309            <param name="max_intron_length" value="500000"/>
310            <param name="quals_scale" value="default"/>
311            <param name="junction_filter" value="0.15"/>
312            <param name="max_multihits" value="40"/>
313            <param name="min_coverage_intron" value="50" />
314            <param name="max_coverage_intron" value="20000" />
315            <param name="seg_mismatches" value="2"/>
316            <param name="seg_length" value="25"/>
317-->            <!--
318            Can't test this right now because first lines of file are run-specific.
319            <output name="accepted_hits" file="tophat_out1.sam"/>
320            -->
321<!--            <output name="coverage" file="tophat_out2.wig"/>
322            <output name="junctions" file="tophat_out3.bed"/>
323        </test>
324-->    </tests>
325
326    <help>
327**Tophat Overview**
328
329TopHat_ is a fast splice junction mapper for RNA-Seq reads. It aligns RNA-Seq reads to mammalian-sized genomes using the ultra high-throughput short read aligner Bowtie, and then analyzes the mapping results to identify splice junctions between exons. Please cite: Trapnell, C., Pachter, L. and Salzberg, S.L. TopHat: discovering splice junctions with RNA-Seq. Bioinformatics 25, 1105-1111 (2009).       
330
331.. _Tophat: http://tophat.cbcb.umd.edu/
332       
333------
334
335**Know what you are doing**
336
337.. class:: warningmark
338
339There is no such thing (yet) as an automated gearshift in splice junction identification. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
340
341.. __: http://tophat.cbcb.umd.edu/manual.html
342
343------
344
345**Input formats**
346
347Tophat accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files.
348
349------
350
351**Outputs**
352
353Tophat produces three output files:
354
355- coverage.wig -- A UCSC BedGraph_ wigglegram track, showing the depth of coverage at each position, including the spliced read alignments.
356- accepted_hits.sam -- A list of read alignments in SAM_ format.
357- junctions.bed -- A UCSC BED_ track of junctions reported by TopHat. Each junction consists of two connected BED blocks, where each block is as long as the maximal overhang of any read spanning the junction. The score is the number of alignments spanning the junction.
358 
359.. _BedGraph: http://genome.ucsc.edu/goldenPath/help/bedgraph.html
360.. _SAM: http://samtools.sourceforge.net/
361.. _BED: http://genome.ucsc.edu/FAQ/FAQformat.html#format1
362   
363-------
364
365**Tophat settings**
366
367All of the options have a default value. You can change any of them. Some of the options in Tophat have been implemented here.
368
369------
370
371**Tophat parameter list**
372
373This is a list of implemented Tophat options::
374
375  -r                                This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments
376                                    selected at 300bp, where each end is 50bp, you should set -r to be 200. There is no default, and this parameter
377                                    is required for paired end runs.
378  --mate-std-dev INT                The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.
379  -a/--min-anchor-length INT        The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction. Note that individual spliced     
380                                    alignments may span a junction with fewer than this many bases on one side. However, every junction involved in spliced alignments is supported by at least one
381                                    read with this many bases on each side.      This must be at least 3 and the default is 8.
382  -m/--splice-mismatches INT        The maximum number of mismatches that may appear in the "anchor" region of a spliced alignment. The default is 0.
383  -i/--min-intron-length INT        The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart. The default is 70.
384  -I/--max-intron-length INT        The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read. The default is 500000.
385  -F/--min-isoform-fraction 0.0-1.0 TopHat filters out junctions supported by too few alignments. Suppose a junction spanning two exons, is supported by S reads. Let the average depth of coverage of
386                                    exon A be D, and assume that it is higher than B. If S / D is less than the minimum isoform fraction, the junction is not reported. A value of zero disables the
387                                    filter. The default is 0.15.
388  -g/--max-multihits INT        Instructs TopHat to allow up to this many alignments to the reference for a given read, and suppresses all alignments for reads with more than this many
389                                    alignments. The default is 40.
390  --no-closure-search               Disables the mate pair closure-based search for junctions. Currently, has no effect - closure search is off by default.
391  --closure-search                  Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (about or less than 50bp)
392  --no-coverage-search              Disables the coverage based search for junctions.
393  --coverage-search                     Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.
394  --microexon-search                With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.
395  --butterfly-search                TopHat will use a slower but potentially more sensitive algorithm to find junctions in addition to its standard search. Consider using this if you expect that your experiment produced a lot of reads from pre-mRNA, that fall within the introns of your transcripts.
396  --segment-mismatches              Read segments are mapped independently, allowing up to this many mismatches in each segment alignment. The default is 2.
397  --segment-length                  Each read is cut up into segments, each at least this long. These segments are mapped independently. The default is 25.
398  --min-closure-exon                During closure search for paired end reads, exonic hops in the potential splice graph must be at least this long. The default is 50.
399  --min-closure-intron              The minimum intron length that may be found during closure search. The default is 50.
400  --max-closure-intron              The maximum intron length that may be found during closure search. The default is 5000.
401  --min-coverage-intron             The minimum intron length that may be found during coverage search. The default is 50.
402  --max-coverage-intron             The maximum intron length that may be found during coverage search. The default is 20000.
403  --min-segment-intron              The minimum intron length that may be found during split-segment search. The default is 50.
404  --max-segment-intron              The maximum intron length that may be found during split-segment search. The default is 500000.
405    </help>
406</tool>
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。