1 | <tool id="hgv_lps" name="LPS" version="1.0.0"> |
---|
2 | <description>LASSO-Patternsearch algorithm</description> |
---|
3 | |
---|
4 | <command interpreter="bash"> |
---|
5 | lps_tool_wrapper.sh $lambda_fac $input_file $label_column $output_file $log_file |
---|
6 | Initialization 0 |
---|
7 | #if $advanced.options == "true": |
---|
8 | Sample $advanced.sample |
---|
9 | Verbosity $advanced.verbosity |
---|
10 | Standardize $advanced.standardize |
---|
11 | initialLambda $advanced.initialLambda |
---|
12 | #if $advanced.continuation.continuation == "1": |
---|
13 | Continuation $advanced.continuation.continuation |
---|
14 | continuationSteps $advanced.continuation.continuationSteps |
---|
15 | accurateIntermediates $advanced.continuation.accurateIntermediates |
---|
16 | #end if |
---|
17 | printFreq $advanced.printFreq |
---|
18 | #if $advanced.newton.newton == "1": |
---|
19 | Newton $advanced.newton.newton |
---|
20 | NewtonThreshold $advanced.newton.newtonThreshold |
---|
21 | #end if |
---|
22 | HessianSampleFraction $advanced.hessianSampleFraction |
---|
23 | BB 0 |
---|
24 | Monotone 0 |
---|
25 | FullGradient $advanced.fullGradient |
---|
26 | GradientFraction $advanced.gradientFraction |
---|
27 | InitialAlpha $advanced.initialAlpha |
---|
28 | AlphaIncrease $advanced.alphaIncrease |
---|
29 | AlphaDecrease $advanced.alphaDecrease |
---|
30 | AlphaMax $advanced.alphaMax |
---|
31 | c1 $advanced.c1 |
---|
32 | MaxIter $advanced.maxIter |
---|
33 | StopTol $advanced.stopTol |
---|
34 | IntermediateTol $advanced.intermediateTol |
---|
35 | FinalOnly $advanced.finalOnly |
---|
36 | #end if |
---|
37 | </command> |
---|
38 | |
---|
39 | <inputs> |
---|
40 | <param name="input_file" type="data" format="tabular" label="Dataset"/> |
---|
41 | <param name="label_column" type="data_column" data_ref="input_file" numerical="true" label="Label column" help="Column containing outcome labels: +1 or -1."/> |
---|
42 | <param name="lambda_fac" label="Lambda_fac" type="float" value="0.03" help="Target value of the regularization parameter, expressed as a fraction of the calculated lambda_max."> |
---|
43 | <validator type="in_range" message="0.00 < lambda_fac <= 1.00" min="0.00" max="1.00"/> |
---|
44 | </param> |
---|
45 | <conditional name="advanced"> |
---|
46 | <param name="options" type="select" label="Advanced Options"> |
---|
47 | <option value="false" selected="true">Hide advanced options</option> |
---|
48 | <option value="true">Show advanced options</option> |
---|
49 | </param> |
---|
50 | <when value="false"> |
---|
51 | <!-- no options --> |
---|
52 | </when> |
---|
53 | <when value="true"> |
---|
54 | <!-- HARDCODED: 'Sample' we don't support passing an array --> |
---|
55 | <param name="sample" type="float" value="1.0" label="Sample fraction" help="Sample this fraction of the data set."> |
---|
56 | <validator type="in_range" message="0.0 <= sample <= 1.0" min="0.0" max="1.0"/> |
---|
57 | </param> |
---|
58 | <!-- HARDCODED: 'Initialization' = 0 :: Initialize at beta=0 --> |
---|
59 | <param name="verbosity" type="select" format="integer" label="Verbosity"> |
---|
60 | <option value="0" selected="true">Little output</option> |
---|
61 | <option value="1">More output</option> |
---|
62 | <option value="2">Still more output</option> |
---|
63 | </param> |
---|
64 | <param name="standardize" type="select" format="integer" label="Standardize" help="Scales and shifts each column so that it has mean zero and variance 1."> |
---|
65 | <option value="0" selected="true">Don't standardize</option> |
---|
66 | <option value="1">Standardize</option> |
---|
67 | </param> |
---|
68 | <param name="initialLambda" type="float" value="0.8" label="Initial lambda" help="First value of lambda to be used in the continuation scheme, expressed as a fraction of lambda_max."> |
---|
69 | <validator type="in_range" message="0.0 < initialLambda < 1.0" min="0.0" max="1.0"/> |
---|
70 | </param> |
---|
71 | <conditional name="continuation"> |
---|
72 | <param name="continuation" type="select" format="integer" label="Continuation" help="Use continuation strategy to start with a larger value of lambda, decreasing it successively to lambda_fac."> |
---|
73 | <option value="0" selected="true">Don't use continuation</option> |
---|
74 | <option value="1">Use continuation</option> |
---|
75 | </param> |
---|
76 | <when value="0"> |
---|
77 | <!-- no options --> |
---|
78 | </when> |
---|
79 | <when value="1"> |
---|
80 | <param name="continuationSteps" type="integer" value="5" label="Continuation steps" help="Number of lambda values to use in continuation <em>prior</em> to target value lambda_fac."/> |
---|
81 | |
---|
82 | <param name="accurateIntermediates" type="select" format="integer" label="Accurate intermediates" help="Indicates whether accurate solutions are required for lambda values other than the target value lambda_fac."> |
---|
83 | <option value="0" selected="true">Don't need accurate intemediates</option> |
---|
84 | <option value="1">Calculate accurate intermediates</option> |
---|
85 | </param> |
---|
86 | </when> |
---|
87 | </conditional> <!-- name="continuation" --> |
---|
88 | <param name="printFreq" type="integer" value="1" label="Print frequency" help="Print a progress report every NI iterations, where NI is the supplied value of this parameter."> |
---|
89 | <validator type="in_range" message="printFreq >= 1" min="1"/> |
---|
90 | </param> |
---|
91 | <conditional name="newton"> |
---|
92 | <param name="newton" type="select" format="integer" label="Projected Newton steps"> |
---|
93 | <option value="0" selected="true">No Newton steps</option> |
---|
94 | <option value="1">Try projected Newton steps</option> |
---|
95 | </param> |
---|
96 | <when value="0"> |
---|
97 | <!-- no options --> |
---|
98 | </when> |
---|
99 | <when value="1"> |
---|
100 | <param name="newtonThreshold" type="integer" value="500" label="Newton threshold" help="Maximum size of free variable subvector for Newton."/> |
---|
101 | </when> |
---|
102 | </conditional> |
---|
103 | <param name="hessianSampleFraction" type="float" value="1.0" label="Hessian sample fraction" help="Fraction of terms to use in approximate Hessian calculation."> |
---|
104 | <validator type="in_range" message="0.01 < hessianSampleFraction <= 1.00" min="0.01" max="1.00"/> |
---|
105 | </param> |
---|
106 | <!-- HARDCODED: 'BB' = 0 :: don't use Barzilai-Borwein steps --> |
---|
107 | <!-- HARDCODED: 'Monotone' = 0 :: don't force monotonicity --> |
---|
108 | <param name="fullGradient" type="select" format="integer" label="Partial gradient vector selection"> |
---|
109 | <option value="0">Use randomly selected partial gradient, including current active components ("biased")</option> |
---|
110 | <option value="1">Use full gradient vector at every step</option> |
---|
111 | <option value="2">Randomly selected partial gradient, without regard to current active set ("unbiased")</option> |
---|
112 | </param> |
---|
113 | <param name="gradientFraction" type="float" value="0.1" label="Gradient fraction" help="Fraction of inactive gradient vector to evaluate."> |
---|
114 | <validator type="in_range" message="0.0 < gradientFraction <= 1" min="0.0" max="1.0"/> |
---|
115 | </param> |
---|
116 | <param name="initialAlpha" type="float" value="1.0" label="Initial value of alpha"/> |
---|
117 | <param name="alphaIncrease" type="float" value="2.0" label="Alpha increase" help="Factor by which to increase alpha after descent not obtained."/> |
---|
118 | <param name="alphaDecrease" type="float" value="0.8" label="Alpha decrease" help="Factor by which to decrease alpha after successful first-order step."/> |
---|
119 | <param name="alphaMax" type="float" value="1e12" label="Alpha max" help="Maximum value of alpha; terminate with error if we exceed this."/> |
---|
120 | <param name="c1" type="float" value="1e-3" help="Parameter defining the margin by which the first-order step is required to decrease before being taken."> |
---|
121 | <validator type="in_range" message="0.0 < c1 < 1.0" min="0.0" max="1.0"/> |
---|
122 | </param> |
---|
123 | <param name="maxIter" type="integer" value="10000" label="Maximum number of iterations" help="Terminate with error if we exceed this."/> |
---|
124 | <param name="stopTol" type="float" value="1e-6" label="Stop tolerance" help="Convergence tolerance for target value of lambda."/> |
---|
125 | <param name="intermediateTol" type="float" value="1e-4" label="Intermediate tolerance" help="Convergence tolerance for intermediate values of lambda."/> |
---|
126 | <param name="finalOnly" type="select" format="integer" label="Final only"> |
---|
127 | <option value="0" selected="true">Return information for all intermediate values</option> |
---|
128 | <option value="1">Just return information at the last lambda</option> |
---|
129 | </param> |
---|
130 | </when> <!-- value="advanced" --> |
---|
131 | </conditional> <!-- name="advanced" --> |
---|
132 | </inputs> |
---|
133 | |
---|
134 | <outputs> |
---|
135 | <data name="output_file" format="tabular" label="${tool.name} on ${on_string}: results"/> |
---|
136 | <data name="log_file" format="txt" label="${tool.name} on ${on_string}: log"/> |
---|
137 | </outputs> |
---|
138 | |
---|
139 | <requirements> |
---|
140 | <requirement type="binary">lps_tool</requirement> |
---|
141 | </requirements> |
---|
142 | |
---|
143 | <tests> |
---|
144 | <test> |
---|
145 | <param name="input_file" value="lps_arrhythmia.tabular"/> |
---|
146 | <param name="label_column" value="280"/> |
---|
147 | <param name="lambda_fac" value="0.03"/> |
---|
148 | <param name="options" value="true"/> |
---|
149 | <param name="sample" value="1.0"/> |
---|
150 | <param name="verbosity" value="1"/> |
---|
151 | <param name="standardize" value="0"/> |
---|
152 | <param name="initialLambda" value="0.9"/> |
---|
153 | <param name="continuation" value="1"/> |
---|
154 | <param name="continuationSteps" value="10"/> |
---|
155 | <param name="accurateIntermediates" value="0"/> |
---|
156 | <param name="printFreq" value="1"/> |
---|
157 | <param name="newton" value="1"/> |
---|
158 | <param name="newtonThreshold" value="500"/> |
---|
159 | <param name="hessianSampleFraction" value="1.0"/> |
---|
160 | <param name="fullGradient" value="1"/> |
---|
161 | <param name="gradientFraction" value="0.5"/> |
---|
162 | <param name="initialAlpha" value="1.0"/> |
---|
163 | <param name="alphaIncrease" value="2.0"/> |
---|
164 | <param name="alphaDecrease" value="0.8"/> |
---|
165 | <param name="alphaMax" value="1e12"/> |
---|
166 | <param name="c1" value="1e-3"/> |
---|
167 | <param name="maxIter" value="2500"/> |
---|
168 | <param name="stopTol" value="1e-6"/> |
---|
169 | <param name="intermediateTol" value="1e-6"/> |
---|
170 | <param name="finalOnly" value="0"/> |
---|
171 | <output name="ouput_file" file="lps_arrhythmia_beta.tabular"/> |
---|
172 | <output name="log_file" file="lps_arrhythmia_log.txt"/> |
---|
173 | </test> |
---|
174 | </tests> |
---|
175 | |
---|
176 | <help> |
---|
177 | **Dataset formats** |
---|
178 | |
---|
179 | The input and output datasets are tabular_. The columns are described below. |
---|
180 | There is a second output dataset (a log) that is in text_ format. |
---|
181 | (`Dataset missing?`_) |
---|
182 | |
---|
183 | .. _tabular: ./static/formatHelp.html#tab |
---|
184 | .. _text: ./static/formatHelp.html#text |
---|
185 | .. _Dataset missing?: ./static/formatHelp.html |
---|
186 | |
---|
187 | ----- |
---|
188 | |
---|
189 | **What it does** |
---|
190 | |
---|
191 | The LASSO-Patternsearch algorithm fits your dataset to an L1-regularized |
---|
192 | logistic regression model. A benefit of using L1-regularization is |
---|
193 | that it typically yields a weight vector with relatively few non-zero |
---|
194 | coefficients. |
---|
195 | |
---|
196 | For example, say you have a dataset containing M rows (subjects) |
---|
197 | and N columns (attributes) where one of these N attributes is binary, |
---|
198 | indicating whether or not the subject has some property of interest P. |
---|
199 | In simple terms, LPS calculates a weight for each of the other attributes |
---|
200 | in your dataset. This weight indicates how "relevant" that attribute |
---|
201 | is for predicting whether or not a given subject has property P. |
---|
202 | The L1-regularization causes most of these weights to be equal to zero, |
---|
203 | which means LPS will find a "small" subset of the remaining N-1 attributes |
---|
204 | in your dataset that can be used to predict P. |
---|
205 | |
---|
206 | In other words, LPS can be used for feature selection. |
---|
207 | |
---|
208 | The input dataset is tabular, and must contain a label column which |
---|
209 | indicates whether or not a given row has property P. In the current |
---|
210 | version of this tool, P must be encoded using +1 and -1. The Lambda_fac |
---|
211 | parameter ranges from 0 to 1, and controls how sparse the weight |
---|
212 | vector will be. At the low end, when Lambda_fac = 0, there will be |
---|
213 | no regularization. At the high end, when Lambda_fac = 1, there will be |
---|
214 | "too much" regularization, and all of the weights will equal zero. |
---|
215 | |
---|
216 | The LPS tool creates two output datasets. The first, called the results |
---|
217 | file, is a tabular dataset containing one column of weights for each |
---|
218 | value of the regularization parameter lambda that was tried. The weight |
---|
219 | columns are in order from left to right by decreasing values of lambda. |
---|
220 | The first N-1 rows in each column are the weights for the N-1 attributes |
---|
221 | in your input dataset. The final row is a constant, the intercept. |
---|
222 | |
---|
223 | Let **x** be a row from your input dataset and let **b** be a column |
---|
224 | from the results file. To compute the probability that row **x** has |
---|
225 | a label value of +1: |
---|
226 | |
---|
227 | Probability(row **x** has label value = +1) = 1 / [1 + exp{**x** \* **b**\[1..N-1\] + **b**\[N\]}] |
---|
228 | |
---|
229 | where **x** \* **b**\[1..N-1\] represents matrix multiplication. |
---|
230 | |
---|
231 | The second output dataset, called the log file, is a text file which |
---|
232 | contains additional data about the fitted L1-regularized logistic |
---|
233 | regression model. These data include the number of features, the |
---|
234 | computed value of lambda_max, the actual values of lambda used, the |
---|
235 | optimal values of the log-likelihood and regularized log-likelihood |
---|
236 | functions, the number of non-zeros, and the number of iterations. |
---|
237 | |
---|
238 | Website: http://pages.cs.wisc.edu/~swright/LPS/ |
---|
239 | |
---|
240 | ----- |
---|
241 | |
---|
242 | **Example** |
---|
243 | |
---|
244 | - input file:: |
---|
245 | |
---|
246 | +1 1 0 0 0 0 1 0 1 1 ... |
---|
247 | +1 1 1 1 0 0 1 0 1 1 ... |
---|
248 | +1 1 0 1 0 1 0 1 0 1 ... |
---|
249 | etc. |
---|
250 | |
---|
251 | - output results file:: |
---|
252 | |
---|
253 | 0 |
---|
254 | 0 |
---|
255 | 0 |
---|
256 | 0 |
---|
257 | 0.025541 |
---|
258 | etc. |
---|
259 | |
---|
260 | - output log file:: |
---|
261 | |
---|
262 | Data set has 100 vectors with 50 features. |
---|
263 | calculateLambdaMax: n=50, m=100, m+=50, m-=50 |
---|
264 | computed value of lambda_max: 5.0000e-01 |
---|
265 | |
---|
266 | lambda=2.96e-02 solution: |
---|
267 | optimal log-likelihood function value: 6.46e-01 |
---|
268 | optimal *regularized* log-likelihood function value: 6.79e-01 |
---|
269 | number of nonzeros at the optimum: 5 |
---|
270 | number of iterations required: 43 |
---|
271 | etc. |
---|
272 | |
---|
273 | ----- |
---|
274 | |
---|
275 | **References** |
---|
276 | |
---|
277 | Koh K, Kim S-J, Boyd S. (2007) |
---|
278 | An interior-point method for large-scale l1-regularized logistic regression. |
---|
279 | Journal of Machine Learning Research. 8:1519-1555. |
---|
280 | |
---|
281 | Shi W, Wahba G, Wright S, Lee K, Klein R, Klein B. (2008) |
---|
282 | LASSO-Patternsearch algorithm with application to ophthalmology and genomic data. |
---|
283 | Stat Interface. 1(1):137-153. |
---|
284 | |
---|
285 | <!-- |
---|
286 | Wright S, Novak R, Figueiredo M. (2009) |
---|
287 | Sparse reconstruction via separable approximation. |
---|
288 | IEEE Transactions on Signal Processing. 57:2479-2403. |
---|
289 | |
---|
290 | Shi J, Yin W, Osher S, Sajda P. (2010) |
---|
291 | A fast hybrid algorithm for large scale l1-regularized logistic regression. |
---|
292 | Journal of Machine Learning Research. 11:713-741. |
---|
293 | |
---|
294 | Byrd R, Chin G, Neveitt W, Nocedal J. (2010) |
---|
295 | On the use of stochastic Hessian information in unconstrained optimization. |
---|
296 | Technical Report. Northwestern University. June 16, 2010. |
---|
297 | |
---|
298 | Wright S. (2010) |
---|
299 | Accelerated block-coordinate relaxation for regularized optimization. |
---|
300 | Technical Report. University of Wisconsin. August 10, 2010. |
---|
301 | --> |
---|
302 | |
---|
303 | </help> |
---|
304 | </tool> |
---|