[2] | 1 | <tool id="hgv_lps" name="LPS" version="1.0.0"> |
---|
| 2 | <description>LASSO-Patternsearch algorithm</description> |
---|
| 3 | |
---|
| 4 | <command interpreter="bash"> |
---|
| 5 | lps_tool_wrapper.sh $lambda_fac $input_file $label_column $output_file $log_file |
---|
| 6 | Initialization 0 |
---|
| 7 | #if $advanced.options == "true": |
---|
| 8 | Sample $advanced.sample |
---|
| 9 | Verbosity $advanced.verbosity |
---|
| 10 | Standardize $advanced.standardize |
---|
| 11 | initialLambda $advanced.initialLambda |
---|
| 12 | #if $advanced.continuation.continuation == "1": |
---|
| 13 | Continuation $advanced.continuation.continuation |
---|
| 14 | continuationSteps $advanced.continuation.continuationSteps |
---|
| 15 | accurateIntermediates $advanced.continuation.accurateIntermediates |
---|
| 16 | #end if |
---|
| 17 | printFreq $advanced.printFreq |
---|
| 18 | #if $advanced.newton.newton == "1": |
---|
| 19 | Newton $advanced.newton.newton |
---|
| 20 | NewtonThreshold $advanced.newton.newtonThreshold |
---|
| 21 | #end if |
---|
| 22 | HessianSampleFraction $advanced.hessianSampleFraction |
---|
| 23 | BB 0 |
---|
| 24 | Monotone 0 |
---|
| 25 | FullGradient $advanced.fullGradient |
---|
| 26 | GradientFraction $advanced.gradientFraction |
---|
| 27 | InitialAlpha $advanced.initialAlpha |
---|
| 28 | AlphaIncrease $advanced.alphaIncrease |
---|
| 29 | AlphaDecrease $advanced.alphaDecrease |
---|
| 30 | AlphaMax $advanced.alphaMax |
---|
| 31 | c1 $advanced.c1 |
---|
| 32 | MaxIter $advanced.maxIter |
---|
| 33 | StopTol $advanced.stopTol |
---|
| 34 | IntermediateTol $advanced.intermediateTol |
---|
| 35 | FinalOnly $advanced.finalOnly |
---|
| 36 | #end if |
---|
| 37 | </command> |
---|
| 38 | |
---|
| 39 | <inputs> |
---|
| 40 | <param name="input_file" type="data" format="tabular" label="Dataset"/> |
---|
| 41 | <param name="label_column" type="data_column" data_ref="input_file" numerical="true" label="Label column" help="Column containing outcome labels: +1 or -1."/> |
---|
| 42 | <param name="lambda_fac" label="Lambda_fac" type="float" value="0.03" help="Target value of the regularization parameter, expressed as a fraction of the calculated lambda_max."> |
---|
| 43 | <validator type="in_range" message="0.00 < lambda_fac <= 1.00" min="0.00" max="1.00"/> |
---|
| 44 | </param> |
---|
| 45 | <conditional name="advanced"> |
---|
| 46 | <param name="options" type="select" label="Advanced Options"> |
---|
| 47 | <option value="false" selected="true">Hide advanced options</option> |
---|
| 48 | <option value="true">Show advanced options</option> |
---|
| 49 | </param> |
---|
| 50 | <when value="false"> |
---|
| 51 | <!-- no options --> |
---|
| 52 | </when> |
---|
| 53 | <when value="true"> |
---|
| 54 | <!-- HARDCODED: 'Sample' we don't support passing an array --> |
---|
| 55 | <param name="sample" type="float" value="1.0" label="Sample fraction" help="Sample this fraction of the data set."> |
---|
| 56 | <validator type="in_range" message="0.0 <= sample <= 1.0" min="0.0" max="1.0"/> |
---|
| 57 | </param> |
---|
| 58 | <!-- HARDCODED: 'Initialization' = 0 :: Initialize at beta=0 --> |
---|
| 59 | <param name="verbosity" type="select" format="integer" label="Verbosity"> |
---|
| 60 | <option value="0" selected="true">Little output</option> |
---|
| 61 | <option value="1">More output</option> |
---|
| 62 | <option value="2">Still more output</option> |
---|
| 63 | </param> |
---|
| 64 | <param name="standardize" type="select" format="integer" label="Standardize" help="Scales and shifts each column so that it has mean zero and variance 1."> |
---|
| 65 | <option value="0" selected="true">Don't standardize</option> |
---|
| 66 | <option value="1">Standardize</option> |
---|
| 67 | </param> |
---|
| 68 | <param name="initialLambda" type="float" value="0.8" label="Initial lambda" help="First value of lambda to be used in the continuation scheme, expressed as a fraction of lambda_max."> |
---|
| 69 | <validator type="in_range" message="0.0 < initialLambda < 1.0" min="0.0" max="1.0"/> |
---|
| 70 | </param> |
---|
| 71 | <conditional name="continuation"> |
---|
| 72 | <param name="continuation" type="select" format="integer" label="Continuation" help="Use continuation strategy to start with a larger value of lambda, decreasing it successively to lambda_fac."> |
---|
| 73 | <option value="0" selected="true">Don't use continuation</option> |
---|
| 74 | <option value="1">Use continuation</option> |
---|
| 75 | </param> |
---|
| 76 | <when value="0"> |
---|
| 77 | <!-- no options --> |
---|
| 78 | </when> |
---|
| 79 | <when value="1"> |
---|
| 80 | <param name="continuationSteps" type="integer" value="5" label="Continuation steps" help="Number of lambda values to use in continuation <em>prior</em> to target value lambda_fac."/> |
---|
| 81 | |
---|
| 82 | <param name="accurateIntermediates" type="select" format="integer" label="Accurate intermediates" help="Indicates whether accurate solutions are required for lambda values other than the target value lambda_fac."> |
---|
| 83 | <option value="0" selected="true">Don't need accurate intemediates</option> |
---|
| 84 | <option value="1">Calculate accurate intermediates</option> |
---|
| 85 | </param> |
---|
| 86 | </when> |
---|
| 87 | </conditional> <!-- name="continuation" --> |
---|
| 88 | <param name="printFreq" type="integer" value="1" label="Print frequency" help="Print a progress report every NI iterations, where NI is the supplied value of this parameter."> |
---|
| 89 | <validator type="in_range" message="printFreq >= 1" min="1"/> |
---|
| 90 | </param> |
---|
| 91 | <conditional name="newton"> |
---|
| 92 | <param name="newton" type="select" format="integer" label="Projected Newton steps"> |
---|
| 93 | <option value="0" selected="true">No Newton steps</option> |
---|
| 94 | <option value="1">Try projected Newton steps</option> |
---|
| 95 | </param> |
---|
| 96 | <when value="0"> |
---|
| 97 | <!-- no options --> |
---|
| 98 | </when> |
---|
| 99 | <when value="1"> |
---|
| 100 | <param name="newtonThreshold" type="integer" value="500" label="Newton threshold" help="Maximum size of free variable subvector for Newton."/> |
---|
| 101 | </when> |
---|
| 102 | </conditional> |
---|
| 103 | <param name="hessianSampleFraction" type="float" value="1.0" label="Hessian sample fraction" help="Fraction of terms to use in approximate Hessian calculation."> |
---|
| 104 | <validator type="in_range" message="0.01 < hessianSampleFraction <= 1.00" min="0.01" max="1.00"/> |
---|
| 105 | </param> |
---|
| 106 | <!-- HARDCODED: 'BB' = 0 :: don't use Barzilai-Borwein steps --> |
---|
| 107 | <!-- HARDCODED: 'Monotone' = 0 :: don't force monotonicity --> |
---|
| 108 | <param name="fullGradient" type="select" format="integer" label="Partial gradient vector selection"> |
---|
| 109 | <option value="0">Use randomly selected partial gradient, including current active components ("biased")</option> |
---|
| 110 | <option value="1">Use full gradient vector at every step</option> |
---|
| 111 | <option value="2">Randomly selected partial gradient, without regard to current active set ("unbiased")</option> |
---|
| 112 | </param> |
---|
| 113 | <param name="gradientFraction" type="float" value="0.1" label="Gradient fraction" help="Fraction of inactive gradient vector to evaluate."> |
---|
| 114 | <validator type="in_range" message="0.0 < gradientFraction <= 1" min="0.0" max="1.0"/> |
---|
| 115 | </param> |
---|
| 116 | <param name="initialAlpha" type="float" value="1.0" label="Initial value of alpha"/> |
---|
| 117 | <param name="alphaIncrease" type="float" value="2.0" label="Alpha increase" help="Factor by which to increase alpha after descent not obtained."/> |
---|
| 118 | <param name="alphaDecrease" type="float" value="0.8" label="Alpha decrease" help="Factor by which to decrease alpha after successful first-order step."/> |
---|
| 119 | <param name="alphaMax" type="float" value="1e12" label="Alpha max" help="Maximum value of alpha; terminate with error if we exceed this."/> |
---|
| 120 | <param name="c1" type="float" value="1e-3" help="Parameter defining the margin by which the first-order step is required to decrease before being taken."> |
---|
| 121 | <validator type="in_range" message="0.0 < c1 < 1.0" min="0.0" max="1.0"/> |
---|
| 122 | </param> |
---|
| 123 | <param name="maxIter" type="integer" value="10000" label="Maximum number of iterations" help="Terminate with error if we exceed this."/> |
---|
| 124 | <param name="stopTol" type="float" value="1e-6" label="Stop tolerance" help="Convergence tolerance for target value of lambda."/> |
---|
| 125 | <param name="intermediateTol" type="float" value="1e-4" label="Intermediate tolerance" help="Convergence tolerance for intermediate values of lambda."/> |
---|
| 126 | <param name="finalOnly" type="select" format="integer" label="Final only"> |
---|
| 127 | <option value="0" selected="true">Return information for all intermediate values</option> |
---|
| 128 | <option value="1">Just return information at the last lambda</option> |
---|
| 129 | </param> |
---|
| 130 | </when> <!-- value="advanced" --> |
---|
| 131 | </conditional> <!-- name="advanced" --> |
---|
| 132 | </inputs> |
---|
| 133 | |
---|
| 134 | <outputs> |
---|
| 135 | <data name="output_file" format="tabular" label="${tool.name} on ${on_string}: results"/> |
---|
| 136 | <data name="log_file" format="txt" label="${tool.name} on ${on_string}: log"/> |
---|
| 137 | </outputs> |
---|
| 138 | |
---|
| 139 | <requirements> |
---|
| 140 | <requirement type="binary">lps_tool</requirement> |
---|
| 141 | </requirements> |
---|
| 142 | |
---|
| 143 | <tests> |
---|
| 144 | <test> |
---|
| 145 | <param name="input_file" value="lps_arrhythmia.tabular"/> |
---|
| 146 | <param name="label_column" value="280"/> |
---|
| 147 | <param name="lambda_fac" value="0.03"/> |
---|
| 148 | <param name="options" value="true"/> |
---|
| 149 | <param name="sample" value="1.0"/> |
---|
| 150 | <param name="verbosity" value="1"/> |
---|
| 151 | <param name="standardize" value="0"/> |
---|
| 152 | <param name="initialLambda" value="0.9"/> |
---|
| 153 | <param name="continuation" value="1"/> |
---|
| 154 | <param name="continuationSteps" value="10"/> |
---|
| 155 | <param name="accurateIntermediates" value="0"/> |
---|
| 156 | <param name="printFreq" value="1"/> |
---|
| 157 | <param name="newton" value="1"/> |
---|
| 158 | <param name="newtonThreshold" value="500"/> |
---|
| 159 | <param name="hessianSampleFraction" value="1.0"/> |
---|
| 160 | <param name="fullGradient" value="1"/> |
---|
| 161 | <param name="gradientFraction" value="0.5"/> |
---|
| 162 | <param name="initialAlpha" value="1.0"/> |
---|
| 163 | <param name="alphaIncrease" value="2.0"/> |
---|
| 164 | <param name="alphaDecrease" value="0.8"/> |
---|
| 165 | <param name="alphaMax" value="1e12"/> |
---|
| 166 | <param name="c1" value="1e-3"/> |
---|
| 167 | <param name="maxIter" value="2500"/> |
---|
| 168 | <param name="stopTol" value="1e-6"/> |
---|
| 169 | <param name="intermediateTol" value="1e-6"/> |
---|
| 170 | <param name="finalOnly" value="0"/> |
---|
| 171 | <output name="ouput_file" file="lps_arrhythmia_beta.tabular"/> |
---|
| 172 | <output name="log_file" file="lps_arrhythmia_log.txt"/> |
---|
| 173 | </test> |
---|
| 174 | </tests> |
---|
| 175 | |
---|
| 176 | <help> |
---|
| 177 | **Dataset formats** |
---|
| 178 | |
---|
| 179 | The input and output datasets are tabular_. The columns are described below. |
---|
| 180 | There is a second output dataset (a log) that is in text_ format. |
---|
| 181 | (`Dataset missing?`_) |
---|
| 182 | |
---|
| 183 | .. _tabular: ./static/formatHelp.html#tab |
---|
| 184 | .. _text: ./static/formatHelp.html#text |
---|
| 185 | .. _Dataset missing?: ./static/formatHelp.html |
---|
| 186 | |
---|
| 187 | ----- |
---|
| 188 | |
---|
| 189 | **What it does** |
---|
| 190 | |
---|
| 191 | The LASSO-Patternsearch algorithm fits your dataset to an L1-regularized |
---|
| 192 | logistic regression model. A benefit of using L1-regularization is |
---|
| 193 | that it typically yields a weight vector with relatively few non-zero |
---|
| 194 | coefficients. |
---|
| 195 | |
---|
| 196 | For example, say you have a dataset containing M rows (subjects) |
---|
| 197 | and N columns (attributes) where one of these N attributes is binary, |
---|
| 198 | indicating whether or not the subject has some property of interest P. |
---|
| 199 | In simple terms, LPS calculates a weight for each of the other attributes |
---|
| 200 | in your dataset. This weight indicates how "relevant" that attribute |
---|
| 201 | is for predicting whether or not a given subject has property P. |
---|
| 202 | The L1-regularization causes most of these weights to be equal to zero, |
---|
| 203 | which means LPS will find a "small" subset of the remaining N-1 attributes |
---|
| 204 | in your dataset that can be used to predict P. |
---|
| 205 | |
---|
| 206 | In other words, LPS can be used for feature selection. |
---|
| 207 | |
---|
| 208 | The input dataset is tabular, and must contain a label column which |
---|
| 209 | indicates whether or not a given row has property P. In the current |
---|
| 210 | version of this tool, P must be encoded using +1 and -1. The Lambda_fac |
---|
| 211 | parameter ranges from 0 to 1, and controls how sparse the weight |
---|
| 212 | vector will be. At the low end, when Lambda_fac = 0, there will be |
---|
| 213 | no regularization. At the high end, when Lambda_fac = 1, there will be |
---|
| 214 | "too much" regularization, and all of the weights will equal zero. |
---|
| 215 | |
---|
| 216 | The LPS tool creates two output datasets. The first, called the results |
---|
| 217 | file, is a tabular dataset containing one column of weights for each |
---|
| 218 | value of the regularization parameter lambda that was tried. The weight |
---|
| 219 | columns are in order from left to right by decreasing values of lambda. |
---|
| 220 | The first N-1 rows in each column are the weights for the N-1 attributes |
---|
| 221 | in your input dataset. The final row is a constant, the intercept. |
---|
| 222 | |
---|
| 223 | Let **x** be a row from your input dataset and let **b** be a column |
---|
| 224 | from the results file. To compute the probability that row **x** has |
---|
| 225 | a label value of +1: |
---|
| 226 | |
---|
| 227 | Probability(row **x** has label value = +1) = 1 / [1 + exp{**x** \* **b**\[1..N-1\] + **b**\[N\]}] |
---|
| 228 | |
---|
| 229 | where **x** \* **b**\[1..N-1\] represents matrix multiplication. |
---|
| 230 | |
---|
| 231 | The second output dataset, called the log file, is a text file which |
---|
| 232 | contains additional data about the fitted L1-regularized logistic |
---|
| 233 | regression model. These data include the number of features, the |
---|
| 234 | computed value of lambda_max, the actual values of lambda used, the |
---|
| 235 | optimal values of the log-likelihood and regularized log-likelihood |
---|
| 236 | functions, the number of non-zeros, and the number of iterations. |
---|
| 237 | |
---|
| 238 | Website: http://pages.cs.wisc.edu/~swright/LPS/ |
---|
| 239 | |
---|
| 240 | ----- |
---|
| 241 | |
---|
| 242 | **Example** |
---|
| 243 | |
---|
| 244 | - input file:: |
---|
| 245 | |
---|
| 246 | +1 1 0 0 0 0 1 0 1 1 ... |
---|
| 247 | +1 1 1 1 0 0 1 0 1 1 ... |
---|
| 248 | +1 1 0 1 0 1 0 1 0 1 ... |
---|
| 249 | etc. |
---|
| 250 | |
---|
| 251 | - output results file:: |
---|
| 252 | |
---|
| 253 | 0 |
---|
| 254 | 0 |
---|
| 255 | 0 |
---|
| 256 | 0 |
---|
| 257 | 0.025541 |
---|
| 258 | etc. |
---|
| 259 | |
---|
| 260 | - output log file:: |
---|
| 261 | |
---|
| 262 | Data set has 100 vectors with 50 features. |
---|
| 263 | calculateLambdaMax: n=50, m=100, m+=50, m-=50 |
---|
| 264 | computed value of lambda_max: 5.0000e-01 |
---|
| 265 | |
---|
| 266 | lambda=2.96e-02 solution: |
---|
| 267 | optimal log-likelihood function value: 6.46e-01 |
---|
| 268 | optimal *regularized* log-likelihood function value: 6.79e-01 |
---|
| 269 | number of nonzeros at the optimum: 5 |
---|
| 270 | number of iterations required: 43 |
---|
| 271 | etc. |
---|
| 272 | |
---|
| 273 | ----- |
---|
| 274 | |
---|
| 275 | **References** |
---|
| 276 | |
---|
| 277 | Koh K, Kim S-J, Boyd S. (2007) |
---|
| 278 | An interior-point method for large-scale l1-regularized logistic regression. |
---|
| 279 | Journal of Machine Learning Research. 8:1519-1555. |
---|
| 280 | |
---|
| 281 | Shi W, Wahba G, Wright S, Lee K, Klein R, Klein B. (2008) |
---|
| 282 | LASSO-Patternsearch algorithm with application to ophthalmology and genomic data. |
---|
| 283 | Stat Interface. 1(1):137-153. |
---|
| 284 | |
---|
| 285 | <!-- |
---|
| 286 | Wright S, Novak R, Figueiredo M. (2009) |
---|
| 287 | Sparse reconstruction via separable approximation. |
---|
| 288 | IEEE Transactions on Signal Processing. 57:2479-2403. |
---|
| 289 | |
---|
| 290 | Shi J, Yin W, Osher S, Sajda P. (2010) |
---|
| 291 | A fast hybrid algorithm for large scale l1-regularized logistic regression. |
---|
| 292 | Journal of Machine Learning Research. 11:713-741. |
---|
| 293 | |
---|
| 294 | Byrd R, Chin G, Neveitt W, Nocedal J. (2010) |
---|
| 295 | On the use of stochastic Hessian information in unconstrained optimization. |
---|
| 296 | Technical Report. Northwestern University. June 16, 2010. |
---|
| 297 | |
---|
| 298 | Wright S. (2010) |
---|
| 299 | Accelerated block-coordinate relaxation for regularized optimization. |
---|
| 300 | Technical Report. University of Wisconsin. August 10, 2010. |
---|
| 301 | --> |
---|
| 302 | |
---|
| 303 | </help> |
---|
| 304 | </tool> |
---|