root/trunk/regress.d

Revision 288, 77.9 kB (checked in by dsimcha, 9 months ago)

Minor updates for 2.053.

Line 
1 /**A module for performing linear regression.  This module has an unusual
2  * interface, as it is range-based instead of matrix based. Values for
3  * independent variables are provided as either a tuple or a range of ranges.
4  * This means that one can use, for example, map, to fit high order models and
5  * lazily evaluate certain values.  (For details, see examples below.)
6  *
7  * Author:  David Simcha*/
8  /*
9  * License:
10  * Boost Software License - Version 1.0 - August 17th, 2003
11  *
12  * Permission is hereby granted, free of charge, to any person or organization
13  * obtaining a copy of the software and accompanying documentation covered by
14  * this license (the "Software") to use, reproduce, display, distribute,
15  * execute, and transmit the Software, and to prepare derivative works of the
16  * Software, and to permit third-parties to whom the Software is furnished to
17  * do so, all subject to the following:
18  *
19  * The copyright notices in the Software and this entire statement, including
20  * the above license grant, this restriction and the following disclaimer,
21  * must be included in all copies of the Software, in whole or in part, and
22  * all derivative works of the Software, unless such copies or derivative
23  * works are solely in the form of machine-executable object code generated by
24  * a source language processor.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
29  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
30  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
31  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
32  * DEALINGS IN THE SOFTWARE.
33  */
34 module dstats.regress;
35
36 import std.math, std.algorithm, std.traits, std.array, std.traits, std.exception,
37     std.typetuple, std.typecons, std.numeric;
38
39 import dstats.alloc, std.range, std.conv, dstats.distrib, dstats.cor,
40     dstats.base, dstats.summary, dstats.sort;
41
42 version(unittest) {
43     import std.stdio, dstats.random, std.functional;
44     void main(){}
45 }
46
47 ///
48 struct PowMap(ExpType, T)
49 if(isForwardRange!(T)) {
50     Unqual!T range;
51     Unqual!ExpType exponent;
52     double cache;
53
54     this(T range, ExpType exponent) {
55         this.range = range;
56         this.exponent = exponent;
57
58         static if(isIntegral!ExpType) {
59             cache = pow(cast(double) range.front, exponent);
60         } else {
61             cache = pow(cast(ExpType) range.front, exponent);
62         }
63     }
64
65     @property double front() const pure nothrow {
66         return cache;
67     }
68
69     void popFront() {
70         range.popFront;
71         if(!range.empty) {
72             cache = pow(cast(double) range.front, exponent);
73         }
74     }
75
76     @property typeof(this) save() {
77         return this;
78     }
79
80     @property bool empty() {
81         return range.empty;
82     }
83 }
84
85 /**Maps a forward range to a power determined at runtime.  ExpType is the type
86  * of the exponent.  Using an int is faster than using a double, but obviously
87  * less flexible.*/
88 PowMap!(ExpType, T) powMap(ExpType, T)(T range, ExpType exponent) {
89     alias PowMap!(ExpType, T) RT;
90     return RT(range, exponent);
91 }
92
93 // Very ad-hoc, does a bunch of matrix ops for linearRegress and
94 // linearRegressBeta.  Specifically, computes xTx and xTy.
95 // Written specifically to be efficient in the context used here.
96 private void rangeMatrixMulTrans(U, T...)
97 (ref double[] xTy, out double[][] xTx, U y, ref T matIn) {
98     static if(isArray!(T[0]) &&
99         isInputRange!(typeof(matIn[0][0])) && matIn.length == 1) {
100         alias matIn[0] mat;
101     } else {
102         alias matIn mat;
103     }
104
105     bool someEmpty() {
106         if(y.empty) {
107             return true;
108         }
109         foreach(range; mat) {
110             if(range.empty) {
111                 return true;
112             }
113         }
114         return false;
115     }
116
117     void popAll() {
118         foreach(ti, range; mat) {
119             mat[ti].popFront;
120         }
121         y.popFront;
122     }
123
124     xTy[] = 0;
125
126     xTx = newStack!(double[])(mat.length);
127     foreach(ref elem; xTx) {
128         elem = newStack!double(mat.length);
129     }
130
131     foreach(row; xTx) {
132         row[] = 0;
133     }
134
135     mixin(newFrame);
136     auto deltas = newStack!double(mat.length);
137
138     // Using an algorithm similar to the one for Pearson cor to improve
139     // numerical stability.  Calculate means and covariances, then
140     // combine them:  Sum of squares = mean1 * N * mean2 + N * cov.
141     double k = 0;
142     double[] means = newStack!(double)(mat.length);
143     means[] = 0;
144     double yMean = 0;
145
146     while(!someEmpty) {
147         foreach(i, elem; mat) {
148             deltas[i] = cast(double) elem.front;
149         }
150
151         immutable kMinus1 = k;
152         immutable kNeg1 = 1 / ++k;
153         deltas[] -= means[];
154         means[] += deltas[] * kNeg1;
155
156         immutable yDelta = cast(double) y.front - yMean;
157         yMean += yDelta * kNeg1;
158
159         foreach(i, delta1; deltas) {
160             xTy[i] += kMinus1 * delta1 * kNeg1 * yDelta;
161             xTx[i][i] += kMinus1 * delta1 * kNeg1 * delta1;
162             auto xTxi = xTx[i];
163
164             foreach(j, delta2; deltas[0..i]) {
165                 xTxi[j] += kMinus1 * delta1 * kNeg1 * delta2;
166             }
167         }
168         popAll();
169     }
170
171     // k is n now that we're done looping over the data.
172     alias k n;
173
174     // mat now consists of covariance * n.    Divide by n and add mean1 * mean2
175     // to get sum of products.
176     foreach(i; 0..xTx.length) foreach(j; 0..i + 1) {
177         xTx[i][j] /= n;
178         xTx[i][j] += means[i] * means[j];
179     }
180
181     // Similarly for the xTy vector
182     foreach(i, ref elem; xTy) {
183         xTy[i] /= n;
184         elem += yMean * means[i];
185     }
186     symmetrize(xTx);
187 }
188
189 // Copies values from lower triangle to upper triangle.
190 private void symmetrize(double[][] mat) pure nothrow {
191     foreach(i; 1..mat.length) {
192         foreach(j; 0..i) {
193             mat[j][i] = mat[i][j];
194         }
195     }
196 }
197
198 // Uses Gauss-Jordan elim. w/ row pivoting to invert from.  Stores the results
199 // in to and leaves from in an undefined state.
200 package void invert(double[][] from, double[][] to) {
201     // Normalize.
202     foreach(i, row; from) {
203         double absMax = 1.0 / reduce!(max)(map!(abs)(row[0..from.length]));
204         row[] *= absMax;
205         to[i][] = 0;
206         to[i][i] = absMax;
207     }
208
209     foreach(col; 0..from.length) {
210         size_t bestRow;
211         double biggest = 0;
212         foreach(row; col..from.length) {
213             if(abs(from[row][col]) > biggest) {
214                 bestRow = row;
215                 biggest = abs(from[row][col]);
216             }
217         }
218
219         swap(from[col], from[bestRow]);
220         swap(to[col], to[bestRow]);
221         immutable pivotFactor = from[col][col];
222
223         foreach(row; 0..from.length) if(row != col) {
224             immutable ratio = from[row][col] / pivotFactor;
225
226             // If you're ever looking to optimize this code, good luck.  The
227             // bottleneck is almost ENTIRELY this one line:
228             from[row][] -= from[col][] * ratio;
229             to[row][] -= to[col][] * ratio;
230         }
231     }
232
233     foreach(i; 0..from.length) {
234         immutable diagVal = from[i][i];
235         from[i][] /= diagVal;
236         to[i][] /= diagVal;
237     }
238 }
239
240 unittest {
241     auto mat = [[1.0, 2, 3], [4.0, 7, 6], [7.0, 8, 9]];
242     auto toMat = [new double[3], new double[3], new double[3]];
243     invert(mat, toMat);
244     assert(approxEqual(toMat[0], [-0.625, -0.25, 0.375]));
245     assert(approxEqual(toMat[1], [-0.25, 0.5, -0.25]));
246     assert(approxEqual(toMat[2], [0.708333, -0.25, 0.041667]));
247 }
248
249 // Solve a system of linear equations mat * b = vec for b.  The result is
250 // stored in vec, and mat is left in an undefined state. Uses Gaussian
251 // elimination w/ row pivoting.  Roughly 4x faster than using inversion to
252 // solve the system, and uses roughly half the memory.
253 private void solve(double[][] mat, double[] vec) {
254     // Normalize.
255     foreach(i, row; mat) {
256         double absMax = 1.0 / reduce!(max)(map!(abs)(row[0..mat.length]));
257         row[] *= absMax;
258         vec[i] *= absMax;
259     }
260
261     foreach(col; 0..mat.length - 1) {
262         size_t bestRow;
263         double biggest = 0;
264         foreach(row; col..mat.length) {
265             if(abs(mat[row][col]) > biggest) {
266                 bestRow = row;
267                 biggest = abs(mat[row][col]);
268             }
269         }
270
271         swap(mat[col], mat[bestRow]);
272         swap(vec[col], vec[bestRow]);
273         immutable pivotFactor = mat[col][col];
274
275         foreach(row; col + 1..mat.length) {
276             immutable ratio = mat[row][col] / pivotFactor;
277
278             // If you're ever looking to optimize this code, good luck.  The
279             // bottleneck is almost ENTIRELY this one line:
280             mat[row][col..$] -= mat[col][col..$] * ratio;
281             vec[row] -= vec[col] * ratio;
282         }
283     }
284
285     foreach(i; 0..mat.length) {
286         double diagVal = mat[i][i];
287         mat[i][] /= diagVal;
288         vec[i] /= diagVal;
289     }
290
291     // Do back substitution.
292     for(size_t row = mat.length - 1; row != size_t.max; row--) {
293         auto v1 = vec[row + 1..$];
294         auto v2 = mat[row][$ - v1.length..$];
295         vec[row] -= dotProduct(v1, v2);
296     }
297 }
298
299 unittest {
300     auto mat = [[2.0, 1, -1], [-3.0, -1, 2], [-2.0, 1, 2]];
301     auto vec = [8.0, -11, -3];
302     solve(mat, vec);
303     assert(approxEqual(vec, [2, 3, -1]));
304
305     auto mat2 = [[1.0, 2, 3], [4.0, 7, 6], [7.0, 8, 9]];
306     auto vec2 = [8.0, 6, 7];
307     solve(mat2, vec2);
308     assert(approxEqual(vec2, [-3.875, -0.75, 4.45833]));
309 }
310
311 // Cholesky decomposition functions adapted from Don Clugston's MathExtra
312 // lib, used w/ permission.
313 private void choleskyDecompose(double[][] a, double[] diag) {
314     immutable N = diag.length;
315
316     foreach(i; 0..N) {
317         const ai = a[i];
318         double sum = ai[i];
319
320         for(sizediff_t k = i - 1; k >= 0; --k) {
321             sum -= ai[k] * ai[k];
322         }
323
324         if (sum > 0.0) {
325             diag[i] = sqrt(sum);
326
327             foreach(j; i + 1..N) {
328                 sum = ai[j] - dotProduct(ai[0..i], a[j][0..i]);
329                 a[j][i] = sum / diag[i];
330             }
331         } else {
332             // not positive definite (could be caused by rounding errors)
333             diag[i] = 0;
334             // make this whole row zero so they have no further effect
335             foreach(j; i + 1..N) a[j][i] = 0;
336         }
337     }
338 }
339
340 private void choleskySolve(double[][] a, double[] diag, double[] b, double[] x) {
341     immutable N = x.length;
342
343     foreach(i; 0..N) {
344         const ai = a[i];
345
346         if(diag[i] > 0)  {
347             double sum = b[i];
348             sum -= dotProduct(ai[0..i], x[0..i]);
349             x[i] = sum / diag[i];
350         } else x[i] = 0; // skip pos definite rows
351     }
352
353     for(sizediff_t i = N - 1; i >= 0; --i) {
354         if (diag[i] > 0) {
355             double sum = x[i];
356             for(sizediff_t k = i + 1; k < N; ++k) sum -= a[k][i] * x[k];
357             x[i] = sum / diag[i];
358         } else x[i] = 0; // skip pos definite rows
359     }
360     // Convert failed columns in solution to NANs if required.
361     foreach(i; 0..N) {
362         if(diag[i] !> 0) x[i] = double.nan;
363     }
364 }
365
366 private void choleskySolve(double[][] a, double[] b, double[] x) {
367     mixin(newFrame);
368     auto diag = newStack!double(x.length);
369     choleskyDecompose(a, diag);
370     choleskySolve(a, diag, b, x);
371 }
372
373 /**Struct that holds the results of a linear regression.  It's a plain old
374  * data struct.*/
375 struct RegressRes {
376     /**The coefficients, one for each range in X.  These will be in the order
377      * that the X ranges were passed in.*/
378     double[] betas;
379
380     /**The standard error terms of the X ranges passed in.*/
381     double[] stdErr;
382
383     /**The lower confidence bounds of the beta terms, at the confidence level
384      * specificied.  (Default 0.95).*/
385     double[] lowerBound;
386
387     /**The upper confidence bounds of the beta terms, at the confidence level
388      * specificied.  (Default 0.95).*/
389     double[] upperBound;
390
391     /**The P-value for the alternative that the corresponding beta value is
392      * different from zero against the null that it is equal to zero.*/
393     double[] p;
394
395     /**The coefficient of determination.*/
396     double R2;
397
398     /**The adjusted coefficient of determination.*/
399     double adjustedR2;
400
401     /**The root mean square of the residuals.*/
402     double residualError;
403
404     /**The P-value for the model as a whole.  Based on an F-statistic.  The
405      * null here is that the model has no predictive value, the alternative
406      * is that it does.*/
407     double overallP;
408
409     // Just used internally.
410     private static string arrStr(T)(T arr) {
411         return text(arr)[1..$ - 1];
412     }
413
414     /**Print out the results in the default format.*/
415     string toString() {
416         return "Betas:  " ~ arrStr(betas) ~ "\nLower Conf. Int.:  " ~
417             arrStr(lowerBound) ~ "\nUpper Conf. Int.:  " ~ arrStr(upperBound) ~
418             "\nStd. Err:  " ~ arrStr(stdErr) ~ "\nP Values:  " ~ arrStr(p) ~
419             "\nR^2:  " ~ text(R2) ~
420             "\nAdjusted R^2:  " ~ text(adjustedR2) ~
421             "\nStd. Residual Error:  " ~ text(residualError)
422             ~ "\nOverall P:  " ~ text(overallP);
423     }
424 }
425
426 /**Forward Range for holding the residuals from a regression analysis.*/
427 struct Residuals(F, U, T...) {
428     static if(T.length == 1 && isForwardRange!(typeof(T[0].front()))) {
429         alias T[0] R;
430         alias typeof(array(R.init)) XType;
431         enum bool needDup = true;
432     } else {
433         alias T R;
434         alias staticMap!(Unqual, R) XType;
435         enum bool needDup = false;
436     }
437
438     Unqual!U Y;
439     XType X;
440     F[] betas;
441     double residual;
442     bool _empty;
443
444     void nextResidual() {
445         double sum = 0;
446         size_t i = 0;
447         foreach(elem; X) {
448             double frnt = elem.front;
449             sum += frnt * betas[i];
450             i++;
451         }
452         residual = Y.front - sum;
453     }
454
455     this(F[] betas, U Y, R X) {
456         static if(is(typeof(X.length))) {
457             dstatsEnforce(X.length == betas.length,
458                 "Betas and X must have same length for residuals.");
459         } else {
460             dstatsEnforce(walkLength(X) == betas.length,
461                 "Betas and X must have same length for residuals.");
462         }
463
464         static if(needDup) {
465             this.X = array(X);
466         } else {
467             this.X = X;
468         }
469
470         foreach(i, elem; this.X) {
471             static if(isForwardRange!(typeof(elem))) {
472                 this.X[i] = this.X[i].save;
473             }
474         }
475
476         this.Y = Y;
477         this.betas = betas;
478         if(Y.empty) {
479             _empty = true;
480             return;
481         }
482         foreach(elem; X) {
483             if(elem.empty) {
484                 _empty = true;
485                 return;
486             }
487         }
488         nextResidual;
489     }
490
491     @property double front() const pure nothrow {
492         return residual;
493     }
494
495     void popFront() {
496         Y.popFront;
497         if(Y.empty) {
498             _empty = true;
499             return;
500         }
501         foreach(ti, elem; X) {
502             X[ti].popFront;
503             if(X[ti].empty) {
504                 _empty = true;
505                 return;
506             }
507         }
508         nextResidual;
509     }
510
511     @property bool empty() const pure nothrow {
512         return _empty;
513     }
514
515     @property typeof(this) save() {
516         auto ret = this;
517         ret.Y = ret.Y.save;
518         foreach(ti, xElem; ret.X) {
519             ret.X[ti] = ret.X[ti].save;
520         }
521
522         return ret;
523     }
524 }
525
526 /**Given the beta coefficients from a linear regression, and X and Y values,
527  * returns a range that lazily computes the residuals.
528  */
529 Residuals!(F, U, T) residuals(F, U, T...)(F[] betas, U Y, T X)
530 if(isFloatingPoint!F && isForwardRange!U && allSatisfy!(isForwardRange, T)) {
531     alias Residuals!(F, U, T) RT;
532     return RT(betas, Y, X);
533 }
534
535 // Compiles summary statistics while iterating, to allow ridge regression over
536 // input ranges.
537 private struct SummaryIter(R) {
538     R range;
539     MeanSD summ;
540
541     this(R range) {
542         this.range = range;
543     }
544
545     double front() @property {
546         return range.front;
547     }
548
549     void popFront() {
550         summ.put(range.front);
551         range.popFront();
552     }
553
554     bool empty() @property {
555         return range.empty;
556     }
557
558     double mse() @property const pure nothrow { return summ.mse; }
559
560     double N() @property const pure nothrow { return summ.N; }
561 }
562
563 private template SummaryType(R) {
564     alias SummaryIter!R SummaryType;
565 }
566
567 /**
568 Perform a linear regression and return just the beta values.  The advantages
569 to just returning the beta values are that it's faster and that each range
570 needs to be iterated over only once, and thus can be just an input range.
571 The beta values are returned such that the smallest index corresponds to
572 the leftmost element of X.  X can be either a tuple or a range of input
573 ranges.  Y must be an input range.
574
575 If, after all X variables are passed in, a numeric type is passed as the last
576 parameter, this is treated as a ridge parameter and ridge regression is
577 performed.  Ridge regression is a form of regression that penalizes the L2 norm
578 of the beta vector and therefore results in more parsimonious models.
579 However, it makes statistical inference such as that supported by
580 linearRegress() difficult to impossible.  Therefore, linearRegress() doesn't
581 support ridges.
582
583 If no ridge parameter is passed, or equivalently if the ridge parameter is
584 zero, then ordinary least squares regression is performed.
585
586 Notes:  The X ranges are traversed in lockstep, but the traversal is stopped
587 at the end of the shortest one.  Therefore, using infinite ranges is safe.
588 For example, using repeat(1) to get an intercept term works.
589
590 References:
591
592 http://www.mathworks.com/help/toolbox/stats/ridge.html
593
594 Venables, W. N. & Ripley, B. D. (2002) Modern Applied Statistics with S.
595 Fourth Edition. Springer, New York. ISBN 0-387-95457-0
596 (This is the citation for the MASS R package.)
597
598 Examples:
599 ---
600 int[] nBeers = [8,6,7,5,3,0,9];
601 int[] nCoffees = [3,6,2,4,3,6,8];
602 int[] musicVolume = [3,1,4,1,5,9,2];
603 int[] programmingSkill = [2,7,1,8,2,8,1];
604 double[] betas = linearRegressBeta(programmingSkill, repeat(1), nBeers, nCoffees,
605     musicVolume, map!"a * a"(musicVolume));
606
607 // Now throw in a ridge parameter of 2.5.
608 double[] ridgeBetas = linearRegressBeta(programmingSkill, repeat(1), nBeers,
609     nCoffees, musicVolume, map!"a * a"(musicVolume), 2.5);
610 ---
611  */
612 double[] linearRegressBeta(U, T...)(U Y, T XIn)
613 if(doubleInput!(U)) {
614     double[] dummy;
615     return linearRegressBetaBuf!(U, T)(dummy, Y, XIn);
616 }
617
618 /**
619 Same as linearRegressBeta, but allows the user to specify a buffer for
620 the beta terms.  If the buffer is too short, a new one is allocated.
621 Otherwise, the results are returned in the user-provided buffer.
622  */
623 double[] linearRegressBetaBuf(U, TRidge...)(double[] buf, U Y, TRidge XRidge)
624 if(doubleInput!(U)) {
625     mixin(newFrame);
626
627     static if(isFloatingPoint!(TRidge[$ - 1]) || isIntegral!(TRidge[$ - 1])) {
628         // ridge param.
629         alias XRidge[$ - 1] ridgeParam;
630         alias TRidge[0..$ - 1] T;
631         alias XRidge[0..$ - 1] XIn;
632         enum bool ridge = true;
633         dstatsEnforce(ridgeParam >= 0,
634             "Cannot do ridge regerssion with ridge param <= 0.");
635
636         static SummaryIter!R summaryIter(R)(R range) {
637             return typeof(return)(range);
638         }
639
640     } else {
641         enum bool ridge = false;
642         enum ridgeParam = 0;
643         alias TRidge T;
644         alias XRidge XIn;
645
646         static R summaryIter(R)(R range) {
647             return range;
648         }
649     }
650
651     static if(isArray!(T[0]) && isInputRange!(typeof(XIn[0][0])) &&
652         T.length == 1) {
653         auto X = tempdup(map!(summaryIter)(XIn[0]));
654         alias typeof(X[0]) E;
655     } else {
656         static if(ridge) {
657             alias staticMap!(SummaryType, T) XType;
658             XType X;
659
660             foreach(ti, elem; XIn) {
661                 X[ti] = summaryIter(elem);
662             }
663         } else {
664             alias XIn X;
665         }
666     }
667
668     double[][] xTx;
669     double[] xTy = newStack!double(X.length);
670     double[] ret;
671     if(buf.length < X.length) {
672         ret = new double[X.length];
673     } else {
674         ret = buf[0..X.length];
675     }
676
677     rangeMatrixMulTrans(xTy, xTx, Y, X);
678
679     static if(ridge) {
680         if(ridgeParam > 0) {
681             foreach(i, range; X) {
682                 // The whole matrix is scaled by N, as well as the xTy vector,
683                 // so scale the ridge param similarly.
684                 xTx[i][i] += ridgeParam * range.mse / range.N;
685             }
686         }
687     }
688
689     choleskySolve(xTx, xTy, ret);
690     return ret;
691 }
692
693 /**
694 Perform a linear regression as in linearRegressBeta, but return a
695 RegressRes with useful stuff for statistical inference.  If the last element
696 of input is a real, this is used to specify the confidence intervals to
697 be calculated.  Otherwise, the default of 0.95 is used.  The rest of input
698 should be the elements of X.
699
700 When using this function, which provides several useful statistics useful
701 for inference, each range must be traversed twice.  This means:
702
703 1.  They have to be forward ranges, not input ranges.
704
705 2.  If you have a large amount of data and you're mapping it to some
706     expensive function, you may want to do this eagerly instead of lazily.
707
708 Notes:
709
710 The X ranges are traversed in lockstep, but the traversal is stopped
711 at the end of the shortest one.  Therefore, using infinite ranges is safe.
712 For example, using repeat(1) to get an intercept term works.
713
714 If the confidence interval specified is exactly 0, this is treated as a
715 special case and confidence interval calculation is skipped.  This can speed
716 things up significantly and therefore can be useful in monte carlo and possibly
717 data mining contexts.
718
719 Bugs:  The statistical tests performed in this function assume that an
720 intercept term is included in your regression model.  If no intercept term
721 is included, the P-values, confidence intervals and adjusted R^2 values
722 calculated by this function will be wrong.
723
724 Examples:
725 ---
726 int[] nBeers = [8,6,7,5,3,0,9];
727 int[] nCoffees = [3,6,2,4,3,6,8];
728 int[] musicVolume = [3,1,4,1,5,9,2];
729 int[] programmingSkill = [2,7,1,8,2,8,1];
730
731 // Using default confidence interval:
732 auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees,
733     musicVolume, map!"a * a"(musicVolume));
734
735 // Using user-specified confidence interval:
736 auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees,
737     musicVolume, map!"a * a"(musicVolume), 0.8675309);
738 ---
739 */
740 RegressRes linearRegress(U, TC...)(U Y, TC input) {
741     static if(is(TC[$ - 1] : double)) {
742         double confLvl = input[$ - 1];
743         enforceConfidence(confLvl);
744         alias TC[0..$ - 1] T;
745         alias input[0..$ - 1] XIn;
746     } else {
747         double confLvl = 0.95; // Default;
748         alias TC T;
749         alias input XIn;
750     }
751
752     mixin(newFrame);
753     static if(isForwardRange!(T[0]) && isForwardRange!(typeof(XIn[0].front())) &&
754         T.length == 1) {
755
756         enum bool arrayX = true;
757         alias typeof(XIn[0].front) E;
758         E[] X = tempdup(XIn[0]);
759     } else static if(allSatisfy!(isForwardRange, T)) {
760         enum bool arrayX = false;
761         alias XIn X;
762     } else {
763         static assert(0, "Linear regression can only be performed with " ~
764             "tuples of forward ranges or ranges of forward ranges.");
765     }
766
767     double[][] xTx = newStack!(double[])(X.length),
768         xTxNeg1 = newStack!(double[])(X.length);
769
770     foreach(i; 0..X.length) {
771         xTx[i] = newStack!double(X.length);
772     }
773
774     double[] xTy = newStack!double(X.length);
775
776     foreach(i; 0..X.length) {
777         xTxNeg1[i] = newStack!double(X.length);
778     }
779
780     static if(arrayX) {
781         auto xSaved = X.tempdup;
782         foreach(ref elem; xSaved) {
783             elem = elem.save;
784         }
785     } else {
786         auto xSaved = X;
787         foreach(ti, Type; X) {
788             xSaved[ti] = X[ti].save;
789         }
790     }
791
792     rangeMatrixMulTrans(xTy, xTx, Y.save, X);
793     invert(xTx, xTxNeg1);
794     double[] betas = new double[X.length];
795     foreach(i; 0..betas.length) {
796         betas[i] = 0;
797         foreach(j; 0..betas.length) {
798             betas[i] += xTxNeg1[i][j] * xTy[j];
799         }
800     }
801
802     X = xSaved;
803     auto residuals = residuals(betas, Y, X);
804     double S = 0;
805     ulong n = 0;
806     PearsonCor R2Calc;
807     for(; !residuals.empty; residuals.popFront) {
808         double residual = residuals.front;
809         S += residual * residual;
810         double Yfront = residuals.Y.front;
811         double predicted = Yfront - residual;
812         R2Calc.put(predicted, Yfront);
813         n++;
814     }
815     immutable ulong df =  n - X.length;
816     immutable double R2 = R2Calc.cor ^^ 2;
817     immutable double adjustedR2 = 1.0L - (1.0L - R2) * ((n - 1.0L) / df);
818
819     immutable double sigma2 = S / (n - X.length);
820
821     double[] stdErr = new double[betas.length];
822     foreach(i, ref elem; stdErr) {
823         elem = sqrt( S * xTxNeg1[i][i] / df / n);
824     }
825
826     double[] lowerBound, upperBound;
827     if(confLvl == 0) {
828         // Then we're going to skip the computation to save time.  (See below.)
829         lowerBound = betas;
830         upperBound = betas;
831     } else {
832         lowerBound = new double[betas.length];
833         upperBound = new double[betas.length];
834     }
835     auto p = new double[betas.length];
836
837     foreach(i, beta; betas) {
838         try {
839             p[i] = 2 * min(studentsTCDF(beta / stdErr[i], df),
840                            studentsTCDFR(beta / stdErr[i], df));
841         } catch(DstatsArgumentException) {
842             // Leave it as a NaN.
843         }
844
845         if(confLvl > 0) {
846             // Skip confidence level computation if level is zero, to save
847             // computation time.  This is important in monte carlo and possibly
848             // data mining contexts.
849             try {
850                 double delta = invStudentsTCDF(0.5 * (1 - confLvl), df) *
851                      stdErr[i];
852                 upperBound[i] = beta - delta;
853                 lowerBound[i] = beta + delta;
854             } catch(DstatsArgumentException) {
855                 // Leave confidence bounds as NaNs.
856             }
857         }
858     }
859
860     double F = (R2 / (X.length - 1)) / ((1 - R2) / (n - X.length));
861     double overallP;
862     try {
863         overallP = fisherCDFR(F, X.length - 1, n - X.length);
864     } catch(DstatsArgumentException) {
865         // Leave it as a NaN.
866     }
867
868     return RegressRes(betas, stdErr, lowerBound, upperBound, p, R2,
869         adjustedR2, sqrt(sigma2), overallP);
870 }
871
872
873 /**Struct returned by polyFit.*/
874 struct PolyFitRes(T) {
875
876     /**The array of PowMap ranges created by polyFit.*/
877     T X;
878
879     /**The rest of the results.  This is alias this'd.*/
880     RegressRes regressRes;
881     alias regressRes this;
882 }
883
884 /**Convenience function that takes a forward range X and a forward range Y,
885  * creates an array of PowMap structs for integer powers from 0 through N,
886  * and calls linearRegressBeta.
887  *
888  * Returns:  An array of doubles.  The index of each element corresponds to
889  * the exponent.  For example, the X<sup>2</sup> term will have an index of
890  * 2.
891  */
892 double[] polyFitBeta(T, U)(U Y, T X, uint N, double ridge = 0) {
893     double[] dummy;
894     return polyFitBetaBuf!(T, U)(dummy, Y, X, N);
895 }
896
897 /**Same as polyFitBeta, but allows the caller to provide an explicit buffer
898  * to return the coefficients in.  If it's too short, a new one will be
899  * allocated.  Otherwise, results will be returned in the user-provided buffer.
900  */
901 double[] polyFitBetaBuf(T, U)(double[] buf, U Y, T X, uint N, double ridge = 0) {
902     mixin(newFrame);
903     auto pows = newStack!(PowMap!(uint, T))(N + 1);
904     foreach(exponent; 0..N + 1) {
905         pows[exponent] = powMap(X, exponent);
906     }
907
908     if(ridge == 0) {
909         return linearRegressBetaBuf(buf, Y, pows);
910     } else {
911         return linearRegressBetaBuf(buf, Y, pows, ridge);
912     }
913 }
914
915 /**Convenience function that takes a forward range X and a forward range Y,
916  * creates an array of PowMap structs for integer powers 0 through N,
917  * and calls linearRegress.
918  *
919  * Returns:  A PolyFitRes containing the array of PowMap structs created and
920  * a RegressRes.  The PolyFitRes is alias this'd to the RegressRes.*/
921 PolyFitRes!(PowMap!(uint, T)[])
922 polyFit(T, U)(U Y, T X, uint N, double confInt = 0.95) {
923     enforceConfidence(confInt);
924     auto pows = new PowMap!(uint, T)[N + 1];
925     foreach(exponent; 0..N + 1) {
926         pows[exponent] = powMap(X, exponent);
927     }
928     alias PolyFitRes!(typeof(pows)) RT;
929     RT ret;
930     ret.X = pows;
931     ret.regressRes = linearRegress(Y, pows, confInt);
932     return ret;
933 }
934
935 unittest {
936     // These are a bunch of values gleaned from various examples on the Web.
937     double[] heights = [1.47,1.5,1.52,1.55,1.57,1.60,1.63,1.65,1.68,1.7,1.73,1.75,
938         1.78,1.8,1.83];
939     double[] weights = [52.21,53.12,54.48,55.84,57.2,58.57,59.93,61.29,63.11,64.47,
940         66.28,68.1,69.92,72.19,74.46];
941     float[] diseaseSev = [1.9, 3.1, 3.3, 4.8, 5.3, 6.1, 6.4, 7.6, 9.8, 12.4];
942     ubyte[] temperature = [2,1,5,5,20,20,23,10,30,25];
943
944     // Values from R.
945     auto res1 = polyFit(diseaseSev, temperature, 1);
946     assert(approxEqual(res1.betas[0], 2.6623));
947     assert(approxEqual(res1.betas[1], 0.2417));
948     assert(approxEqual(res1.stdErr[0], 1.1008));
949     assert(approxEqual(res1.stdErr[1], 0.0635));
950     assert(approxEqual(res1.p[0], 0.0419));
951     assert(approxEqual(res1.p[1], 0.0052));
952     assert(approxEqual(res1.R2, 0.644));
953     assert(approxEqual(res1.adjustedR2, 0.6001));
954     assert(approxEqual(res1.residualError, 2.03));
955     assert(approxEqual(res1.overallP, 0.00518));
956
957
958     auto res2 = polyFit(weights, heights, 2);
959     assert(approxEqual(res2.betas[0], 128.813));
960     assert(approxEqual(res2.betas[1], -143.162));
961     assert(approxEqual(res2.betas[2], 61.960));
962
963     assert(approxEqual(res2.stdErr[0], 16.308));
964     assert(approxEqual(res2.stdErr[1], 19.833));
965     assert(approxEqual(res2.stdErr[2], 6.008));
966
967     assert(approxEqual(res2.p[0], 4.28e-6));
968     assert(approxEqual(res2.p[1], 1.06e-5));
969     assert(approxEqual(res2.p[2], 2.57e-7));
970
971     assert(approxEqual(res2.R2, 0.9989, 0.0001));
972     assert(approxEqual(res2.adjustedR2, 0.9987, 0.0001));
973
974     assert(approxEqual(res2.lowerBound[0], 92.9, 0.01));
975     assert(approxEqual(res2.lowerBound[1], -186.8, 0.01));
976     assert(approxEqual(res2.lowerBound[2], 48.7, 0.01));
977     assert(approxEqual(res2.upperBound[0], 164.7, 0.01));
978     assert(approxEqual(res2.upperBound[1], -99.5, 0.01));
979     assert(approxEqual(res2.upperBound[2], 75.2, 0.01));
980
981     auto res3 = linearRegress(weights, repeat(1), heights, map!"a * a"(heights));
982     assert(res2.betas == res3.betas);
983
984     double[2] beta1Buf;
985     auto beta1 = linearRegressBetaBuf
986         (beta1Buf[], diseaseSev, repeat(1), temperature);
987     assert(beta1Buf.ptr == beta1.ptr);
988     assert(beta1Buf[] == beta1[]);
989     assert(approxEqual(beta1, res1.betas));
990     auto beta2 = polyFitBeta(weights, heights, 2);
991     assert(approxEqual(beta2, res2.betas));
992
993     auto res4 = linearRegress(weights, repeat(1), heights);
994     assert(approxEqual(res4.p, 3.604e-14));
995     assert(approxEqual(res4.betas, [-39.062, 61.272]));
996     assert(approxEqual(res4.p, [6.05e-9, 3.60e-14]));
997     assert(approxEqual(res4.R2, 0.9892));
998     assert(approxEqual(res4.adjustedR2, 0.9884));
999     assert(approxEqual(res4.residualError, 0.7591));
1000     assert(approxEqual(res4.lowerBound, [-45.40912, 57.43554]));
1001     assert(approxEqual(res4.upperBound, [-32.71479, 65.10883]));
1002
1003     // Test residuals.
1004     assert(approxEqual(residuals(res4.betas, weights, repeat(1), heights),
1005         [1.20184170, 0.27367611,  0.40823237, -0.06993322,  0.06462305,
1006          -0.40354255, -0.88170814,  -0.74715188, -0.76531747, -0.63076120,
1007          -0.65892680, -0.06437053, -0.08253613,  0.96202014,  1.39385455]));
1008
1009     // Test nonzero ridge parameters.
1010         // Values from R's MASS package.
1011     auto a = [1, 2, 3, 4, 5, 6, 7];
1012     auto b = [8, 6, 7, 5, 3, 0, 9];
1013     auto c = [2, 7, 1, 8, 2, 8, 1];
1014
1015     // With a ridge param. of zero, ridge regression reduces to regular
1016     // OLS regression.
1017     assert(approxEqual(linearRegressBeta(a, repeat(1), b, c, 0),
1018         linearRegressBeta(a, repeat(1), b, c)));
1019
1020     // Test the ridge regression. Values from R MASS package.
1021     auto ridge1 = linearRegressBeta(a, repeat(1), b, c, 1);
1022     auto ridge2 = linearRegressBeta(a, repeat(1), b, c, 2);
1023     auto ridge3 = linearRegressBeta(c, [[1,1,1,1,1,1,1], a, b], 10);
1024     assert(approxEqual(ridge1, [6.0357757, -0.2729671, -0.1337131]));
1025     assert(approxEqual(ridge2, [5.62367784, -0.22449854, -0.09775174]));
1026     assert(approxEqual(ridge3, [5.82653624, -0.05197246, -0.27185592 ]));
1027 }
1028
1029 private MeanSD[] calculateSummaries(X...)(X xIn) {
1030     // This is slightly wasteful because it sticks this shallow dup in
1031     // an unfreeable pos on TempAlloc.
1032     static if(X.length == 1 && isRoR!(X[0])) {
1033         auto ret = newStack!MeanSD(xIn[0].length);
1034         mixin(newFrame);
1035         auto x = tempdup(xIn[0]);
1036
1037         foreach(ref range; x) {
1038             range = range.save;
1039         }
1040     } else {
1041         auto ret = newStack!MeanSD(xIn.length);
1042         alias xIn x;
1043
1044         foreach(ti, R; X) {
1045             x[ti] = x[ti].save;
1046         }
1047     }
1048
1049     ret[] = MeanSD.init;
1050
1051     bool someEmpty() {
1052         foreach(range; x) {
1053             if(range.empty) return true;
1054         }
1055
1056         return false;
1057     }
1058
1059     void popAll() {
1060         foreach(ti, elem; x) {
1061             x[ti].popFront();
1062         }
1063     }
1064
1065     while(!someEmpty) {
1066         foreach(i, range; x) {
1067             ret[i].put(range.front);
1068         }
1069         popAll();
1070     }
1071
1072     return ret;
1073 }
1074
1075 private double softThresh(double z, double gamma) {
1076     if(gamma >= abs(z)) {
1077         return 0;
1078     } else if(z > 0) {
1079         return z - gamma;
1080     } else {
1081         return z + gamma;
1082     }
1083 }
1084
1085 private struct PreprocessedData {
1086     MeanSD[] xSumm;
1087     MeanSD ySumm;
1088     double[] y;
1089     double[][] x;
1090 }
1091
1092 private PreprocessedData preprocessStandardize(Y, X...)
1093 (Y yIn, X xIn) {
1094     static if(X.length == 1 && isRoR!(X[0])) {
1095         auto xRaw = tempdup(xIn[0]);
1096     } else {
1097         alias xIn xRaw;
1098     }
1099
1100     auto summaries = calculateSummaries(xRaw);
1101     immutable minLen = to!size_t(
1102         reduce!min(
1103             map!"a.N"(summaries)
1104         )
1105     );
1106
1107     auto x = newStack!(double[])(summaries.length);
1108     foreach(i, range; xRaw) {
1109         x[i] = tempdup(map!(to!double)(take(range, minLen)));
1110         x[i][] -= summaries[i].mean;
1111         x[i][] /= sqrt(summaries[i].mse);
1112     }
1113
1114     double[] y;
1115     MeanSD ySumm;
1116     if(yIn.length) {
1117         y = tempdup(map!(to!double)(take(yIn, minLen)));
1118         ySumm = meanStdev(y);
1119         y[] -= ySumm.mean;
1120     }
1121
1122     return PreprocessedData(summaries, ySumm, y, x);
1123 }
1124
1125 /**
1126 Performs lasso (L1) and/or ridge (L2) penalized linear regression.  Due to the
1127 way the data is standardized, no intercept term should be included in x
1128 (unlike linearRegress and linearRegressBeta).  The intercept coefficient is
1129 implicitly included and returned in the first element of the returned array.
1130 Usage is otherwise identical.
1131
1132 Note:  Setting lasso equal to zero is equivalent to performing ridge regression.
1133        This can also be done with linearRegressBeta.  However, the
1134        linearRegressBeta algorithm is optimized for memory efficiency and
1135        large samples.  This algorithm is optimized for large feature sets.
1136
1137 Returns:  The beta coefficients for the regression model.
1138
1139 References:
1140
1141 Friedman J, et al Pathwise coordinate optimization. Ann. Appl. Stat.
1142 2007;2:302-332.
1143
1144 Goeman, J. J., L1 penalized estimation in the Cox proportional hazards model.
1145 Biometrical Journal 52(1), 70{84.
1146
1147 Eilers, P., Boer, J., Van Ommen, G., Van Houwelingen, H. 2001 Classification of
1148 microarray data with penalized logistic regression. Proceedings of SPIE.
1149 Progress in Biomedical Optics and Images vol. 4266, pp. 187-198
1150 */
1151 double[] linearRegressPenalized(Y, X...)
1152 (Y yIn, X xIn, double lasso, double ridge) {
1153     mixin(newFrame);
1154
1155     auto preproc = preprocessStandardize(yIn, xIn);
1156
1157     auto summaries = preproc.xSumm;
1158     auto ySumm = preproc.ySumm;
1159     auto x = preproc.x;
1160     auto y = preproc.y;
1161
1162     auto betasFull = new double[x.length + 1];
1163     betasFull[] = 0;
1164     auto betas = betasFull[1..$];
1165
1166     if(lasso > 0) {
1167         coordDescent(y, x, betas, lasso, ridge, null);
1168     } else if(y.length > x.length) {
1169         // Correct for different )#*$# scaling conventions.
1170         foreach(i, feature; x) {
1171             feature[] /= sqrt(summaries[i].mse);
1172         }
1173
1174         linearRegressBetaBuf(betas, y, x, ridge);
1175
1176         // More correction for diff. scaling factors.
1177         foreach(i, ref b; betas) {
1178             b /= sqrt(summaries[i].mse);
1179         }
1180
1181     } else {
1182         ridgeLargeP(y, x, ridge, betas, null);
1183     }
1184
1185     foreach(i, ref elem; betas) {
1186         elem /= sqrt(summaries[i].mse);
1187     }
1188
1189     betasFull[0] = ySumm.mean;
1190     foreach(i, beta; betas) {
1191         betasFull[0] -= beta * summaries[i].mean;
1192     }
1193
1194     return betasFull;
1195 }
1196
1197 private void coordDescent
1198 (double[] y, double[][] x, double[] betas, double lasso, double ridge, double[] w) {
1199     mixin(newFrame);
1200
1201     auto predictions = newStack!double(y.length);
1202     predictions[] = 0;
1203
1204     void makePredictions() {
1205         foreach(j, beta; betas) {
1206             predictions[] += x[j][] * beta;
1207         }
1208     }
1209
1210     if(reduce!max(0.0, map!abs(betas)) > 0) {
1211         makePredictions();
1212     }
1213
1214     auto residuals = newStack!double(y.length);
1215
1216     uint iter = 0;
1217     enum maxIter = 10_000;
1218     enum relEpsilon = 1e-5;
1219     enum absEpsilon = 1e-10;
1220     immutable n = cast(double) y.length;
1221     auto perm = tempdup(iota(0U, x.length));
1222
1223     auto weightDots = newStack!double(x.length);
1224     if(w.length == 0) {
1225         ridge /= n;
1226         lasso /= n;
1227         weightDots[] = 1;
1228     } else {
1229         foreach(j, col; x) {
1230             weightDots[j] = dotProduct(w, map!"a * a"(col));
1231         }
1232     }
1233
1234     double doIter(double[] betas, double[][] x, double mul) {
1235 //        stderr.writeln("ITER:  ", betas, '\t', predictions);
1236         double maxRelError = 0;
1237         foreach(j, ref b; betas) {
1238             if(b == 0) {
1239                 residuals[] = (-predictions[] + y[]);
1240             } else {
1241                 residuals[] = (-predictions[] + x[j][] * b + y[]);
1242                 predictions[] -= x[j][] * b;
1243             }
1244
1245             double z;
1246             if(w.length) {
1247                 z = 0;
1248                 foreach(i, weight; w) {
1249                     z += weight * x[j][i] * residuals[i];
1250                 }
1251             } else {
1252                 residuals[] /= n;
1253                 z = dotProduct(residuals, x[j]);
1254             }
1255
1256             auto newB = softThresh(z, lasso * mul) /
1257                 (weightDots[j] + ridge * mul);
1258
1259             immutable absErr = abs(b - newB);
1260             immutable err = abs(b - newB) / max(abs(b), abs(newB));
1261
1262             if(absErr > absEpsilon) {
1263                 maxRelError = max(maxRelError, err);
1264             }
1265
1266             b = newB;
1267             if(b != 0) {
1268                 predictions[] += x[j][] * b;
1269             }
1270         }
1271
1272         return maxRelError;
1273     }
1274
1275     void toConvergence(double mul) {
1276         double maxRelErr = doIter(betas, x, mul);
1277         iter++;
1278         if(maxRelErr < relEpsilon) return;
1279
1280         static bool absGreater(double x, double y) { return abs(x) > abs(y); }
1281
1282         while(iter < maxIter) {
1283             size_t split = 0;
1284             while(split < betas.length && abs(betas[split]) > 0) {
1285                 split++;
1286             }
1287
1288             try {
1289                 qsort!absGreater(betas, x, perm, weightDots);
1290             } catch(SortException) {
1291                 betas[] = double.nan;
1292                 break;
1293             }
1294
1295             maxRelErr = double.infinity;
1296             for(; !(maxRelErr < relEpsilon) && split < betas.length
1297             && iter < maxIter; iter++) {
1298                 maxRelErr = doIter(betas[0..split], x[0..split], mul);
1299             }
1300
1301
1302             maxRelErr = doIter(betas, x, mul);
1303             iter++;
1304             if(maxRelErr < relEpsilon) break;
1305         }
1306
1307         //stderr.writefln("Converged in %s iterations for %s mult.", iter, mul);
1308     }
1309
1310     toConvergence(1);
1311     try {
1312         qsort(perm, x, betas);
1313     } catch(SortException) {
1314         return;
1315     }
1316 }
1317
1318 // Compute(X X')' = C.
1319 double[][] makeC(double[][] x) {
1320     if(x.length == 0) return null;
1321     immutable n = x[0].length;
1322     auto c = newStack!(double[])(n);
1323
1324     // Only need lower half.
1325     foreach(i, ref elem; c) {
1326         elem = newStack!double(i + 1);
1327         elem[] = 0;
1328     }
1329
1330     foreach(col; x) {
1331         foreach(i; 0..n) {
1332             foreach(j; i..n) {
1333                 c[j][i] += col[i] * col[j];
1334             }
1335         }
1336     }
1337
1338     return c;
1339 }
1340
1341 private double[][] doCTWC(double[][] c, double[] w = null) {
1342     auto ret = newStack!(double[])(c.length);
1343     foreach(i, ref elem; ret) elem = newStack!double(c.length);
1344
1345     double getElem(size_t i, size_t j) {
1346         return (i >= j) ? c[i][j] : c[j][i];
1347     }
1348
1349     foreach(i; 0..c.length) foreach(j; 0..i + 1) {
1350         ret[i][j] = 0;
1351
1352         foreach(k; 0..c.length) {
1353             auto toAdd = getElem(i, k) * getElem(j, k);
1354             if(w.length) toAdd *= w[k];
1355             ret[i][j] += toAdd;
1356         }
1357     }
1358
1359     symmetrize(ret);
1360     return ret;
1361 }
1362
1363 // An implementation of ridge regression for large dimension.
1364 private void ridgeLargeP
1365 (double[] yIn, double[][] x, double lambdaIn, double[] betas, double[] w = null) {
1366     {
1367         mixin(newFrame);
1368         auto y = tempdup(yIn);
1369         auto c = makeC(x);
1370
1371         // This scaling preserves numerical stability when the distrib. of
1372         // x has a long tail.
1373         auto maxElem = reduce!max(0.0, map!abs(joiner(c)));
1374         foreach(col; c) col[] /= maxElem;
1375         y[] /= (maxElem);
1376         auto lambda = lambdaIn / maxElem;
1377         auto cwc = doCTWC(c, w);
1378
1379         foreach(i, row; c) foreach(j, elem; row) {
1380             cwc[j][i] += lambda * elem;
1381             if(i != j) cwc[i][j] += lambda * elem;
1382         }
1383
1384         // Multiply c' * y.  c is symmetric, so it doesn't matter that I'm really
1385         // multiplying the transpose.
1386         if(w.length) y[] *= w[];
1387         auto cTy = newStack!double(y.length);
1388         cTy[] = 0;
1389
1390         foreach(i; 0..c.length) foreach(j; 0..c.length) {
1391             if(i >= j) {
1392                 cTy[j] += y[i] * c[i][j];
1393             } else {
1394                 cTy[j] += y[i] * c[j][i];
1395             }
1396         }
1397
1398         solve(cwc, cTy);
1399
1400         // Multiply X' * csi to get answer.
1401         betas[] = 0;
1402         foreach(i, col; x) {
1403             betas[i] = dotProduct(cTy, col);
1404         }
1405     }
1406
1407     static bool notFinite(double num) { return !isFinite(num); }
1408
1409     // In a few pathological cases this algo ends up singular to machine
1410     // precision, and NaNs or infs up in betas.  Fall back to solving the
1411     // normal equations directly.
1412     if(filter!notFinite(betas).empty) return;
1413     version(unittest) stderr.writeln("SINGULAR");
1414     ridgeFallback(yIn, x, lambdaIn, betas, w);
1415 }
1416
1417 // Used if we end up singular in ridgeLargeP.
1418 private void ridgeFallback
1419 (double[] yIn, double[][] x, double lambda, double[] betas, double[] w) {
1420     mixin(newFrame);
1421
1422     // Compute xTx or xT * w * x depending on whether w is null.
1423     auto xTx = newStack!(double[])(x.length);
1424     foreach(ref row; xTx) row = newStack!double(x.length);
1425
1426     foreach(i, xi; x) foreach(j, xj; x[0..i + 1]) {
1427         xTx[i][j] = 0;
1428         foreach(k; 0..xi.length) {
1429             immutable weight = (w.length) ? w[k] : 1.0;
1430             xTx[i][j] += xi[k] * weight * xj[k];
1431         }
1432     }
1433
1434     symmetrize(xTx);
1435
1436     double[] y;
1437     if(w.length) {
1438         y = tempdup(yIn);
1439         y[] *= w[];
1440     } else {
1441         y = yIn;
1442     }
1443
1444     auto xTy = newStack!double(x.length);
1445     foreach(i, ref val; xTy) {
1446         val = dotProduct(y, x[i]);
1447     }
1448
1449     // Add in the penalty.
1450     foreach(i; 0..xTx.length) {
1451         xTx[i][i] += lambda;
1452     }
1453
1454     choleskySolve(xTx, xTy, betas);
1455 }
1456
1457 unittest {
1458     // Test ridge regression.  We have three impls for all kinds of diff.
1459     // scenarios.  See if they all agree.  Note that the ridiculously small but
1460     // nonzero lasso param is to force the use of the coord descent algo.
1461     auto y = new double[12];
1462     auto x = new double[][16];
1463     foreach(ref elem; x) elem = new double[12];
1464     x[0][] = 1;
1465     auto gen = Random(31415);  // For random but repeatable results.
1466
1467     foreach(iter; 0..1000) {
1468         foreach(col; x[1..$]) foreach(ref elem; col) elem = rNorm(0, 1, gen);
1469         foreach(ref elem; y) elem = rNorm(0, 1, gen);
1470         immutable ridge = uniform(0.1, 10.0, gen);
1471
1472         auto normalEq = linearRegressBeta(y, x, ridge);
1473         auto coordDescent = linearRegressPenalized(
1474             y, x[1..$], double.min_normal, ridge);
1475         auto linalgTrick = linearRegressPenalized(y, x[1..$], 0, ridge);
1476
1477         // Every once in a blue moon coordinate descent doesn't converge that
1478         // well.  These small errors are of no practical significance, hence
1479         // the wide tolerance.  However, if the direct normal equations
1480         // and linalg trick don't agree extremely closely, then something's
1481         // fundamentally wrong.
1482         assert(approxEqual(normalEq, coordDescent, 0.02, 1e-4), text(
1483             normalEq, coordDescent));
1484         assert(approxEqual(linalgTrick, coordDescent, 0.02, 1e-4), text(
1485             linalgTrick, coordDescent));
1486         assert(approxEqual(normalEq, linalgTrick, 1e-6, 1e-8), text(
1487             normalEq, linalgTrick));
1488     }
1489
1490     // Make sure fallback and largeP algo get same answers.
1491     auto fb = new double[x.length];
1492     auto lp = new double[x.length];
1493     ridgeFallback(y, x, 1, fb, null);
1494     ridgeLargeP(y, x, 1, lp);
1495     assert(approxEqual(fb, lp));
1496
1497     // With weights.
1498     auto w = randArray!uniform(y.length, 0.1, 1, gen);
1499     ridgeFallback(y, x, 1, fb, w);
1500     ridgeLargeP(y, x, 1, lp, w);
1501     assert(approxEqual(fb, lp));
1502
1503     // Test stuff that's got some lasso in it.  Values from R's Penalized
1504     // package.
1505     y = [1.0, 2.0, 3, 4, 5, 6, 7];
1506     x = [[8.0, 6, 7, 5, 3, 0, 9],
1507          [3.0, 6, 2, 4, 3, 6, 8],
1508          [3.0, 1, 4, 1, 5, 9, 2],
1509          [2.0, 7, 1, 8, 2, 8, 1]];
1510
1511     assert(approxEqual(linearRegressPenalized(y, x, 1, 0),
1512         [4.16316, -0.3603197, 0.6308278, 0, -0.2633263]));
1513     assert(approxEqual(linearRegressPenalized(y, x, 1, 3),
1514         [2.519590, -0.09116883, 0.38067757, 0.13122413, -0.05637939]));
1515     assert(approxEqual(linearRegressPenalized(y, x, 2, 0.1),
1516         [1.247235, 0, 0.4440735, 0.2023602, 0]));
1517     assert(approxEqual(linearRegressPenalized(y, x, 5, 7),
1518         [3.453787, 0, 0.10968736, 0.01253992, 0]));
1519 }
1520
1521 /**
1522 Computes a logistic regression using a maximum likelihood estimator
1523 and returns the beta coefficients.  This is a generalized linear model with
1524 the link function f(XB) = 1 / (1 + exp(XB)). This is generally used to model
1525 the probability that a binary Y variable is 1 given a set of X variables.
1526
1527 For the purpose of this function, Y variables are interpreted as Booleans,
1528 regardless of their type.  X may be either a range of ranges or a tuple of
1529 ranges.  However, note that unlike in linearRegress, they are copied to an
1530 array if they are not random access ranges.  Note that each value is accessed
1531 several times, so if your range is a map to something expensive, you may
1532 want to evaluate it eagerly.
1533
1534 If the last parameter passed in is a numeric value instead of a range,
1535 it is interpreted as a ridge parameter and ridge regression is performed.  This
1536 penalizes the L2 norm of the beta vector (in a scaled space) and results
1537 in more parsimonious models.  It limits the usefulness of inference techniques
1538 (p-values, confidence intervals), however, and is therefore not offered
1539 in logisticRegres().
1540
1541 If no ridge parameter is passed, or equivalenty if the ridge parameter is
1542 zero, then ordinary maximum likelihood regression is performed.
1543
1544 Note that, while this implementation of ridge regression was tested against
1545 the R Design Package implementation, it uses slightly different conventions
1546 that make the results not comparable without transformation.  dstats uses a
1547 biased estimate of the variance to scale the beta vector penalties, while
1548 Design uses an unbiased estimate.  Furthermore, Design penalizes by 1/2 of the
1549 L2 norm, whereas dstats penalizes by the L2 norm.  Therefore, if n is the
1550 sample size, and lambda is the penalty used with dstats, the proper penalty
1551 to use in Design to get the same results is 2 * (n - 1) * lambda / n.
1552
1553 Also note that, as in linearRegress, repeat(1) can be used for the intercept
1554 term.
1555
1556 Returns:  The beta coefficients for the regression model.
1557
1558 References:
1559
1560 http://en.wikipedia.org/wiki/Logistic_regression
1561
1562 http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf
1563
1564 S. Le Cessie and J. C. Van Houwelingen.  Ridge Estimators in Logistic
1565 Regression.  Journal of the Royal Statistical Society. Series C
1566 (Applied Statistics), Vol. 41, No. 1(1992), pp. 191-201
1567
1568 Frank E Harrell Jr (2009). Design: Design Package. R package version 2.3-0.
1569 http://CRAN.R-project.org/package=Design
1570  */
1571 double[] logisticRegressBeta(T, U...)(T yIn, U xRidge) {
1572     static if(isFloatingPoint!(U[$ - 1]) || isIntegral!(U[$ - 1])) {
1573         alias xRidge[$ - 1] ridge;
1574         alias xRidge[0..$ - 1] xIn;
1575     } else {
1576         enum double ridge = 0.0;
1577         alias xRidge xIn;
1578     }
1579
1580     return logisticRegressImpl(false, ridge, yIn, xIn).betas;
1581 }
1582
1583 /**
1584 Plain old data struct to hold the results of a logistic regression.
1585 */
1586 struct LogisticRes {
1587     /**The coefficients, one for each range in X.  These will be in the order
1588      * that the X ranges were passed in.*/
1589     double[] betas;
1590
1591     /**The standard error terms of the X ranges passed in.*/
1592     double[] stdErr;
1593
1594     /**
1595     The Wald lower confidence bounds of the beta terms, at the confidence level
1596     specificied.  (Default 0.95).*/
1597     double[] lowerBound;
1598
1599     /**
1600     The Wald upper confidence bounds of the beta terms, at the confidence level
1601     specificied.  (Default 0.95).*/
1602     double[] upperBound;
1603
1604     /**
1605     The P-value for the alternative that the corresponding beta value is
1606     different from zero against the null that it is equal to zero.  These
1607     are calculated using the Wald Test.*/
1608     double[] p;
1609
1610     /**
1611     The log likelihood for the null model.
1612     */
1613     double nullLogLikelihood;
1614
1615     /**
1616     The log likelihood for the model fit.
1617     */
1618     double logLikelihood;
1619
1620     /**
1621     Akaike Information Criterion, which is a complexity-penalized goodness-
1622     of-fit score, equal to 2 * k - 2 log(L) where L is the log likelihood and
1623     k is the number of parameters.
1624     */
1625     double aic() const pure nothrow @property @safe {
1626         return 2 * (betas.length - logLikelihood);
1627     }
1628
1629     /**
1630     The P-value for the model as a whole, based on the likelihood ratio test.
1631     The null here is that the model has no predictive value, the alternative
1632     is that it does have predictive value.*/
1633     double overallP;
1634
1635     // Just used internally.
1636     private static string arrStr(T)(T arr) {
1637         return text(arr)[1..$ - 1];
1638     }
1639
1640     /**Print out the results in the default format.*/
1641     string toString() {
1642         return "Betas:  " ~ arrStr(betas) ~ "\nLower Conf. Int.:  " ~
1643             arrStr(lowerBound) ~ "\nUpper Conf. Int.:  " ~ arrStr(upperBound) ~
1644             "\nStd. Err:  " ~ arrStr(stdErr) ~ "\nP Values:  " ~ arrStr(p) ~
1645             "\nNull Log Likelihood:  " ~ text(nullLogLikelihood) ~
1646             "\nLog Likelihood:  " ~ text(logLikelihood) ~
1647             "\nAIC:  " ~ text(aic) ~
1648             "\nOverall P:  " ~ text(overallP);
1649     }
1650 }
1651
1652 /**
1653 Similar to logisticRegressBeta, but returns a LogisticRes with useful stuff for
1654 statistical inference.  If the last element of input is a floating point
1655 number instead of a range, it is used to specify the confidence interval
1656 calculated.  Otherwise, the default of 0.95 is used.
1657
1658 References:
1659
1660 http://en.wikipedia.org/wiki/Wald_test
1661 http://en.wikipedia.org/wiki/Akaike_information_criterion
1662 */
1663 LogisticRes logisticRegress(T, V...)(T yIn, V input) {
1664     return logisticRegressImpl!(T, V)(true, 0, yIn, input);
1665 }
1666
1667 unittest {
1668     // Values from R.  Confidence intervals from confint.default().
1669     // R doesn't automatically calculate likelihood ratio P-value, and reports
1670     // deviations instead of log likelihoods.  Deviations are just
1671     // -2 * likelihood.
1672     alias approxEqual ae;  // Save typing.
1673
1674     // Start with the basics, with X as a ror.
1675     auto y1 =  [1,   0, 0, 0, 1, 0, 0];
1676     auto x1 = [[1.0, 1 ,1 ,1 ,1 ,1 ,1],
1677               [8.0, 6, 7, 5, 3, 0, 9]];
1678     auto res1 = logisticRegress(y1, x1);
1679     assert(ae(res1.betas[0], -0.98273));
1680     assert(ae(res1.betas[1], 0.01219));
1681     assert(ae(res1.stdErr[0], 1.80803));
1682     assert(ae(res1.stdErr[1], 0.29291));
1683     assert(ae(res1.p[0], 0.587));
1684     assert(ae(res1.p[1], 0.967));
1685     assert(ae(res1.aic, 12.374));
1686     assert(ae(res1.logLikelihood, -0.5 * 8.3758));
1687     assert(ae(res1.nullLogLikelihood, -0.5 * 8.3740));
1688     assert(ae(res1.lowerBound[0], -4.5264052));
1689     assert(ae(res1.lowerBound[1], -0.5618933));
1690     assert(ae(res1.upperBound[0], 2.560939));
1691     assert(ae(res1.upperBound[1], 0.586275));
1692
1693     // Use tuple.
1694     auto y2   = [1,0,1,1,0,1,0,0,0,1,0,1];
1695     auto x2_1 = [3,1,4,1,5,9,2,6,5,3,5,8];
1696     auto x2_2 = [2,7,1,8,2,8,1,8,2,8,4,5];
1697     auto res2 = logisticRegress(y2, repeat(1), x2_1, x2_2);
1698     assert(ae(res2.betas[0], -1.1875));
1699     assert(ae(res2.betas[1], 0.1021));
1700     assert(ae(res2.betas[2], 0.1603));
1701     assert(ae(res2.stdErr[0], 1.5430));
1702     assert(ae(res2.stdErr[1], 0.2507));
1703     assert(ae(res2.stdErr[2], 0.2108));
1704     assert(ae(res2.p[0], 0.442));
1705     assert(ae(res2.p[1], 0.684));
1706     assert(ae(res2.p[2], 0.447));
1707     assert(ae(res2.aic, 21.81));
1708     assert(ae(res2.nullLogLikelihood, -0.5 * 16.636));
1709     assert(ae(res2.logLikelihood, -0.5 * 15.810));
1710     assert(ae(res2.lowerBound[0], -4.2116584));
1711     assert(ae(res2.lowerBound[1], -0.3892603));
1712     assert(ae(res2.lowerBound[2], -0.2528110));
1713     assert(ae(res2.upperBound[0], 1.8366823));
1714     assert(ae(res2.upperBound[1], 0.5934631));
1715     assert(ae(res2.upperBound[2], 0.5733693));
1716
1717     auto x2Intercept = [1,1,1,1,1,1,1,1,1,1,1,1];
1718     auto res2a = logisticRegress(y2,
1719         filter!"a.length"([x2Intercept, x2_1, x2_2]));
1720     foreach(ti, elem; res2a.tupleof) {
1721         assert(ae(elem, res2.tupleof[ti]));
1722     }
1723
1724     // Use a huge range of values to test numerical stability.
1725
1726     // The filter is to make y3 a non-random access range.
1727     auto y3 = filter!"a < 2"([1,1,1,1,0,0,0,0]);
1728     auto x3_1 = filter!"a > 0"([1, 1e10, 2, 2e10, 3, 3e15, 4, 4e7]);
1729     auto x3_2 = [1e8, 1e6, 1e7, 1e5, 1e3, 1e0, 1e9, 1e11];
1730     auto x3_3 = [-5e12, 5e2, 6e5, 4e3, -999999, -666, -3e10, -2e10];
1731     auto res3 = logisticRegress(y3, repeat(1), x3_1, x3_2, x3_3, 0.99);
1732     assert(ae(res3.betas[0], 1.115e0));
1733     assert(ae(res3.betas[1], -4.674e-15));
1734     assert(ae(res3.betas[2], -7.026e-9));
1735     assert(ae(res3.betas[3], -2.109e-12));
1736     assert(ae(res3.stdErr[0], 1.158));
1737     assert(ae(res3.stdErr[1], 2.098e-13));
1738     assert(ae(res3.stdErr[2], 1.878e-8));
1739     assert(ae(res3.stdErr[3], 4.789e-11));
1740     assert(ae(res3.p[0], 0.336));
1741     assert(ae(res3.p[1], 0.982));
1742     assert(ae(res3.p[2], 0.708));
1743     assert(ae(res3.p[3], 0.965));
1744     assert(ae(res3.aic, 12.544));
1745     assert(ae(res3.nullLogLikelihood, -0.5 * 11.0904));
1746     assert(ae(res3.logLikelihood, -0.5 * 4.5442));
1747     // Not testing confidence intervals b/c they'd be so buried in numerical
1748     // fuzz.
1749
1750
1751     // Test with a just plain huge dataset that R chokes for several minutes
1752     // on.  If you think this unittest is slow, try getting the reference
1753     // values from R.
1754     auto y4 = chain(
1755                 take(cycle([0,0,0,0,1]), 500_000),
1756                 take(cycle([1,1,1,1,0]), 500_000));
1757     auto x4_1 = iota(0, 1_000_000);
1758     auto x4_2 = array(map!(compose!(exp, "a / 1_000_000.0"))(x4_1));
1759     auto x4_3 = take(cycle([1,2,3,4,5]), 1_000_000);
1760     auto x4_4 = take(cycle([8,6,7,5,3,0,9]), 1_000_000);
1761     auto res4 = logisticRegress(y4, repeat(1), x4_1, x4_2, x4_3, x4_4, 0.99);
1762     assert(ae(res4.betas[0], -1.574));
1763     assert(ae(res4.betas[1], 5.625e-6));
1764     assert(ae(res4.betas[2], -7.282e-1));
1765     assert(ae(res4.betas[3], -4.381e-6));
1766     assert(ae(res4.betas[4], -8.343e-6));
1767     assert(ae(res4.stdErr[0], 3.693e-2));
1768     assert(ae(res4.stdErr[1], 7.188e-8));
1769     assert(ae(res4.stdErr[2], 4.208e-2));
1770     assert(ae(res4.stdErr[3], 1.658e-3));
1771     assert(ae(res4.stdErr[4], 8.164e-4));
1772     assert(ae(res4.p[0], 0));
1773     assert(ae(res4.p[1], 0));
1774     assert(ae(res4.p[2], 0));
1775     assert(ae(res4.p[3], 0.998));
1776     assert(ae(res4.p[4], 0.992));
1777     assert(ae(res4.aic, 1089339));
1778     assert(ae(res4.nullLogLikelihood, -0.5 * 1386294));
1779     assert(ae(res4.logLikelihood, -0.5 * 1089329));
1780     assert(ae(res4.lowerBound[0], -1.668899));
1781     assert(ae(res4.lowerBound[1], 5.439787e-6));
1782     assert(ae(res4.lowerBound[2], -0.8366273));
1783     assert(ae(res4.lowerBound[3], -4.27406e-3));
1784     assert(ae(res4.lowerBound[4], -2.111240e-3));
1785     assert(ae(res4.upperBound[0], -1.478623));
1786     assert(ae(res4.upperBound[1], 5.810089e-6));
1787     assert(ae(res4.upperBound[2], -6.198418e-1));
1788     assert(ae(res4.upperBound[3], 4.265302e-3));
1789     assert(ae(res4.upperBound[4], 2.084554e-3));
1790
1791     // Test ridge stuff.
1792     auto ridge2 = logisticRegressBeta(y2, repeat(1), x2_1, x2_2, 3);
1793     assert(ae(ridge2[0], -0.40279319));
1794     assert(ae(ridge2[1], 0.03575638));
1795     assert(ae(ridge2[2], 0.05313875));
1796
1797     auto ridge2_2 = logisticRegressBeta(y2, repeat(1), x2_1, x2_2, 2);
1798     assert(ae(ridge2_2[0], -0.51411490));
1799     assert(ae(ridge2_2[1], 0.04536590));
1800     assert(ae(ridge2_2[2], 0.06809964));
1801 }
1802
1803 /// The logistic function used in logistic regression.
1804 double logistic(double xb) pure nothrow @safe {
1805     return 1.0 / (1 + exp(-xb));
1806 }
1807
1808 /**
1809 Performs lasso (L1) and/or ridge (L2) penalized logistic regression.  Due to the
1810 way the data is standardized, no intercept term should be included in x
1811 (unlike logisticRegress and logisticRegressBeta).  The intercept coefficient is
1812 implicitly included and returned in the first element of the returned array.
1813 Usage is otherwise identical.
1814
1815 Note:  Setting lasso equal to zero is equivalent to performing ridge regression.
1816        This can also be done with logisticRegressBeta.  However, the
1817        logisticRegressBeta algorithm is optimized for memory efficiency and
1818        large samples.  This algorithm is optimized for large feature sets.
1819
1820 Returns:  The beta coefficients for the regression model.
1821
1822 References:
1823
1824 Friedman J, et al Pathwise coordinate optimization. Ann. Appl. Stat.
1825 2007;2:302-332.
1826
1827 Goeman, J. J., L1 penalized estimation in the Cox proportional hazards model.
1828 Biometrical Journal 52(1), 70{84.
1829
1830 Eilers, P., Boer, J., Van Ommen, G., Van Houwelingen, H. 2001 Classification of
1831 microarray data with penalized logistic regression. Proceedings of SPIE.
1832 Progress in Biomedical Optics and Images vol. 4266, pp. 187-198
1833 */
1834 double[] logisticRegressPenalized(Y, X...)
1835 (Y yIn, X xIn, double lasso, double ridge) {
1836     mixin(newFrame);
1837
1838     static assert(!isInfinite!Y, "Can't do regression with infinite # of Y's.");
1839     static if(isRandomAccessRange!Y) {
1840         alias yIn y;
1841     } else {
1842         auto y = toBools(yIn);
1843     }
1844
1845     static if(X.length == 1 && isRoR!X) {
1846         enum bool tupleMode = false;
1847         static if(isForwardRange!X) {
1848             auto x = toRandomAccessRoR(y.length, xIn);
1849         } else {
1850             auto x = toRandomAccessRoR(y.length, tempdup(xIn));
1851         }
1852     } else {
1853         enum bool tupleMode = true;
1854         auto x = toRandomAccessTuple(xIn).expand;
1855     }
1856
1857     auto betas = new double[x.length + 1];
1858     if(y.length >= x.length && lasso == 0) {
1859         // Add intercept term.
1860         static if(tupleMode) {
1861             doMLENewton(betas, (double[]).init, ridge, y, repeat(1), x);
1862         } else static if(is(x == double[][])) {
1863             auto xInt = newStack!(double[])(betas.length);
1864             xInt[1..$] = x[];
1865             xInt[0] = tempdup(replicate(1.0, y.length));
1866             doMLENewton(betas, (double[]).init, ridge, y, xInt);
1867         } else {
1868             // No choice but to dup the whole thing.
1869             auto xInt = newStack!(double[])(betas.length);
1870             xInt[0] = tempdup(replicate(1.0, y.length));
1871
1872             foreach(i, ref arr; xInt[1..$]) {
1873                 arr = tempdup(
1874                     map!(to!double)(take(x[i], y.length))
1875                 );
1876             }
1877
1878             doMLENewton(betas, (double[]).init, ridge, y, xInt);
1879         }
1880
1881     } else {
1882         logisticRegressPenalizedImpl(betas, lasso, ridge, y, x);
1883     }
1884
1885     return betas;
1886 }
1887
1888 unittest {
1889     // Test ridge regression.  We have three impls for all kinds of diff.
1890     // scenarios.  See if they all agree.  Note that the ridiculously small but
1891     // nonzero lasso param is to force the use of the coord descent algo.
1892     auto y = new bool[12];
1893     auto x = new double[][16];
1894     foreach(ref elem; x) elem = new double[12];
1895     x[0][] = 1;
1896     auto gen = Random(31415);  // For random but repeatable results.
1897
1898     foreach(iter; 0..1000) {
1899         foreach(col; x[1..$]) foreach(ref elem; col) elem = rNorm(0, 1, gen);
1900
1901         // Nothing will converge if y is all true's or all false's.
1902         size_t trueCount;
1903         do {
1904             foreach(ref elem; y) elem = cast(bool) rBernoulli(0.5, gen);
1905             trueCount = count!"a"(y);
1906         } while(trueCount == 0 || trueCount == y.length);
1907
1908         immutable ridge = uniform(0.1, 10.0, gen);
1909
1910         auto normalEq = logisticRegressBeta(y, x, ridge);
1911         auto coordDescent = logisticRegressPenalized(
1912             y, x[1..$], double.min_normal, ridge);
1913         auto linalgTrick = logisticRegressPenalized(y, x[1..$], 0, ridge);
1914
1915         // Every once in a blue moon coordinate descent doesn't converge that
1916         // well.  These small errors are of no practical significance, hence
1917         // the wide tolerance.  However, if the direct normal equations
1918         // and linalg trick don't agree extremely closely, then something's
1919         // fundamentally wrong.
1920         assert(approxEqual(normalEq, coordDescent, 0.02, 1e-4), text(
1921             normalEq, coordDescent));
1922         assert(approxEqual(linalgTrick, coordDescent, 0.02, 1e-4), text(
1923             linalgTrick, coordDescent));
1924         assert(approxEqual(normalEq, linalgTrick, 1e-6, 1e-8), text(
1925             normalEq, linalgTrick));
1926     }
1927
1928     assert(approxEqual(logisticRegressBeta(y, x[0], x[1], x[2]),
1929         logisticRegressPenalized(y, x[1], x[2], 0, 0)));
1930     assert(approxEqual(logisticRegressBeta(y, [x[0], x[1], x[2]]),
1931         logisticRegressPenalized(y, [x[1], x[2]], 0, 0)));
1932     assert(approxEqual(logisticRegressBeta(y, [x[0], x[1], x[2]]),
1933         logisticRegressPenalized(y,
1934         [to!(float[])(x[1]), to!(float[])(x[2])], 0, 0)));
1935
1936     // Make sure the adding intercept stuff is right for the Newton path.
1937     //assert(logisticRegressBeta(x[0], x[1], x[2]) ==
1938
1939     // Test stuff that's got some lasso in it.  Values from R's Penalized
1940     // package.
1941     y = [1, 0, 0, 1, 1, 1, 0];
1942     x = [[8.0, 6, 7, 5, 3, 0, 9],
1943          [3.0, 6, 2, 4, 3, 6, 8],
1944          [3.0, 1, 4, 1, 5, 9, 2],
1945          [2.0, 7, 1, 8, 2, 8, 1]];
1946
1947     // Values from R's Penalized package.  Note that it uses a convention for
1948     // the ridge parameter such that Penalized ridge = 2 * dstats ridge.
1949     assert(approxEqual(logisticRegressPenalized(y, x, 1, 0),
1950         [1.642080, -0.22086515, -0.02587546,  0.00000000, 0.00000000 ]));
1951     assert(approxEqual(logisticRegressPenalized(y, x, 1, 3),
1952         [0.5153373, -0.04278257, -0.00888014,  0.01316831,  0.00000000]));
1953     assert(approxEqual(logisticRegressPenalized(y, x, 2, 0.1),
1954         [0.2876821, 0, 0., 0., 0]));
1955     assert(approxEqual(logisticRegressPenalized(y, x, 1.2, 7),
1956         [0.367613 , -0.017227631, 0.000000000, 0.003875104, 0.000000000]));
1957 }
1958
1959 // Scheduled for deprecation.  This was a terrble name choice.
1960 alias logistic inverseLogit;
1961
1962 private:
1963 double absMax(double a, double b) {
1964     return max(abs(a), abs(b));
1965 }
1966
1967 LogisticRes logisticRegressImpl(T, V...)
1968 (bool inference, double ridge, T yIn, V input) {
1969     mixin(newFrame);
1970
1971     static if(isFloatingPoint!(V[$ - 1])) {
1972         alias input[$ - 1] conf;
1973         alias V[0..$ - 1] U;
1974         alias input[0..$ - 1] xIn;
1975         enforceConfidence(conf);
1976     } else {
1977         alias V U;
1978         alias input xIn;
1979         enum conf = 0.95;
1980     }
1981
1982     static assert(!isInfinite!T, "Can't do regression with infinite # of Y's.");
1983     static if(isRandomAccessRange!T) {
1984         alias yIn y;
1985     } else {
1986         auto y = toBools(yIn);
1987     }
1988
1989     static if(U.length == 1 && isRoR!U) {
1990         static if(isForwardRange!U) {
1991             auto x = toRandomAccessRoR(y.length, xIn);
1992         } else {
1993             auto x = toRandomAccessRoR(y.length, tempdup(xIn));
1994         }
1995     } else {
1996         auto x = toRandomAccessTuple(xIn).expand;
1997     }
1998
1999     typeof(return) ret;
2000     ret.betas.length = x.length;
2001     if(inference) ret.stdErr.length = x.length;
2002     ret.logLikelihood = doMLENewton(ret.betas, ret.stdErr, ridge, y, x);
2003
2004     if(!inference) return ret;
2005
2006     static bool hasNaNs(R)(R range) {
2007         return !filter!isNaN(range).empty;
2008     }
2009
2010     if(isNaN(ret.logLikelihood) || hasNaNs(ret.betas) || hasNaNs(ret.stdErr)) {
2011         // Then we didn't converge or our data was defective.
2012         return ret;
2013     }
2014
2015     ret.nullLogLikelihood = .priorLikelihood(y);
2016     double lratio = ret.logLikelihood - ret.nullLogLikelihood;
2017
2018     // Compensate for numerical fuzz.
2019     if(lratio < 0 && lratio > -1e-5) lratio = 0;
2020     if(lratio > 0) {
2021         ret.overallP = chiSquareCDFR(2 * lratio, x.length - 1);
2022     }
2023
2024     ret.p.length = x.length;
2025     ret.lowerBound.length = x.length;
2026     ret.upperBound.length = x.length;
2027     immutable nDev = -invNormalCDF((1 - conf) / 2);
2028     foreach(i; 0..x.length) {
2029         ret.p[i] = 2 * normalCDF(-abs(ret.betas[i]) / ret.stdErr[i]);
2030         ret.lowerBound[i] = ret.betas[i] - nDev * ret.stdErr[i];
2031         ret.upperBound[i] = ret.betas[i] + nDev * ret.stdErr[i];
2032     }
2033
2034     return ret;
2035 }
2036
2037 private void logisticRegressPenalizedImpl(Y, X...)
2038 (double[] betas, double lasso, double ridge, Y y, X xIn) {
2039     static if(isRoR!(X[0]) && X.length == 1) {
2040         alias xIn[0] x;
2041     } else {
2042         alias xIn x;
2043     }
2044
2045     auto ps = newStack!double(y.length);
2046     betas[] = 0;
2047
2048     mixin(newFrame);
2049     auto betasRaw = tempdup(betas[1..$]);
2050
2051     double oldLikelihood = -double.infinity;
2052     double oldPenalty2 = double.infinity;
2053     double oldPenalty1 = double.infinity;
2054     enum eps = 1e-6;
2055     enum maxIter = 1000;
2056
2057     auto weights = newStack!double(y.length);
2058     auto z = newStack!double(y.length);
2059     auto xMeans = newStack!double(x.length);
2060     auto xSds = newStack!double(x.length);
2061     double zMean = 0;
2062     auto xCenterScale = newStack!(double[])(x.length);
2063     foreach(ref col; xCenterScale) col = newStack!double(y.length);
2064
2065     // Puts x in xCenterScale, with weighted mean subtracted and weighted
2066     // biased stdev divided.  Also standardizes z similarly.
2067     //
2068     // Returns:  true if successful, false if weightSum is so small that the
2069     //           algorithm has converged for all practical purposes.
2070     bool doCenterScale() {
2071         immutable weightSum = sum(weights);
2072         if(weightSum < eps) return false;
2073
2074         xMeans[] = 0;
2075         zMean = 0;
2076         xSds[] = 0;
2077
2078         // Do copying.
2079         foreach(i, col; x) {
2080             copy(take(col, y.length), xCenterScale[i]);
2081         }
2082
2083         // Compute weighted means.
2084         foreach(j, col; xCenterScale) {
2085             foreach(i, w; weights) {
2086                 xMeans[j] += w * col[i];
2087             }
2088         }
2089
2090         foreach(i, w; weights) {
2091             zMean += w * z[i];
2092         }
2093
2094         xMeans[] /= weightSum;
2095         zMean /= weightSum;
2096         z[] -= zMean;
2097         foreach(i, col; xCenterScale) col[] -= xMeans[i];
2098
2099         // Compute biased stdevs.
2100         foreach(j, ref sd; xSds) {
2101             sd = sqrt(meanStdev(xCenterScale[j]).mse);
2102         }
2103
2104         foreach(i, col; xCenterScale) col[] /= xSds[i];
2105         return true;
2106     }
2107
2108     // Rescales the beta coefficients to undo the effects of standardizing.
2109     void rescaleBetas() {
2110         betas[0] = zMean;
2111         foreach(i, b; betasRaw) {
2112             betas[i + 1] = b / xSds[i];
2113             betas[0] -= betas[i + 1] * xMeans[i];
2114         }
2115     }
2116
2117     foreach(iter; 0..maxIter) {
2118         evalPs(betas[0], ps, betas[1..$], x);
2119         immutable lh = logLikelihood(ps, y);
2120         immutable penalty2 = ridge * reduce!"a + b * b"(0.0, betas);
2121         immutable penalty1 = lasso * reduce!"a + (b < 0) ? -b : b"(0.0, betas);
2122
2123         if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps
2124         && (oldPenalty2 - penalty2) / (absMax(penalty2, oldPenalty2) + 0.1) < eps
2125         && (oldPenalty1 - penalty1) / (absMax(penalty1, oldPenalty1) + 0.1) < eps) {
2126             return;
2127         } else if(isNaN(lh) || isNaN(penalty2) || isNaN(penalty1)) {
2128             betas[] = double.nan;
2129             return;
2130         }
2131
2132         oldPenalty2 = penalty2;
2133         oldPenalty1 = penalty1;
2134         oldLikelihood = lh;
2135
2136         foreach(i, ref w; weights) {
2137             w = (ps[i] * (1 - ps[i]));
2138         }
2139
2140         z[] = betas[0];
2141         foreach(i, col; x) {
2142             static if(is(typeof(col) : const(double)[])) {
2143                 z[] += col[] * betas[i + 1];
2144             } else {
2145                 foreach(j, ref elem; z) {
2146                     elem += col[j] + betas[i + 1];
2147                 }
2148             }
2149         }
2150
2151         foreach(i, w; weights) {
2152             if(w == 0){
2153                 z[i] = 0;
2154             } else {
2155                 immutable double yi = (y[i] == 0) ? 0.0 : 1.0;
2156                 z[i] += (yi - ps[i]) / w;
2157             }
2158         }
2159
2160         immutable centerScaleRes = doCenterScale();
2161
2162         // If this is false then weightSum is so small that all probabilities
2163         // are for all practical purposes either 0 or 1.  We can declare
2164         // convergence and go home.
2165         if(!centerScaleRes) return;
2166
2167         if(lasso > 0) {
2168             // Correct for different conventions in defining ridge params
2169             // so all functions get the same answer.
2170             immutable ridgeCorrected = ridge * 2.0;
2171             coordDescent(z, xCenterScale, betasRaw,
2172                 lasso, ridgeCorrected, weights);
2173         } else {
2174             // Correct for different conventions in defining ridge params
2175             // so all functions get the same answer.
2176             immutable ridgeCorrected = ridge * 2.0;
2177             ridgeLargeP(z, xCenterScale, ridgeCorrected, betasRaw, weights);
2178         }
2179
2180         rescaleBetas();
2181     }
2182
2183     immutable lh = logLikelihood(ps, y);
2184     immutable penalty2 = ridge * reduce!"a + b * b"(0.0, betas);
2185     immutable penalty1 = lasso * reduce!"a + (b < 0) ? -b : b"(0.0, betas);
2186
2187     if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps
2188     && (oldPenalty2 - penalty2) / (absMax(penalty2, oldPenalty2) + 0.1) < eps
2189     && (oldPenalty1 - penalty1) / (absMax(penalty1, oldPenalty1) + 0.1) < eps) {
2190         return;
2191     } else {
2192         // If we got here, we haven't converged.  Return NaNs instead of bogus
2193         // values.
2194