root/trunk/distrib.d

Revision 292, 65.3 kB (checked in by dsimcha, 1 year ago)

Add dirichlet distribution.

Line 
1 /**Probability distribution CDFs, PDFs/PMFs, and a few inverse CDFs.
2  *
3  * Authors:  David Simcha, Don Clugston*/
4  /*
5  * Acknowledgements:  Some of this module was borrowed the mathstat module
6  * of Don Clugston's MathExtra library.  This was done to create a
7  * coherent, complete library without massive dependencies, and without
8  * reinventing the wheel.  These functions have been renamed to
9  * fit the naming conventions of this library, and are noted below.
10  * The code from Don Clugston's MathExtra library was based on the Cephes
11  * library by Stephen Moshier.
12  *
13  * Conventions:
14  * Cumulative distribution functions are named <distribution>CDF.  For
15  * discrete distributions, are the P(X <= x) where X is the random variable,
16  * NOT P(X < x).
17  *
18  * All CDFs have a complement, named <distribution>CDFR, which stands for
19  * "Cumulative Distribution Function Right".  For discrete distributions,
20  * this is P(X >= x), NOT P(X > x) and is therefore NOT equal to
21  * 1 - <distribution>CDF.  Also, even for continuous distributions, the
22  * numerical accuracy is higher for small p-values if the CDFR is used than
23  * if 1 - CDF is used.
24  *
25  * If a PDF/PMF function is included for a distribution, it is named
26  * <distribution>PMF or <distribution>PDF (PMF for discrete, PDF for
27  * continuous distributions).
28  *
29  * If an inverse CDF is included, it is named inv<Distribution>CDF.
30  *
31  * For all distributions, the test statistic is the first function parameter
32  * and the distribution parameters are further down the function parameter
33  * list.  This is important for certain generic code, such as  tests and
34  * the parametrize template.
35  *
36  * The following functions are identical or functionally equivalent to
37  * functions found in MathExtra/Tango.Math.probability.  This information
38  * might be useful if someone is trying to integrate this library into other code:
39  *
40  * normalCDF <=> normalDistribution
41  *
42  * normalCDFR <=> normalDistributionCompl
43  *
44  * invNormalCDF <=> normalDistributionComplInv
45  *
46  * studentsTCDF <=> studentsTDistribution  (Note reversal in argument order)
47  *
48  * invStudentsTCDF <=> studentsTDistributionInv (Again, arg order reversed)
49  *
50  * binomialCDF <=> binomialDistribution
51  *
52  * negBinomCDF <=> negativeBinomialDistribution
53  *
54  * poissonCDF <=> poissonDistribution
55  *
56  * chiSqrCDF <=> chiSqrDistribution (Note reversed arg order)
57  *
58  * chiSqrCDFR <=> chiSqrDistributionCompl (Note reversed arg order)
59  *
60  * invChiSqCDFR <=> chiSqrDistributionComplInv
61  *
62  * fisherCDF <=> fDistribution (Note reversed arg order)
63  *
64  * fisherCDFR <=> fDistributionCompl (Note reversed arg order)
65  *
66  * invFisherCDFR <=> fDistributionComplInv
67  *
68  * gammaCDF <=> gammaDistribution  (Note arg reversal)
69  *
70  * gammaCDFR <=> gammaDistributionCompl  (Note arg reversal)
71  *
72  * Note that CDFRs/Compls of continuous distributions are not equivalent,
73  * because in Tango/MathExtra they represent P(X > x) while in dstats they
74  * represent P(X >= x).
75  *
76  *
77  * Copyright (c) 2008-2009, David Simcha and Don Clugston
78  * All rights reserved.
79  *
80  * Redistribution and use in source and binary forms, with or without
81  * modification, are permitted provided that the following conditions are met:
82  *
83  *     * Redistributions of source code must retain the above copyright
84  *       notice, this list of conditions and the following disclaimer.
85  *
86  *     * Redistributions in binary form must reproduce the above copyright
87  *       notice, this list of conditions and the following disclaimer in the
88  *       documentation and/or other materials provided with the distribution.
89  *
90  *     * Neither the name of the authors nor the
91  *       names of its contributors may be used to endorse or promote products
92  *       derived from this software without specific prior written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
95  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
96  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
97  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
98  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
99  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
100  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
101  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
102  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
103  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
104
105 module dstats.distrib;
106
107 import std.algorithm, std.conv, std.exception, std.math, std.traits,
108     std.mathspecial, std.range;
109
110 alias std.mathspecial.erfc erfc;
111 alias std.mathspecial.erf erf;
112
113 import dstats.base;
114
115 // CTFE doesn't work yet for sqrt() in GDC.  This value is sqrt(2 * PI).
116 enum SQ2PI = 2.50662827463100050241576528481104525300698674060993831662992;
117
118 version(unittest) {
119     import std.stdio, std.random;
120
121     alias std.math.approxEqual ae;
122
123     void main(){
124     }
125 }
126
127 /**Takes a distribution function (CDF or PDF/PMF) as a template argument, and
128  * parameters as function arguments in the order that they appear in the
129  * function declaration and returns a delegate that binds the supplied
130  * parameters to the distribution function.  Assumes the non-parameter
131  * argument is the first argument to the distribution function.
132  *
133  * Examples:
134  * ---
135  * auto stdNormal = parametrize!(normalCDF)(0.0L, 1.0L);
136  * ---
137  *
138  * stdNormal is now a delegate for the normal(0, 1) distribution.*/
139 double delegate(ParameterTypeTuple!(distrib)[0])
140               parametrize(alias distrib)(ParameterTypeTuple!(distrib)[1..$]
141               parameters) {
142
143     double calculate(ParameterTypeTuple!(distrib)[0] arg) {
144         return distrib(arg, parameters);
145     }
146
147     return &calculate;
148 }
149
150 unittest {
151     // Just basically see if this compiles.
152     auto stdNormal = parametrize!normalCDF(0, 1);
153     assert(approxEqual(stdNormal(2.5), normalCDF(2.5, 0, 1)));
154 }
155
156 ///
157 struct ParamFunctor(alias distrib) {
158     ParameterTypeTuple!(distrib)[1..$] parameters;
159
160     double opCall(ParameterTypeTuple!(distrib)[0] arg) {
161         return distrib(arg, parameters);
162     }
163 }
164
165 /**Takes a distribution function (CDF or PDF/PMF) as a template argument, and
166  * parameters as function arguments in the order that they appear in the
167  * function declaration and returns a functor that binds the supplied
168  * parameters to the distribution function.  Assumes the non-parameter
169  * argument is the first argument to the distribution function.
170  *
171  * Examples:
172  * ---
173  * auto stdNormal = paramFunctor!(normalCDF)(0.0L, 1.0L);
174  * ---
175  *
176  * stdNormal is now a functor for the normal(0, 1) distribution.*/
177 ParamFunctor!(distrib) paramFunctor(alias distrib)
178                        (ParameterTypeTuple!(distrib)[1..$] parameters) {
179     ParamFunctor!(distrib) ret;
180     foreach(ti, elem; parameters) {
181         ret.tupleof[ti] = elem;
182     }
183     return ret;
184 }
185
186 unittest {
187     // Just basically see if this compiles.
188     auto stdNormal = paramFunctor!normalCDF(0, 1);
189     assert(approxEqual(stdNormal(2.5), normalCDF(2.5, 0, 1)));
190 }
191
192 ///
193 double uniformPDF(double X, double lower, double upper) {
194     dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution.");
195     dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution.");
196     return 1.0L / (upper - lower);
197 }
198
199 ///
200 double uniformCDF(double X, double lower, double upper) {
201     dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution.");
202     dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution.");
203
204     return (X - lower) / (upper - lower);
205 }
206
207 ///
208 double uniformCDFR(double X, double lower, double upper) {
209     dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution.");
210     dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution.");
211
212     return (upper - X) / (upper - lower);
213 }
214
215 ///
216 double poissonPMF(ulong k, double lambda) {
217     dstatsEnforce(lambda > 0, "Cannot have a Poisson with lambda <= 0 or nan.");
218
219     return exp(cast(double) k * log(lambda) -
220             (lambda + logFactorial(k)));  //Grouped for best precision.
221 }
222
223 unittest {
224     assert(approxEqual(poissonPMF(1, .1), .0904837));
225 }
226
227 enum POISSON_NORMAL = 1UL << 12;  // Where to switch to normal approx.
228
229 // The gamma incomplete function is too unstable and the distribution
230 // is for all practical purposes normal anyhow.
231 private double normApproxPoisCDF(ulong k, double lambda)
232 in {
233     assert(lambda > 0);
234 } body {
235     double sd = sqrt(lambda);
236     // mean == lambda.
237     return normalCDF(k + 0.5L, lambda, sd);
238 }
239
240 /**P(K <= k) where K is r.v.*/
241 double poissonCDF(ulong k, double lambda) {
242     dstatsEnforce(lambda > 0, "Cannot have a poisson with lambda <= 0 or nan.");
243
244     return (max(k, lambda) >= POISSON_NORMAL) ?
245            normApproxPoisCDF(k, lambda) :
246            gammaIncompleteCompl(k + 1, lambda);
247 }
248
249 unittest {
250     // Make sure this jives with adding up PMF elements, since this is a
251     // discrete distribution.
252     static double pmfSum(uint k, double lambda) {
253         double ret = 0;
254         foreach(i; 0..k + 1) {
255             ret += poissonPMF(i, lambda);
256         }
257         return ret;
258     }
259
260     assert(approxEqual(poissonCDF(1, 0.5), pmfSum(1, 0.5)));
261     assert(approxEqual(poissonCDF(3, 0.7), pmfSum(3, 0.7)));
262
263     // Absurdly huge values:  Test normal approximation.
264     // Values from R.
265     double ans = poissonCDF( (1UL << 50) - 10_000_000, 1UL << 50);
266     assert(approxEqual(ans, 0.3828427));
267
268     // Make sure cutoff is reasonable, i.e. make sure gamma incomplete branch
269     // and normal branch get roughly the same answer near the cutoff.
270     for(double lambda = POISSON_NORMAL / 2; lambda <= POISSON_NORMAL * 2; lambda += 100) {
271         for(ulong k = POISSON_NORMAL / 2; k <= POISSON_NORMAL * 2; k += 100) {
272             double normAns = normApproxPoisCDF(k, lambda);
273             double gammaAns = gammaIncompleteCompl(k + 1, lambda);
274             assert(abs(normAns - gammaAns) < 0.01, text(normAns, '\t', gammaAns));
275         }
276     }
277 }
278
279 // The gamma incomplete function is too unstable and the distribution
280 // is for all practical purposes normal anyhow.
281 private double normApproxPoisCDFR(ulong k, double lambda)
282 in {
283     assert(lambda > 0);
284 } body {
285     double sd = sqrt(lambda);
286     // mean == lambda.
287     return normalCDFR(k - 0.5L, lambda, sd);
288 }
289
290 /**P(K >= k) where K is r.v.*/
291 double poissonCDFR(ulong k, double lambda) {
292     dstatsEnforce(lambda > 0, "Can't have a poisson with lambda <= 0 or nan.");
293
294     return (max(k, lambda) >= POISSON_NORMAL) ?
295             normApproxPoisCDFR(k, lambda) :
296             gammaIncomplete(k, lambda);
297 }
298
299 unittest {
300     // Make sure this jives with adding up PMF elements, since this is a
301     // discrete distribution.
302     static double pmfSum(uint k, double lambda) {
303         double ret = 0;
304         foreach(i; 0..k + 1)  {
305             ret += poissonPMF(i, lambda);
306         }
307         return ret;
308     }
309
310     assert(approxEqual(poissonCDFR(1, 0.5), 1 - pmfSum(0, 0.5)));
311     assert(approxEqual(poissonCDFR(3, 0.7), 1 - pmfSum(2, 0.7)));
312
313     // Absurdly huge value to test normal approximation.
314     // Values from R.
315     double ans = poissonCDFR( (1UL << 50) - 10_000_000, 1UL << 50);
316     assert(approxEqual(ans, 0.6171573));
317
318     // Make sure cutoff is reasonable, i.e. make sure gamma incomplete branch
319     // and normal branch get roughly the same answer near the cutoff.
320     for(double lambda = POISSON_NORMAL / 2; lambda <= POISSON_NORMAL * 2; lambda += 100) {
321         for(ulong k = POISSON_NORMAL / 2; k <= POISSON_NORMAL * 2; k += 100) {
322             double normAns = normApproxPoisCDFR(k, lambda);
323             double gammaAns = gammaIncomplete(k, lambda);
324             assert(abs(normAns - gammaAns) < 0.01, text(normAns, '\t', gammaAns));
325         }
326     }
327 }
328
329 /**Returns the value of k for the given p-value and lambda.  If p-val
330  * doesn't exactly map to a value of k, the k for which poissonCDF(k, lambda)
331  * is closest to pVal is used.*/
332 uint invPoissonCDF(double pVal, double lambda) {
333     dstatsEnforce(lambda > 0, "Cannot have a poisson with lambda <= 0 or nan.");
334     dstatsEnforce(pVal >= 0 && pVal <= 1, "P-values must be between 0, 1.");
335
336     // Use normal approximation to get approx answer, then brute force search.
337     // This works better than you think because for small n, there's not much
338     // search space and for large n, the normal approx. is doublely good.
339     uint guess = cast(uint) max(round(
340           invNormalCDF(pVal, lambda, sqrt(lambda)) + 0.5), 0.0L);
341     double guessP = poissonCDF(guess, lambda);
342
343     if(guessP == pVal) {
344         return guess;
345     } else if(guessP < pVal) {
346         for(uint k = guess + 1; ; k++) {
347             double newP = guessP + poissonPMF(k, lambda);
348             if(newP >= 1)
349                 return k;
350             if(abs(newP - pVal) > abs(guessP - pVal)) {
351                 return k - 1;
352             } else {
353                 guessP = newP;
354             }
355         }
356     } else {
357         for(uint k = guess - 1; k != uint.max; k--) {
358             double newP = guessP - poissonPMF(k + 1, lambda);
359             if(abs(newP - pVal) > abs(guessP - pVal)) {
360                 return k + 1;
361             } else {
362                 guessP = newP;
363             }
364         }
365         return 0;
366     }
367 }
368
369 unittest {
370    foreach(i; 0..1_000) {
371        // Restricted variable ranges are because, in the tails, more than one
372        // value of k can map to the same p-value at machine precision.
373        // Obviously, this is one of those corner cases that nothing can be
374        // done about.
375        double lambda = uniform(.05L, 8.0L);
376        uint k = uniform(0U, cast(uint) ceil(3.0L * lambda));
377        double pVal = poissonCDF(k, lambda);
378        assert(invPoissonCDF(pVal, lambda) == k);
379    }
380 }
381
382 ///
383 double binomialPMF(ulong k, ulong n, double p) {
384     dstatsEnforce(k <= n, "k cannot be > n in binomial distribution.");
385     dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution.");
386     return exp(logNcomb(n, k) + k * log(p) + (n - k) * log(1 - p));
387 }
388
389 unittest {
390     assert(approxEqual(binomialPMF(0, 10, .5), cast(double) 1/1024));
391     assert(approxEqual(binomialPMF(100, 1000, .11), .024856));
392 }
393
394 // Determines what value of n we switch to normal approximation at b/c
395 // betaIncomplete becomes unstable.
396 private enum BINOM_APPROX = 1UL << 24;
397
398 // Cutoff value of n * p for deciding whether to go w/ normal or poisson approx
399 // when betaIncomplete becomes unstable.
400 private enum BINOM_POISSON = 1_024;
401
402 // betaIncomplete is numerically unstable for huge values of n.
403 // Luckily this is exactly when the normal approximation becomes
404 // for all practical purposes exact.
405 private double normApproxBinomCDF(double k, double n, double p)
406 in {
407     assert(k <= n);
408     assert(p >= 0 && p <= 1);
409 } body {
410     double mu = p * n;
411     double sd = sqrt( to!double(n) ) * sqrt(p) * sqrt(1 - p);
412     double xCC = k + 0.5L;
413     return normalCDF(xCC, mu, sd);
414 }
415
416 ///P(K <= k) where K is random variable.
417 double binomialCDF(ulong k, ulong n, double p) {
418     dstatsEnforce(k <= n, "k cannot be > n in binomial distribution.");
419     dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution.");
420
421     if(k == n) {
422         return 1;
423     } else if(k == 0) {
424         return pow(1.0 - p,  cast(double) n);
425     }
426
427     if(n > BINOM_APPROX) {
428         if(n * p < BINOM_POISSON) {
429             return poissonCDF(k, n * p);
430         } else if(n * (1 - p) < BINOM_POISSON) {
431             return poissonCDFR(n - k, n * (1 - p));
432         } else {
433             return normApproxBinomCDF(k, n, p);
434         }
435     }
436
437     return betaIncomplete(n - k, k + 1, 1.0 - p);
438 }
439
440 unittest {
441     assert(approxEqual(binomialCDF(10, 100, .11), 0.4528744401));
442     assert(approxEqual(binomialCDF(15, 100, .12), 0.8585510507));
443     assert(approxEqual(binomialCDF(50, 1000, .04), 0.95093595));
444     assert(approxEqual(binomialCDF(7600, 15000, .5), .9496193045414));
445     assert(approxEqual(binomialCDF(0, 10, 0.2), 0.1073742));
446
447     // Absurdly huge numbers:
448     {
449         ulong k = (1UL << 60) - 100_000_000;
450         ulong n = 1UL << 61;
451         assert(approxEqual(binomialCDF(k, n, 0.5L), 0.4476073));
452     }
453
454     // Test Poisson branch.
455     double poisAns = binomialCDF(85, 1UL << 26, 1.49e-6);
456     assert(approxEqual(poisAns, 0.07085327));
457
458     // Test poissonCDFR branch.
459     poisAns = binomialCDF( (1UL << 25) - 100, 1UL << 25, 0.9999975L);
460     assert(approxEqual(poisAns, 0.04713316));
461
462     // Make sure cutoff is reasonable:  Just below it, we should get similar
463     // results for normal, exact.
464     for(ulong n = BINOM_APPROX / 2; n < BINOM_APPROX; n += 200_000) {
465         for(double p = 0.01; p <= 0.99; p += 0.05) {
466
467             long lowerK = roundTo!long( n * p * 0.99);
468             long upperK = roundTo!long( n * p / 0.99);
469
470             for(ulong k = lowerK; k <= min(n, upperK); k += 1_000) {
471                 double normRes = normApproxBinomCDF(k, n, p);
472                 double exactRes = binomialCDF(k, n, p);
473                 assert(abs(normRes - exactRes) < 0.001,
474                     text(normRes, '\t', exactRes));
475             }
476         }
477     }
478
479 }
480
481 // betaIncomplete is numerically unstable for huge values of n.
482 // Luckily this is exactly when the normal approximation becomes
483 // for all practical purposes exact.
484 private double normApproxBinomCDFR(ulong k, ulong n, double p)
485 in {
486     assert(k <= n);
487     assert(p >= 0 && p <= 1);
488 } body {
489     double mu = p * n;
490     double sd = sqrt( to!double(n) ) * sqrt(p)  * sqrt(1 - p);
491     double xCC = k - 0.5L;
492     return normalCDFR(xCC, mu, sd);
493 }
494
495 ///P(K >= k) where K is random variable.
496 double binomialCDFR(ulong k, ulong n, double p) {
497     dstatsEnforce(k <= n, "k cannot be > n in binomial distribution.");
498     dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution.");
499
500     if(k == 0) {
501         return 1;
502     } else if(k == n) {
503         return pow(p, cast(double) n);
504     }
505
506     if(n > BINOM_APPROX) {
507         if(n * p < BINOM_POISSON) {
508             return poissonCDFR(k, n * p);
509         } else if(n * (1 - p) < BINOM_POISSON) {
510             return poissonCDF(n - k, n * (1 - p));
511         } else {
512             return normApproxBinomCDFR(k, n, p);
513         }
514     }
515
516     return betaIncomplete(k, n - k + 1, p);
517 }
518
519 unittest {
520     // Values from R, Maxima.
521     assert(approxEqual(binomialCDF(10, 100, .11), 1 -
522                       binomialCDFR(11, 100, .11)));
523     assert(approxEqual(binomialCDF(15, 100, .12), 1 -
524                        binomialCDFR(16, 100, .12)));
525     assert(approxEqual(binomialCDF(50, 1000, .04), 1 -
526                        binomialCDFR(51, 1000, .04)));
527     assert(approxEqual(binomialCDF(7600, 15000, .5), 1 -
528                        binomialCDFR(7601, 15000, .5)));
529     assert(approxEqual(binomialCDF(9, 10, 0.3), 1 -
530                        binomialCDFR(10, 10, 0.3)));
531
532     // Absurdly huge numbers, test normal branch.
533     {
534         ulong k = (1UL << 60) - 100_000_000;
535         ulong n = 1UL << 61;
536         assert(approxEqual(binomialCDFR(k, n, 0.5L), 0.5523927));
537     }
538
539     // Test Poisson inversion branch.
540     double poisRes = binomialCDFR((1UL << 25) - 70, 1UL << 25, 0.9999975L);
541     assert(approxEqual(poisRes, 0.06883905));
542
543     // Test Poisson branch.
544     poisRes = binomialCDFR(350, 1UL << 25, 1e-5);
545     assert(approxEqual(poisRes, 0.2219235));
546
547     // Make sure cutoff is reasonable:  Just below it, we should get similar
548     // results for normal, exact.
549     for(ulong n = BINOM_APPROX / 2; n < BINOM_APPROX; n += 200_000) {
550         for(double p = 0.01; p <= 0.99; p += 0.05) {
551
552             long lowerK = roundTo!long( n * p * 0.99);
553             long upperK = roundTo!long( n * p / 0.99);
554
555             for(ulong k = lowerK; k <= min(n, upperK); k += 1_000) {
556                 double normRes = normApproxBinomCDFR(k, n, p);
557                 double exactRes = binomialCDFR(k, n, p);
558                 assert(abs(normRes - exactRes) < 0.001,
559                     text(normRes, '\t', exactRes));
560             }
561         }
562     }
563 }
564
565 /**Returns the value of k for the given p-value, n and p.  If p-value does
566  * not exactly map to a value of k, the value for which binomialCDF(k, n, p)
567  * is closest to pVal is used.*/
568 uint invBinomialCDF(double pVal, uint n, double p) {
569     dstatsEnforce(pVal >= 0 && pVal <= 1, "p-values must be between 0, 1.");
570     dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution.");
571
572     // Use normal approximation to get approx answer, then brute force search.
573     // This works better than you think because for small n, there's not much
574     // search space and for large n, the normal approx. is doublely good.
575     uint guess = cast(uint) max(round(
576           invNormalCDF(pVal, n * p, sqrt(n * p * (1 - p)))) + 0.5, 0);
577     if(guess > n) {
578         if(pVal < 0.5)  // Numerical issues/overflow.
579             guess = 0;
580         else guess = n;
581     }
582     double guessP = binomialCDF(guess, n, p);
583
584     if(guessP == pVal) {
585         return guess;
586     } else if(guessP < pVal) {
587         for(uint k = guess + 1; k <= n; k++) {
588             double newP = guessP + binomialPMF(k, n, p);
589             if(abs(newP - pVal) > abs(guessP - pVal)) {
590                 return k - 1;
591             } else {
592                 guessP = newP;
593             }
594         }
595         return n;
596     } else {
597         for(uint k = guess - 1; k != uint.max; k--) {
598             double newP = guessP - binomialPMF(k + 1, n, p);
599             if(abs(newP - pVal) > abs(guessP - pVal)) {
600                 return k + 1;
601             } else {
602                 guessP = newP;
603             }
604         }
605         return 0;
606     }
607 }
608
609 unittest {
610    Random gen = Random(unpredictableSeed);
611    foreach(i; 0..1_000) {
612        // Restricted variable ranges are because, in the tails, more than one
613        // value of k can map to the same p-value at machine precision.
614        // Obviously, this is one of those corner cases that nothing can be
615        // done about.  Using small n's, moderate p's prevents this.
616        uint n = uniform(5U, 10U);
617        uint k = uniform(0U, n);
618        double p = uniform(0.1L, 0.9L);
619        double pVal = binomialCDF(k, n, p);
620        assert(invBinomialCDF(pVal, n, p) == k);
621    }
622 }
623
624 ///
625 double hypergeometricPMF(long x, long n1, long n2, long n)
626 in {
627     assert(x <= n);
628 } body {
629     if(x > n1 || x < (n - n2)) {
630         return 0;
631     }
632     double result = logNcomb(n1, x) + logNcomb(n2, n - x) - logNcomb(n1 + n2, n);
633     return exp(result);
634 }
635
636 unittest {
637     assert(approxEqual(hypergeometricPMF(5, 10, 10, 10), .3437182));
638     assert(approxEqual(hypergeometricPMF(9, 12, 10, 15), .27089783));
639     assert(approxEqual(hypergeometricPMF(9, 100, 100, 15), .15500003));
640 }
641
642 /**P(X <= x), where X is random variable.  Uses either direct summation,
643  * normal or binomial approximation depending on parameters.*/
644 // If anyone knows a better algorithm for this, feel free...
645 // I've read a decent amount about it, though, and getting hypergeometric
646 // CDFs that are both accurate and fast is just plain hard.  This
647 // implementation attempts to strike a balance between the two, so that
648 // both speed and accuracy are "good enough" for most practical purposes.
649 double hypergeometricCDF(long x, long n1, long n2, long n) {
650     dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution.");
651     dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution.");
652     dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution.");
653
654     ulong expec = (n1 * n) / (n1 + n2);
655     long nComp = n1 + n2 - n, xComp = n2 + x - n;
656
657     // Try to reduce number of calculations using identities.
658     if(x >= n1 || x == n) {
659         return 1;
660     } else if(x > expec && x > n / 2) {
661         return 1 - hypergeometricCDF(n - x - 1, n2, n1, n);
662     } else if(xComp < x && xComp > 0) {
663         return hypergeometricCDF(xComp, n2, n1, nComp);
664     }
665
666     // Speed depends on x mostly, so always use exact for small x.
667     if(x <= 100) {
668         return hyperExact(x, n1, n2, n);
669     }
670
671     // Determine whether to use exact, normal approx or binomial approx.
672     // Using obviously arbitrary but relatively stringent standards
673     // for determining whether to approximate.
674     enum NEXACT = 50L;
675
676     double p = cast(double) n1 / (n1 + n2);
677     double pComp = cast(double) n2 / (n1 + n2);
678     double pMin = min(p, pComp);
679     if(min(n, nComp) * pMin >= 100) {
680         // Since high relative error in the lower tail is a significant problem,
681         // this is a hack to improve the normal approximation:  Use the normal
682         // approximation, except calculate the last NEXACT elements exactly,
683         // since elements around the e.v. are where absolute error is highest.
684         // For large x, gives most of the accuracy of an exact calculation in
685         // only a small fraction of the time.
686         if(x <= expec + NEXACT / 2) {
687             return min(1, normApproxHyper(x - NEXACT, n1, n2, n) +
688                 hyperExact(x, n1, n2, n, x - NEXACT + 1));
689         } else {
690             // Just use plain old normal approx.  Since P is large, the
691             // relative error won't be so bad anyhow.
692             return normApproxHyper(x, n1, n2, n);
693         }
694     }
695     // Try to make n as small as possible by applying mathematically equivalent
696     // transformations so that binomial approx. works as well as possible.
697     ulong bSc1 = (n1 + n2) / n, bSc2 = (n1 + n2) / n1;
698
699     if(bSc1 >= 50 && bSc1 > bSc2) {
700         // Same hack as normal approximation for rel. acc. in lower tail.
701         if(x <= expec + NEXACT / 2) {
702             return min(1, binomialCDF(cast(uint) (x - NEXACT), cast(uint) n, p) +
703                 hyperExact(x, n1, n2, n, x - NEXACT + 1));
704         } else {
705             return binomialCDF(cast(uint) x, cast(uint)  n, p);
706         }
707     } else if(bSc2 >= 50 && bSc2 > bSc1) {
708         double p2 = cast(double) n / (n1 + n2);
709         if(x <= expec + NEXACT / 2) {
710             return min(1, binomialCDF(cast(uint) (x - NEXACT), cast(uint)  n1, p2) +
711                 hyperExact(x, n1, n2, n, x - NEXACT + 1));
712         } else {
713             return binomialCDF(cast(uint) x, cast(uint) n1, p2);
714         }
715     } else {
716         return hyperExact(x, n1, n2, n);
717     }
718 }
719
720 unittest {
721     // Values from R and the Maxima CAS.
722     // Test exact branch, including reversing, complementing.
723     assert(approxEqual(hypergeometricCDF(5, 10, 10, 10), 0.6718591));
724     assert(approxEqual(hypergeometricCDF(3, 11, 15, 10), 0.27745322));
725     assert(approxEqual(hypergeometricCDF(18, 27, 31, 35), 0.88271714));
726     assert(approxEqual(hypergeometricCDF(21, 29, 31, 35), 0.99229253));
727
728     // Normal branch.
729     assert(approxEqual(hypergeometricCDF(501, 2000, 1000, 800), 0.002767073));
730     assert(approxEqual(hypergeometricCDF(565, 2000, 1000, 800), 0.9977068));
731     assert(approxEqual(hypergeometricCDF(2700, 10000, 20000, 8000), 0.825652));
732
733     // Binomial branch.  One for each transformation.
734     assert(approxEqual(hypergeometricCDF(110, 5000, 7000, 239), 0.9255627));
735     assert(approxEqual(hypergeometricCDF(19840, 2950998, 12624, 19933), 0.2020618));
736     assert(approxEqual(hypergeometricCDF(130, 24195, 52354, 295), 0.9999973));
737     assert(approxEqual(hypergeometricCDF(103, 901, 49014, 3522), 0.999999));
738 }
739
740 ///P(X >= x), where X is random variable.
741 double hypergeometricCDFR(ulong x, ulong n1, ulong n2, ulong n) {
742     dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution.");
743     dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution.");
744     dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution.");
745
746     return hypergeometricCDF(n - x, n2, n1, n);
747 }
748
749 unittest {
750     //Reverses n1, n2 and subtracts x from n to get mirror image.
751     assert(approxEqual(hypergeometricCDF(5,10,10,10),
752                        hypergeometricCDFR(5,10,10,10)));
753     assert(approxEqual(hypergeometricCDF(3, 11, 15, 10),
754                        hypergeometricCDFR(7, 15, 11, 10)));
755     assert(approxEqual(hypergeometricCDF(18, 27, 31, 35),
756                        hypergeometricCDFR(17, 31, 27, 35)));
757     assert(approxEqual(hypergeometricCDF(21, 29, 31, 35),
758                        hypergeometricCDFR(14, 31, 29, 35)));
759 }
760
761 double hyperExact(ulong x, ulong n1, ulong n2, ulong n, ulong startAt = 0) {
762     dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution.");
763     dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution.");
764     dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution.");
765
766     immutable double constPart = logFactorial(n1) + logFactorial(n2) +
767         logFactorial(n) + logFactorial(n1 + n2 - n) - logFactorial(n1 + n2);
768     double sum = 0;
769     for(ulong i = x; i != startAt - 1; i--) {
770         double oldSum = sum;
771         sum += exp(constPart - logFactorial(i) - logFactorial(n1 - i) -
772                logFactorial(n2 + i - n) - logFactorial(n - i));
773         if(isIdentical(sum, oldSum)) { // At full machine precision.
774             break;
775         }
776     }
777     return sum;
778 }
779
780 double normApproxHyper(ulong x, ulong n1, ulong n2, ulong n) {
781     double p1 = cast(double) n1 / (n1 + n2);
782     double p2 = cast(double) n2 / (n1 + n2);
783     double numer = x + 0.5L - n * p1;
784     double denom = sqrt(n * p1 * p2 * (n1 + n2 - n) / (n1 + n2 - 1));
785     return normalCDF(numer / denom);
786 }
787
788 // Aliases for old names.  Not documented because new names should be used.
789 alias chiSquareCDF chiSqrCDF;
790 alias chiSquareCDFR chiSqrCDFR;
791 alias invChiSquareCDFR invChiSqCDFR;
792
793 ///
794 double chiSquarePDF(double x, double v) {
795     dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution.");
796     dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square.");
797
798     // Calculate in log space for stability.
799     immutable logX = log(x);
800     immutable numerator = logX * (0.5 * v - 1) - 0.5 * x;
801     immutable denominator = LN2 * (0.5 * v) + lgamma(0.5 * v);
802     return exp(numerator - denominator);
803 }
804
805 unittest {
806     assert( approxEqual(chiSquarePDF(1, 2), 0.3032653));
807     assert( approxEqual(chiSquarePDF(2, 1), 0.1037769));
808 }
809
810 /**
811  *  $(POWER &chi;,2) distribution function and its complement.
812  *
813  * Returns the area under the left hand tail (from 0 to x)
814  * of the Chi square probability density function with
815  * v degrees of freedom. The complement returns the area under
816  * the right hand tail (from x to &infin;).
817  *
818  *  chiSquareCDF(x | v) = ($(INTEGRATE 0, x)
819  *          $(POWER t, v/2-1) $(POWER e, -t/2) dt )
820  *             / $(POWER 2, v/2) $(GAMMA)(v/2)
821  *
822  *  chiSquareCDFR(x | v) = ($(INTEGRATE x, &infin;)
823  *          $(POWER t, v/2-1) $(POWER e, -t/2) dt )
824  *             / $(POWER 2, v/2) $(GAMMA)(v/2)
825  *
826  * Params:
827  *  v  = degrees of freedom. Must be positive.
828  *  x  = the $(POWER &chi;,2) variable. Must be positive.
829  *
830  */
831 double chiSquareCDF(double x, double v) {
832     dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution.");
833     dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square.");
834
835     // These are very common special cases where we can make the calculation
836     // a lot faster and/or more accurate.
837     if(v == 1) {
838         // Then it's the square of a normal(0, 1).
839         return 1.0L - erfc(sqrt(x) * SQRT1_2);
840     } else if(v == 2) {
841         // Then it's an exponential w/ lambda == 1/2.
842         return 1.0L - exp(-0.5 * x);
843     } else {
844         return gammaIncomplete(0.5 * v, 0.5 * x);
845     }
846 }
847
848 ///
849 double chiSquareCDFR(double x, double v) {
850     dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution.");
851     dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square.");
852
853     // These are very common special cases where we can make the calculation
854     // a lot faster and/or more accurate.
855     if(v == 1) {
856         // Then it's the square of a normal(0, 1).
857         return erfc(sqrt(x) * SQRT1_2);
858     } else if(v == 2) {
859         // Then it's an exponential w/ lambda == 1/2.
860         return exp(-0.5 * x);
861     } else {
862         return gammaIncompleteCompl(0.5 * v, 0.5 * x);
863     }
864 }
865
866 /**
867  *  Inverse of complemented $(POWER &chi;, 2) distribution
868  *
869  * Finds the $(POWER &chi;, 2) argument x such that the integral
870  * from x to &infin; of the $(POWER &chi;, 2) density is equal
871  * to the given cumulative probability p.
872  *
873  * Params:
874  * p = Cumulative probability. 0<= p <=1.
875  * v = Degrees of freedom. Must be positive.
876  *
877  */
878 double invChiSquareCDFR(double v, double p) {
879     dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square.");
880     dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
881     return  2.0 * gammaIncompleteComplInverse( 0.5*v, p);
882 }
883
884 unittest {
885     assert(feqrel(chiSqrCDFR(invChiSqCDFR(3.5, 0.1), 3.5), 0.1)>=double.mant_dig-3);
886     assert(approxEqual(
887         chiSqrCDF(0.4L, 19.02L) + chiSqrCDFR(0.4L, 19.02L), 1.0L));
888     assert(ae( invChiSqCDFR( 3, chiSqrCDFR(1, 3)), 1));
889
890     assert(ae(chiSquareCDFR(0.2, 1), 0.6547208));
891     assert(ae(chiSquareCDFR(0.2, 2), 0.9048374));
892     assert(ae(chiSquareCDFR(0.8, 1), 0.3710934));
893     assert(ae(chiSquareCDFR(0.8, 2), 0.67032));
894
895     assert(ae(chiSquareCDF(0.2, 1), 0.3452792));
896     assert(ae(chiSquareCDF(0.2, 2), 0.09516258));
897     assert(ae(chiSquareCDF(0.8, 1), 0.6289066));
898     assert(ae(chiSquareCDF(0.8, 2), 0.3296800));
899 }
900
901 ///
902 double normalPDF(double x, double mean = 0, double sd = 1) {
903     dstatsEnforce(sd > 0, "Standard deviation must be > 0 for normal distribution.");
904     double dev = x - mean;
905     return exp(-(dev * dev) / (2 * sd * sd)) / (sd * SQ2PI);
906 }
907
908 unittest {
909     assert(approxEqual(normalPDF(3, 1, 2), 0.1209854));
910 }
911
912 ///P(X < x) for normal distribution where X is random var.
913 double normalCDF(double x, double mean = 0, double stdev = 1) {
914     dstatsEnforce(stdev > 0, "Standard deviation must be > 0 for normal distribution.");
915
916     // Using a slightly non-obvious implementation in terms of erfc because
917     // it seems more accurate than erf for very small values of Z.
918
919     double Z = (-x + mean) / stdev;
920     return erfc(Z*SQRT1_2)/2;
921 }
922
923 unittest {
924     assert(approxEqual(normalCDF(2), .9772498));
925     assert(approxEqual(normalCDF(-2), .02275013));
926     assert(approxEqual(normalCDF(1.3), .90319951));
927 }
928
929 ///P(X > x) for normal distribution where X is random var.
930 double normalCDFR(double x, double mean = 0, double stdev = 1) {
931     dstatsEnforce(stdev > 0, "Standard deviation must be > 0 for normal distribution.");
932
933     double Z = (x - mean) / stdev;
934     return erfc(Z * SQRT1_2) / 2;
935 }
936
937 unittest {
938     //Should be essentially a mirror image of normalCDF.
939     for(double i = -8; i < 8; i += .1) {
940         assert(approxEqual(normalCDF(i), normalCDFR(-i)));
941     }
942 }
943
944 enum real SQRT2PI =   0x1.40d931ff62705966p+1;    // 2.5066282746310005024
945 enum real EXP_2  = 0.13533528323661269189L; /* exp(-2) */
946
947 private {
948 immutable real P0[8] = [
949    -0x1.758f4d969484bfdcp-7,    // -0.011400139698853582732
950    0x1.53cee17a59259dd2p-3, // 0.16592193750979583221
951    -0x1.ea01e4400a9427a2p-1,    // -0.95704568177942689081
952    0x1.61f7504a0105341ap+1, // 2.7653599130008302859
953    -0x1.09475a594d0399f6p+2,    // -4.1449800369337538286
954    0x1.7c59e7a0df99e3e2p+1, // 2.971493676711545292
955    -0x1.87a81da52edcdf14p-1,    // -0.76495449677843806914
956    0x1.1fb149fd3f83600cp-7  // 0.0087796794200550691607
957 ];
958
959 immutable real Q0[8] = [
960    -0x1.64b92ae791e64bb2p-7,    // -0.010886331510064192632
961    0x1.7585c7d597298286p-3, // 0.1823840725000038842
962    -0x1.40011be4f7591ce6p+0,    // -1.2500169214248199725
963    0x1.1fc067d8430a425ep+2, // 4.4961185085232139506
964    -0x1.21008ffb1e7ccdf2p+3,    // -9.0313186554593813887
965    0x1.3d1581cf9bc12fccp+3, // 9.9088753752567182205
966    -0x1.53723a89fd8f083cp+2,    // -5.3038469646037218604
967    0x1p+0   // 1
968 ];
969
970 immutable real P1[10] = [
971    0x1.20ceea49ea142f12p-13,    // 0.00013771451113809605662
972    0x1.cbe8a7267aea80bp-7,  // 0.014035302749980729871
973    0x1.79fea765aa787c48p-2, // 0.36913549001712241224
974    0x1.d1f59faa1f4c4864p+1, // 3.6403083401370131097
975    0x1.1c22e426a013bb96p+4, // 17.75851836288460008
976    0x1.a8675a0c51ef3202p+5, // 53.050464721918523919
977    0x1.75782c4f83614164p+6, // 93.367356531518738722
978    0x1.7a2f3d90948f1666p+6, // 94.546133288447683183
979    0x1.5cd116ee4c088c3ap+5, // 43.602094518370966827
980    0x1.1361e3eb6e3cc20ap+2  // 4.3028497504355521807
981 ];
982
983 immutable real Q1[10] = [
984    0x1.3a4ce1406cea98fap-13,    // 0.00014987006762866754669
985    0x1.f45332623335cda2p-7, // 0.015268706895221911913
986    0x1.98f28bbd4b98db1p-2,  // 0.39936273901812389627
987    0x1.ec3b24f9c698091cp+1, // 3.8455549449546995474
988    0x1.1cc56ecda7cf58e4p+4, // 17.79820137342627204
989    0x1.92c6f7376bf8c058p+5, // 50.347151215536627131
990    0x1.4154c25aa47519b4p+6, // 80.332772651946720635
991    0x1.1b321d3b927849eap+6, // 70.798939638914882544
992    0x1.403a5f5a4ce7b202p+4, // 20.014251091705301368
993    0x1p+0   // 1
994 ];
995
996 immutable real P2[8] = [
997    0x1.8c124a850116a6d8p-21,    // 7.3774056430545041787e-07
998    0x1.534abda3c2fb90bap-13,    // 0.0001617870121822776094
999    0x1.29a055ec93a4718cp-7, // 0.0090828342009931074419
1000    0x1.6468e98aad6dd474p-3, // 0.17402822927913678347
1001    0x1.3dab2ef4c67a601cp+0, // 1.2408933017345389353
1002    0x1.e1fb3a1e70c67464p+1, // 3.7654793404231444828
1003    0x1.b6cce8035ff57b02p+2, // 6.8562564881284157607
1004    0x1.9f4c9e749ff35f62p+1  // 3.2445257253129069325
1005 ];
1006
1007 immutable real Q2[8] = [
1008    0x1.af03f4fc0655e006p-21,    // 8.0282885006885383316e-07
1009    0x1.713192048d11fb2p-13, // 0.00017604524340842589303
1010    0x1.4357e5bbf5fef536p-7, // 0.0098676559208996361084
1011    0x1.7fdac8749985d43cp-3, // 0.18742901426157036096
1012    0x1.4a080c813a2d8e84p+0, // 1.2891853156563028786
1013    0x1.c3a4b423cdb41bdap+1, // 3.528463857156936774
1014    0x1.8160694e24b5557ap+2, // 6.0215094817275106307
1015    0x1p+0   // 1
1016 ];
1017
1018 immutable real P3[8] = [
1019    -0x1.55da447ae3806168p-34,   // -7.7728283809481633868e-11
1020    -0x1.145635641f8778a6p-24,   // -6.4339663876133447143e-08
1021    -0x1.abf46d6b48040128p-17,   // -1.2754046756102807876e-05
1022    -0x1.7da550945da790fcp-11,   // -0.00072793152007373443093
1023    -0x1.aa0b2a31157775fap-8,    // -0.0065009096152460679857
1024    0x1.b11d97522eed26bcp-3, // 0.21148222178987070632
1025    0x1.1106d22f9ae89238p+1, // 2.1330206615874130532
1026    0x1.029a358e1e630f64p+1  // 2.0203310913027725356
1027 ];
1028
1029 immutable real Q3[8] = [
1030    -0x1.74022dd5523e6f84p-34,   // -8.4584942637876803775e-11
1031    -0x1.2cb60d61e29ee836p-24,   // -7.0014768675591937804e-08
1032    -0x1.d19e6ec03a85e556p-17,   // -1.3876523894802171788e-05
1033    -0x1.9ea2a7b4422f6502p-11,   // -0.00079085420887378582886
1034    -0x1.c54b1e852f107162p-8,    // -0.0069167088997199649828
1035    0x1.e05268dd3c07989ep-3, // 0.23453218388704381964
1036    0x1.239c6aff14afbf82p+1, // 2.2782109971534491995
1037    0x1p+0   // 1
1038 ];
1039
1040 }
1041
1042 /******************************
1043  * Inverse of Normal distribution function
1044  *
1045  * Returns the argument, x, for which the area under the
1046  * Normal probability density function (integrated from
1047  * minus infinity to x) is equal to p.
1048  */
1049 double invNormalCDF(double p, double mean = 0, double sd = 1) {
1050     dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
1051     dstatsEnforce(sd > 0, "Standard deviation must be > 0 for normal distribution.");
1052
1053     if (p == 0.0L) {
1054         return -double.infinity;
1055     }
1056     if( p == 1.0L ) {
1057         return double.infinity;
1058     }
1059     double x, z, y2, x0, x1;
1060     int code = 1;
1061     double y = p;
1062     if( y > (1.0L - EXP_2) ) {
1063         y = 1.0L - y;
1064         code = 0;
1065     }
1066
1067     if ( y > EXP_2 ) {
1068         y = y - 0.5L;
1069         y2 = y * y;
1070         x = y + y * (y2 * poly( y2, P0)/poly( y2, Q0));
1071         x = x * SQRT2PI;
1072         return x * sd + mean;
1073     }
1074
1075     x = sqrt( -2.0L * log(y) );
1076     x0 = x - log(x)/x;
1077     z = 1.0L/x;
1078     if( x < 8.0L ) {
1079         x1 = z * poly( z, P1)/poly( z, Q1);
1080     } else if( x < 32.0L ) {
1081         x1 = z * poly( z, P2)/poly( z, Q2);
1082     } else {
1083 //  assert(0);
1084         x1 = z * poly( z, P3)/poly( z, Q3);
1085     }
1086     x = x0 - x1;
1087     if( code != 0 ) {
1088         x = -x;
1089     }
1090     return x * sd + mean;
1091 }
1092
1093
1094 unittest {
1095     // The values below are from Excel 2003.
1096     assert(fabs(invNormalCDF(0.001) - (-3.09023230616779))< 0.00000000000005);
1097     assert(fabs(invNormalCDF(1e-50) - (-14.9333375347885))< 0.00000000000005);
1098     assert(feqrel(invNormalCDF(0.999), -invNormalCDF(0.001))>double.mant_dig-6);
1099
1100     // Excel 2003 gets all the following values wrong!
1101     assert(invNormalCDF(0.0)==-double.infinity);
1102     assert(invNormalCDF(1.0)==double.infinity);
1103     assert(invNormalCDF(0.5)==0);
1104
1105     // I don't know the correct result for low values
1106     // (Excel 2003 returns norminv(p) = -30 for all p < 1e-200).
1107     // The value tested here is the one the function returned in Jan 2006.
1108     double unknown1 = invNormalCDF(1e-250L);
1109     assert( fabs(unknown1 -(-33.79958617269L) ) < 0.00000005);
1110
1111     Random gen;
1112     gen.seed(unpredictableSeed);
1113     // normalCDF function trivial given ERF, unlikely to contain subtle bugs.
1114     // Just make sure invNormalCDF works like it should as the inverse.
1115     foreach(i; 0..1000) {
1116         double x = uniform(0.0L, 1.0L);
1117         double mean = uniform(0.0L, 100.0L);
1118         double sd = uniform(1.0L, 3.0L);
1119         double inv = invNormalCDF(x, mean, sd);
1120         double rec = normalCDF(inv, mean, sd);
1121         assert(approxEqual(x, rec));
1122     }
1123 }
1124
1125 ///
1126 double logNormalPDF(double x, double mu = 0, double sigma = 1) {
1127     dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution.");
1128
1129     immutable mulTerm = 1.0L / (x * sigma * SQ2PI);
1130     double expTerm = log(x) - mu;
1131     expTerm *= expTerm;
1132     expTerm /= 2 * sigma * sigma;
1133     return mulTerm * exp(-expTerm);
1134 }
1135
1136 unittest {
1137     // Values from R.
1138     assert(approxEqual(logNormalPDF(1, 0, 1), 0.3989423));
1139     assert(approxEqual(logNormalPDF(2, 2, 3), 0.06047173));
1140 }
1141
1142 ///
1143 double logNormalCDF(double x, double mu = 0, double sigma = 1) {
1144     dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution.");
1145
1146     return 0.5L + 0.5L * erf((log(x) - mu) / (sigma * SQRT2));
1147 }
1148
1149 unittest {
1150     assert(approxEqual(logNormalCDF(4), 0.9171715));
1151     assert(approxEqual(logNormalCDF(1, -2, 3), 0.7475075));
1152 }
1153
1154 ///
1155 double logNormalCDFR(double x, double mu = 0, double sigma = 1) {
1156     dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution.");
1157
1158     return 0.5L - 0.5L * erf((log(x) - mu) / (sigma * SQRT2));
1159 }
1160
1161 unittest {
1162     assert(approxEqual(logNormalCDF(4) + logNormalCDFR(4), 1));
1163     assert(approxEqual(logNormalCDF(1, -2, 3) + logNormalCDFR(1, -2, 3), 1));
1164 }
1165
1166 ///
1167 double weibullPDF(double x, double shape, double scale = 1) {
1168     dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution.");
1169     dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution.");
1170
1171     if(x < 0) {
1172         return 0;
1173     }
1174     double ret = pow(x / scale, shape - 1) * exp( -pow(x / scale, shape));
1175     return ret * (shape / scale);
1176 }
1177
1178 unittest {
1179     assert(approxEqual(weibullPDF(2,1,3), 0.1711390));
1180 }
1181
1182 ///
1183 double weibullCDF(double x, double shape, double scale = 1) {
1184     dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution.");
1185     dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution.");
1186
1187     double exponent = pow(x / scale, shape);
1188     return 1 - exp(-exponent);
1189 }
1190
1191 unittest {
1192     assert(approxEqual(weibullCDF(2, 3, 4), 0.1175031));
1193 }
1194
1195 ///
1196 double weibullCDFR(double x, double shape, double scale = 1) {
1197     dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution.");
1198     dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution.");
1199
1200     double exponent = pow(x / scale, shape);
1201     return exp(-exponent);
1202 }
1203
1204 unittest {
1205     assert(approxEqual(weibullCDF(2, 3, 4) + weibullCDFR(2, 3, 4), 1));
1206 }
1207
1208 // For K-S tests in dstats.random.  Todo:  Flesh out.
1209 double waldCDF(double x, double mu, double lambda) {
1210     double sqr = sqrt(lambda / (2 * x));
1211     double term1 = 1 + erf(sqr * (x / mu - 1));
1212     double term2 = exp(2 * lambda / mu);
1213     double term3 = 1 - erf(sqr * (x / mu + 1));
1214     return 0.5L * term1 + 0.5L * term2 * term3;
1215 }
1216
1217 // ditto.
1218 double rayleighCDF(double x, double mode) {
1219     return 1.0L - exp(-x * x / (2 * mode * mode));
1220 }
1221
1222 ///
1223 double studentsTPDF(double t, double df) {
1224     dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom.");
1225
1226     immutable logPart = lgamma(0.5 * df + 0.5) - lgamma(0.5 * df);
1227     immutable term1 = exp(logPart) / sqrt(df * PI);
1228     immutable term2 = (1.0 + t / df * t) ^^ (-0.5 * df - 0.5);
1229     return term1 * term2;
1230 }
1231
1232 ///
1233 double studentsTCDF(double t, double df) {
1234     dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom.");
1235
1236     double x = (t + sqrt(t * t + df)) / (2 * sqrt(t * t + df));
1237     return betaIncomplete(df * 0.5L, df * 0.5L, x);
1238 }
1239
1240 ///
1241 double studentsTCDFR(double t, double df)   {
1242     return studentsTCDF(-t, df);
1243 }
1244
1245 unittest {
1246     assert(approxEqual(studentsTPDF(1, 1), 0.1591549));
1247     assert(approxEqual(studentsTPDF(3, 10), 0.0114055));
1248     assert(approxEqual(studentsTPDF(-4, 5), 0.005123727));
1249
1250     assert(approxEqual(studentsTCDF(1, 1), 0.75));
1251     assert(approxEqual(studentsTCDF(1.061, 2), 0.8));
1252     assert(approxEqual(studentsTCDF(5.959, 5), 0.9995));
1253     assert(approxEqual(studentsTCDF(.667, 20), 0.75));
1254     assert(approxEqual(studentsTCDF(2.353, 3), 0.95));
1255 }
1256
1257 /******************************************
1258 *   Inverse of Student's t distribution
1259 *
1260 * Given probability p and degrees of freedom df,
1261 * finds the argument t such that the one-sided
1262 * studentsDistribution(nu,t) is equal to p.
1263 * Used to test whether two distributions have the same
1264 * standard deviation.
1265 *
1266 * Params:
1267 * df = degrees of freedom. Must be >1
1268 * p  = probability. 0 < p < 1
1269 */
1270 double invStudentsTCDF(double p, double df) {
1271     dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
1272     dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom.");
1273
1274     if (p==0) return -double.infinity;
1275     if (p==1) return double.infinity;
1276
1277     double rk, z;
1278     rk =  df;
1279
1280     if ( p > 0.25L && p < 0.75L ) {
1281         if ( p == 0.5L ) return 0;
1282         z = 1.0L - 2.0L * p;
1283         z = betaIncompleteInverse( 0.5L, 0.5L*rk, fabs(z) );
1284         double t = sqrt( rk*z/(1.0L-z) );
1285         if( p < 0.5L )
1286             t = -t;
1287         return t;
1288     }
1289     int rflg = -1; // sign of the result
1290     if (p >= 0.5L) {
1291         p = 1.0L - p;
1292         rflg = 1;
1293     }
1294     z = betaIncompleteInverse( 0.5L*rk, 0.5L, 2.0L*p );
1295
1296     if (z<0) return rflg * double.infinity;
1297     return rflg * sqrt( rk/z - rk );
1298 }
1299
1300 unittest {
1301     // The remaining values listed here are from Excel, and are unlikely to be accurate
1302     // in the last decimal places. However, they are helpful as a sanity check.
1303
1304     //  Microsoft Excel 2003 gives TINV(2*(1-0.995), 10) == 3.16927267160917
1305     assert(approxEqual(invStudentsTCDF(0.995, 10), 3.169_272_67L));
1306     assert(approxEqual(invStudentsTCDF(0.6, 8), 0.261_921_096_769_043L));
1307     assert(approxEqual(invStudentsTCDF(0.4, 18), -0.257_123_042_655_869L));
1308     assert(approxEqual(studentsTCDF(invStudentsTCDF(0.4L, 18), 18), .4L));
1309     assert(approxEqual(studentsTCDF( invStudentsTCDF(0.9L, 11), 11), 0.9L));
1310 }
1311
1312 /**
1313  * The Fisher distribution, its complement, and inverse.
1314  *
1315  * The F density function (also known as Snedcor's density or the
1316  * variance ratio density) is the density
1317  * of x = (u1/df1)/(u2/df2), where u1 and u2 are random
1318  * variables having $(POWER &chi;,2) distributions with df1
1319  * and df2 degrees of freedom, respectively.
1320  *
1321  * fisherCDF returns the area from zero to x under the F density
1322  * function.   The complementary function,
1323  * fisherCDFR, returns the area from x to &infin; under the F density function.
1324  *
1325  * The inverse of the complemented Fisher distribution,
1326  * invFisherCDFR, finds the argument x such that the integral
1327  * from x to infinity of the F density is equal to the given probability y.
1328
1329  * Params:
1330  *  df1 = Degrees of freedom of the first variable. Must be >= 1
1331  *  df2 = Degrees of freedom of the second variable. Must be >= 1
1332  *  x  = Must be >= 0
1333  */
1334 double fisherCDF(double x, double df1, double df2) {
1335     dstatsEnforce(df1 > 0 && df2 > 0,
1336         "Fisher distribution must have >0 degrees of freedom.");
1337     dstatsEnforce(x >= 0, "x must be >=0 for Fisher distribution.");
1338
1339     double a = cast(double)(df1);
1340     double b = cast(double)(df2);
1341     double w = a * x;
1342     w = w/(b + w);
1343     return betaIncomplete(0.5L*a, 0.5L*b, w);
1344 }
1345
1346 /** ditto */
1347 double fisherCDFR(double x, double df1, double df2) {
1348     dstatsEnforce(df1 > 0 && df2 > 0,
1349         "Fisher distribution must have >0 degrees of freedom.");
1350     dstatsEnforce(x >= 0, "x must be >=0 for Fisher distribution.");
1351
1352     double a = cast(double)(df1);
1353     double b = cast(double)(df2);
1354     double w = b / (b + a * x);
1355     return betaIncomplete( 0.5L*b, 0.5L*a, w );
1356 }
1357
1358 /**
1359  * Inverse of complemented Fisher distribution
1360  *
1361  * Finds the F density argument x such that the integral
1362  * from x to infinity of the F density is equal to the
1363  * given probability p.
1364  *
1365  * This is accomplished using the inverse beta integral
1366  * function and the relations
1367  *
1368  *      z = betaIncompleteInverse( df2/2, df1/2, p ),
1369  *      x = df2 (1-z) / (df1 z).
1370  *
1371  * Note that the following relations hold for the inverse of
1372  * the uncomplemented F distribution:
1373  *
1374  *      z = betaIncompleteInverse( df1/2, df2/2, p ),
1375  *      x = df2 z / (df1 (1-z)).
1376 */
1377 double invFisherCDFR(double df1, double df2, double p ) {
1378     dstatsEnforce(df1 > 0 && df2 > 0,
1379         "Fisher distribution must have >0 degrees of freedom.");
1380     dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
1381
1382     double a = df1;
1383     double b = df2;
1384     /* Compute probability for x = 0.5.  */
1385     double w = betaIncomplete( 0.5L*b, 0.5L*a, 0.5L );
1386     /* If that is greater than p, then the solution w < .5.
1387        Otherwise, solve at 1-p to remove cancellation in (b - b*w).  */
1388     if ( w > p || p < 0.001L) {
1389         w = betaIncompleteInverse( 0.5L*b, 0.5L*a, p );
1390         return (b - b*w)/(a*w);
1391     } else {
1392         w = betaIncompleteInverse( 0.5L*a, 0.5L*b, 1.0L - p );
1393         return b*w/(a*(1.0L-w));
1394     }
1395 }
1396
1397 unittest {
1398     // fDistCompl(df1, df2, x) = Excel's FDIST(x, df1, df2)
1399       assert(fabs(fisherCDFR(16.5, 6, 4) - 0.00858719177897249L)< 0.0000000000005L);
1400       assert(fabs((1-fisherCDF(0.1, 12, 23)) - 0.99990562845505L)< 0.0000000000005L);
1401       assert(fabs(invFisherCDFR(8, 34, 0.2) - 1.48267037661408L)< 0.0000000005L);
1402       assert(fabs(invFisherCDFR(4, 16, 0.008) - 5.043_537_593_48596L)< 0.0000000005L);
1403       // This one used to fail because of a bug in the definition of MINLOG.
1404       assert(approxEqual(fisherCDFR(invFisherCDFR(4,16, 0.008), 4, 16), 0.008));
1405 }
1406
1407 ///
1408 double negBinomPMF(ulong k, ulong n, double p) {
1409     dstatsEnforce(p >= 0 && p <= 1,
1410         "p must be between 0, 1 for negative binomial distribution.");
1411
1412     return exp(logNcomb(k - 1 + n, k) +  n * log(p) + k * log(1 - p));
1413 }
1414
1415 unittest {
1416     // Values from R.
1417     assert(approxEqual(negBinomPMF(1, 8, 0.7), 0.1383552));
1418     assert(approxEqual(negBinomPMF(3, 2, 0.5), 0.125));
1419 }
1420
1421
1422 /**********************
1423  * Negative binomial distribution.
1424  *
1425  * Returns the sum of the terms 0 through k of the negative
1426  * binomial distribution:
1427  *
1428  * $(BIGSUM j=0, k) $(CHOOSE n+j-1, j) $(POWER p, n) $(POWER (1-p), j)
1429  * ???? In mathworld, it is
1430  * $(BIGSUM j=0, k) $(CHOOSE n+j-1, j-1) $(POWER p, j) $(POWER (1-p), n)
1431  *
1432  * In a sequence of Bernoulli trials, this is the probability
1433  * that k or fewer failures precede the n-th success.
1434  *
1435  * The arguments must be positive, with 0 < p < 1 and r>0.
1436  *
1437  * The Geometric Distribution is a special case of the negative binomial
1438  * distribution.
1439  * -----------------------
1440  * geometricDistribution(k, p) = negativeBinomialDistribution(k, 1, p);
1441  * -----------------------
1442  * References:
1443  * $(LINK http://mathworld.wolfram.com/NegativeBinomialDistribution.html)
1444  */
1445 double negBinomCDF(ulong k, ulong n, double p ) {
1446     dstatsEnforce(p >= 0 && p <= 1,
1447         "p must be between 0, 1 for negative binomial distribution.");
1448     if ( k == 0 ) return pow(p, cast(double) n);
1449     return  betaIncomplete( n, k + 1, p );
1450 }
1451
1452 unittest {
1453     // Values from R.
1454     assert(approxEqual(negBinomCDF(50, 50, 0.5), 0.5397946));
1455     assert(approxEqual(negBinomCDF(2, 1, 0.5), 0.875));
1456 }
1457
1458 /**Probability that k or more failures precede the nth success.*/
1459 double negBinomCDFR(ulong k, ulong n, double p) {
1460     dstatsEnforce(p >= 0 && p <= 1,
1461         "p must be between 0, 1 for negative binomial distribution.");
1462
1463     if(k == 0)
1464         return 1;
1465     return betaIncomplete(k, n, 1.0L - p);
1466 }
1467
1468 unittest {
1469     assert(approxEqual(negBinomCDFR(10, 20, 0.5), 1 - negBinomCDF(9, 20, 0.5)));
1470 }
1471
1472 ///
1473 ulong invNegBinomCDF(double pVal, ulong n, double p) {
1474     dstatsEnforce(p >= 0 && p <= 1,
1475         "p must be between 0, 1 for negative binomial distribution.");
1476     dstatsEnforce(pVal >= 0 && pVal <= 1,
1477         "P-values must be between 0, 1.");
1478
1479     // Normal or gamma approx, then adjust.
1480     double mean = n * (1 - p) / p;
1481     double var = n * (1 - p) / (p * p);
1482     double skew = (2 - p) / sqrt(n * (1 - p));
1483     double kk = 4.0L / (skew * skew);
1484     double theta = sqrt(var / kk);
1485     double offset = (kk * theta) - mean + 0.5L;
1486     ulong guess;
1487
1488     // invGammaCDFR is very expensive, but worth it in cases where normal approx
1489     // would be the worst.  Otherwise, use normal b/c it's *MUCH* cheaper to
1490     // calculate.
1491     if(skew > 1.5 && var > 1_048_576)
1492         guess = cast(long) max(round(
1493          invGammaCDFR(1 - pVal, 1 / theta, kk) - offset), 0.0L);
1494     else
1495         guess = cast(long) max(round(
1496            invNormalCDF(pVal, mean, sqrt(var)) + 0.5), 0.0L);
1497
1498     // This is pretty arbitrary behavior, but I don't want to use exceptions
1499     // and it has to be handled as a special case.
1500     if(pVal > 1 - double.epsilon)
1501         return ulong.max;
1502     double guessP = negBinomCDF(guess, n, p);
1503
1504     if(guessP == pVal) {
1505         return guess;
1506     } else if(guessP < pVal) {
1507         for(ulong k = guess + 1; ; k++) {
1508             double newP = guessP + negBinomPMF(k, n, p);
1509             // Test for aliasing.
1510             if(newP == guessP)
1511                 return k - 1;
1512             if(abs(pVal - newP) > abs(guessP - pVal)) {
1513                 return k - 1;
1514             } else if(newP >= 1) {
1515                 return k;
1516             } else {
1517                 guessP = newP;
1518             }
1519         }
1520     } else {
1521         for(ulong k = guess - 1; guess != ulong.max; k--) {
1522             double newP = guessP - negBinomPMF(k + 1, n, p);
1523             // Test for aliasing.
1524             if(newP == guessP)
1525                 return k + 1;
1526             if(abs(newP - pVal) > abs(guessP - pVal)) {
1527                 return k + 1;
1528             } else {
1529                 guessP = newP;
1530             }
1531         }
1532         return 0;
1533     }
1534 }
1535
1536 unittest {
1537     Random gen = Random(unpredictableSeed);
1538     uint nSkipped;
1539     foreach(i; 0..1000) {
1540         uint n = uniform(1u, 10u);
1541         double p = uniform(0.0L, 1L);
1542         uint k = uniform(0, 20);
1543         double pVal = negBinomCDF(k, n, p);
1544
1545         // In extreme tails, p-values can alias, giving incorrect results.
1546         // This is a corner case that nothing can be done about.  Just skip
1547         // these.
1548         if(pVal >= 1 - 10 * double.epsilon) {
1549             nSkipped++;
1550             continue;
1551         }
1552         assert(invNegBinomCDF(pVal, n, p) == k);
1553     }
1554 }
1555
1556 ///
1557 double exponentialPDF(double x, double lambda) {
1558     dstatsEnforce(x >= 0, "x must be >0 in exponential distribution");
1559     dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution");
1560
1561     return lambda * exp(-lambda * x);
1562 }
1563
1564 ///
1565 double exponentialCDF(double x, double lambda) {
1566     dstatsEnforce(x >= 0, "x must be >0 in exponential distribution");
1567     dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution");
1568
1569     return 1.0 - exp(-lambda * x);
1570 }
1571
1572 ///
1573 double exponentialCDFR(double x, double lambda) {
1574     dstatsEnforce(x >= 0, "x must be >0 in exponential distribution");
1575     dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution");
1576
1577     return exp(-lambda * x);
1578 }
1579
1580 ///
1581 double invExponentialCDF(double p, double lambda) {
1582     dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in exponential distribution");
1583     dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution");
1584     return log(-1.0 / (p - 1.0)) / lambda;
1585 }
1586
1587 unittest {
1588     // Values from R.
1589     assert(approxEqual(exponentialPDF(0.75, 3), 0.3161977));
1590     assert(approxEqual(exponentialCDF(0.75, 3), 0.8946008));
1591     assert(approxEqual(exponentialCDFR(0.75, 3), 0.1053992));
1592
1593     assert(approxEqual(invExponentialCDF(0.8, 2), 0.804719));
1594     assert(approxEqual(invExponentialCDF(0.2, 7), 0.03187765));
1595 }
1596
1597 ///
1598 double gammaPDF(double x, double rate, double shape) {
1599     dstatsEnforce(x > 0, "x must be >0 in gamma distribution.");
1600     dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution.");
1601     dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution.");
1602
1603     immutable scale = 1.0 / rate;
1604     immutable firstPart = x ^^ (shape - 1);
1605     immutable logNumer = -x / scale;
1606     immutable logDenom = lgamma(shape) + shape * log(scale);
1607     return firstPart * exp(logNumer - logDenom);
1608 }
1609
1610 ///
1611 double gammaCDF(double x, double rate, double shape) {
1612     dstatsEnforce(x > 0, "x must be >0 in gamma distribution.");
1613     dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution.");
1614     dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution.");
1615
1616     return gammaIncomplete(shape, rate * x);
1617 }
1618
1619 ///
1620 double gammaCDFR(double x, double rate, double shape) {
1621     dstatsEnforce(x > 0, "x must be >0 in gamma distribution.");
1622     dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution.");
1623     dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution.");
1624
1625     return gammaIncompleteCompl(shape, rate * x);
1626 }
1627
1628 /**This just calls invGammaCDFR w/ 1 - p b/c invGammaCDFR is more accurate,
1629  * but this function is necessary for consistency.
1630  */
1631 double invGammaCDF(double p, double rate, double shape) {
1632     return invGammaCDFR(1.0 - p, rate, shape);
1633 }
1634
1635 ///
1636 double invGammaCDFR(double p, double rate, double shape) {
1637     dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in gamma distribution.");
1638     dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution.");
1639     dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution.");
1640
1641     double ratex = gammaIncompleteComplInverse(shape, p);
1642     return ratex / rate;
1643 }
1644
1645 unittest {
1646     assert(approxEqual(gammaPDF(1, 2, 5), 0.1804470));
1647     assert(approxEqual(gammaPDF(0.5, 8, 4), 1.562935));
1648     assert(approxEqual(gammaPDF(3, 2, 7), 0.3212463));
1649     assert(approxEqual(gammaCDF(1, 2, 5), 0.05265302));
1650     assert(approxEqual(gammaCDFR(1, 2, 5), 0.947347));
1651
1652     double inv = invGammaCDFR(0.78, 2, 1);
1653     assert(approxEqual(gammaCDFR(inv, 2, 1), 0.78));
1654
1655     double inv2 = invGammaCDF(0.78, 2, 1);
1656     assert(approxEqual(gammaCDF(inv2, 2, 1), 0.78));
1657 }
1658
1659 ///
1660 double betaPDF(double x, double alpha, double beta) {
1661     dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution.");
1662     dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution.");
1663     dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution.");
1664
1665     return x ^^ (alpha - 1) * (1 - x) ^^ (beta - 1) /
1666         std.mathspecial.beta(alpha, beta);
1667 }
1668
1669 ///
1670 double betaCDF(double x, double alpha, double beta) {
1671     dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution.");
1672     dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution.");
1673     dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution.");
1674
1675     return std.mathspecial.betaIncomplete(alpha, beta, x);
1676 }
1677
1678 ///
1679 double betaCDFR(double x, double alpha, double beta) {
1680     dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution.");
1681     dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution.");
1682     dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution.");
1683
1684     return std.mathspecial.betaIncomplete(beta, alpha, 1 - x);
1685 }
1686
1687 ///
1688 double invBetaCDF(double p, double alpha, double beta) {
1689     dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution.");
1690     dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution.");
1691     dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 for beta distribution.");
1692
1693     return std.mathspecial.betaIncompleteInverse(alpha, beta, p);
1694 }
1695
1696 unittest {
1697     // Values from R.
1698     assert(approxEqual(betaPDF(0.3, 2, 3), 1.764));
1699     assert(approxEqual(betaPDF(0.78, 0.9, 4), 0.03518569));
1700
1701     assert(approxEqual(betaCDF(0.3, 2, 3), 0.3483));
1702     assert(approxEqual(betaCDF(0.78, 0.9, 4), 0.9980752));
1703
1704     assert(approxEqual(betaCDFR(0.3, 2, 3), 0.6517));
1705     assert(approxEqual(betaCDFR(0.78, 0.9, 4), 0.001924818));
1706
1707     assert(approxEqual(invBetaCDF(0.3483, 2, 3), 0.3));
1708     assert(approxEqual(invBetaCDF(0.9980752, 0.9, 4), 0.78));
1709 }
1710
1711 /**
1712 The Dirichlet probability density.
1713
1714 Params:
1715
1716 x = An input range of observed values.  All must be between [0, 1].  They
1717 must also sum to 1, though this is not checked because small deviations from
1718 this may result due to numerical error.
1719
1720 alpha = A forward range of parameters.  This must have the same length as
1721 x.
1722 */
1723 double dirichletPDF(X, A)(X x, A alpha)
1724 if(isInputRange!X && isForwardRange!A && is(ElementType!X : double) &&
1725 is(ElementType!A : double)) {
1726
1727     // Evaluating the multinomial beta function = product(gamma(alpha_1)) over
1728     // gamma(sum(alpha)), in log space.
1729     double logNormalizer = 0;
1730     double sumAlpha = 0;
1731
1732     foreach(a; alpha.save) {
1733         dstatsEnforce(a > 0, "All alpha values must be > 0 for Dirichlet distribution.");
1734         logNormalizer += lgamma(a);
1735         sumAlpha += a;
1736     }
1737
1738     logNormalizer -= lgamma(sumAlpha);
1739     double sum = 0;
1740     foreach(xElem, a; lockstep(x, alpha)) {
1741         dstatsEnforce(xElem > 0, "All x values must be > 0 for Dirichlet distribution.");
1742         sum += log(xElem) * (a - 1);
1743     }
1744
1745     sum -= logNormalizer;
1746     return exp(sum);
1747 }
1748
1749 unittest {
1750     // Test against beta
1751     assert(approxEqual(dirichletPDF([0.1, 0.9], [2, 3]), betaPDF(0.1, 2, 3)));
1752
1753     // A few values from R's gregmisc package
1754     assert(approxEqual(dirichletPDF([0.1, 0.2, 0.7], [4, 5, 6]), 1.356672));
1755     assert(approxEqual(dirichletPDF([0.8, 0.05, 0.15], [8, 5, 6]), 0.04390199));
1756 }
1757
1758 ///
1759 double cauchyPDF(double X, double X0 = 0, double gamma = 1) {
1760     dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution.");
1761
1762     double toSquare = (X - X0) / gamma;
1763     return 1.0L / (
1764         PI * gamma * (1 + toSquare * toSquare));
1765 }
1766
1767 unittest {
1768     assert(approxEqual(cauchyPDF(5), 0.01224269));
1769     assert(approxEqual(cauchyPDF(2), 0.06366198));
1770 }
1771
1772
1773 ///
1774 double cauchyCDF(double X, double X0 = 0, double gamma = 1) {
1775     dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution.");
1776
1777     return M_1_PI * atan((X - X0) / gamma) + 0.5L;
1778 }
1779
1780 unittest {
1781     // Values from R
1782     assert(approxEqual(cauchyCDF(-10), 0.03172552));
1783     assert(approxEqual(cauchyCDF(1), 0.75));
1784 }
1785
1786 ///
1787 double cauchyCDFR(double X, double X0 = 0, double gamma = 1) {
1788     dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution.");
1789
1790     return M_1_PI * atan((X0 - X) / gamma) + 0.5L;
1791 }
1792
1793 unittest {
1794     // Values from R
1795     assert(approxEqual(1 - cauchyCDFR(-10), 0.03172552));
1796     assert(approxEqual(1 - cauchyCDFR(1), 0.75));
1797 }
1798
1799 ///
1800 double invCauchyCDF(double p, double X0 = 0, double gamma = 1) {
1801     dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution.");
1802     dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
1803
1804     return X0 + gamma * tan(PI * (p - 0.5L));
1805 }
1806
1807 unittest {
1808     // cauchyCDF already tested.  Just make sure this is the inverse.
1809     assert(approxEqual(invCauchyCDF(cauchyCDF(.5)), .5));
1810     assert(approxEqual(invCauchyCDF(cauchyCDF(.99)), .99));
1811     assert(approxEqual(invCauchyCDF(cauchyCDF(.03)), .03));
1812 }
1813
1814 // For K-S tests in dstats.random.  To be fleshed out later.  Intentionally
1815 // lacking ddoc.
1816 double logisticCDF(double x, double loc, double shape) {
1817     return 1.0L / (1 + exp(-(x - loc) / shape));
1818 }
1819
1820 ///
1821 double laplacePDF(double x, double mu = 0, double b = 1) {
1822     dstatsEnforce(b > 0, "b must be > 0 for laplace distribution.");
1823
1824     return (exp(-abs(x - mu) / b)) / (2 * b);
1825 }
1826
1827 unittest {
1828     // Values from Maxima.
1829     assert(approxEqual(laplacePDF(3, 2, 1), 0.18393972058572));
1830     assert(approxEqual(laplacePDF(-8, 6, 7), 0.0096668059454723));
1831 }
1832
1833 ///
1834 double laplaceCDF(double X, double mu = 0, double b = 1) {
1835     dstatsEnforce(b > 0, "b must be > 0 for laplace distribution.");
1836
1837     double diff = (X - mu);
1838     double sign = (diff > 0) ? 1 : -1;
1839     return 0.5L *(1 + sign * (1 - exp(-abs(diff) / b)));
1840 }
1841
1842 unittest {
1843     // Values from Octave.
1844     assert(approxEqual(laplaceCDF(5), 0.9963));
1845     assert(approxEqual(laplaceCDF(-3.14), .021641));
1846     assert(approxEqual(laplaceCDF(0.012), 0.50596));
1847 }
1848
1849 ///
1850 double laplaceCDFR(double X, double mu = 0, double b = 1) {
1851     dstatsEnforce(b > 0, "b must be > 0 for laplace distribution.");
1852
1853     double diff = (mu - X);
1854     double sign = (diff > 0) ? 1 : -1;
1855     return 0.5L *(1 + sign * (1 - exp(-abs(diff) / b)));
1856 }
1857
1858 unittest {
1859     // Values from Octave.
1860     assert(approxEqual(1 - laplaceCDFR(5), 0.9963));
1861     assert(approxEqual(1 - laplaceCDFR(-3.14), .021641));
1862     assert(approxEqual(1 - laplaceCDFR(0.012), 0.50596));
1863 }
1864
1865 ///
1866 double invLaplaceCDF(double p, double mu = 0, double b = 1) {
1867     dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
1868     dstatsEnforce(b > 0, "b must be > 0 for laplace distribution.");
1869
1870     double p05 = p - 0.5L;
1871     double sign = (p05 < 0) ? -1.0L : 1.0L;
1872     return mu - b * sign * log(1.0L - 2 * abs(p05));
1873 }
1874
1875 unittest {
1876     assert(approxEqual(invLaplaceCDF(0.012), -3.7297));
1877     assert(approxEqual(invLaplaceCDF(0.82), 1.0217));
1878 }
1879
1880
1881 /**Kolmogorov distribution.  Used in Kolmogorov-Smirnov testing.
1882  *
1883  * References: http://en.wikipedia.org/wiki/Kolmogorov-Smirnov
1884  */
1885 double kolmDist(double X) {
1886     dstatsEnforce(X >= 0, "X must be >= 0 for Kolmogorov distribution.");
1887
1888     if(X == 0) {
1889         //Handle as a special case.  Otherwise, get NAN b/c of divide by zero.
1890         return 0;
1891     }
1892     double result = 0;
1893     double i = 1;
1894     while(true) {
1895         immutable delta = exp(-(2 * i - 1) * (2 * i - 1) * PI * PI  / (8 * X * X));
1896         i++;
1897
1898         immutable oldResult = result;
1899         result += delta;
1900         if(isNaN(result) || oldResult == result) {
1901             break;
1902         }
1903     }
1904     result *= (sqrt(2 * PI) / X);
1905     return result;
1906 }
1907
1908 unittest {
1909     assert(approxEqual(1 - kolmDist(.75), 0.627167));
1910     assert(approxEqual(1 - kolmDist(.5), 0.9639452436));
1911     assert(approxEqual(1 - kolmDist(.9), 0.39273070));
1912     assert(approxEqual(1 - kolmDist(1.2), 0.112249666));
1913 }
Note: See TracBrowser for help on using the browser.