root/trunk/kerneldensity.d

Revision 226, 18.9 kB (checked in by dsimcha, 2 years ago)

Fix 64-bit compilation errors. Code only passes semantic analysis, hasn't been unit tested in 64-bit environment yet.

Line 
1 /**This module contains a small but growing library for performing kernel
2  * density estimation.
3  *
4  * Author:  David Simcha
5  */
6 /*
7  * License:
8  * Boost Software License - Version 1.0 - August 17th, 2003
9  *
10  * Permission is hereby granted, free of charge, to any person or organization
11  * obtaining a copy of the software and accompanying documentation covered by
12  * this license (the "Software") to use, reproduce, display, distribute,
13  * execute, and transmit the Software, and to prepare derivative works of the
14  * Software, and to permit third-parties to whom the Software is furnished to
15  * do so, all subject to the following:
16  *
17  * The copyright notices in the Software and this entire statement, including
18  * the above license grant, this restriction and the following disclaimer,
19  * must be included in all copies of the Software, in whole or in part, and
20  * all derivative works of the Software, unless such copies or derivative
21  * works are solely in the form of machine-executable object code generated by
22  * a source language processor.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30  * DEALINGS IN THE SOFTWARE.
31  */
32 module dstats.kerneldensity;
33
34 import std.conv, std.math, std.algorithm, std.exception, std.traits, std.range,
35     std.array, std.typetuple, dstats.distrib;
36
37 import  dstats.alloc, dstats.base, dstats.summary;
38
39 version(unittest) {
40
41     import dstats.random, std.stdio;
42
43     void main() {}
44 }
45
46 /**Estimates densities in the 1-dimensional case.  The 1-D case is special
47  * enough to be treated as a special case, since it's very common and enables
48  * some significant optimizations that are otherwise not feasible.
49  *
50  * Under the hood, this works by binning the data into a large number of bins
51  * (currently 1,000), convolving it with the kernel function to smooth it, and
52  * then using linear interpolation to evaluate the density estimates.  This
53  * will produce results that are different from the textbook definition of
54  * kernel density estimation, but to an extent that's negligible in most cases.
55  * It also means that constructing this object is relatively expensive, but
56  * evaluating a density estimate can be done in O(1) time complexity afterwords.
57  */
58 class KernelDensity1D {
59 private:
60     immutable double[] bins;
61     immutable double[] cumulative;
62     immutable double minElem;
63     immutable double maxElem;
64     immutable double diffNeg1Nbin;
65
66
67     this(immutable double[] bins, immutable double[] cumulative,
68          double minElem, double maxElem) {
69         this.bins = bins;
70         this.cumulative = cumulative;
71         this.minElem = minElem;
72         this.maxElem = maxElem;
73         this.diffNeg1Nbin = bins.length / (maxElem - minElem);
74     }
75
76     private static double findEdgeBuffer(C)(C kernel) {
77         // Search for the approx. point where the kernel's density is 0.001 *
78         // what it is at zero.
79         immutable zeroVal = kernel(0);
80         double ret = 1;
81
82         double factor = 4;
83         double kernelVal;
84
85         do {
86             while(kernel(ret) / zeroVal > 1e-3) {
87                 ret *= factor;
88             }
89
90             factor = (factor - 1) / 2 + 1;
91             while(kernel(ret) / zeroVal < 1e-4) {
92                 ret /= factor;
93             }
94
95             kernelVal = kernel(ret) / zeroVal;
96         } while((kernelVal > 1e-3 || kernelVal < 1e-4) && factor > 1);
97
98         return ret;
99     }
100
101 public:
102     /**Construct a kernel density estimation object from a callable object.
103      * R must be a range of numeric types.  C must be a kernel function,
104      * delegate, or class or struct with overloaded opCall.  The kernel
105      * function is assumed to be symmetric about zero, to take its maximum
106      * value at zero and to be unimodal.
107      *
108      * edgeBuffer determines how much space below and above the smallest and
109      * largest observed value will be allocated when doing the binning.
110      * Values less than reduce!min(range) - edgeBuffer or greater than
111      * reduce!max(range) + edgeBuffer will be assigned a density of zero.
112      * If this value is left at its default, it will be set to a value at which
113      * the kernel is somewhere between 1e-3 and 1e-4 times its value at zero.
114      *
115      * The bandwidth of the kernel is indirectly selected by parametrizing the
116      * kernel function.
117      *
118      * Examples:
119      * ---
120      * auto randNums = randArray!rNorm(1_000, 0, 1);
121      * auto kernel = parametrize!normalPDF(0, 0.01);
122      * auto density = KernelDensity1D(kernel, randNums);
123      * writeln(normalPDF(1, 0, 1), "  ", density(1)).  // Should be about the same.
124      * ---
125      */
126     static KernelDensity1D fromCallable(C, R)
127     (scope C kernel, R range, double edgeBuffer = double.nan)
128     if(isForwardRange!R && is(typeof(kernel(2.0)) : double)) {
129         enum nBin = 1000;
130         mixin(newFrame);
131
132         uint N = 0;
133         double minElem = double.infinity;
134         double maxElem = -double.infinity;
135         foreach(elem; range) {
136             minElem = min(minElem, elem);
137             maxElem = max(maxElem, elem);
138             N++;
139         }
140
141         if(isNaN(edgeBuffer)) {
142             edgeBuffer = findEdgeBuffer(kernel);
143         }
144         minElem -= edgeBuffer;
145         maxElem += edgeBuffer;
146
147         // Using ints here because they convert faster to floats than uints do.
148         auto binsRaw = newStack!int(nBin);
149         binsRaw[] = 0;
150
151         foreach(elemRaw; range) {
152             double elem = elemRaw - minElem;
153             elem /= (maxElem - minElem);
154             elem *= nBin;
155             auto bin = to!uint(elem);
156             if(bin == nBin) {
157                 bin--;
158             }
159
160             binsRaw[bin]++;
161         }
162
163         // Convolve the binned data with our kernel.  Since N is fairly small
164         // we'll use a simple O(N^2) algorithm.  According to my measurements,
165         // this is actually comparable in speed to using an FFT (and a lot
166         // simplier and more space efficient) because:
167         //
168         // 1.  We can take advantage of kernel symmetry.
169         //
170         // 2.  We can take advantage of the sparsity of binsRaw.  (We don't
171         //     need to convolve the zero count bins.)
172         //
173         // 3.  We don't need to do any zero padding to get a non-cyclic
174         //     convolution.
175         //
176         // 4.  We don't need to convolve the tails of the kernel function,
177         //     where the contribution to the final density estimate would be
178         //     negligible.
179         auto binsCooked = newVoid!double(nBin);
180         binsCooked[] = 0;
181
182         auto kernelPoints = newStack!double(nBin);
183         immutable stepSize = (maxElem - minElem) / nBin;
184
185         kernelPoints[0] = kernel(0);
186         immutable stopAt = kernelPoints[0] * 1e-10;
187         foreach(ptrdiff_t i; 1..kernelPoints.length) {
188             kernelPoints[i] = kernel(stepSize * i);
189
190             // Don't bother convolving stuff that contributes negligibly.
191             if(kernelPoints[i] < stopAt) {
192                 kernelPoints = kernelPoints[0..i];
193                 break;
194             }
195         }
196
197         foreach(i, count; binsRaw) if(count > 0) {
198             binsCooked[i] += kernelPoints[0] * count;
199
200             foreach(offset; 1..min(kernelPoints.length, max(i + 1, nBin - i))) {
201                 immutable kernelVal = kernelPoints[offset];
202
203                 if(i >= offset) {
204                     binsCooked[i - offset] += kernelVal * count;
205                 }
206
207                 if(i + offset < nBin) {
208                     binsCooked[i + offset] += kernelVal * count;
209                 }
210             }
211         }
212
213         binsCooked[] /= sum(binsCooked);
214         binsCooked[] *= nBin / (maxElem - minElem);  // Make it a density.
215
216         auto cumulative = newVoid!double(nBin);
217         cumulative[0] = binsCooked[0];
218         foreach(i; 1..nBin) {
219             cumulative[i] = cumulative[i - 1] + binsCooked[i];
220         }
221         cumulative[] /= cumulative[$ - 1];
222
223         return new typeof(this)(
224             assumeUnique(binsCooked), assumeUnique(cumulative),
225             minElem, maxElem);
226     }
227
228     /**Construct a kernel density estimator from an alias.*/
229     static KernelDensity1D fromAlias(alias kernel, R)
230     (R range, double edgeBuffer = double.nan)
231     if(isForwardRange!R && is(typeof(kernel(2.0)) : double)) {
232         static double kernelFun(double x) {
233             return kernel(x);
234         }
235
236         return fromCallable(&kernelFun, range, edgeBuffer);
237     }
238
239     /**Construct a kernel density estimator using the default kernel, which is
240      * a Gaussian kernel with the Scott bandwidth.
241      */
242     static KernelDensity1D fromDefaultKernel(R)
243     (R range, double edgeBuffer = double.nan)
244     if(isForwardRange!R && is(ElementType!R : double)) {
245         immutable bandwidth = scottBandwidth(range.save);
246
247         double kernel(double x) {
248             return normalPDF(x, 0, bandwidth);
249         }
250
251         return fromCallable(&kernel, range, edgeBuffer);
252     }
253
254     /**Compute the probability density at a given point.*/
255     double opCall(double x) const {
256         if(x < minElem || x > maxElem) {
257             return 0;
258         }
259
260         x -= minElem;
261         x *= diffNeg1Nbin;
262
263         immutable fract = x - floor(x);
264         immutable upper = to!size_t(ceil(x));
265         immutable lower = to!size_t(floor(x));
266
267         if(upper == bins.length) {
268             return bins[$ - 1];
269         }
270
271         immutable ret = fract * bins[upper] + (1 - fract) * bins[lower];
272         return max(0, ret);  // Compensate for roundoff
273     }
274
275     /**Compute the cumulative density, i.e. the integral from -infinity to x.*/
276     double cdf(double x) const {
277         if(x <= minElem) {
278             return 0;
279         } else if(x >= maxElem) {
280             return 1;
281         }
282
283         x -= minElem;
284         x *= diffNeg1Nbin;
285
286         immutable fract = x - floor(x);
287         immutable upper = to!size_t(ceil(x));
288         immutable lower = to!size_t(floor(x));
289
290         if(upper == cumulative.length) {
291             return 1;
292         }
293
294         return fract * cumulative[upper] + (1 - fract) * cumulative[lower];
295     }
296
297     /**Compute the cumulative density from the rhs, i.e. the integral from
298      * x to infinity.
299      */
300     double cdfr(double x) const {
301         // Here, we can get away with just returning 1 - cdf b/c
302         // there are inaccuracies several orders of magnitude bigger than
303         // the rounding error.
304         return 1.0 - cdf(x);
305     }
306 }
307
308 unittest {
309     auto kde = KernelDensity1D.fromCallable(parametrize!normalPDF(0, 1), [0]);
310     assert(approxEqual(kde(1), normalPDF(1)));
311     assert(approxEqual(kde.cdf(1), normalCDF(1)));
312     assert(approxEqual(kde.cdfr(1), normalCDFR(1)));
313
314     // This is purely to see if fromAlias works.
315     auto cosKde = KernelDensity1D.fromAlias!cos([0], 1);
316
317     // Make sure fromDefaultKernel at least instantiates.
318     auto defaultKde = KernelDensity1D.fromDefaultKernel([1, 2, 3]);
319 }
320
321 /**Uses Scott's Rule to select the bandwidth of the Gaussian kernel density
322  * estimator.  This is 1.06 * min(stdev(data), interquartileRange(data) / 1.34)
323  * N ^^ -0.2.  R must be a forward range of numeric types.
324  *
325  * Examples:
326  * ---
327  * immutable bandwidth = scottBandwidth(data);
328  * auto kernel = parametrize!normalPDF(0, bandwidth);
329  * auto kde = KernelDensity1D(data, kernel);
330  * ---
331  *
332  * References:
333  * Scott, D. W. (1992) Multivariate Density Estimation: Theory, Practice,
334  * and Visualization. Wiley.
335  */
336 double scottBandwidth(R)(R data)
337 if(isForwardRange!R && is(ElementType!R : double)) {
338
339     immutable summary = meanStdev(data.save);
340     immutable interquartile = interquantileRange(data.save, 0.25) / 1.34;
341     immutable sigmaHat = min(summary.stdev, interquartile);
342
343     return 1.06 * sigmaHat * (summary.N ^^ -0.2);
344 }
345
346 unittest {
347     // Values from R.
348     assert(approxEqual(scottBandwidth([1,2,3,4,5]), 1.14666));
349     assert(approxEqual(scottBandwidth([1,2,2,2,2,8,8,8,8]), 2.242446));
350 }
351
352 /**Construct an N-dimensional kernel density estimator.  This is done using
353  * the textbook definition of kernel density estimation, since the binning
354  * and convolving method used in the 1-D case would rapidly become
355  * unfeasible w.r.t. memory usage as dimensionality increased.
356  *
357  * Eventually, a 2-D estimator might be added as another special case, but
358  * beyond 2-D, bin-and-convolute clearly isn't feasible.
359  *
360  * This class can be used for 1-D estimation instead of KernelDensity1D, and
361  * will work properly.  This is useful if:
362  *
363  * 1.  You can't accept even the slightest deviation from the results that the
364  *     textbook definition of kernel density estimation would produce.
365  *
366  * 2.  You are only going to evaluate at a few points and want to avoid the
367  *     up-front cost of the convolution used in the 1-D case.
368  *
369  * 3.  You're using some weird kernel that doesn't meet the assumptions
370  *     required for KernelDensity1D.
371  */
372 class KernelDensity {
373     private immutable double[][] points;
374     private double delegate(double[]...) kernel;
375
376     private this(immutable double[][] points) {
377         this.points = points;
378     }
379
380     /**Returns the number of dimensions in the estimator.*/
381     uint nDimensions() const @property {
382         // More than uint.max dimensions is absolutely implausible.
383         assert(points.length <= uint.max);
384         return cast(uint) points.length;
385     }
386
387     /**Construct a kernel density estimator from a kernel provided as a callable
388      * object (such as a function pointer, delegate, or class with overloaded
389      * opCall).  R must be either a range of ranges, multiple ranges passed in
390      * as variadic arguments, or a single range for the 1D case.  Each range
391      * represents the values of one variable in the joint distribution.
392      * kernel must accept either an array of doubles or the same number of
393      * arguments as the number of dimensions, and must return a floating point
394      * number.
395      *
396      * Examples:
397      * ---
398      * // Create an estimate of the density of the joint distribution of
399      * // hours sleep and programming skill.
400      * auto programmingSkill = [8,6,7,5,3,0,9];
401      * auto hoursSleep = [3,6,2,4,3,5,8];
402      *
403      * // Make a 2D Gaussian kernel function with bandwidth 0.5 in both
404      * // dimensions and covariance zero.
405      * static double myKernel(double x1, double x2) {
406      *    return normalPDF(x1, 0, 0.5) * normalPDF(x2, 0, 0.5);
407      * }
408      *
409      * auto estimator = KernelDensity.fromCallable
410      *     (&myKernel, programmingSkill, hoursSleep);
411      *
412      * // Estimate the density at programming skill 1, 2 hours sleep.
413      * auto density = estimator(1, 2);
414      * ---
415      */
416     static KernelDensity fromCallable(C, R...)(C kernel, R ranges)
417     if(allSatisfy!(isInputRange, R)) {
418         auto kernelWrapped = wrapToArrayVariadic(kernel);
419
420         static if(R.length == 1 && isInputRange!(typeof(ranges[0].front))) {
421             alias ranges[0] data;
422         } else {
423             alias ranges data;
424         }
425
426         double[][] points;
427         foreach(range; data) {
428             double[] asDoubles;
429
430             static if(dstats.base.hasLength!(typeof(range))) {
431                 asDoubles = newVoid!double(range.length);
432
433                 size_t i = 0;
434                 foreach(elem; range) {
435                     asDoubles[i++] = elem;
436                 }
437             } else {
438                 auto app = appender(&asDoubles);
439                 foreach(elem; range) {
440                     app.put(elem);
441                 }
442             }
443
444             points ~= asDoubles;
445         }
446
447         dstatsEnforce(points.length,
448             "Can't construct a zero dimensional kernel density estimator.");
449
450         foreach(arr; points[1..$]) {
451             dstatsEnforce(arr.length == points[0].length,
452                 "All ranges must be the same length to construct a KernelDensity.");
453         }
454
455         auto ret = new KernelDensity(assumeUnique(points));
456         ret.kernel = kernelWrapped;
457
458         return ret;
459     }
460
461     /**Estimate the density at the point given by x.  The variables in X are
462      * provided in the same order as the ranges were provided for estimation.
463      */
464     double opCall(double[] x...) const {
465         dstatsEnforce(x.length == points.length,
466             "Dimension mismatch when evaluating kernel density.");
467         double sum = 0;
468
469         mixin(newFrame);
470         auto dataPoint = newStack!double(points.length);
471         foreach(i; 0..points[0].length) {
472             foreach(j; 0..points.length) {
473                 dataPoint[j] = x[j] - points[j][i];
474             }
475
476             sum += kernel(dataPoint);
477         }
478
479         sum /= points[0].length;
480         return sum;
481     }
482 }
483
484 unittest {
485     auto data = randArray!rNorm(100, 0, 1);
486     auto kernel = parametrize!normalPDF(0, scottBandwidth(data));
487     auto kde = KernelDensity.fromCallable(kernel, data);
488     auto kde1 = KernelDensity1D.fromCallable(kernel, data);
489     foreach(i; 0..5) {
490         assert(abs(kde(i) - kde1(i)) < 0.01);
491     }
492
493     // Make sure example compiles.
494     auto programmingSkill = [8,6,7,5,3,0,9];
495     auto hoursSleep = [3,6,2,4,3,5,8];
496
497     // Make a 2D Gaussian kernel function with bandwidth 0.5 in both
498     // dimensions and covariance zero.
499     static double myKernel(double x1, double x2) {
500         return normalPDF(x1, 0, 0.5) * normalPDF(x2, 0, 0.5);
501     }
502
503     auto estimator = KernelDensity.fromCallable
504         (&myKernel, programmingSkill, hoursSleep);
505
506     // Estimate the density at programming skill 1, 2 hours sleep.
507     auto density = estimator(1, 2);
508
509     // Test instantiating from functor.
510     auto foo = KernelDensity.fromCallable(estimator, hoursSleep);
511 }
512
513
514 private:
515
516 double delegate(double[]...) wrapToArrayVariadic(C)(C callable) {
517     static if(is(C == delegate) || isFunctionPointer!C) {
518         alias callable fun;
519     } else {  // It's a functor.
520         alias callable.opCall fun;
521     }
522
523     alias ParameterTypeTuple!fun params;
524     static if(params.length == 1 && is(params[0] == double[])) {
525         // Already in the right form.
526         static if(is(C == delegate) && is(ReturnType!C == double)) {
527             return callable;
528         } else static if(is(ReturnType!(callable.opCall) == double)) {
529             return &callable.opCall;
530         } else {  // Need to forward.
531             double forward(double[] args...) {
532                 return fun(args);
533             }
534
535             return &forward;
536         }
537     } else {  // Need to convert to single arguments and forward.
538         static assert(allSatisfy!(isFloatingPoint, params));
539
540         double doCall(double[] args...) {
541             assert(args.length == params.length);
542             mixin("return fun(" ~ makeCallList(params.length) ~ ");");
543         }
544
545         return &doCall;
546     }
547 }
548
549 // CTFE function for forwarding elements of an array as single function
550 // arguments.
551 string makeCallList(uint N) {
552     string ret;
553     foreach(i; 0..N - 1) {
554         ret ~= "args[" ~ to!string(i) ~ "], ";
555     }
556
557     ret ~= "args[" ~ to!string(N - 1) ~ "]";
558     return ret;
559 }
Note: See TracBrowser for help on using the browser.