root/trunk/tempAlloc/tempAlloc.d

Revision 543, 17.1 kB (checked in by dsimcha, 3 years ago)

Fix false ptr issues. Add stackCat.

Line 
1 /**A struct to allocate memory in a strictly first-in last-out order for
2  * things like scratch space.  Technically, memory can safely escape the
3  * scope in which it was allocated.  However, this is a very bad idea
4  * unless being done within the private API of a class, struct or nested
5  * function, where it can be guaranteed that LIFO will not be violated.
6  *
7  * Under the hood, this works by allocating large blocks (currently 4 MB)
8  * from the GC, and sub-allocating these as a stack.  Very large allocations
9  * (currently > 4MB) are simply performed on the heap.  There are two ways to
10  * free memory:  Calling TempAlloc.free() frees the last allocated block.
11  * Calling TempAlloc.frameFree() frees all memory allocated since the last
12  * call to TempAlloc.frameInit().
13  *
14  * All allocations are aligned on 16-byte boundaries using padding, since on x86,
15  * 16-byte alignment is necessary to make SSE2 work.  Note, however, that this
16  * is implemented based on the assumption that the GC allocates using 16-byte
17  * alignment (which appears to be true in druntime.)
18  *
19  * Author:  David Simcha
20  * License: Use and redistribution in both binary and source form, both
21  *          modified and unmodified, for both commercian and non-
22  *          commercial purposes is hereby permitted subject to:
23  *
24  *          1. The author(s) hereby disclaim all warranties, both express
25  *             and implied.
26  *          2. A statement acknowledging the contributions of the authors
27  *             of this module would be appreciated but is not required.
28  *
29  * Bugs:
30  *     No data stored in memory allocated by TempAlloc is scanned by the
31  *     GC.  This reflects its intended use as a scratch space to store
32  *     things like simple primitives.  Due to false pointer issues, etc.
33  *     making this memory be scanned by the GC would just plain be a bad
34  *     idea.  DO NOT store references to reference types in TempAlloc-
35  *     allocated memory unless you also store a reference in memory that is
36  *     scanned by the GC.
37  */
38
39 module tempalloc;
40
41 import core.memory, core.thread, std.traits, std.c.stdio : stderr;
42
43 // C functions, marked w/ nothrow.
44 extern(C) nothrow int fprintf(void*, in char *,...);
45 extern(C) nothrow void exit(int);
46
47 ///
48 struct TempAlloc {
49 private:
50     struct Stack(T) {  // Simple, fast stack w/o error checking.
51         private size_t capacity;
52         private size_t index;
53         private T* data;
54         private enum sz = T.sizeof;
55
56         private static size_t max(size_t lhs, size_t rhs) pure nothrow {
57             return (rhs > lhs) ? rhs : lhs;
58         }
59
60         void push(T elem) nothrow {
61             if (capacity == index) {
62                 capacity = max(16, capacity * 2);
63                 data = cast(T*) ntRealloc(data, capacity * sz, cast(GC.BlkAttr) 0);
64                 data[index..capacity] = T.init;  // Prevent false ptrs.
65             }
66             data[index++] = elem;
67         }
68
69         T pop() nothrow {
70             index--;
71             auto ret = data[index];
72             data[index] = T.init;  // Prevent false ptrs.
73             return ret;
74         }
75     }
76
77     struct Block {
78         size_t used = 0;
79         void* space = null;
80     }
81
82     final class State {
83         size_t used;
84         void* space;
85         size_t totalAllocs;
86         void*[] lastAlloc;
87         uint nblocks;
88         uint nfree;
89         size_t frameIndex;
90
91         // inUse holds info for all blocks except the one currently being
92         // allocated from.  freelist holds space ptrs for all free blocks.
93         Stack!(Block) inUse;
94         Stack!(void*) freelist;
95
96         ~this() {  // Blocks are pretty large.  Prevent false ptrs.
97             ntFree(lastAlloc.ptr);
98             while(nblocks > 1) {
99                 ntFree((inUse.pop()).space);
100                 nblocks--;
101             }
102             ntFree(space);
103             while(nfree > 0) {
104                 ntFree(freelist.pop);
105                 nfree--;
106             }
107         }
108     }
109
110     // core.thread.Thread.thread_needLock() is nothrow (read the code if you
111     // don't believe me) but not marked as such because nothrow is such a new
112     // feature in D.  This is a workaround until that gets fixed.
113     static enum tnl = cast(bool function() nothrow) &thread_needLock;
114
115     enum blockSize = 4U * 1024U * 1024U;
116     enum nBookKeep = 1_048_576;  // How many bytes to allocate upfront for bookkeeping.
117     enum alignBytes = 16U;
118     static __thread State state;
119     static State mainThreadState;
120
121     static void die() nothrow {
122         fprintf(stderr, "TempAlloc error: Out of memory.\0".ptr);
123         exit(1);
124     }
125
126     static void doubleSize(ref void*[] lastAlloc) nothrow {
127         size_t newSize = lastAlloc.length * 2;
128         void** ptr = cast(void**)
129         ntRealloc(lastAlloc.ptr, newSize * (void*).sizeof, GC.BlkAttr.NO_SCAN);
130
131         if (lastAlloc.ptr != ptr) {
132             ntFree(lastAlloc.ptr);
133         }
134
135         lastAlloc = ptr[0..newSize];
136     }
137
138     static void* ntMalloc(size_t size, GC.BlkAttr attr) nothrow {
139         try { return GC.malloc(size, attr); } catch { die(); }
140     }
141
142     static void* ntRealloc(void* ptr, size_t size, GC.BlkAttr attr) nothrow {
143         try { return GC.realloc(ptr, size, attr); } catch { die(); }
144     }
145
146     static void ntFree(void* ptr) nothrow {
147         try { GC.free(ptr); } catch {}
148     }
149
150     static size_t getAligned(size_t nbytes) pure nothrow {
151         size_t rem = nbytes % alignBytes;
152         return (rem == 0) ? nbytes : nbytes - rem + alignBytes;
153     }
154
155     static State stateInit() nothrow {
156         State stateCopy;
157         try { stateCopy = new State; } catch { die(); }
158
159         with(stateCopy) {
160             space = ntMalloc(blockSize, GC.BlkAttr.NO_SCAN);
161             lastAlloc = (cast(void**) ntMalloc(nBookKeep, GC.BlkAttr.NO_SCAN))
162                         [0..nBookKeep / (void*).sizeof];
163             nblocks++;
164         }
165
166         state = stateCopy;
167         if (!tnl())
168             mainThreadState = stateCopy;
169         return stateCopy;
170     }
171
172 public:
173     /**Allows caller to cache the state class on the stack and pass it in as a
174      * parameter.  This is ugly, but results in a speed boost that can be
175      * significant in some cases because it avoids a thread-local storage
176      * lookup.  Also used internally.*/
177     static State getState() nothrow {
178         // Believe it or not, even with builtin TLS, the thread_needLock()
179         // is worth it to avoid the TLS lookup.
180         State stateCopy = (tnl()) ? state : mainThreadState;
181         return (stateCopy is null) ? stateInit : stateCopy;
182     }
183
184     /**Initializes a frame, i.e. marks the current allocation position.
185      * Memory past the position at which this was last called will be
186      * freed when frameFree() is called.  Returns a reference to the
187      * State class in case the caller wants to cache it for speed.*/
188     static State frameInit() nothrow {
189         return frameInit(getState);
190     }
191
192     /**Same as frameInit() but uses stateCopy cached on stack by caller
193      * to avoid a thread-local storage lookup.  Strictly a speed hack.*/
194     static State frameInit(State stateCopy) nothrow {
195         with(stateCopy) {
196             if (totalAllocs == lastAlloc.length) // Should happen very infrequently.
197                 doubleSize(lastAlloc);
198             lastAlloc[totalAllocs] = cast(void*) frameIndex;
199             frameIndex = totalAllocs;
200             totalAllocs++;
201         }
202         return stateCopy;
203     }
204
205     /**Frees all memory allocated by TempAlloc since the last call to
206      * frameInit().*/
207     static void frameFree() nothrow {
208         frameFree(getState);
209     }
210
211     /**Same as frameFree() but uses stateCopy cached on stack by caller
212     * to avoid a thread-local storage lookup.  Strictly a speed hack.*/
213     static void frameFree(State stateCopy) nothrow {
214         with(stateCopy) {
215             while (totalAllocs > frameIndex + 1) {
216                 free(stateCopy);
217             }
218             frameIndex = cast(size_t) lastAlloc[--totalAllocs];
219         }
220     }
221
222     /**Purely a convenience overload, forwards arguments to TempAlloc.malloc().*/
223     static void* opCall(T...)(T args) nothrow {
224         return TempAlloc.malloc(args);
225     }
226
227     /**Allocates nbytes bytes on the TempAlloc stack.  NOT safe for real-time
228      * programming, since if there's not enough space on the current block,
229      * a new one will automatically be created.  Also, very large objects
230      * (currently over 4MB) will simply be heap-allocated.
231      *
232      * Bugs:  Memory allocated by TempAlloc is not scanned by the GC.
233      * This is necessary for performance and to avoid false pointer issues.
234      * Do not store the only reference to a GC-allocated object in
235      * TempAlloc-allocated memory.*/
236     static void* malloc(size_t nbytes) nothrow {
237         return malloc(nbytes, getState);
238     }
239
240     /**Same as malloc() but uses stateCopy cached on stack by caller
241     * to avoid a thread-local storage lookup.  Strictly a speed hack.*/
242     static void* malloc(size_t nbytes, State stateCopy) nothrow {
243         nbytes = getAligned(nbytes);
244         with(stateCopy) {
245             void* ret;
246             if (blockSize - used >= nbytes) {
247                 ret = space + used;
248                 used += nbytes;
249             } else if (nbytes > blockSize) {
250                 ret = ntMalloc(nbytes, GC.BlkAttr.NO_SCAN);
251             } else if (nfree > 0) {
252                 inUse.push(Block(used, space));
253                 space = freelist.pop;
254                 used = nbytes;
255                 nfree--;
256                 nblocks++;
257                 ret = space;
258             } else { // Allocate more space.
259                 inUse.push(Block(used, space));
260                 space = ntMalloc(blockSize, GC.BlkAttr.NO_SCAN);
261                 nblocks++;
262                 used = nbytes;
263                 ret = space;
264             }
265             if (totalAllocs == lastAlloc.length) {
266                 doubleSize(lastAlloc);
267             }
268             lastAlloc[totalAllocs++] = ret;
269             return ret;
270         }
271     }
272
273     /**Frees the last piece of memory allocated by TempAlloc.  Since
274      * all memory must be allocated and freed in strict LIFO order,
275      * there's no need to pass a pointer in.  All bookkeeping for figuring
276      * out what to free is done internally.*/
277     static void free() nothrow {
278         free(getState);
279     }
280
281     /**Same as free() but uses stateCopy cached on stack by caller
282     * to avoid a thread-local storage lookup.  Strictly a speed hack.*/
283     static void free(State stateCopy) nothrow {
284         with(stateCopy) {
285             void* lastPos = lastAlloc[--totalAllocs];
286
287             // Handle large blocks.
288             if (lastPos > space + blockSize || lastPos < space) {
289                 ntFree(lastPos);
290                 return;
291             }
292
293             used = (cast(size_t) lastPos) - (cast(size_t) space);
294             if (nblocks > 1 && used == 0) {
295                 freelist.push(space);
296                 Block newHead = inUse.pop;
297                 space = newHead.space;
298                 used = newHead.used;
299                 nblocks--;
300                 nfree++;
301
302                 if (nfree >= nblocks * 2) {
303                     foreach(i; 0..nfree / 2) {
304                         ntFree(freelist.pop);
305                         nfree--;
306                     }
307                 }
308             }
309         }
310     }
311 }
312
313 /**Allocates an array of type T and size size using TempAlloc.
314  * Note that appending to this array using the ~= operator,
315  * or enlarging it using the .length property, will result in
316  * undefined behavior.  This is because, if the array is located
317  * at the beginning of a TempAlloc block, the GC will think the
318  * capacity is as large as a TempAlloc block, and will overwrite
319  * adjacent TempAlloc-allocated data, instead of reallocating it.
320  *
321  * Bugs: Do not store the only reference to a GC-allocated reference object
322  * in an array allocated by newStack because this memory is not
323  * scanned by the GC.*/
324 T[] newStack(T)(size_t size) nothrow {
325     size_t bytes = size * T.sizeof;
326     T* ptr = cast(T*) TempAlloc.malloc(bytes);
327     return ptr[0..size];
328 }
329
330 /**Same as newStack(size_t) but uses stateCopy cached on stack by caller
331 * to avoid a thread-local storage lookup.  Strictly a speed hack.*/
332 T[] newStack(T)(size_t size, TempAlloc.State state) nothrow {
333     size_t bytes = size * T.sizeof;
334     T* ptr = cast(T*) TempAlloc.malloc(bytes, state);
335     return ptr[0..size];
336 }
337
338 /**Concatenate any number of arrays of the same type, placing results on
339  * the TempAlloc stack.*/
340 T[0] stackCat(T...)(T data) {
341     foreach(array; data) {
342         static assert(is(typeof(array) == typeof(data[0])));
343     }
344
345     size_t totalLen = 0;
346     foreach(array; data) {
347         totalLen += array.length;
348     }
349     auto ret = newStack!(Mutable!(typeof(T[0][0])))(totalLen);
350
351     size_t offset = 0;
352     foreach(array; data) {
353         ret[offset..offset + array.length] = array[0..$];
354         offset += array.length;
355     }
356     return cast(T[0]) ret;
357 }
358
359 /**Creates a duplicate of an array on the TempAlloc stack.*/
360 auto tempdup(T)(T[] data) nothrow {
361     alias Mutable!(T) U;
362     U[] ret = newStack!(U)(data.length);
363     ret[] = data[];
364     return ret;
365 }
366
367 /**Same as tempdup(T[]) but uses stateCopy cached on stack by caller
368  * to avoid a thread-local storage lookup.  Strictly a speed hack.*/
369 auto tempdup(T)(T[] data, TempAlloc.State state) nothrow {
370     alias Mutable!(T) U;
371     U[] ret = newStack!(U)(data.length, state);
372     ret[] = data;
373     return ret;
374 }
375
376 /**A string to mixin at the beginning of a scope, purely for
377  * convenience.  Initializes a TempAlloc frame using frameInit(),
378  * and inserts a scope statement to delete this frame at the end
379  * of the current scope.
380  *
381  * Slower than calling free() manually when only a few pieces
382  * of memory will be allocated in the current scope, due to the
383  * extra bookkeeping involved.  Can be faster, however, when
384  * large amounts of allocations, such as arrays of arrays,
385  * are allocated, due to caching of data stored in thread-local
386  * storage.*/
387 invariant char[] newFrame =
388     "TempAlloc.frameInit; scope(exit) TempAlloc.frameFree;";
389
390 unittest {
391     /* Not a particularly good unittest in that it depends on knowing the
392      * internals of TempAlloc, but it's the best I could come up w/.  This
393      * is really more of a stress test/sanity check than a normal unittest.*/
394
395      // First test to make sure a large number of allocations does what it's
396      // supposed to in terms of reallocing lastAlloc[], etc.
397      enum nIter =  TempAlloc.blockSize * 5 / TempAlloc.alignBytes;
398      foreach(i; 0..nIter) {
399          TempAlloc(TempAlloc.alignBytes);
400      }
401      assert(TempAlloc.getState.nblocks == 5);
402      assert(TempAlloc.getState.nfree == 0);
403      foreach(i; 0..nIter) {
404         TempAlloc.free;
405     }
406     assert(TempAlloc.getState.nblocks == 1);
407     assert(TempAlloc.getState.nfree == 2);
408
409     // Make sure logic for freeing excess blocks works.  If it doesn't this
410     // test will run out of memory.
411     enum allocSize = TempAlloc.blockSize / 2;
412     void*[] oldStates;
413     foreach(i; 0..50) {
414         foreach(j; 0..50) {
415             TempAlloc(allocSize);
416         }
417         foreach(j; 0..50) {
418             TempAlloc.free;
419         }
420         oldStates ~= cast(void*) TempAlloc.state;
421         oldStates ~= cast(void*) TempAlloc.mainThreadState;
422         TempAlloc.state = null;
423         TempAlloc.mainThreadState = null;
424     }
425     oldStates = null;
426
427     // Make sure data is stored properly.
428     foreach(i; 0..10) {
429         TempAlloc(allocSize);
430     }
431     foreach(i; 0..5) {
432         TempAlloc.free;
433     }
434     GC.collect;  // Make sure nothing that shouldn't is getting GC'd.
435     void* space = TempAlloc.mainThreadState.space;
436     size_t used = TempAlloc.mainThreadState.used;
437
438     TempAlloc.frameInit;
439     // This array of arrays should not be scanned by the GC because otherwise
440     // bugs caused th not having the GC scan certain internal things in
441     // TempAlloc that it should would not be exposed.
442     uint[][] arrays = (cast(uint[]*) GC.malloc((uint[]).sizeof * 10,
443                        GC.BlkAttr.NO_SCAN))[0..10];
444     foreach(i; 0..10) {
445         uint[] data = newStack!(uint)(250_000);
446         foreach(j, ref e; data) {
447             e = j * (i + 1);  // Arbitrary values that can be read back later.
448         }
449         arrays[i] = data;
450     }
451
452     // Make stuff get overwrriten if blocks are getting GC'd when they're not
453     // supposed to.
454     GC.minimize;  // Free up all excess pools.
455     uint[][] foo;
456     foreach(i; 0..40) {
457         foo ~= new uint[1_048_576];
458     }
459     foo = null;
460
461     for(size_t i = 9; i != size_t.max; i--) {
462         foreach(j, e; arrays[i]) {
463             assert(e == j * (i + 1));
464         }
465     }
466     TempAlloc.frameFree;
467     assert(space == TempAlloc.mainThreadState.space);
468     assert(used == TempAlloc.mainThreadState.used);
469     while(TempAlloc.state.nblocks > 1 || TempAlloc.state.used > 0) {
470         TempAlloc.free;
471     }
472     fprintf(stderr, "Passed TempAlloc test.\n\0".ptr);
473 }
Note: See TracBrowser for help on using the browser.