| 1 |
/* |
|---|
| 2 |
* jidctflt.d |
|---|
| 3 |
* |
|---|
| 4 |
* Copyright (C) 1994-1998, Thomas G. Lane. |
|---|
| 5 |
* This file is part of the Independent JPEG Group's software. |
|---|
| 6 |
* |
|---|
| 7 |
* The authors make NO WARRANTY or representation, either express or implied, |
|---|
| 8 |
* with respect to this software, its quality, accuracy, merchantability, or |
|---|
| 9 |
* fitness for a particular purpose. This software is provided "AS IS", and you, |
|---|
| 10 |
* its user, assume the entire risk as to its quality and accuracy. |
|---|
| 11 |
* |
|---|
| 12 |
* This software is copyright (C) 1991-1998, Thomas G. Lane. |
|---|
| 13 |
* All Rights Reserved except as specified below. |
|---|
| 14 |
* |
|---|
| 15 |
* Permission is hereby granted to use, copy, modify, and distribute this |
|---|
| 16 |
* software (or portions thereof) for any purpose, without fee, subject to these |
|---|
| 17 |
* conditions: |
|---|
| 18 |
* (1) If any part of the source code for this software is distributed, then this |
|---|
| 19 |
* README file must be included, with this copyright and no-warranty notice |
|---|
| 20 |
* unaltered; and any additions, deletions, or changes to the original files |
|---|
| 21 |
* must be clearly indicated in accompanying documentation. |
|---|
| 22 |
* (2) If only executable code is distributed, then the accompanying |
|---|
| 23 |
* documentation must state that "this software is based in part on the work of |
|---|
| 24 |
* the Independent JPEG Group". |
|---|
| 25 |
* (3) Permission for use of this software is granted only if the user accepts |
|---|
| 26 |
* full responsibility for any undesirable consequences; the authors accept |
|---|
| 27 |
* NO LIABILITY for damages of any kind. |
|---|
| 28 |
* |
|---|
| 29 |
* These conditions apply to any software derived from or based on the IJG code, |
|---|
| 30 |
* not just to the unmodified library. If you use our work, you ought to |
|---|
| 31 |
* acknowledge us. |
|---|
| 32 |
* |
|---|
| 33 |
* Permission is NOT granted for the use of any IJG author's name or company name |
|---|
| 34 |
* in advertising or publicity relating to this software or products derived from |
|---|
| 35 |
* it. This software may be referred to only as "the Independent JPEG Group's |
|---|
| 36 |
* software". |
|---|
| 37 |
* |
|---|
| 38 |
* We specifically permit and encourage the use of this software as the basis of |
|---|
| 39 |
* commercial products, provided that all warranty or liability claims are |
|---|
| 40 |
* assumed by the product vendor. |
|---|
| 41 |
* |
|---|
| 42 |
* |
|---|
| 43 |
* This file contains a floating-point implementation of the |
|---|
| 44 |
* inverse DCT (Discrete Cosine Transform). In the IJG code, this routine |
|---|
| 45 |
* must also perform dequantization of the input coefficients. |
|---|
| 46 |
* |
|---|
| 47 |
* This implementation should be more accurate than either of the integer |
|---|
| 48 |
* IDCT implementations. However, it may not give the same results on all |
|---|
| 49 |
* machines because of differences in roundoff behavior. Speed will depend |
|---|
| 50 |
* on the hardware's floating point capacity. |
|---|
| 51 |
* |
|---|
| 52 |
* A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT |
|---|
| 53 |
* on each row (or vice versa, but it's more convenient to emit a row at |
|---|
| 54 |
* a time). Direct algorithms are also available, but they are much more |
|---|
| 55 |
* complex and seem not to be any faster when reduced to code. |
|---|
| 56 |
* |
|---|
| 57 |
* This implementation is based on Arai, Agui, and Nakajima's algorithm for |
|---|
| 58 |
* scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in |
|---|
| 59 |
* Japanese, but the algorithm is described in the Pennebaker & Mitchell |
|---|
| 60 |
* JPEG textbook (see REFERENCES section in file README). The following code |
|---|
| 61 |
* is based directly on figure 4-8 in P&M. |
|---|
| 62 |
* While an 8-point DCT cannot be done in less than 11 multiplies, it is |
|---|
| 63 |
* possible to arrange the computation so that many of the multiplies are |
|---|
| 64 |
* simple scalings of the final outputs. These multiplies can then be |
|---|
| 65 |
* folded into the multiplications or divisions by the JPEG quantization |
|---|
| 66 |
* table entries. The AA&N method leaves only 5 multiplies and 29 adds |
|---|
| 67 |
* to be done in the DCT itself. |
|---|
| 68 |
* The primary disadvantage of this method is that with a fixed-point |
|---|
| 69 |
* implementation, accuracy is lost due to imprecise representation of the |
|---|
| 70 |
* scaled quantization values. However, that problem does not arise if |
|---|
| 71 |
* we use floating point arithmetic. |
|---|
| 72 |
*/ |
|---|
| 73 |
|
|---|
| 74 |
/* |
|---|
| 75 |
* Ported to the D programming language by |
|---|
| 76 |
* Tomas Lindquist Olsen <tomas@famolsen.dk> |
|---|
| 77 |
*/ |
|---|
| 78 |
|
|---|
| 79 |
module tinyjpeg.jidctflt; |
|---|
| 80 |
|
|---|
| 81 |
version(Tango) |
|---|
| 82 |
import tango.stdc.stdint; |
|---|
| 83 |
else |
|---|
| 84 |
import std.stdint; |
|---|
| 85 |
|
|---|
| 86 |
import tinyjpeg.internal; |
|---|
| 87 |
|
|---|
| 88 |
alias float FAST_FLOAT; |
|---|
| 89 |
|
|---|
| 90 |
enum |
|---|
| 91 |
{ |
|---|
| 92 |
DCTSIZE = 8, |
|---|
| 93 |
DCTSIZE2 = (DCTSIZE*DCTSIZE) |
|---|
| 94 |
} |
|---|
| 95 |
|
|---|
| 96 |
FAST_FLOAT DEQUANTIZE(int16_t coef, FAST_FLOAT quantval) |
|---|
| 97 |
{ |
|---|
| 98 |
return (cast(FAST_FLOAT)coef)*quantval; |
|---|
| 99 |
} |
|---|
| 100 |
|
|---|
| 101 |
/* |
|---|
| 102 |
#if defined(__GNUC__) && (defined(__i686__) || defined(__x86_64__)) |
|---|
| 103 |
static inline unsigned char descale_and_clamp(int x, int shift) |
|---|
| 104 |
{ |
|---|
| 105 |
__asm__ ( |
|---|
| 106 |
"add %3,%1\n" |
|---|
| 107 |
"\tsar %2,%1\n" |
|---|
| 108 |
"\tsub $-128,%1\n" |
|---|
| 109 |
"\tcmovl %5,%1\n" // Use the sub to compare to 0 |
|---|
| 110 |
"\tcmpl %4,%1\n" |
|---|
| 111 |
"\tcmovg %4,%1\n" |
|---|
| 112 |
: "=r"(x) |
|---|
| 113 |
: "0"(x), "Ir"(shift), "ir"(1UL<<(shift-1)), "r" (0xff), "r" (0) |
|---|
| 114 |
); |
|---|
| 115 |
return x; |
|---|
| 116 |
} |
|---|
| 117 |
*/ |
|---|
| 118 |
|
|---|
| 119 |
|
|---|
| 120 |
private ubyte descale_and_clamp(int x, int shift) |
|---|
| 121 |
{ |
|---|
| 122 |
x += (1U<<(shift-1)); |
|---|
| 123 |
if (x<0) |
|---|
| 124 |
x = (x >>> shift) | ((~(0U)) << (32-(shift))); |
|---|
| 125 |
else |
|---|
| 126 |
x >>>= shift; |
|---|
| 127 |
x += 128; |
|---|
| 128 |
if (x>255) |
|---|
| 129 |
return 255; |
|---|
| 130 |
else if (x<0) |
|---|
| 131 |
return 0; |
|---|
| 132 |
else |
|---|
| 133 |
return x; |
|---|
| 134 |
} |
|---|
| 135 |
|
|---|
| 136 |
/* |
|---|
| 137 |
* Perform dequantization and inverse DCT on one block of coefficients. |
|---|
| 138 |
*/ |
|---|
| 139 |
|
|---|
| 140 |
void |
|---|
| 141 |
tinyjpeg_idct_float (component *compptr, uint8_t *output_buf, int stride) |
|---|
| 142 |
{ |
|---|
| 143 |
FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|---|
| 144 |
FAST_FLOAT tmp10, tmp11, tmp12, tmp13; |
|---|
| 145 |
FAST_FLOAT z5, z10, z11, z12, z13; |
|---|
| 146 |
int16_t *inptr; |
|---|
| 147 |
FAST_FLOAT *quantptr; |
|---|
| 148 |
FAST_FLOAT *wsptr; |
|---|
| 149 |
uint8_t *outptr; |
|---|
| 150 |
int ctr; |
|---|
| 151 |
FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */ |
|---|
| 152 |
|
|---|
| 153 |
/* Pass 1: process columns from input, store into work array. */ |
|---|
| 154 |
|
|---|
| 155 |
inptr = compptr.DCT.ptr; |
|---|
| 156 |
quantptr = compptr.Q_table; |
|---|
| 157 |
wsptr = workspace.ptr; |
|---|
| 158 |
for (ctr = DCTSIZE; ctr > 0; ctr--) { |
|---|
| 159 |
/* Due to quantization, we will usually find that many of the input |
|---|
| 160 |
* coefficients are zero, especially the AC terms. We can exploit this |
|---|
| 161 |
* by short-circuiting the IDCT calculation for any column in which all |
|---|
| 162 |
* the AC terms are zero. In that case each output is equal to the |
|---|
| 163 |
* DC coefficient (with scale factor as needed). |
|---|
| 164 |
* With typical images and quantization tables, half or more of the |
|---|
| 165 |
* column DCT calculations can be simplified this way. |
|---|
| 166 |
*/ |
|---|
| 167 |
|
|---|
| 168 |
if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && |
|---|
| 169 |
inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && |
|---|
| 170 |
inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && |
|---|
| 171 |
inptr[DCTSIZE*7] == 0) { |
|---|
| 172 |
/* AC terms all zero */ |
|---|
| 173 |
FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
|---|
| 174 |
|
|---|
| 175 |
wsptr[DCTSIZE*0] = dcval; |
|---|
| 176 |
wsptr[DCTSIZE*1] = dcval; |
|---|
| 177 |
wsptr[DCTSIZE*2] = dcval; |
|---|
| 178 |
wsptr[DCTSIZE*3] = dcval; |
|---|
| 179 |
wsptr[DCTSIZE*4] = dcval; |
|---|
| 180 |
wsptr[DCTSIZE*5] = dcval; |
|---|
| 181 |
wsptr[DCTSIZE*6] = dcval; |
|---|
| 182 |
wsptr[DCTSIZE*7] = dcval; |
|---|
| 183 |
|
|---|
| 184 |
inptr++; /* advance pointers to next column */ |
|---|
| 185 |
quantptr++; |
|---|
| 186 |
wsptr++; |
|---|
| 187 |
continue; |
|---|
| 188 |
} |
|---|
| 189 |
|
|---|
| 190 |
/* Even part */ |
|---|
| 191 |
|
|---|
| 192 |
tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
|---|
| 193 |
tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); |
|---|
| 194 |
tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); |
|---|
| 195 |
tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); |
|---|
| 196 |
|
|---|
| 197 |
tmp10 = tmp0 + tmp2; /* phase 3 */ |
|---|
| 198 |
tmp11 = tmp0 - tmp2; |
|---|
| 199 |
|
|---|
| 200 |
tmp13 = tmp1 + tmp3; /* phases 5-3 */ |
|---|
| 201 |
tmp12 = (tmp1 - tmp3) * (cast(FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */ |
|---|
| 202 |
|
|---|
| 203 |
tmp0 = tmp10 + tmp13; /* phase 2 */ |
|---|
| 204 |
tmp3 = tmp10 - tmp13; |
|---|
| 205 |
tmp1 = tmp11 + tmp12; |
|---|
| 206 |
tmp2 = tmp11 - tmp12; |
|---|
| 207 |
|
|---|
| 208 |
/* Odd part */ |
|---|
| 209 |
|
|---|
| 210 |
tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); |
|---|
| 211 |
tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); |
|---|
| 212 |
tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); |
|---|
| 213 |
tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); |
|---|
| 214 |
|
|---|
| 215 |
z13 = tmp6 + tmp5; /* phase 6 */ |
|---|
| 216 |
z10 = tmp6 - tmp5; |
|---|
| 217 |
z11 = tmp4 + tmp7; |
|---|
| 218 |
z12 = tmp4 - tmp7; |
|---|
| 219 |
|
|---|
| 220 |
tmp7 = z11 + z13; /* phase 5 */ |
|---|
| 221 |
tmp11 = (z11 - z13) * (cast(FAST_FLOAT) 1.414213562); /* 2*c4 */ |
|---|
| 222 |
|
|---|
| 223 |
z5 = (z10 + z12) * (cast(FAST_FLOAT) 1.847759065); /* 2*c2 */ |
|---|
| 224 |
tmp10 = (cast(FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */ |
|---|
| 225 |
tmp12 = (cast(FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */ |
|---|
| 226 |
|
|---|
| 227 |
tmp6 = tmp12 - tmp7; /* phase 2 */ |
|---|
| 228 |
tmp5 = tmp11 - tmp6; |
|---|
| 229 |
tmp4 = tmp10 + tmp5; |
|---|
| 230 |
|
|---|
| 231 |
wsptr[DCTSIZE*0] = tmp0 + tmp7; |
|---|
| 232 |
wsptr[DCTSIZE*7] = tmp0 - tmp7; |
|---|
| 233 |
wsptr[DCTSIZE*1] = tmp1 + tmp6; |
|---|
| 234 |
wsptr[DCTSIZE*6] = tmp1 - tmp6; |
|---|
| 235 |
wsptr[DCTSIZE*2] = tmp2 + tmp5; |
|---|
| 236 |
wsptr[DCTSIZE*5] = tmp2 - tmp5; |
|---|
| 237 |
wsptr[DCTSIZE*4] = tmp3 + tmp4; |
|---|
| 238 |
wsptr[DCTSIZE*3] = tmp3 - tmp4; |
|---|
| 239 |
|
|---|
| 240 |
inptr++; /* advance pointers to next column */ |
|---|
| 241 |
quantptr++; |
|---|
| 242 |
wsptr++; |
|---|
| 243 |
} |
|---|
| 244 |
|
|---|
| 245 |
/* Pass 2: process rows from work array, store into output array. */ |
|---|
| 246 |
/* Note that we must descale the results by a factor of 8 == 2**3. */ |
|---|
| 247 |
|
|---|
| 248 |
wsptr = workspace.ptr; |
|---|
| 249 |
outptr = output_buf; |
|---|
| 250 |
for (ctr = 0; ctr < DCTSIZE; ctr++) { |
|---|
| 251 |
/* Rows of zeroes can be exploited in the same way as we did with columns. |
|---|
| 252 |
* However, the column calculation has created many nonzero AC terms, so |
|---|
| 253 |
* the simplification applies less often (typically 5% to 10% of the time). |
|---|
| 254 |
* And testing floats for zero is relatively expensive, so we don't bother. |
|---|
| 255 |
*/ |
|---|
| 256 |
|
|---|
| 257 |
/* Even part */ |
|---|
| 258 |
|
|---|
| 259 |
tmp10 = wsptr[0] + wsptr[4]; |
|---|
| 260 |
tmp11 = wsptr[0] - wsptr[4]; |
|---|
| 261 |
|
|---|
| 262 |
tmp13 = wsptr[2] + wsptr[6]; |
|---|
| 263 |
tmp12 = (wsptr[2] - wsptr[6]) * (cast(FAST_FLOAT) 1.414213562) - tmp13; |
|---|
| 264 |
|
|---|
| 265 |
tmp0 = tmp10 + tmp13; |
|---|
| 266 |
tmp3 = tmp10 - tmp13; |
|---|
| 267 |
tmp1 = tmp11 + tmp12; |
|---|
| 268 |
tmp2 = tmp11 - tmp12; |
|---|
| 269 |
|
|---|
| 270 |
/* Odd part */ |
|---|
| 271 |
|
|---|
| 272 |
z13 = wsptr[5] + wsptr[3]; |
|---|
| 273 |
z10 = wsptr[5] - wsptr[3]; |
|---|
| 274 |
z11 = wsptr[1] + wsptr[7]; |
|---|
| 275 |
z12 = wsptr[1] - wsptr[7]; |
|---|
| 276 |
|
|---|
| 277 |
tmp7 = z11 + z13; |
|---|
| 278 |
tmp11 = (z11 - z13) * (cast(FAST_FLOAT) 1.414213562); |
|---|
| 279 |
|
|---|
| 280 |
z5 = (z10 + z12) * (cast(FAST_FLOAT) 1.847759065); /* 2*c2 */ |
|---|
| 281 |
tmp10 = (cast(FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */ |
|---|
| 282 |
tmp12 = (cast(FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */ |
|---|
| 283 |
|
|---|
| 284 |
tmp6 = tmp12 - tmp7; |
|---|
| 285 |
tmp5 = tmp11 - tmp6; |
|---|
| 286 |
tmp4 = tmp10 + tmp5; |
|---|
| 287 |
|
|---|
| 288 |
/* Final output stage: scale down by a factor of 8 and range-limit */ |
|---|
| 289 |
|
|---|
| 290 |
outptr[0] = descale_and_clamp(cast(int)(tmp0 + tmp7), 3); |
|---|
| 291 |
outptr[7] = descale_and_clamp(cast(int)(tmp0 - tmp7), 3); |
|---|
| 292 |
outptr[1] = descale_and_clamp(cast(int)(tmp1 + tmp6), 3); |
|---|
| 293 |
outptr[6] = descale_and_clamp(cast(int)(tmp1 - tmp6), 3); |
|---|
| 294 |
outptr[2] = descale_and_clamp(cast(int)(tmp2 + tmp5), 3); |
|---|
| 295 |
outptr[5] = descale_and_clamp(cast(int)(tmp2 - tmp5), 3); |
|---|
| 296 |
outptr[4] = descale_and_clamp(cast(int)(tmp3 + tmp4), 3); |
|---|
| 297 |
outptr[3] = descale_and_clamp(cast(int)(tmp3 - tmp4), 3); |
|---|
| 298 |
|
|---|
| 299 |
|
|---|
| 300 |
wsptr += DCTSIZE; /* advance pointer to next row */ |
|---|
| 301 |
outptr += stride; |
|---|
| 302 |
} |
|---|
| 303 |
} |
|---|
| 304 |
|
|---|
| 305 |
alias tinyjpeg_idct_float IDCT; |
|---|