Cell Applications & Solutions Histogram Equalization with Cell Broadband Engine™ IBM Confidential 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Content Overview: Histogram Equalization Definitions Assumptions, Highlights Approach: Histogram Computation Approach: Transform Image Performance Results 2 IBM Confidential 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Overview: Histogram Equalization One of the most significant part of Image Processing Improves contrast by redistributing intensity distributions Compute a uniform histogram Three stages: 1. Compute 2. Normalize 3. Transform 3 IBM Confidential 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Definitions First Stage: Computing the Histogram Parse the input image Count each distinct pixel value in the image Ex. for 8-bit pixels, the Max Pixel Value is 255, and array size is 256. Second Stage: Computing the normalized sum of histogram Store the sum of all the histogram values normalize by multiplying each element by (maximum-pixelvalue/number of pixels). Third Stage: Transforming input image into output image Use the normalized array as a look up table for mapping the input image pixel value to the new set of values from stage 4 IBM Confidential 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Assumptions, Highlights Assumptions for demo: 8-bit color scale Approach Highlights: Parallelize Reduce dependencies Loop unroll SIMDize the code using vectors and SPE intrinsics 5 IBM Confidential 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Scalar Code Flow #define ROUND(v) (int)((v) + 0.5) //!-- Round it to the closest integer #define __min(a,b) ( ((a) < (b)) ? (a) : (b) ) #define __max(a,b) ( ((a) > (b)) ? (a) : (b) ) #define BOUND(v) (unsigned char)(__min(255, __max((v), 0))) // 0-255 { int size = PIXEL_DATA_SIZE; unsigned char map[size]; unsigned char src[size]; unsigned char dest[size]; unsigned int counts[256]; double sc; long v; int i, index; unsigned int sum=0; for(i=0; i < size; i++) { counts[i] = 0; src[i] = random() & 0xFF; } 6 IBM Confidential for (i=0; i<size; i++) { Compute counts[src[i]]++; Histogram } sc = PIXEL_MAX_VALUE / (double) IMAGE_SIZE; for (i = 0; i < size; i++) { sum += counts[i]; Normalized v = ROUND(sc * sum); sum of Histogram map[i] = BOUND(v); } for (i = 0; i < size; i++) { dest[i] = map[src[i]]; Transform Histogram } } 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Histogram Computation Vector unsigned char - load 16 bytes at a time to use the 128 bit register boundary Data Array Byte 0 Byte F 2B 1B 3B 1B 0 4B 1 2 3 4 5 6 7 For ex. 110000 10 These 6 bits determine which of the 64 element array index it should go to 64 64 00 01 10 11 Counter 0 vector unsigned int 00 01 Counter 1 vector unsigned int Slots containing 32 bit counter value 7 IBM Confidential 64 64 10 11 00 rd Slot ’10’ – 3 slot Counter0[48] These two bits decide which slot to go into 01 10 11 Counter 2 vector unsigned int 00 01 10 11 Counter 3 vector unsigned int 64 element vector(128 bits) arrays – each containing 4 32 bit counters 4 of them are created to enable parallel computation and loop unrolling 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Code sections for Histogram computation unsigned int idx_0, idx_1, idx_2, idx_3; int slot_0, slot_1, slot_2, slot_3; vector unsigned char in; vector unsigned char *vdata; vector unsigned int *vcounts; vector unsigned int in_0, in_1, in_2, in_3; vector unsigned int cnts_0[64]; vector unsigned int cnts_1[64]; vector unsigned int cnts_2[64]; vector unsigned int cnts_3[64]; /* Roll the counters into the overall (external) count array. */ for (i=0; i<64; i+=4) { vector unsigned int sum0, sum1, sum2, sum3; vdata = (vector unsigned char *)(data); for (i=15; i<size; i+=16) { in = *vdata++; //!-- Loop Unroll 1: //!-- Handle the first 16 bytes from the input string in_0 = spu_and((vector unsigned int)(in), 0xFF); in_1 = spu_and(spu_rlmask((vector unsigned int)(in), -8), 0xFF); in_2 = spu_and(spu_rlmask((vector unsigned int)(in), -16), 0xFF); in_3 = spu_rlmask((vector unsigned int)(in), -24); idx_0 idx_1 idx_2 idx_3 = spu_extract(in_0, = spu_extract(in_1, = spu_extract(in_2, = spu_extract(in_3, slot_0 = (0 - idx_0) slot_1 = (0 - idx_1) slot_2 = (0 - idx_2) slot_3 = (0 - idx_3) sum0 sum1 sum2 sum3 = spu_add(cnts_0[i], cnts_1[i]); = spu_add(cnts_0[i+1], cnts_1[i+1]); = spu_add(cnts_0[i+2], cnts_1[i+2]); = spu_add(cnts_0[i+3], cnts_1[i+3]); sum0 sum1 sum2 sum3 = spu_add(sum0, = spu_add(sum1, = spu_add(sum2, = spu_add(sum3, cnts_2[i]); cnts_2[i+1]); cnts_2[i+2]); cnts_2[i+3]); vcounts[i] = spu_add(sum0, cnts_3[i]); vcounts[i+1] = spu_add(sum1, cnts_3[i+1]); vcounts[i+2] = spu_add(sum2, cnts_3[i+2]); vcounts[i+3] = spu_add(sum3, cnts_3[i+3]); } 0); 0); 0); 0); << 2; << 2; << 2; << 2; This is repeated four times idx_0 idx_1 idx_2 idx_3 >>= 2; >>= 2; >>= 2; >>= 2; cnts_0[idx_0] cnts_1[idx_1] cnts_2[idx_2] cnts_3[idx_3] = spu_add(cnts_0[idx_0], = spu_add(cnts_1[idx_1], = spu_add(cnts_2[idx_2], = spu_add(cnts_3[idx_3], spu_rlqwbyte(one, spu_rlqwbyte(one, spu_rlqwbyte(one, spu_rlqwbyte(one, slot_0)); slot_1)); slot_2)); slot_3)); The above code section rolls the 4 counters into one counter //!– Repeat for 1, 2, 3, //!– Loop Unroll 2: --} 8 IBM Confidential 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Normalized Sum v = count[i] v0 v0 v0 v0 + v = count[i] X v1 v1 v1 + 1. Compute the sum for the 64 vector entries 2. Multiply with the normalization constant 3. Clamp it to be 0-255 4. Store in an character map LUT v = count[i] X X v2 v2 + v = count[i] X 9 X X v3 IBM Confidential float sc = PIXEL_MAX_VALUE/ (float) IMAGE_SIZE; vector float vc = spu_splats((float)sc); float scr = 0.5; vector float vr = spu_splats((float) scr); vector float vf1, vf2; vector unsigned char splat0 = (vector unsigned char) {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; vector unsigned char splat1 = (vector unsigned char) {128,128,128,128, 4,5,6,7, 4,5,6,7, 4,5,6,7}; vector unsigned char splat2 = (vector unsigned char){128,128,128,128, 128,128,128,128, 8,9,10,11, 8,9,10,11}; vector unsigned char splat3 = (vector unsigned char){12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15}; vector unsigned int mask3 = (vector unsigned int){0,0,0,-1} //!-- TODO: Convert it so the computation is pipelined. TRACE("Print the final character map: \n"); for(i=0; i<size; i++) { v = counts[i]; sum = spu_shuffle(sum, sum, splat3); v0 = spu_shuffle(v, v, splat0); v1 = spu_shuffle(v, v, splat1); v2 = spu_shuffle(v, v, splat2); v3 = spu_and(v, mask3); sum = spu_add(spu_add(spu_add(sum, v3), v2), spu_add(v1, v0)); //!-- Normalize, round it vf2 = spu_convtf(sum, 0); vf1 = spu_madd(vf2, vc, vr); mapvi[i] = spu_convtu(vf1, 0); for(j=0; j<4; j++) { var = spu_extract(mapvi[i], j); map[k] = BOUND(var); //!-- TODO vectorize this TRACE("%d ", map[k]); k++; } 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Transform the image 0 - 15 16 - 31 32 - 47 48 - 63 64 - 79 80 - 95 96 - 111 112 - 127 234 - 239 240 - 255 Byte Shuffle using the MSB 5 bits Select using index bit 2 Select using index bit 1 Select using index bit 0 0 10 IBM Confidential 1 2 3 4 5 6 7 6/21/2016 © 2006 IBM Research Cell Applications and Solutions Performance Results Environment: Benchmark was written in C and using xlc compiler. IBM Systemsim & Cell Blade was used to collect performance numbers. Sample grayscale image (pieh2.pgm) Configuration: Cell blade is running at 3.2GHz. DMA operations are not counted in the calculation. Performance numbers are derived from the cycles count collected on a single SPE. Performance numbers: Histogram computation & image mapping(stage 1, 2, 3) combined at 0.50 Gigapixels/second for 100K 11 IBM Confidential 6/21/2016 © 2006 IBM Research