Histogram Equalization with Cell Broadband Engine™ Cell Applications & Solutions

advertisement
Cell Applications & Solutions
Histogram Equalization
with
Cell Broadband Engine™
IBM Confidential
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Content
Overview: Histogram Equalization
Definitions
Assumptions, Highlights
Approach: Histogram Computation
Approach: Transform Image
Performance Results
2
IBM Confidential
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Overview: Histogram Equalization
 One of the most significant part of Image Processing
 Improves contrast by redistributing intensity
distributions
 Compute a uniform histogram
Three stages:
1. Compute
2. Normalize
3. Transform
3
IBM Confidential
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Definitions
First Stage: Computing the Histogram
 Parse the input image
 Count each distinct pixel value in the image
 Ex. for 8-bit pixels, the Max Pixel Value is 255, and array size
is 256.
Second Stage: Computing the normalized sum of histogram
 Store the sum of all the histogram values
 normalize by multiplying each element by (maximum-pixelvalue/number of pixels).
Third Stage: Transforming input image into output image
 Use the normalized array as a look up table for mapping the
input image pixel value to the new set of values from stage
4
IBM Confidential
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Assumptions, Highlights
Assumptions for demo:
 8-bit color scale
Approach Highlights:
 Parallelize
 Reduce dependencies
 Loop unroll
 SIMDize the code using vectors and SPE intrinsics
5
IBM Confidential
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Scalar Code Flow
#define ROUND(v) (int)((v) + 0.5) //!-- Round it
to the closest integer
#define __min(a,b) ( ((a) < (b)) ? (a) : (b) )
#define __max(a,b) ( ((a) > (b)) ? (a) : (b) )
#define BOUND(v) (unsigned char)(__min(255,
__max((v), 0))) // 0-255
{
int size = PIXEL_DATA_SIZE;
unsigned char map[size];
unsigned char src[size];
unsigned char dest[size];
unsigned int counts[256];
double sc;
long v;
int i, index;
unsigned int sum=0;
for(i=0; i < size; i++)
{
counts[i] = 0;
src[i] = random() & 0xFF;
}
6
IBM Confidential
for (i=0; i<size; i++)
{
Compute
counts[src[i]]++;
Histogram
}
sc = PIXEL_MAX_VALUE / (double)
IMAGE_SIZE;
for (i = 0; i < size; i++)
{
sum += counts[i];
Normalized
v = ROUND(sc * sum);
sum of Histogram
map[i] = BOUND(v);
}
for (i = 0; i < size; i++)
{
dest[i] = map[src[i]]; Transform
Histogram
}
}
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Histogram Computation
Vector unsigned char - load 16 bytes at a time to use the 128 bit register boundary
Data
Array
Byte 0
Byte F
2B
1B
3B
1B
0
4B
1
2
3
4
5
6
7
For ex.
110000 10
These 6 bits determine which of the 64 element
array index it should go to
64
64
00
01
10
11
Counter 0
vector unsigned int
00
01
Counter 1
vector unsigned int
Slots containing 32 bit counter value
7
IBM Confidential
64
64
10
11
00
rd
Slot ’10’ – 3 slot
Counter0[48]
These two bits decide
which slot to go into
01
10
11
Counter 2
vector unsigned int
00
01
10
11
Counter 3
vector unsigned int
64 element vector(128 bits) arrays – each containing 4 32 bit counters
4 of them are created to enable parallel computation and loop unrolling
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Code sections for Histogram computation
unsigned int idx_0, idx_1, idx_2, idx_3;
int slot_0, slot_1, slot_2, slot_3;
vector unsigned char in;
vector unsigned char *vdata;
vector unsigned int *vcounts;
vector unsigned int in_0, in_1, in_2, in_3;
vector unsigned int cnts_0[64];
vector unsigned int cnts_1[64];
vector unsigned int cnts_2[64];
vector unsigned int cnts_3[64];
/* Roll the counters into the overall (external) count array.
*/
for (i=0; i<64; i+=4) {
vector unsigned int sum0, sum1, sum2, sum3;
vdata = (vector unsigned char *)(data);
for (i=15; i<size; i+=16)
{
in = *vdata++;
//!-- Loop Unroll 1:
//!-- Handle the first 16 bytes from the input string
in_0 = spu_and((vector unsigned int)(in), 0xFF);
in_1 = spu_and(spu_rlmask((vector unsigned int)(in), -8), 0xFF);
in_2 = spu_and(spu_rlmask((vector unsigned int)(in), -16), 0xFF);
in_3 = spu_rlmask((vector unsigned int)(in), -24);
idx_0
idx_1
idx_2
idx_3
= spu_extract(in_0,
= spu_extract(in_1,
= spu_extract(in_2,
= spu_extract(in_3,
slot_0 = (0 - idx_0)
slot_1 = (0 - idx_1)
slot_2 = (0 - idx_2)
slot_3 = (0 - idx_3)
sum0
sum1
sum2
sum3
= spu_add(cnts_0[i], cnts_1[i]);
= spu_add(cnts_0[i+1], cnts_1[i+1]);
= spu_add(cnts_0[i+2], cnts_1[i+2]);
= spu_add(cnts_0[i+3], cnts_1[i+3]);
sum0
sum1
sum2
sum3
= spu_add(sum0,
= spu_add(sum1,
= spu_add(sum2,
= spu_add(sum3,
cnts_2[i]);
cnts_2[i+1]);
cnts_2[i+2]);
cnts_2[i+3]);
vcounts[i] = spu_add(sum0, cnts_3[i]);
vcounts[i+1] = spu_add(sum1, cnts_3[i+1]);
vcounts[i+2] = spu_add(sum2, cnts_3[i+2]);
vcounts[i+3] = spu_add(sum3, cnts_3[i+3]);
}
0);
0);
0);
0);
<< 2;
<< 2;
<< 2;
<< 2;
This is repeated four times
idx_0
idx_1
idx_2
idx_3
>>= 2;
>>= 2;
>>= 2;
>>= 2;
cnts_0[idx_0]
cnts_1[idx_1]
cnts_2[idx_2]
cnts_3[idx_3]
= spu_add(cnts_0[idx_0],
= spu_add(cnts_1[idx_1],
= spu_add(cnts_2[idx_2],
= spu_add(cnts_3[idx_3],
spu_rlqwbyte(one,
spu_rlqwbyte(one,
spu_rlqwbyte(one,
spu_rlqwbyte(one,
slot_0));
slot_1));
slot_2));
slot_3));
The above code section rolls the 4 counters
into one counter
//!– Repeat for 1, 2, 3,
//!– Loop Unroll 2:
--}
8
IBM Confidential
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Normalized Sum
v = count[i]
v0
v0
v0
v0
+
v = count[i]
X
v1
v1
v1
+
1. Compute the sum for the 64 vector
entries
2. Multiply with the normalization
constant
3. Clamp it to be 0-255
4. Store in an character map LUT
v = count[i]
X
X
v2
v2
+
v = count[i]
X
9
X
X
v3
IBM Confidential
float sc = PIXEL_MAX_VALUE/ (float) IMAGE_SIZE;
vector float vc = spu_splats((float)sc);
float scr = 0.5;
vector float vr = spu_splats((float) scr);
vector float vf1, vf2;
vector unsigned char splat0 = (vector unsigned char) {0,1,2,3, 0,1,2,3,
0,1,2,3, 0,1,2,3};
vector unsigned char splat1 = (vector unsigned char)
{128,128,128,128, 4,5,6,7, 4,5,6,7, 4,5,6,7};
vector unsigned char splat2 = (vector unsigned char){128,128,128,128,
128,128,128,128, 8,9,10,11, 8,9,10,11};
vector unsigned char splat3 = (vector unsigned char){12,13,14,15,
12,13,14,15, 12,13,14,15, 12,13,14,15};
vector unsigned int mask3 = (vector unsigned int){0,0,0,-1}
//!-- TODO: Convert it so the computation is pipelined.
TRACE("Print the final character map: \n");
for(i=0; i<size; i++)
{
v = counts[i];
sum = spu_shuffle(sum, sum, splat3);
v0 = spu_shuffle(v, v, splat0);
v1 = spu_shuffle(v, v, splat1);
v2 = spu_shuffle(v, v, splat2);
v3 = spu_and(v, mask3);
sum = spu_add(spu_add(spu_add(sum, v3), v2), spu_add(v1, v0));
//!-- Normalize, round it
vf2 = spu_convtf(sum, 0);
vf1 = spu_madd(vf2, vc, vr);
mapvi[i] = spu_convtu(vf1, 0);
for(j=0; j<4; j++)
{
var = spu_extract(mapvi[i], j);
map[k] = BOUND(var); //!-- TODO vectorize this
TRACE("%d ", map[k]);
k++;
}
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Transform the image
0 - 15
16 - 31
32 - 47
48 - 63
64 - 79
80 - 95
96 - 111
112 - 127
234 - 239 240 - 255
Byte Shuffle using the MSB 5 bits
Select using index bit 2
Select using index bit 1
Select using index bit 0
0
10
IBM Confidential
1
2
3
4
5
6
7
6/21/2016
© 2006 IBM Research
Cell Applications and Solutions
Performance Results
Environment:
 Benchmark was written in C and using xlc compiler.
 IBM Systemsim & Cell Blade was used to collect performance
numbers.
 Sample grayscale image (pieh2.pgm)
 Configuration:
 Cell blade is running at 3.2GHz.
 DMA operations are not counted in the calculation.
 Performance numbers are derived from the cycles count collected on
a single SPE.
Performance numbers:
 Histogram computation & image mapping(stage 1, 2, 3) combined at
0.50 Gigapixels/second for 100K
11
IBM Confidential
6/21/2016
© 2006 IBM Research
Download