Optimizing C63 for x86

Optimizing C63 for x86 { Group 9    Bird’s-eye view: gprof of reference encoder Optimizing SAD Results Outline  Each sample counts as 0.01 seconds.  % cumulative time seconds 93.05 37.36 3.04 38.58 1.77 39.29 1.25 39.79 0.35 39.93 0.17 40.00         self self total seconds calls us/call us/call 37.36 500214528 0.07 0.07 1.22 705672 1.73 54.67 0.71 712800 1.00 1.00 0.50 712800 0.70 0.70 0.14 713100 0.20 0.29 0.07 16398942 0.00 0.00 name sad_block_8x8 me_block_8x8 dequant_idct_block_8x8 dct_quant_block_8x8 flush_bits put_bits granularity: each sample hit covers 2 byte(s) for 0.02% of 40.16 seconds gprof: reference, foreman Optimizing SAD                 void sad_block_8x8(uint8_t *block1, uint8_t *block2, int stride, int *result) { int v; __m128i r = _mm_setzero_si128(); for (v = 0; v < 8; v += 2) { const __m128i b1 = _mm_set_epi64(*(__m64 *) &block1[(v+0)*stride], *(__m64 *) &block1[(v+1)*stride]); const __m128i b2 = _mm_set_epi64(*(__m64 *) &block2[(v+0)*stride], *(__m64 *) &block2[(v+1)*stride]); r = _mm_add_epi16(r, _mm_sad_epu8(b2, b1)); } *result = _mm_extract_epi16(r, 0) + _mm_extract_epi16(r, 4); } SAD SSE2 PSAD  Gprof: Improved SSE2 SAD uses 3.69 s vs. 38.50s*  Cachegrind: Lots of branch prediction misses in me_block_8x8 *) Foreman sequence on gpu-7 How well does this perform?                           void sad_block_8x8x8(uint8_t *block1, uint8_t *block2, int stride, int *best, int *result) { int v; __m128i r = _mm_setzero_si128(); union { __m128i v; struct { uint16_t sad; unsigned int index : 3; } minpos; } mp; for (v = 0; v < 8; v += 2) { const __m128i b1 = _mm_set_epi64(*(__m64 *) &block1[(v+1)*stride], *(__m64 *) &block1[(v+0)*stride]); const __m128i b2 = _mm_loadu_si128((__m128i *) &block2[(v+0)*stride]); const __m128i b3 = _mm_loadu_si128((__m128i *) &block2[(v+1)*stride]); r = _mm_add_epi16(r, _mm_mpsadbw_epu8(b2, b1, 0b000)); r = _mm_add_epi16(r, _mm_mpsadbw_epu8(b2, b1, 0b101)); r = _mm_add_epi16(r, _mm_mpsadbw_epu8(b3, b1, 0b010)); r = _mm_add_epi16(r, _mm_mpsadbw_epu8(b3, b1, 0b111)); } mp.v = _mm_minpos_epu16(r); *result = mp.minpos.sad; *best = mp.minpos.index; } SAD SSE4.1 MPSADBW+PHMINSUM  Gprof: Improved SSE4.1 SAD uses 0.90s vs 3.69s  Cachegrind: Branch prediction misses reduced by a factor of 8  Intel’s IACA tool: CPU pipeline appears to be filled! Appears both source and reference block loads compete with (V)MPSADBW for the CPUs execution ports  Assembly: Better utilize AVX’s non-destructive instructions (less register copies) Better utilize loaded data for SAD computations with same source block How well does this perform?          Load source block once: .macro .load_src vmovq (%rdi), %xmm0 vpinsrq $1,(%rdi,%rdx),%xmm0,%xmm0 vmovq (%rdi,%rdx,2),%xmm1 vmovhps (%rdi,%r8), %xmm1,%xmm1 vmovq (%rdi,%rdx,4),%xmm2 vmovhps (%rdi,%r9), %xmm2,%xmm2 vmovq (%rdi,%r8,2), %xmm3 vmovhps (%rdi,%rax), %xmm3,%xmm3 # # # # # # # # src[0] src[1] src[2] src[3] src[4] src[5] src[6] src[7]       Do sad for 8x8 - 8x8 blocks (relative y = 0…8, x = 0…8): .macro .8x8x8x8 vmovdqu (%rsi), %xmm12 # ref[0] .8x8x1 0, 0, 12, 4 0 vmovdqu (%rsi,%rdx), %xmm13 .8x8x1 0, 1, 13, 4 0 .8x8x1 1, 0, 13, 5 0 # ref[1] SAD 8x8x8x8: Less src loads, branches  Gprof: Improved SAD 8x8x8x8 uses 0.53s vs 0.90s  Valgrind: Even less branching How well does this perform?              sad_4x8x8x8x8: .load_src mov .8x8x8x8 lea .8x8x8x8 lea .8x8x8x8 lea .8x8x8x8 … vphminposuw … ret %rsi, %rdi # Load source block from %rdi # Reference block x,y 8(%rdi), %rsi # Reference block x+8, y (%rdi,%rdx,8), %rsi 8(%rdi,%rdx,8), %rsi # SAD 4x8x8x8x8: Branchless UV-plane ME  Gprof: Improved 4x8x8x8x8 SAD uses 0.47s vs 0.53s  Valgrind: Even less branching!  Total runtime of Foreman sequence reduced from ~40.1s to ~1.6s (factor of 25) How well does this perform? Instruction Function variant Cycle mean, adjusted for single 8x8 block MPSADBW SAD 8x 8x8 7.5 VMPSADBW SAD 2x8x 8x8 3.6 SAD 4x8x 8x8 4.3 SAD 8x8x 8x8 v1 2.9 SAD 8x8x 8x8 v2 2.8 Future research SAD 4x 8x8x 8x8 2.7 SAD 16x8x8x8x8 2.6? Even less branches?  SAD assembly code: Iterative improvement PSNR Mean PSNR 95% SSIM Mean SSIM 95% Tractor reference 33.1 3045 34.46 549 0.9614 4 0.97672 Tractor 33.1 2339 34.46 587 0.9614 6 0.97672 Foreman reference 33.4 6215 35.36 280 0.951 53 0.964 76 Foreman 33.4 4806 35.36 217 0.951 39 0.964 36 Image quality: comparable  NOT ON NDLAB!   Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls s/call s/call name 92.90 124.66 124.66 1809584896 0.00 0.00 sad_block_8x8 3.06 128.76 4.10 49 0.08 2.63 c63_motion_estimate 1.58 130.89 2.13 2448000 0.00 0.00 dct_quant_block_8x8 1.58 133.01 2.13 2448000 0.00 0.00 dequant_idct_block_8x8 0.26 133.36 0.35 2448000 0.00 0.00 write_block 0.18 133.60 0.24 150 0.00 0.02 dequantize_idct 0.16 133.82 0.22 49 0.00 0.00 c63_motion_compensate 0.13 133.99 0.17 150 0.00 0.02 dct_quantize 0.11 134.14 0.15 55939437 0.00 0.00 put_bits  granularity: each sample hit covers 2 byte(s) for 0.01% of 134.21 seconds           gprof: reference, tractor 50 frames (-O3)  NOT ON NDLAB!  Each sample counts as 0.01 seconds.              % cumulative time seconds 31.15 1.83 24.75 3.28 23.81 4.67 7.17 5.09 4.44 5.35 3.24 5.54 1.71 5.64 1.54 5.73 0.85 5.78 0.68 5.82 0.51 5.85 self seconds calls 1.83 6869016 1.45 2448000 1.40 2448000 0.42 50 0.26 150 0.19 55937692 0.10 150 0.09 9792000 0.05 798700 0.04 49 0.03 49 self ms/call 0.00 0.00 0.00 8.40 1.73 0.00 0.67 0.00 0.00 0.82 0.61 total ms/call 0.00 0.00 0.00 12.20 11.34 0.00 10.64 0.00 0.00 39.09 0.61 name sad_4x8x8x8x8 dct_quant_block_8x8 dequant_idct_block_8x8 write_frame dequantize_idct put_bits dct_quantize transpose_block_avx sad_8x8x8x8 c63_motion_estimate c63_motion_compensate  granularity: each sample hit covers 2 byte(s) for 0.17% of 5.86 seconds  134.2/5.86 = ~24 gprof: improved, tractor 50 frames (-O3)

Optimizing C63 for x86

Related documents

Products

Support

Optimizing C63 for x86

Related documents

Add this document to collection(s)

Add this document to saved

Suggest us how to improve StudyLib