CS 105 Tour of Black Holes of Computing Machine-Level Programming: X86-64 Topics Registers Stack Function Calls Local Storage X86-64.ppt Interesting Features of x86-64 Pointers and long Integers are 64 bits Arithmetic ops support 8, 16, 32, and 64-bit ints Set of general purpose registers is 16 Full compatibility with X32 Conditional ops are implemented using conditional move instructions (when possible) – faster than branch Much of pgm state held in registers Up to 6 arguments passed via registers (not stack) Some procedures do not need to access the stack at all?? Floating-point ops implemented using register-oriented instructions rather than stack-based With move to x86-64, gcc updated to take advantage of many architecture changes –2– 105 Data Representations: IA32 + x86-64 Sizes of C Objects (in Bytes) C Data Type Typical 32-bit unsigned int long int char short float double long double char * 4 4 4 1 2 4 8 8 4 Intel IA32 x86-64 4 4 4 1 2 4 8 10/12 4 4 4 8 1 2 4 8 16 8 Or any other pointer –3– x86-64: 64 bit ints and 64 bit pointers 105 x86-64 Integer Registers - 16 %rax %eax %r8 %r8d %rbx %ebx %r9 %r9d %rcx %ecx %r10 %r10d %rdx %edx %r11 %r11d %rsi %esi %r12 %r12d %rdi %edi %r13 %r13d %rsp %esp %r14 %r14d %rbp %ebp %r15 %r15d Extend existing registers. Add 8 new ones, word (16 bit and byte (8 bit) registers still available, first 2 bytes available (rax, rbx, rcx, rdx) 105 –4– Make %ebp/%rbp general purpose, note 32 bit names Instructions Long word l (4 Bytes) ↔ Quad word q (8 Bytes) New instructions: movl → movq addl → addq sall → salq etc. 32-bit instructions that generate 32-bit results –5– Set higher order bits of destination register to 0 Example: addl 105 Swap in 32-bit Mode void swap(int *xp, int *yp) { int t0 = *xp; int t1 = *yp; *xp = t1; *yp = t0; } swap: pushl %ebp movl %esp,%ebp pushl %ebx movl movl movl movl movl movl 12(%ebp),%ecx 8(%ebp),%edx (%ecx),%eax (%edx),%ebx %eax,(%edx) %ebx,(%ecx) movl -4(%ebp),%ebx movl %ebp,%esp popl %ebp ret –6– Setup Body Finish 105 Swap in 64-bit Mode void swap(int *xp, int *yp) { int t0 = *xp; int t1 = *yp; *xp = t1; *yp = t0; } swap: movl movl movl movl retq (%rdi), %edx (%rsi), %eax %eax, (%rdi) %edx, (%rsi) Operands passed in registers (why useful?) First (xp) in %rdi, second (yp) in %rsi 64-bit pointers No stack operations required, actually no stack frame 32-bit data –7– Data held in registers %eax and %edx movl operation 105 x86-64 Integer Registers, 6 used for arguments –8– %rax Return value %r8 Argument #5 %rbx Callee saved %r9 Argument #6 %rcx Argument #4 %r10 Callee saved %rdx Argument #3 %r11 Used for linking %rsi Argument #2 %r12 C: Callee saved %rdi Argument #1 %r13 Callee saved %rsp Stack pointer %r14 Callee saved %rbp Callee saved %r15 Callee saved 105 Interesting Features of Stack Frame Allocate entire frame at once All stack accesses can be relative to %rsp Do so by decrementing stack pointer Can delay allocation, since safe to temporarily use red zone128 bytes available below the stack Simple deallocation –9– Increment stack pointer No base/frame pointer needed 105 x86-64 Registers Arguments passed to functions via registers If more than 6 integral parameters, then pass rest on stack These registers can be used as caller-saved as well All references to stack frame via stack pointer Eliminates need to update %ebp/%rbp Other Registers R12-R15, Rbx, Rbp, callee saved 2 or 3 have special uses Rax holds return value for a function Rsp is the stack pointer – 10 – 105 Reading Condition Codes: x86-64 SetX Instructions: Set single byte based on combination of condition codes Does not alter remaining 3 bytes int gt (long x, long y) { return x > y; } long lgt (long x, long y) { return x > y; } Body (same for both) xorl %eax, %eax cmpq %rsi, %rdi setg %al – 11 – # eax = 0 Will disappear # Compare x and y Blackboard? # al = x > y 105 Reading Condition Codes: x86-64 SetX Instructions: Set single byte based on combination of condition codes Does not alter remaining 3 bytes int gt (long x, long y) { return x > y; } long lgt (long x, long y) { return x > y; } Body (same for both) xorl %eax, %eax cmpq %rsi, %rdi setg %al # eax = 0 # Compare x and y # al = x > y Is %rax zero? Yes: 32-bit instructions set high order 32 bits to 0! – 12 – 105 Conditionals: x86-64 int absdiff( int x, int y) { int result; if (x > y) { result = x-y; } else { result = y-x; } return result; } – 13 – absdiff: movl movl subl subl cmpl cmovle ret # x in %edi, y in %esi %edi, %eax # eax = x %esi, %edx # edx = y %esi, %eax # Will eaxdisappear = x-y %edi, %edx # edx = y-x Blackboard? %esi, %edi # x:y %edx, %eax # eax=edx if <= 105 Conditionals: x86-64 int absdiff( int x, int y) { int result; if (x > y) { result = x-y; } else { result = y-x; } return result; } absdiff: movl movl subl subl cmpl cmovle ret # x in %edi, y in %esi %edi, %eax # eax = x %esi, %edx # edx = y %esi, %eax # eax = x-y %edi, %edx # edx = y-x %esi, %edi # x:y %edx, %eax # eax=edx if <= Conditional move instruction – 14 – cmovC src, dest Move value from src to dest if condition C holds More efficient than conditional branching (simple control flow) But overhead: both branches are evaluated 105 x86-64 Procedure Summary Heavy use of registers Parameter passing More temporaries since more registers Minimal use of stack Sometimes none Allocate/deallocate entire block Many tricky optimizations – 15 – What kind of stack frame to use Calling with jump Various allocation techniques 105 IA32 Example Addresses not drawn to scale FF Stack address range ~232 $esp p3 p1 p4 p2 &p2 beyond big_array huge_array main() useless() final malloc() 0xffffbcd0 0x65586008 0x55585008 0x1904a110 0x1904a008 0x18049760 0x08049744 0x18049780 0x08049760 0x080483c6 0x08049744 0x006be166 malloc() is dynamically linked address determined at runtime – 16 – 80 Heap 08 00 Data Text 105 x86-64 Example Addresses not drawn to scale 00007F Stack address range ~247 $rsp p3 p1 p4 p2 &p2 beyond big_array huge_array main() useless() final malloc() 0x7ffffff8d1f8 0x2aaabaadd010 0x2aaaaaadc010 0x000011501120 0x000011501010 0x000010500a60 0x000000500a44 0x000010500a80 0x000000500a50 0x000000400510 0x000000400500 0x00386ae6a170 malloc() is dynamically linked address determined at runtime – 17 – 000030 Heap Data Text 000000 105 x86-64 Summary Pointers and long integers are 64 bit 16 Registers, with 64, 32, 16, 8, bit uses Minimal use of stack Parameters passed in registers Allocate/deallocate entire block for a procedure Can use 128 bytes passed (beyond) stack pointer (red zone) can use stack without messing with stack pointer No Frame Pointer Many tricky optimizations – 18 – What kind of stack frame to use Calling with jump Various allocation techniques Compilers need to be smarter 105 x86-64 Locals in the Red Zone /* Swap, using local array */ void swap_a(long *xp, long *yp) { volatile long loc[2]; loc[0] = *xp; loc[1] = *yp; *xp = loc[1]; *yp = loc[0]; } swap_a: movq movq movq movq movq movq movq movq ret Avoiding Stack Pointer Change – 19 – Can hold all information within small window beyond stack pointer (%rdi), %rax %rax, -24(%rsp) (%rsi), %rax %rax, -16(%rsp) -16(%rsp), %rax %rax, (%rdi) -24(%rsp), %rax %rax, (%rsi) rtn Ptr −8 %rsp unused −16 loc[1] −24 loc[0] 105 x86-64 Stack Frame Example long sum = 0; /* Swap a[i] & a[i+1] */ void swap_ele_su (long a[], int i) { swap(&a[i], &a[i+1]); sum += a[i]; } Keeps values of a and i in callee save registers Must set up stack frame to save these registers – 20 – swap_ele_su: movq %rbx, -16(%rsp) movslq %esi,%rbx movq %r12, -8(%rsp) movq %rdi, %r12 leaq (%rdi,%rbx,8), %rdi subq $16, %rsp leaq 8(%rdi), %rsi call swap movq (%r12,%rbx,8), %rax addq %rax, sum(%rip) movq (%rsp), %rbx movq 8(%rsp), %r12 addq $16, %rsp ret Blackboard? 105 Understanding x86-64 Stack Frame swap_ele_su: movq %rbx, -16(%rsp) movslq %esi,%rbx movq %r12, -8(%rsp) movq %rdi, %r12 leaq (%rdi,%rbx,8), %rdi subq $16, %rsp leaq 8(%rdi), %rsi call swap movq (%r12,%rbx,8), %rax addq %rax, sum(%rip) movq (%rsp), %rbx movq 8(%rsp), %r12 addq $16, %rsp ret – 21 – # # # # # # # # # # # # # Save %rbx Extend & save i Save %r12 Save a – array pointer Create &a[i] Allocate stack frame &a[i+1] swap() a[i] sum += a[i] Restore %rbx Restore %r12 Deallocate stack frame 105 Understanding x86-64 Stack Frame swap_ele_su: movq %rbx, -16(%rsp) movslq %esi,%rbx movq %r12, -8(%rsp) movq %rdi, %r12 leaq (%rdi,%rbx,8), %rdi subq $16, %rsp leaq 8(%rdi), %rsi call swap movq (%r12,%rbx,8), %rax addq %rax, sum(%rip) movq (%rsp), %rbx movq 8(%rsp), %r12 addq $16, %rsp ret – 22 – # # # # # # # # # # # # # Save %rbx %rsp rtn addr Extend & save i −8 %r12 Save %r12 −16 %rbx Save a &a[i] Allocate stack frame &a[i+1] rtn addr swap() +8 %r12 a[i] sum += a[i] %rsp %rbx Restore %rbx Restore %r12 Deallocate stack frame 105 Interesting Features of Stack Frame Allocate entire frame at once All stack accesses can be relative to %rsp Do by decrementing stack pointer Can delay allocation, since safe to temporarily use red zone Simple deallocation – 23 – Increment stack pointer No base/frame pointer needed 105