Chapter 1: Introduction to Compiling CSE4100 Prof. Steven A. Demurjian Computer Science & Engineering Department The University of Connecticut 371 Fairfield Way, Unit 2155 Storrs, CT 06269-3155 steve@engr.uconn.edu http://www.engr.uconn.edu/~steve (860) 486 - 4818 Material for course thanks to: Laurent Michel Aggelos Kiayias Robert LeBarre CH1.1 Introduction to Compilers CSE4100 As a Discipline, Involves Multiple CSE Areas Programming Languages and Algorithms Software Engineering & Theory / Foundations Computer Architecture & Operating Systems But, Has Surprisingly Simplistic Intent: Source program Compiler Target Program Error messages Diverse & Varied CH1.2 What is the Compilation Process? Consider the main.c C Program? CSE4100 #include <stdio.h> #include <ctype.h> #include <string.h> main() { char day[10]; int m, d, y; m = 9; d = 7; y = 2011; strcpy(day, "Wednesday"); printf("Today is: "); printf("%s %d/%d/%d\n", day, m, d, y); } CH1.3 Viewing C from Compiler Theory CSE4100 Tokens or Patterns for the “Words” of the Programming Language Specified in Flex Grammar Rules that Describe the Allowable Sentences of the Programming Language Specified in Bison Other Stages of Compilation Semantic and Type Checking Analysis Intermediate Code Generation Code Optimization Final (Relocatable) Code Generation CH1.4 Viewing C from Usage Side CSE4100 ANSI C Specification of Lexical Structure in Flex ANSI C Specification of Grammar Rules in Bison Multi-Stage Compilation Process from Source Code to Preprocessed Code to Assembly Code to Object Code to Executable Preprocess: gcc –E main.c > main.e Assembly: gcc –S main.c Generates main.s Object: gcc –c main.c Generates main.o Executable: gcc main.c Generates a.out CH1.5 What are Tokens? Smallest Individual Units that are Recognized CSE4100 #include <stdio.h> #include <ctype.h> #include <string.h> main() { char day[10]; int m, d, y; m = 9; d = 7; y = 2011; strcpy(day, "Wednesday"); printf("Today is: "); printf("%s %d/%d/%d\n", day, m, d, y); } CH1.6 C Lexical Specification in Flex CSE4100 http://www.lysator.liu.se/c/ANSI-C-grammar-l.html D [0-9] L [a-zA-Z_] H [a-fA-F0-9] E [Ee][+-]?{D}+ FS (f|F|l|L) IS (u|U|l|L)* %{ #include <stdio.h> #include "y.tab.h" void count(); %} %% "/*" { comment(); } CH1.7 C Flex Continued … CSE4100 "auto" "break" "case" "char" "const" "continue" "default" "do" "double" "else" "enum" "extern" "float" "for" "goto" "if" "int" { { { { { { { { { { { { { { { { { count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); return(AUTO); } return(BREAK); } return(CASE); } return(CHAR); } return(CONST); } return(CONTINUE); } return(DEFAULT); } return(DO); } return(DOUBLE); } return(ELSE); } return(ENUM); } return(EXTERN); } return(FLOAT); } return(FOR); } return(GOTO); } return(IF); } return(INT); } CH1.8 C Flex Continued … CSE4100 "long" "register" "return" "short" "signed" "sizeof" "static" "struct" "switch" "typedef" "union" "unsigned" "void" "volatile" "while" { { { { { { { { { { { { { { { count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); return(LONG); } return(REGISTER); } return(RETURN); } return(SHORT); } return(SIGNED); } return(SIZEOF); } return(STATIC); } return(STRUCT); } return(SWITCH); } return(TYPEDEF); } return(UNION); } return(UNSIGNED); } return(VOID); } return(VOLATILE); } return(WHILE); } What TOKEN is Missing? CH1.9 C Flex Continued … CSE4100 {L}({L}|{D})* { count(); return(check_type()); } 0[xX]{H}+{IS}? { count(); return(CONSTANT); } 0{D}+{IS}? { count(); return(CONSTANT); } {D}+{IS}? { count(); return(CONSTANT); } L?'(\\.|[^\\'])+' { count(); return(CONSTANT); } {D}+{E}{FS}? { count(); return(CONSTANT); } {D}*"."{D}+({E})?{FS}? { count(); return(CONSTANT); } {D}+"."{D}*({E})?{FS}? { count(); return(CONSTANT); } L?\"(\\.|[^\\"])*\" { count(); return(STRING_LITERAL); } What Do These Represent? CH1.10 C Flex Continued … "..." ">>=" CSE4100 "<<=" "+=" "-=" "*=" "/=" "%=" "&=" "^=" "|=" ">>" "<<" "++" "--" "->" { { { { { { { { { { { { { { { { count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); count(); return(ELLIPSIS); } return(RIGHT_ASSIGN); } return(LEFT_ASSIGN); } return(ADD_ASSIGN); } return(SUB_ASSIGN); } return(MUL_ASSIGN); } return(DIV_ASSIGN); } return(MOD_ASSIGN); } return(AND_ASSIGN); } return(XOR_ASSIGN); } return(OR_ASSIGN); } return(RIGHT_OP); } return(LEFT_OP); } return(INC_OP); } return(DEC_OP); } return(PTR_OP); } CH1.11 C Flex Continued … MISSING LOTS OF FLEX CSE4100 "<" ">" "^" "|" "?" [ \t\v\n\f] . %% yywrap() { return(1); } { { { { { { { count(); return('<'); } count(); return('>'); } count(); return('^'); } count(); return('|'); } count(); return('?'); } count(); } /* ignore bad characters */ } MISSING OTHER CODE CH1.12 C Bison Specification http://www.lysator.liu.se/c/ANSI-C-grammar-y.html %token IDENTIFIER CONSTANT STRING_LITERAL SIZEOF CSE4100 %token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP %token GE_OP EQ_OP NE_OP %token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN %token XOR_ASSIGN OR_ASSIGN TYPE_NAME ADD_ASSIGN %token %token %token %token TYPEDEF EXTERN STATIC AUTO REGISTER CHAR SHORT INT LONG SIGNED UNSIGNED FLOAT DOUBLE CONST VOLATILE VOID STRUCT UNION ENUM ELLIPSIS %token CASE DEFAULT IF ELSE SWITCH WHILE DO FOR GOTO CONTINUE %token BREAK RETURN %start translation_unit %% CH1.13 Bison Spec Continued … primary_expression : IDENTIFIER | CONSTANT | STRING_LITERAL CSE4100 | '(' expression ')' ; postfix_expression : primary_expression | postfix_expression | postfix_expression | postfix_expression | postfix_expression | postfix_expression | postfix_expression | postfix_expression ; These are Grammar Rules '[' expression ']' '(' ')' '(' argument_expression_list ')' '.' IDENTIFIER PTR_OP IDENTIFIER INC_OP DEC_OP argument_expression_list : assignment_expression | argument_expression_list ',' assignment_expression CH1.14 Bison Spec Continued … These Rules Declare Variables CSE4100 declaration_list : declaration | declaration_list declaration declaration : declaration_specifiers ';' | declaration_specifiers init_declarator_list ';' ; declaration_specifiers : storage_class_specifier | storage_class_specifier declaration_specifiers | type_specifier | type_specifier declaration_specifiers | type_qualifier | type_qualifier declaration_specifiers CH1.15 Bison Spec Continued … statement_list : statement | statement_list statement ; CSE4100 statement : labeled_statement | compound_statement | expression_statement | selection_statement | iteration_statement | jump_statement ; List of Statements and Different Statements labeled_statement : IDENTIFIER ':' statement | CASE constant_expression ':' statement | DEFAULT ':' statement ; CH1.16 Bison Spec Continued … expression_statement : ';' | expression ';' CSE4100 ; selection_statement : IF '(' expression ')' statement | IF '(' expression ')' statement ELSE statement | SWITCH '(' expression ')' statement ; iteration_statement : WHILE '(' expression ')' statement | DO statement WHILE '(' expression ')' ';' | FOR '(' expression_statement expression_statement ')' statement | FOR '(' expression_statement expression_statement expression ')' statement ; CH1.17 Reviewing Compilation Process in C CSE4100 Current Programmers Used to State of the Art IDEs (Eclipse, etc.) IDEs focus on Higher Level Concepts Syntax Correction Capabilities Matching Parens and Brackets Sophisticated Compilation Error Messages Debugging Environment Process Hides Details – In C: Multi-Stage Compilation Process from Source Code to Preprocessed Code to Assembly Code to Object Code to Executable CH1.18 From Source to Preprocessing: What Does main.e Contain? CSE4100 # # # # # # # # # # # # # # # 1 "main.c" 1 "<built-in>" 1 "<command-line>" 1 "main.c" 1 "/usr/include/stdio.h" 1 3 4 28 "/usr/include/stdio.h" 3 4 1 "/usr/include/features.h" 1 3 4 330 "/usr/include/features.h" 3 4 1 "/usr/include/sys/cdefs.h" 1 3 4 348 "/usr/include/sys/cdefs.h" 3 4 1 "/usr/include/bits/wordsize.h" 1 3 4 349 "/usr/include/sys/cdefs.h" 2 3 4 331 "/usr/include/features.h" 2 3 4 354 "/usr/include/features.h" 3 4 1 "/usr/include/gnu/stubs.h" 1 3 4 … etc …. CH1.19 From Source to Preprocessing: What Does main.e Contain? … missing almost 900 lines of code … CSE4100 # 4 "main.c" 2 main() { char day[10]; int m, d, y; m = 9; d = 7; y = 2011; strcpy(day, "Wednesday"); printf("Today is: "); printf("%s %d/%d/%d\n", day, m, d, y); } CH1.20 From Preprocessing to Assembly: What Does main.s Contain? CSE4100 CH1.21 What are Final Two Steps? CSE4100 Generation of “.o” files Linking of “.o” files to Create a.out In Olden Days (no IDEs): Compilation of 250K code take 2-3 Hours Utilization of Makefiles for Smart Compilation Creation of Directory Structure for Code Makefiles to Establish Dependencies Among “.h” and “.c” and “.o” Files Change in Time Stamp Causes Recompile Try to Recompile “Least” Amount of Code Then – Relink all “.o” Files into a.out CH1.22 Programming Languages Offer … Abstractions At different levels From low CSE4100 Good for machines…. To high Good for humans…. Three Approaches Interpreted Compiled Mixed CH1.23 Interpreter Motivation… Easiest to implement! Upside ? Downside ? Phases Lexical analysis Parsing Semantic checking Interpretation Interpreted Languages? CSE4100 CH1.24 Compiler Motivation It is natural! Upside? Downside? Phases [Interpreter] Code Generation Code Optimization Link & load Compiled Languages? CSE4100 CH1.25 Mixed Motivation The best of two breeds… Upside ? Downside? Mixed Languages? CSE4100 CH1.26 Classifications of Compilers CSE4100 Compilers Viewed from Many Perspectives Single Pass Multiple Pass Construction Load & Go Debugging Optimizing Functional However, All utilize same basic tasks to accomplish their actions CH1.27 Compiler STructure Two Fundamental Sets of Issues Analysis: Decompose Source into an intermediate representation CSE4100 Text, Syntactic, and Structural Analysis Synthesis: Target program generation from representation which includes optimization We Will Discuss Each Category in This Class CH1.28 Important Notes CSE4100 In Today’s Technology, Analysis Is Often Performed by Software Tools - This Wasn’t the Case in Early CSE Days Structure / Syntax directed editors: Force “syntactically” correct code to be entered Pretty Printers: Standardized version for program structure (i.e., blank space, indenting, etc.) Static Checkers: A “quick” compilation to detect rudimentary errors Interpreters: “real” time execution of code a “line-at-a-time” CH1.29 Important Notes CSE4100 Compilation Is Not Limited to Programming Language Applications Text Formatters LATEX & TROFF Are Languages Whose Commands Format Text Silicon Compilers Textual / Graphical: Take Input and Generate Circuit Design Database Query Processors Database Query Languages Are Also a Programming Language Input Is“compiled” Into a Set of Operations for Accessing the Database CH1.30 Summary of Various Languages Compiled Programming Languages [C,C#,ML,LISP,…] CSE4100 Communication Languages [XML,HTML,…] Presentation Languages [CSS,SGML,…] Hardware Languages [VHDL,…] Formatting Languages [Postscript,troff,LaTeX,…] Query Languages [SQL & friends] CH1.31 The Many Phases of a Compiler Source Program 1 CSE4100 2 3 Symbol-table Manager Syntax Analyzer Semantic Analyzer Error Handler 4 5 6 1, 2, 3 : Analysis - Our Focus 4, 5, 6 : Synthesis Lexical Analyzer Intermediate Code Generator Code Optimizer Code Generator Target Program CH1.32 What Does Relocatable Mean? Source Program 1 CSE4100 2 3 PreProcessor Compiler Assembler 4 Relocatable Machine Code 5 Loader Link/Editor Library, relocatable object files Executable CH1.33 The Analysis Task For Compilation Three Phases: Linear / Lexical Analysis: L-to-r Scan to Identify Tokens CSE4100 Hierarchical Analysis: Grouping of Tokens Into Meaningful Collection Semantic Analysis: Checking to Insure Correctness of Components Source Program Language Analysis Phases 1 2 3 Lexical Analyzer Syntax Analyzer Semantic Analyzer CH1.34 Phase 1. Lexical Analysis CSE4100 Easiest Analysis - Identify tokens which are building blocks For Example: Position := initial + rate * 60 ; _______ __ _____ _ ___ _ __ _ All are tokens Blanks, Line breaks, etc. are scanned out CH1.35 Lexical Analysis Purpose Slice the sequence of symbols into tokens CSE4100 Date x := new Date ( System.today( ) + 30 ) ; Id Date Date new Id Id x Symbol Keyword := Symbol Id System Symbol . Today Symbol ( Id ( Symbol + Symbol ) Integer Symbol 30 ) Symbol ; CH1.36 Phase 2. Hierarchical Analysis aka Parsing or Syntax Analysis assignment statement CSE4100 identifier position := expression identifier initial For 1st example, we would have Parse Tree: expression + expression * expression expression identifier rate number 60 Nodes of tree are constructed using a grammar for the language CH1.37 Syntax Analysis (parsing) CSE4100 For Second example, we would have: Organize tokens in sentences based on grammar Symbol Symbol ; Symbol ) Symbol Symbol new . Keyw ord Symbol := Symbol + Symbol ) ( ( CH1.38 What is a Grammar? Grammar is a Set of Rules Which Govern the Interdependencies & Structure Among the Tokens CSE4100 statement is an assignment statement, or while statement, or if statement, or ... assignment statement is an identifier := expression ; expression (expression), or expression + expression, or expression * expression, or number, or identifier, or ... is an CH1.39 Summary so far… Turn a symbol stream into a parse tree CSE4100 CH1.40 Semantic Analysis CSE4100 Lexical Analysis - Scans Input to Identify “words” that are the the Tokens of the Language Syntactic Analysis uses Recursion to Identify Structure as Indicated in Parse Tree What is Semantic Analysis? Purpose Determine Unique / Unambiguous Interpretation Catch errors related to program meaning Determine Whether the Sentences have One and Only One Unambiguous Interpretation “John Took Picture of Mary Out on the Patio” For a PL – Wrong Types, Missing Declaration, Missing Methods, Ambiguous Statements, etc. CH1.41 Phase 3. Semantic Analysis CSE4100 Find More Complicated Semantic Errors and Support Code Generation Parse Tree Is Augmented With Semantic Actions := := position position + initial initial * rate + 60 * rate inttoreal 60 Compressed Tree Conversion Action CH1.42 Phase 3. Semantic Analysis CSE4100 Most Important Activity in This Phase: Type Checking - Legality of Operands Many Different Situations: Real := int + char ; A[int] := A[real] + int ; while char <> int do …. Etc. Primary Tool is Symbol Table CH1.43 Analysis in Text Formatting Simple Commands : LATEX CSE4100 \begin{single} \end{single} \noindent \section{Introduction} Embedded in a stream of text, i.e., a FILE begin single Language noindent Commands section $A_i$ $A_{i_j}$ \ and $ serve as signals to LATEX What are tokens? What is hierarchical structure? What kind of semantic analysis is required? CH1.44 Supporting Phases/ Activities for Analysis Symbol Table Creation / Maintenance Contains Info on Each “Meaningful” Token, Typically Identifiers Data Structure Created / Initialized During Lexical Analysis Utilized / Updated During Later Analysis & Synthesis Error Handling Detection of Different Errors Which Correspond to All Phases What Kinds of Errors Are Found During the Analysis Phase? What Happens When an Error Is Found? CSE4100 CH1.45 Summary so far... Turned a symbol stream into an annotated parse tree CSE4100 CH1.46 From Analysis to Synthesis Source Program 1 CSE4100 2 3 Symbol-table Manager Syntax Analyzer Semantic Analyzer Error Handler 4 5 6 1, 2, 3 : Analysis - Our Focus 4, 5, 6 : Synthesis Phase Lexical Analyzer Intermediate Code Generator Code Optimizer Code Generator Target Program CH1.47 The Synthesis Task For Compilation Intermediate Code Generation CSE4100 Abstract Machine Version of Code Independent of Architecture Easy to Produce and Do Final, Machine Dependent Code Generation Code Optimization Find More Efficient Ways to Execute Code Replace Code With More Optimal Statements 2-approaches: High-level Language & “Peephole” Optimization Final Code Generation Generate Relocatable Machine Dependent Code CH1.48 Intermediate Code CSE4100 What is intermediate code ? A low level representation A simple representation Easy to reason about Easy to manipulate Programming Language independent Hardware independent CH1.49 IR Code example CSE4100 Quadruples (x,y,op,z) to represent x := y op z Infinitely many temporaries Implicit call stack management Example Date x := new Date ( System.today( ) + 30 ) ; push System t0 := call today t1 := 30 t2 := t0 + t1 push t2 t3 := call DateFactory x := t3 CH1.50 Code Optimization Purpose Improve the intermediate code CSE4100 Get rid of redundant / dead code Get rid of redundant computation Reorganize code Schedule instructions Factor out code Improve the machine code Register allocation Instruction scheduling [for specific hardware] CH1.51 Machine Code Generation CSE4100 Purpose Generate code for the target architecture Code is relocatable Accounts for platforms specific Registers – IA32: 6 GP registers (int) – MIPS: 32 GP registers (int) Calling convention – IA32: stack-based – MIPS: register based – Sparc: register sliding window based CH1.52 Machine code generation CSE4100 Pragmatics Generate assembly Let an assembler produced the binary code. CH1.53 Assemblers CSE4100 Assembly code: names are used for instructions, and names are used for memory addresses. MOV a, R1 ADD #2, R1 MOV R1, b Two-pass Assembly: First Pass: all identifiers are assigned to memory addresses (0-offset) e.g. substitute 0 for a, and 4 for b Second Pass: produce relocatable machine code: 0001 01 00 00000000 * 0011 01 10 00000010 0010 01 00 00000100 * relocation bit CH1.54 Linker & Loader Loader Input Relocatable code in a file CSE4100 Output Bring executable into virtual address space. Relocate “shared” libs. Linker Input Many relocatable machine code files (object files) Output A single executable file with all the object file glued together. – Need to relocate each object file as needed Want to learn more ? CSE258 (Operating Systems) CH1.55 Reviewing the Entire Process position := initial + rate * 60 lexical analyzer id1 := id2 + id3 * 60 CSE4100 syntax analyzer := + id1 id2l * id3 60 semantic analyzer := Symbol Table position .... + id1 id2l * id3 inttoreal 60 initial …. rate…. intermediate code generator E r r o r s CH1.56 Reviewing the Entire Process Symbol Table CSE4100 position .... initial …. rate…. intermediate code generator E r r o r s temp1 := inttoreal(60) temp2 := id3 * temp1 temp3 := id2 + temp2 id1 := temp3 code optimizer temp1 := id3 * 60.0 id1 := id2 + temp1 final code generator mov f id3, r2 mulf #60.0, r2 movf id2, r1 addf r2, r2 movf r1, id1 CH1.57 Compiler Cousins: Preprocessors 1. Macro Preprocessor CSE4100 Purpose: Beautify Code Performs text replacement (editing) Performs file concatenation (#include) Example In C, #define does not exist In C, #define is handled by a preprocessor Simple Text Substitution #define X 3 #define Y A*B+C #define Z getchar() CH1.58 2. File Inclusion #include in C - bring in another file before compiling CSE4100 defs.h ////// ////// ////// main.c #include “defs.h” …---…---…--…---…---…--…---…---…--- ////// ////// ////// …---…---…--…---…---…--…---…---…--- CH1.59 3. Rational Preprocessors CSE4100 Augment “Old” Languages With Modern Constructs Add Macros for If - Then, While, Etc. #Define Can Make C Code More Pascal-like #define begin { #define end } #define then CH1.60 4. Language Extensions for a Database System EQUEL - Database query language embedded in C CSE4100 ## Retrieve (DN=Department.Dnum) where ## is Department.Dname = ‘Research’ Preprocessed into: ingres_system(“Retr…..Research’”,____,____); a procedure call in a programming language. CH1.61 The Grouping of Phases CSE4100 Front End : Analysis + Intermediate Code Generation vs. Back End : Code Generation + Optimization Number of Passes: Single - Preferred Multiple - Easier, but less efficient Tradeoffs …….. CH1.62 Compiler Construction Tools CSE4100 Parser Generators : Produce Syntax Analyzers Scanner Generators : Produce Lexical Analyzers Syntax-directed Translation Engines : Generate Intermediate Code Automatic Code Generators : Generate Actual Code Data-Flow Engines : Support Optimization CH1.63 Modern Compilers CSE4100 Motto Re-targetable compilers Apply Software Engineering Principles re. Design and Reuse Organization Front-end – Reuse The analysis The generation of intermediate code Back-end – Multiple Outputs The generation of machine code The architecture-specific optimization CH1.64 Modern Compiler Example Gcc Front-ends for CSE4100 C,C++,FORTRAN,Objective-C,Java Back-ends for IA32, PPC, Ultra, MIPS, ARM, Itanium, DEC, Alpha, VMS, … CH1.65