Python for Rapid Engineering Solutions Steve Millman Data Visualization Welcome! Today’s Objectives: • Learn various ways to present data • Use pandas and seaborn Create a Dataframe from pandas import DataFrame, read_csv import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sn # bring in the packages majors = ['CS','CSE','EE','Physics','Chemistry'] students = [15, 12, 35, 3, 2] a_grade = [5,5,20,1,0] b_grade = [8,4,10,1,1] c_grade = [1,2,2,1,0] d_grade = [1,1,2,0,1] e_grade = [0,0,1,0,0] # # # # # # # majors count of students students students students students students with As with Bs with Cs with Ds with Es # column headings cols = [ 'major', 'count', 'a_grade', 'b_grade', 'c_grade', 'd_grade', 'e_grade'] grades = cols[2:] # zip them together to create a list of tuples enrolled = list(zip(majors,students,a_grade,b_grade,c_grade,d_grade,e_grade)) student_df = DataFrame( data = enrolled, columns = cols ) student_df.set_index('major',inplace=True,drop=False) print(student_df) Look at the Dataframe Ø python major count major CS CS 15 CSE CSE 12 EE EE 35 Physics Physics 3 Chemistry Chemistry 2 a_grade b_grade c_grade d_grade e_grade 5 5 20 1 0 8 4 10 1 1 1 2 2 1 0 1 1 2 0 1 0 0 1 0 0 Pie Chart (continued): # generate plots - a pie chart student_df['count'].plot(kind='pie',y='major', labels=student_df['major'],autopct='%1.1f%%') plt.title('number of students per major') plt.axis('off') Exploding Pie Chart (continued): # generate plots - an exploding pie chart with a shadow explode_list = np.zeros(len(student_df),float) explode_list[-3:] = [.1,.2,.3] student_df['count'].plot(kind='pie',y='major',shadow=True,explode=explode_list, startangle=0, labels=student_df['major'],autopct='%1.1f%%') plt.title('students percentage per major: exploding slices') plt.axis('off') Pie Chart Problems Pie charts aren't always clear! Slices too close together in size Some slices too thin Labeling gets confusing Bar charts are replacing them Histogram (continued): # generate plots - a historgram student_df.plot(kind='hist',y='count', bins=np.arange(0,student_df['count'].max()+5,5)) plt.xlabel('number of students') plt.title('histogram of students per major') Line Chart (continued): # generate plots - legend automatically added if > 1 line! student_df.plot(kind='line') plt.xlabel('major') plt.ylabel('number of students') plt.title('students per major') Line Chart (continued): # generate a line chart for EE major ee_grades = student_df.loc['EE',grades] ee_grades.plot(kind='line') plt.title('grade distribution for EE students') plt.ylabel('number of students') plt.xlabel('grades') Area Chart - Stacked (continued): # create an area plot for grades per major # and transpose so axes are swapped! grades_df = student_df[grades].transpose() grades_df.plot(kind='area') plt.title('grade distribution - stacked') plt.ylabel('number of students') plt.xlabel('grade') # extract just the grades # create the plot Area Chart - Unstacked (continued): # create an area chart that is not stacked grades_df.plot(kind='area',stacked=False,alpha=0.75) plt.ylabel('number of students') plt.xlabel('grade') plt.title('grade distribution - unstacked') A Data Frame Tracking Gifts (continued): # read in a dataframe for use with bar charts gifts_df = read_csv('gifts.csv',index_col=0) print("\n\nthe new dataframe") plt.title('grade distribution') print(gifts_df) the new dataframe year1 year2 toys 4 8 games 3 4 books 1 3 puzzles 1 2 year3 6 2 2 2 year4 7 5 5 1 year5 2 4 4 2 year6 5 2 3 3 year7 6 5 5 0 year8 7 11 1 2 Bar Chart – One Series (continued): # create a bar chart for one of the items gifts_df.loc['toys'].plot(kind='bar',rot=0) plt.ylabel('count') plt.title('toys per year') # rot rotates x-axis labels Bar Chart – All the Data (continued): # create a bar chart for all the items at once gifts_df.plot(kind='bar',rot=0) plt.ylabel('count') plt.title('gifts per year') Bar Chart – Stacked (continued): # create a bar chart for all the items at once and stack them gifts_df.plot(kind='bar',rot=0,stacked=True) plt.ylabel('count') plt.title('gifts per year - stacked') Bar Chart – Stacked Horizontally (continued): # create a horizontal bar chart for all the items at once and stack them gifts_df.plot(kind='barh',rot=0,stacked=True) plt.xlabel('count') plt.title('gifts per year - horizontal stack') Confusing Box and Whiskers (continued): # create a box and whiskers plot gifts_df.plot(kind='box') plt.ylabel('gifts per year') plt.title('box chart for gifts per year') Proper Box and Whiskers (continued): # swap the axes so it makes more sense gifts_tp_df = gifts_df.transpose() gifts_tp_df.plot(kind='box') plt.title('box chart for gift types per year') plt.ylabel('gifts per year') Horizontal Box and Whiskers (continued): # show the graph horizontally gifts_tp_df.plot(kind='box',vert=False) plt.xlabel('gifts per year') plt.title('box chart for gift types per year horizontally') Multiple Panda Plots in One Plot (continued): # create subplots for the graphs! fig = plt.figure() ax0 = fig.add_subplot(1,2,1) # add to first position of 1x2 display ax1 = fig.add_subplot(1,2,2) # add to second position of 1x2 display gifts_tp_df.plot(kind='box',ax=ax0,figsize=(10,4)) gifts_tp_df.plot(kind='box',vert=False,ax=ax1) ax0.set_ylabel('gifts per year') ax0.set_title('gift types per year') ax1.set_xlabel('gifts per year') ax1.set_title('gift types per year') plt.suptitle('veritcal vs horizontal box charts') Show Multiple Plots at Once Add a Row and Transpose (continued): # add a row that sums the others; transpose so it's a column print("\n\nadd a total row, transpose, and add a column") gifts_df.loc['Yearly Total'] = gifts_df.sum() gifts_tp_df = gifts_df.transpose() gifts_tp_df['Year Num'] = list(range(1,len(gifts_tp_df)+1)) print(gifts_tp_df) add a total row, transpose, and add a column toys games books puzzles Yearly Total year1 4 3 1 1 9 year2 8 4 3 2 17 year3 6 2 2 2 12 year4 7 5 5 1 18 year5 2 4 4 2 12 year6 5 2 3 3 13 year7 6 5 5 0 16 year8 7 11 1 2 21 Year Num 1 2 3 4 5 6 7 8 Scatter Plot (continued): # create a scatter plot gifts_tp_df.plot(kind='scatter',x='Year Num',y='Yearly Total') plt.title('scatter plot of gifts vs year') Modify the Marker Size (continued): # create a bubble plot after creating normalized weights weights = gifts_tp_df['Yearly Total'] - gifts_tp_df['Yearly Total'].min() weights /= ( gifts_tp_df['Yearly Total'].max() gifts_tp_df['Yearly Total'].min() ) weights += 1 # in case of 0! weights *= weights # get larger differences weights *= 100 # make visible gifts_tp_df.plot(kind='scatter',x='Year Num',y='Yearly Total',s=weights) plt.title('scatter plot of gifts vs year') Bubble Plot Use Seaborn For Regression (continued): # use seaborn with pandas - create a new data frame with sample data clean_data = np.arange(0.0,50.0) noise = 5 * np.random.randn(len(clean_data)) data = clean_data + noise sample_num = range(0,50) sample_df = DataFrame(data = zip(sample_num,data),columns=['sample','value']) ax = sn.regplot(x=sample_df.index,y='value',data=sample_df) plt.xlabel('sample number') plt.title('regression line with 95% confidence level') Regression Line