Python for Rapid Engineering Solutions Steve Millman Data Visualization Welcome! Today’s Objectives: • Learn various ways to present data • Use pandas and seaborn Create a Dataframe panda_viz.py: from pandas import DataFrame, read_csv import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sn # bring in the packages majors = ['CS','CSE','EE','Physics','Chemistry'] students = [15, 12, 35, 3, 2] a_grade = [5,5,20,1,0] b_grade = [8,4,10,1,1] c_grade = [1,2,2,1,0] d_grade = [1,1,2,0,1] e_grade = [0,0,1,0,0] # # # # # # # majors count of students students students students students students with As with Bs with Cs with Ds with Es # column headings cols = [ 'major', 'count', 'a_grade', 'b_grade', 'c_grade', 'd_grade', 'e_grade'] grades = cols[2:] # zip them together to create a list of tuples enrolled = list(zip(majors,students,a_grade,b_grade,c_grade,d_grade,e_grade)) student_df = DataFrame( data = enrolled, columns = cols ) student_df.set_index('major',inplace=True,drop=False) print(student_df) Look at the Dataframe Ø python panda_viz.py major count major CS CS 15 CSE CSE 12 EE EE 35 Physics Physics 3 Chemistry Chemistry 2 a_grade b_grade c_grade d_grade e_grade 5 5 20 1 0 8 4 10 1 1 1 2 2 1 0 1 1 2 0 1 0 0 1 0 0 Pie Chart panda_viz.py (continued): # generate plots - a pie chart student_df['count'].plot(kind='pie',y='major', labels=student_df['major'],autopct='%1.1f%%') plt.title('number of students per major') plt.axis('off') plt.show() Exploding Pie Chart panda_viz.py (continued): # generate plots - an exploding pie chart with a shadow explode_list = np.zeros(len(student_df),float) explode_list[-3:] = [.1,.2,.3] student_df['count'].plot(kind='pie',y='major',shadow=True,explode=explode_list, startangle=0, labels=student_df['major'],autopct='%1.1f%%') plt.title('students percentage per major: exploding slices') plt.axis('off') plt.show() Pie Chart Problems Pie charts aren't always clear! Slices too close together in size Some slices too thin Labeling gets confusing Bar charts are replacing them Histogram panda_viz.py (continued): # generate plots - a historgram student_df.plot(kind='hist',y='count', bins=np.arange(0,student_df['count'].max()+5,5)) plt.xlabel('number of students') plt.title('histogram of students per major') plt.show() Line Chart panda_viz.py (continued): # generate plots - legend automatically added if > 1 line! student_df.plot(kind='line') plt.xlabel('major') plt.ylabel('number of students') plt.title('students per major') plt.show() Line Chart panda_viz.py (continued): # generate a line chart for EE major ee_grades = student_df.loc['EE',grades] ee_grades.plot(kind='line') plt.title('grade distribution for EE students') plt.ylabel('number of students') plt.xlabel('grades') plt.show() Area Chart - Stacked panda_viz.py (continued): # create an area plot for grades per major # and transpose so axes are swapped! grades_df = student_df[grades].transpose() grades_df.plot(kind='area') plt.title('grade distribution - stacked') plt.ylabel('number of students') plt.xlabel('grade') plt.show() # extract just the grades # create the plot Area Chart - Unstacked panda_viz.py (continued): # create an area chart that is not stacked grades_df.plot(kind='area',stacked=False,alpha=0.75) plt.ylabel('number of students') plt.xlabel('grade') plt.title('grade distribution - unstacked') plt.show() A Data Frame Tracking Gifts panda_viz.py (continued): # read in a dataframe for use with bar charts gifts_df = read_csv('gifts.csv',index_col=0) print("\n\nthe new dataframe") plt.title('grade distribution') print(gifts_df) the new dataframe year1 year2 toys 4 8 games 3 4 books 1 3 puzzles 1 2 year3 6 2 2 2 year4 7 5 5 1 year5 2 4 4 2 year6 5 2 3 3 year7 6 5 5 0 year8 7 11 1 2 Bar Chart – One Series panda_viz.py (continued): # create a bar chart for one of the items gifts_df.loc['toys'].plot(kind='bar',rot=0) plt.ylabel('count') plt.title('toys per year') plt.show() # rot rotates x-axis labels Bar Chart – All the Data panda_viz.py (continued): # create a bar chart for all the items at once gifts_df.plot(kind='bar',rot=0) plt.ylabel('count') plt.title('gifts per year') plt.show() Bar Chart – Stacked panda_viz.py (continued): # create a bar chart for all the items at once and stack them gifts_df.plot(kind='bar',rot=0,stacked=True) plt.ylabel('count') plt.title('gifts per year - stacked') plt.show() Bar Chart – Stacked Horizontally panda_viz.py (continued): # create a horizontal bar chart for all the items at once and stack them gifts_df.plot(kind='barh',rot=0,stacked=True) plt.xlabel('count') plt.title('gifts per year - horizontal stack') plt.show() Confusing Box and Whiskers panda_viz.py (continued): # create a box and whiskers plot gifts_df.plot(kind='box') plt.ylabel('gifts per year') plt.title('box chart for gifts per year') plt.show() Proper Box and Whiskers panda_viz.py (continued): # swap the axes so it makes more sense gifts_tp_df = gifts_df.transpose() gifts_tp_df.plot(kind='box') plt.title('box chart for gift types per year') plt.ylabel('gifts per year') plt.show() Horizontal Box and Whiskers panda_viz.py (continued): # show the graph horizontally gifts_tp_df.plot(kind='box',vert=False) plt.xlabel('gifts per year') plt.title('box chart for gift types per year horizontally') plt.show() Multiple Panda Plots in One Plot panda_viz.py (continued): # create subplots for the graphs! fig = plt.figure() ax0 = fig.add_subplot(1,2,1) # add to first position of 1x2 display ax1 = fig.add_subplot(1,2,2) # add to second position of 1x2 display gifts_tp_df.plot(kind='box',ax=ax0,figsize=(10,4)) gifts_tp_df.plot(kind='box',vert=False,ax=ax1) ax0.set_ylabel('gifts per year') ax0.set_title('gift types per year') ax1.set_xlabel('gifts per year') ax1.set_title('gift types per year') plt.suptitle('veritcal vs horizontal box charts') plt.show() Show Multiple Plots at Once Add a Row and Transpose panda_viz.py (continued): # add a row that sums the others; transpose so it's a column print("\n\nadd a total row, transpose, and add a column") gifts_df.loc['Yearly Total'] = gifts_df.sum() gifts_tp_df = gifts_df.transpose() gifts_tp_df['Year Num'] = list(range(1,len(gifts_tp_df)+1)) print(gifts_tp_df) add a total row, transpose, and add a column toys games books puzzles Yearly Total year1 4 3 1 1 9 year2 8 4 3 2 17 year3 6 2 2 2 12 year4 7 5 5 1 18 year5 2 4 4 2 12 year6 5 2 3 3 13 year7 6 5 5 0 16 year8 7 11 1 2 21 Year Num 1 2 3 4 5 6 7 8 Scatter Plot panda_viz.py (continued): # create a scatter plot gifts_tp_df.plot(kind='scatter',x='Year Num',y='Yearly Total') plt.title('scatter plot of gifts vs year') plt.show() Modify the Marker Size panda_viz.py (continued): # create a bubble plot after creating normalized weights weights = gifts_tp_df['Yearly Total'] - gifts_tp_df['Yearly Total'].min() weights /= ( gifts_tp_df['Yearly Total'].max() gifts_tp_df['Yearly Total'].min() ) weights += 1 # in case of 0! weights *= weights # get larger differences weights *= 100 # make visible gifts_tp_df.plot(kind='scatter',x='Year Num',y='Yearly Total',s=weights) plt.title('scatter plot of gifts vs year') plt.show() Bubble Plot Use Seaborn For Regression panda_viz.py (continued): # use seaborn with pandas - create a new data frame with sample data clean_data = np.arange(0.0,50.0) noise = 5 * np.random.randn(len(clean_data)) data = clean_data + noise sample_num = range(0,50) sample_df = DataFrame(data = zip(sample_num,data),columns=['sample','value']) ax = sn.regplot(x=sample_df.index,y='value',data=sample_df) plt.xlabel('sample number') plt.title('regression line with 95% confidence level') plt.show() Regression Line