Uploaded by p H

15.8 Data Visualization

advertisement
Python for Rapid Engineering Solutions
Steve Millman
Data Visualization
Welcome!
Today’s Objectives:
• Learn various ways to present data
• Use pandas and seaborn
Create a Dataframe
panda_viz.py:
from pandas import DataFrame, read_csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
# bring in the packages
majors = ['CS','CSE','EE','Physics','Chemistry']
students = [15, 12, 35, 3, 2]
a_grade = [5,5,20,1,0]
b_grade = [8,4,10,1,1]
c_grade = [1,2,2,1,0]
d_grade = [1,1,2,0,1]
e_grade = [0,0,1,0,0]
#
#
#
#
#
#
#
majors
count of
students
students
students
students
students
students
with As
with Bs
with Cs
with Ds
with Es
# column headings
cols = [ 'major', 'count', 'a_grade', 'b_grade', 'c_grade',
'd_grade', 'e_grade']
grades = cols[2:]
# zip them together to create a list of tuples
enrolled = list(zip(majors,students,a_grade,b_grade,c_grade,d_grade,e_grade))
student_df = DataFrame( data = enrolled, columns = cols )
student_df.set_index('major',inplace=True,drop=False)
print(student_df)
Look at the Dataframe
Ø python panda_viz.py
major count
major
CS
CS
15
CSE
CSE
12
EE
EE
35
Physics
Physics
3
Chemistry Chemistry
2
a_grade
b_grade
c_grade
d_grade
e_grade
5
5
20
1
0
8
4
10
1
1
1
2
2
1
0
1
1
2
0
1
0
0
1
0
0
Pie Chart
panda_viz.py (continued):
# generate plots - a pie chart
student_df['count'].plot(kind='pie',y='major',
labels=student_df['major'],autopct='%1.1f%%')
plt.title('number of students per major')
plt.axis('off')
plt.show()
Exploding Pie Chart
panda_viz.py (continued):
# generate plots - an exploding pie chart with a shadow
explode_list = np.zeros(len(student_df),float)
explode_list[-3:] = [.1,.2,.3]
student_df['count'].plot(kind='pie',y='major',shadow=True,explode=explode_list,
startangle=0,
labels=student_df['major'],autopct='%1.1f%%')
plt.title('students percentage per major: exploding slices')
plt.axis('off')
plt.show()
Pie Chart Problems
Pie charts aren't always clear!
Slices too close together in size
Some slices too thin
Labeling gets confusing
Bar charts are replacing them
Histogram
panda_viz.py (continued):
# generate plots - a historgram
student_df.plot(kind='hist',y='count',
bins=np.arange(0,student_df['count'].max()+5,5))
plt.xlabel('number of students')
plt.title('histogram of students per major')
plt.show()
Line Chart
panda_viz.py (continued):
# generate plots - legend automatically added if > 1 line!
student_df.plot(kind='line')
plt.xlabel('major')
plt.ylabel('number of students')
plt.title('students per major')
plt.show()
Line Chart
panda_viz.py (continued):
# generate a line chart for EE major
ee_grades = student_df.loc['EE',grades]
ee_grades.plot(kind='line')
plt.title('grade distribution for EE students')
plt.ylabel('number of students')
plt.xlabel('grades')
plt.show()
Area Chart - Stacked
panda_viz.py (continued):
# create an area plot for grades per major
# and transpose so axes are swapped!
grades_df = student_df[grades].transpose()
grades_df.plot(kind='area')
plt.title('grade distribution - stacked')
plt.ylabel('number of students')
plt.xlabel('grade')
plt.show()
# extract just the grades
# create the plot
Area Chart - Unstacked
panda_viz.py (continued):
# create an area chart that is not stacked
grades_df.plot(kind='area',stacked=False,alpha=0.75)
plt.ylabel('number of students')
plt.xlabel('grade')
plt.title('grade distribution - unstacked')
plt.show()
A Data Frame Tracking Gifts
panda_viz.py (continued):
# read in a dataframe for use with bar charts
gifts_df = read_csv('gifts.csv',index_col=0)
print("\n\nthe new dataframe")
plt.title('grade distribution')
print(gifts_df)
the new dataframe
year1 year2
toys
4
8
games
3
4
books
1
3
puzzles
1
2
year3
6
2
2
2
year4
7
5
5
1
year5
2
4
4
2
year6
5
2
3
3
year7
6
5
5
0
year8
7
11
1
2
Bar Chart – One Series
panda_viz.py (continued):
# create a bar chart for one of the items
gifts_df.loc['toys'].plot(kind='bar',rot=0)
plt.ylabel('count')
plt.title('toys per year')
plt.show()
# rot rotates x-axis labels
Bar Chart – All the Data
panda_viz.py (continued):
# create a bar chart for all the items at once
gifts_df.plot(kind='bar',rot=0)
plt.ylabel('count')
plt.title('gifts per year')
plt.show()
Bar Chart – Stacked
panda_viz.py (continued):
# create a bar chart for all the items at once and stack them
gifts_df.plot(kind='bar',rot=0,stacked=True)
plt.ylabel('count')
plt.title('gifts per year - stacked')
plt.show()
Bar Chart – Stacked Horizontally
panda_viz.py (continued):
# create a horizontal bar chart for all the items at once and stack them
gifts_df.plot(kind='barh',rot=0,stacked=True)
plt.xlabel('count')
plt.title('gifts per year - horizontal stack')
plt.show()
Confusing Box and Whiskers
panda_viz.py (continued):
# create a box and whiskers plot
gifts_df.plot(kind='box')
plt.ylabel('gifts per year')
plt.title('box chart for gifts per year')
plt.show()
Proper Box and Whiskers
panda_viz.py (continued):
# swap the axes so it makes more sense
gifts_tp_df = gifts_df.transpose()
gifts_tp_df.plot(kind='box')
plt.title('box chart for gift types per year')
plt.ylabel('gifts per year')
plt.show()
Horizontal Box and Whiskers
panda_viz.py (continued):
# show the graph horizontally
gifts_tp_df.plot(kind='box',vert=False)
plt.xlabel('gifts per year')
plt.title('box chart for gift types per year horizontally')
plt.show()
Multiple Panda Plots in One Plot
panda_viz.py (continued):
# create subplots for the graphs!
fig = plt.figure()
ax0 = fig.add_subplot(1,2,1)
# add to first position of 1x2 display
ax1 = fig.add_subplot(1,2,2)
# add to second position of 1x2 display
gifts_tp_df.plot(kind='box',ax=ax0,figsize=(10,4))
gifts_tp_df.plot(kind='box',vert=False,ax=ax1)
ax0.set_ylabel('gifts per year')
ax0.set_title('gift types per year')
ax1.set_xlabel('gifts per year')
ax1.set_title('gift types per year')
plt.suptitle('veritcal vs horizontal box charts')
plt.show()
Show Multiple Plots at Once
Add a Row and Transpose
panda_viz.py (continued):
# add a row that sums the others; transpose so it's a column
print("\n\nadd a total row, transpose, and add a column")
gifts_df.loc['Yearly Total'] = gifts_df.sum()
gifts_tp_df = gifts_df.transpose()
gifts_tp_df['Year Num'] = list(range(1,len(gifts_tp_df)+1))
print(gifts_tp_df)
add a total row, transpose, and add a column
toys games books puzzles Yearly Total
year1
4
3
1
1
9
year2
8
4
3
2
17
year3
6
2
2
2
12
year4
7
5
5
1
18
year5
2
4
4
2
12
year6
5
2
3
3
13
year7
6
5
5
0
16
year8
7
11
1
2
21
Year Num
1
2
3
4
5
6
7
8
Scatter Plot
panda_viz.py (continued):
# create a scatter plot
gifts_tp_df.plot(kind='scatter',x='Year Num',y='Yearly Total')
plt.title('scatter plot of gifts vs year')
plt.show()
Modify the Marker Size
panda_viz.py (continued):
# create a bubble plot after creating normalized weights
weights = gifts_tp_df['Yearly Total'] - gifts_tp_df['Yearly Total'].min()
weights /= ( gifts_tp_df['Yearly Total'].max() gifts_tp_df['Yearly Total'].min() )
weights += 1
# in case of 0!
weights *= weights # get larger differences
weights *= 100
# make visible
gifts_tp_df.plot(kind='scatter',x='Year Num',y='Yearly Total',s=weights)
plt.title('scatter plot of gifts vs year')
plt.show()
Bubble Plot
Use Seaborn For Regression
panda_viz.py (continued):
# use seaborn with pandas - create a new data frame with sample data
clean_data = np.arange(0.0,50.0)
noise = 5 * np.random.randn(len(clean_data))
data = clean_data + noise
sample_num = range(0,50)
sample_df = DataFrame(data = zip(sample_num,data),columns=['sample','value'])
ax = sn.regplot(x=sample_df.index,y='value',data=sample_df)
plt.xlabel('sample number')
plt.title('regression line with 95% confidence level')
plt.show()
Regression Line
Download