Uploaded by maymyat26804

MIS formula sheet

advertisement
Query (3) - Filtering rows 2: Query (2) + payments made by credit card
SELECT payment_id, client_id, date, amount, payment_method
Query (2) - Filtering rows 1: Query (1) + payments made between 2019-01-01 FROM payments
and 2019WHERE date between '2019-01-01' and '2019-01-15'
01-15'
and payment_method = 1;
SELECT payment_id, client_id, date, amount
FROM payments;
SELECT payment_id, client_id, date, amount, payment_method
FROM payments
WHERE date between '2019-01-01' and '2019-01-15';
Impor&ng required python libraries and
Reading datasets
# To import required python libraries
import pandas as pd
import numpy as np
# To read 'CARS.csv' dataset and save it to a dataframe
df = pd.read_csv('CARS.csv') # The .csv files should be
stored at the same folder where this .ipybn file is
stored
# to get a concise summary of the dataset
df.info()
Note: In Python, the term “object” is quite the catchall; including numbers, strings of characters, lists, and
functions - a Python object is essentially anything that
you can assign to a variable.
# Methods to examine dataframes
# df.describe()
# df.columns # df.dtypes
# df.dtypes.value_counts()
# df.shape # df.size
Selecting Columns and Slicing Rows
Selec&ng columns
# To select one column in a vector (series in Python).
For example, df['Model']
# To see a more DataFrame-like format you can use
the `to_frame` method df['Model'].to_frame()
# OR, using double square brackets df[['Model']]
# To select mulRple columns in a dataframe
df[['Make', 'Model','EngineSize']]
Slicing/Filtering rows
# To slice the first row from a dataframe with numeric
indexes(to get the first row)
df.loc[0] # or df.iloc(0). loc = 'index' locaRon, iloc =
'numeric' locaRon ('posiRon') of the index
# To slice the last row(to get the last row)
df.iloc[-1]
# To slice rows from 0 to 4(to get first five rows)
df.loc[0:4]
# To slice rows using a (filtering) condiRon
df.loc[df['EngineSize']> 4] # engine size in the df is
greater than 4(meaning of the code above)
# slice rows using mulRple condiRons # &: 'and'
df.loc[(df['EngineSize']>4) & (df['Type']=='Sedan')]
# To slice rows using mulRple condiRons # |: 'or'
df.loc[(df['EngineSize']>4) | (df['Type']=='Sedan')]
# To slice rows using condiRons and select columns
df.loc[df['EngineSize']> 4, ['Make', 'Model',
'EngineSize']]
Transforming Datasets
Renaming Column Names
# To rename column names
df.rename(columns={'Model': 'Model Name',
'EngineSize': 'Engine Size'})
# To check the column names of 'df' again: you will
see the names remain the same
df #it's because the change is not permanent
# To permanently rename columns, save into another
dataframe including itself
df_t = df.rename(columns={'Model': 'Model
Name','EngineSize': 'Engine Size'})
df_t
# Or, use 'inplace=True' save into another dataframe
including itself
df.rename(columns={'Model': 'Model
Name','EngineSize': 'Engine Size'}, inplace=True)
# inplace=Ture: to commit the changes to the
dataframe(making the change permanent) df
Crea&ng/Adding Columns
df.info()
# To create columns from other columns
df['MPG_Highway/MPG_City'] =
df['MPG_Highway']/df['MPG_City']
df
Dropping Columns
# To drop columns permanently
df.drop(columns= 'MPG_Highway/MPG_City',
inplace=True) # inplace=Ture: to commit the changes
to the dataframe(to make changes permanent) df
table1_df_report.to_csv('In-Class
Exercise4_Table1.csv', index=None)
Impor&ng pandas library
Impor&ng pandas library
import pandas as pd
import numpy as np
Reading data
# Orderdetails table
od_df = pd.read_csv("data/orderdetails.csv")
# Products table
p_df = pd.read_csv("data/products.csv")
# Orders table
o_df = pd.read_csv("data/orders.csv")
# Customers table
cus_df = pd.read_csv("data/customers.csv")
# Employees table
emp_df = pd.read_csv("data/employees.csv")
# Offices table
off_df = pd.read_csv("data/offices.csv")
import pandas as pd
import numpy as np
# Orderdetails table
od_df = pd.read_csv("data/orderdetails.csv")
#.read_csv : reading data from csv file named
orderdetails.csv, storing it in a dataframe called od_df
od_df.head() #return the first 5 rows by default
# Products table
prod_df = pd.read_csv("data/products.csv")
prod_df.head() prod_df.info()
Grouping and Agrrega&ng
#To get the sum of quan0tyOrdered, we need to group
'od_df' by 'productCode' and aggrega0ng the sum of
'quan0tyOrdered' of each group, i.e. each product
table1_df_grp = od_df.groupby('productCode',
as_index=False).aggregate({'quanRtyOrdered':'sum'})
# if you don't have 'as_index=False, 'productCode'
becomes an index, not a column
table1_df_grp
# If you want to aggregate mulRple columns, for
example, to get the number of orders for each
'productCode' as well,
od_df.groupby('productCode',
as_index=False).aggregate({'quanRtyOrdered': 'sum',
'orderNumber': 'count'})
# Or, use '.nunique' to count the number of unique
value.
Tidying up the result table
(We need to add two addi,onal columns, 'productName' and
'productVendor,' and to do so, we need to merge 'table1_df_grp'
and 'prod_df' dataframses. Before merging dataframes, first we
need to check if there are commom columns in the dataframes
to be merged, and also the common columns have the same
name and the same data type)
table1_df_grp.info() #checking the common columns
prod_df.info() #&check they’ve same name&data type
(Here, both dataframes have a common field 'productCode' &
the data types are the same in 'object'. So, now we can just go
ahead and merge the two dataframes on 'productCode.')
# to merge two dataframes
table1_df_merge = table1_df_grp.merge(prod_df,
on="productCode")
table1_df_merge
table1_df_merge.info()
table1_df_merge.shape à shows (row,column)
# to select columns to report
table1_df_report = table1_df_merge[['productName',
'productCode', 'productVendor', 'quanRtyOrdered']]
table1_df_report #below is the output
Reading data
Grouping and Agrrega&ng
To get all the aggregate values, we need to join the
following dataframes:
od_df, p_df, o_df, cus_df, emp_df, off_df
Merging/Joining dataframes
To check and make sure that the colmun names and data types
of common fields of the dataframes to be merged are the same
# Changing 'salesRepEmployee' in cus_df to
'employeeNumber' as it is in emp_df
cus_df = cus_df.rename(columns =
{'salesRepEmployeeNumber': 'employeeNumber'})
# Changinge the data type of 'employeeNumber' in
both dataframe to 'object'
emp_df['employeeNumber'] =
emp_df['employeeNumber'].astype('object')
cus_df['employeeNumber'] =
cus_df['employeeNumber'].astype('object')
cus_df['customerNumber'] =
cus_df['customerNumber'].astype('object')
o_df['customerNumber'] =
o_df['customerNumber'].astype('object')
o_df['orderNumber'] =
o_df['orderNumber'].astype('object')
od_df['orderNumber'] =
od_df['orderNumber'].astype('object')
# productCode is already 'object',
# Now, merge all the dataframes
table_m = od_df.merge(p_df,
on="productCode").merge(o_df,
on="orderNumber").merge(cus_df,
on="customerNumber").merge(emp_df,
on="employeeNumber").merge(off_df,
on="officeCode")
table_m
(Alterna0vely, we can merge dataframes without changing
the column name from 'salesRepEmployeeNumber' in
cus_df dataframe to 'employeeNumber')
# First, let's import 'customer.csv' file again.
#Since we changd the column name from
'salesRepEmployeeNumber' in cus_df dataframe to
'employeeNumber' earlier, # we need to have a fresh
'cus_df'.
cus_df=pd.read_csv("_data/MySql_classicmodels/cust
omers.csv")
# Then, let's change the data types.
emp_df['employeeNumber'] =
emp_df['employeeNumber'].astype('object')
cus_df['salesRepEmployeeNumber'] =
cus_df['salesRepEmployeeNumber'].astype('object')
cus_df['customerNumber'] =
cus_df['customerNumber'].astype('object')
o_df['customerNumber'] =
o_df['customerNumber'].astype('object')
What if column names and data types are diff?
o_df['orderNumber'] =
**Renaming column names
o_df['orderNumber'].astype('object')
# df = df.rename(columns = {'current column name':
od_df['orderNumber'] =
'new column name'})
od_df['orderNumber'].astype('object')
AlternaRvely, without changing you can merge
# productCode is already 'object',
dataframes like below:
Calculating Basic Statistics
#
Now, merge 'emp_df' and 'cus_df' with different
# df_merged = pd.merge(df1, df2,
cyl = df['Cylinders'] #Cylinders is one of columns
column names, 'employeeNumber' and
lem_on='lem_column_name',
cyl.min() # cyl.max()
'salesRepEmployeeNumber,' and conRnue to merge
right_on='right_column_name')
cyl.sum() cyl.mean() cyl.std() cyl.median()
other dataframes.
**Changing data types
cyl.describe()
table_m = pd.merge(emp_df, cus_df,
# df['column name'] = df['column
#count,mean,std,min,std,25%,50%,75%,max, name,
lem_on="employeeNumber",
name'].astype('object')
dtype
right_on="salesRepEmployeeNumber").merge(o_df,
Expor8ng/saving the result tableas CSV file
cyl.quantile(.25)
on="customerNumber").merge(od_df,
# to save the result table in a csv file
cyl.quantile([.2, .4, .6, .8, 1]) #also show, name,dtype
Query (6): Query (5) + adding invoice number and invoice total
Query (5): Query (4) + adding client name
SELECT
payment_id,
c.client_id,
c.name AS 'Client Name',
Query (4): Query (3) + presenting payment method name SELECT payment_id, c.client_id, c.name AS 'Client Name',
date, amount, pm.name AS 'Payment Method Name',
SELECT payment_id, client_id, date, amount,
date, amount, pm.name AS 'Payment Method Name'
i.number AS 'Invoice Number',
name AS 'Payment Method Name'
FROM payments p
invoice_total AS 'Invoice Total'
FROM payments p
INNER JOIN payment_methods pm on p.payment_method
FROM payments p
=
INNER JOIN payment_methods pm on
INNER JOIN payment_methods pm on p.payment_method =pm.payment_method_id
pm.payment_method_id
p.payment_method =
INNER JOIN clients c ON c.client_id = p.client_id
INNER JOIN clients c ON c.client_id = p.client_id
pm.payment_method_id
INNER JOIN invoices i ON i.invoice_id = p.invoice_id
WHERE date between '2019-01-01' and '2019-01-15'
WHERE date between '2019-01-01' and '2019-01-15'
and pm.name = 'Credit Card';
WHERE date between '2019-01-01' and '2019-01-15'
and name = 'Credit Card';
and pm.name = 'Credit Card';
# to sort rows in the descending order of
'quanRtyOrdered'
table1_df_report=table1_df_report.sort_values('quan
RtyOrdered', ascending=False)
table1_df_report
# to Rdy up column name and report the first 10 rows
table1_df_report = table1_df_report.rename(columns
={'quanRtyOrdered':'SumOfQuanRtyOrdered'}).head(
10)
table1_df_report
Query (7): Query (6) +
grouping by client
SELECT c.client_id,
c.name AS 'Client
Name',
pm.name AS 'Payment
Method Name',
sum(invoice_total) AS
'Sum of Invoice total',
sum(amount) AS 'Sum
of Payment total'
FROM payments p
INNER JOIN
payment_methods pm
on p.payment_method
=
pm.payment_method_i
d
INNER JOIN clients c
ON c.client_id =
p.client_id
INNER JOIN invoices i
ON i.invoice_id =
p.invoice_id
WHERE date between
'2019-01-01' and
'2019-01-15'
and pm.name = 'Credit
Card'
GROUP BY c.client_id
ORDER BY
sum(amount);
on="orderNumber").merge(pd_df,
on="productCode")
Note that the script for merging 'emp_df' and 'cus_df' is:
pd.merge(emp_df, cus_df, leR_on="employeeNumber",
right_on="salesRepEmployeeNumber")
Grouping and Agrrega8ng
table_m.info() #46columns,2996rows
# Before grouping, add aggregate columns
table_m['Total QuanRty Ordered'] =
table_m['quanRtyOrdered']
table_m['Total Sales Revenue'] =
table_m['quanRtyOrdered'] * table_m['priceEach']
table_m['Total Costs of Sales'] =
table_m['quanRtyOrdered'] * table_m['buyPrice']
table_m['Total Sales Margin'] =
table_m['quanRtyOrdered']*(table_m['priceEach']table_m['buyPrice'])
table_m.info() #50columns,2996rows (columns are
added at the bosom of the list)
# Grouping by 'officeCode' and aggregaRng different
columns
table_m_grp = table_m.groupby('officeCode',
as_index=False).aggregate({'employeeNumber':'nuniq
ue','customerNumber':'nunique','orderNumber':'nuni
que','productCode':'nunique','Total QuanRty
Ordered': 'sum',
'Total Sales Revenue':'sum', 'Total Costs of
Sales':'sum', 'Total Sales Margin':'sum'})
# The .csv files should be stored at the same
folder where this .ipybn file is stored
# to get a concise summary of the dataset
df.info()
Exploring Single Variable
2.1 Single Numeric Variable
2.1.1 Calculating Basic Descriptive Statistics
sm = df['SalesMargin']
sm.info()
sm.describe()
(We need to add two addi0onal columns, 'city' and 'county'
from 'offices', and to do so, we need to merge
'table1_m_grp' and 'off_df' dataframses.
Before merging dataframes, first we need to check if there
are commom columns in the dataframes to be merged,
and also the common columns have the same name and
the same data type.)
table_m_grp.info()
off_df.info() (Both dataframes have a common field 'officeCode'
and the data types are the same in 'int64'. So, now we can just
go ahead and merge the two dataframes on 'productCode.')
# Merging off_df to add more info about offices
table_rpt = table_m_grp.merge(off_df,
on="officeCode")
table_rpt.info()
# SelecRng columns to report
table_rpt_f = table_rpt[['officeCode', 'city', 'country',
'employeeNumber','customerNumber','orderNumber’
,'productCode', 'Total QuanRty Ordered','Total Sales
Revenue', 'Total Costs of Sales', 'Total Sales Margin']]
table_rpt_f
# Tidying up columns names and reporRng the
finalized table
table_rpt_f=table_rpt_f.rename(columns={'officeCode
': 'Office Code', 'city':'City', 'county':'Country',
'employeeNumber':'Number of Sales Rep',
'customerNumber':'Number of Customers Assigned',
'orderNumber':'Number of
Orders','productCode':'Number of DisRnct Product
Items Sold'})
table_rpt_f
Expor8ng/Saving the result table as CSV file
# Saving the result table in a csv file
table_rpt_f.to_csv('MIS2023_HW4_SuggestedAns
wers_table.csv', index=None)
In-Class Exercise 5: Exploratory Data
Analysis (EDA) & Data Visualization
3 Popular VisualizaOon Tools in Python
* pd.Dataframe.plot: "df.plot.xxx"
* matplotlib.pylab [as plt]: "plt.xxx"
* seaborn [as sns]: "sns.xxx"
Impor8ng required Python Libraries
# To import required python libraries
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
Reading data
# To read 'CARS.csv' dataset and save it to a
dataframe
df = pd.read_csv('In-Class Exercise5_data.csv')
(Density is an alterna0ve to histograms that can provide
more insight into the distribu0on of the data points.)
#Density plot using pandas.DataFrame.plot [as pd]
sm.plot.density(figsize=(3, 4), xlim=[0, 130000])
plt.Rtle("Density Plot of Sales Margin", fontsize=10)
plt.xlabel('Sales Margin')
plt.Rght_layout(). plt.show()
2.1.2 Percen8les and Boxplots
sm.quanRle([0.05, 0.25, 0.5, 0.75, 0.95])
# Boxplot using pandas.DataFrame.plot [as pd]
sm.plot.box(figsize=(3, 4)) # or
df.SalesMargin.plot.box(figsize=(3,4))
plt.Rtle("Boxplot of Sales Margin", fontsize=10)
plt.ylabel("Amounts")
plt.Rght_layout()
plt.show()
# Density plot using seabon [as sns]
plt.figure(figsize=(3,4))
plt.Rtle("Density Plot of Sales Margin", fontsize=10)
sns.kdeplot(sm)
plt.xlabel('Sales Margin')
plt.Rght_layout() plt.show()
# Histogram with density plot using
pandas.DataFrame.plot [as pd]
table_m_grp
Tidying up the result table
2.1.4 Density Plots
# Boxplot using matplotlib.pylab [as plt]
plt.figure(figsize=(3,4))
plt.Rtle("Boxplot of Sales Margin", fontsize=10)
plt.boxplot(sm) # or plt.boxplot(df.SalesMargin)
plt.ylabel('Amount')
plt.xlabel('Sales Margin')
plt.Rght_layout()
plt.show()
2.1.3 Frequency Tables and Histograms
# Looking at a column
df['SalesMargin'] or df.SalesMargin
Frequency Table
- The `cut` method for _pandas_ data splits the
dataset into bins.
- There are a number of arguments for the method.
- The following code creates equal sized bins.
- The method `value_counts` returns a frequency
table.
#Gevng a frequency table
binnedSalesMargin = pd.cut(sm, 10)
print(binnedSalesMargin.value_counts())
# Looking at the bin which each row belongs to
binnedSalesMargin
binnedSalesMargin.info()
binnedSalesMargin
sm # df['SalesMargin'] ß
# Combining two serieses (df.SalesMargin,
binnedMPG_City) together into a dataframe
# Renaming 'binnedMPG' series name
binnedSalesMargin=binnedSalesMargin.rename("Bin"
)
binned=pd.concat([sm,binnedSalesMargin],axis=1)
binned
binned.info() ß
Histogram
# Histogram using pandas.DataFrame.plot [as pd]
sm.plot.hist(figsize=(3, 4), bins=10) # alternaRvely,
df.SalesMargin
plt.Rtle("Histogram of Sales Margin", fontsize=10)
plt.ylabel("Counts"). plt.xlabel("Sales Margin")
plt.Rght_layout(). plt.show()
# Histogram using matplotlib.pylab [as plt]
plt.figure(figsize=(3,4))
plt.Rtle("Histogram of Sales Margin", fontsize=10)
plt.hist(sm, bins=10) # or plt.hist(df.SalesMargin)
plt.ylabel('Counts') plt.xlabel('Sales Margin')
plt.Rght_layout() plt.show()
sm.plot.hist(density=True, xlim=[6000, 130000],
bins=10, figsize=(3, 4))
sm.plot.density()
plt.Rtle("Histogram with Density Plot of Sales Margin",
fontsize=10)
# plt.ylabel('ProporRon')
plt.xlabel('Sales Margin')
plt.Rght_layout() plt.show()
2.2 A Single Categorical Variable
2.2.1 Count Table
# We need to prepare a count table first
df_type = df['productLine'].value_counts().to_frame()
df_type.info()
df_type # NoRce that 'productLine' is an index, not a
column
#.loc: you can slice a row directly using an value in the
index
df_type.loc['Ships'] à output: count,name,dtype
#.iloc: you can slice a row directly using the locaRon
df_type.iloc[5] à same output
2.2.2 Bar Plot
# Bar Plot using pandas.DataFrame.plot [as pd]
df_type.plot.bar(figsize=(4, 4), legend=False,
width=0.7)
plt.Rtle("Bar Plot of Product Lines", fontsize=12)
plt.ylabel("Counts")
plt.xlabel("Product Line")
plt.xRcks(rotaRon=45) # rotate the x-axis values
plt.Rght_layout() plt.show()
# Bar plot using matplotlib.pylab [as plt]
# First, convert index ('productLine') to column
df_type_i=df_type.reset_index()
df_type_i # NoRce that now 'productLine' is a column.
# Draw a bar plot
plt.figure(figsize=(4,5))
plt.Rtle("Bar Plot of Product Lines", fontsize=10)
plt.bar(df_type_i['productLine'], df_type_i['count'],
color=['r', 'g', 'b', 'y', 'dodgerblue', 'C2', '#e35f62'],
width=0.7)
plt.ylabel('Counts')
plt.xlabel('Product Lines')
plt.xRcks(rotaRon=45) # rotate the x-axis values
plt.Rght_layout() plt.show()
3. Exploring Mul&ple Variables
3.1 Two Numeric Variables -- ScaSer Plots
df.info()
# Scaser plot using pandas.DataFrame.plot [as pd]
df.plot.scaser(x='SumOfQuanRtyOrdered',
y='SalesMargin', figsize=(6, 4), marker='$\u25EF$')
# $\u25EF$ uses an open circle for each point
plt.Rtle("Scaser Plot of Sum of QuanRty Ordered and
Sales Margin", fontsize=10)
plt.ylabel('Sales Margin')
plt.xlabel('Sum of QuanRty Ordered')
plt.xRcks(rotaRon=45) # rotate the x-axis values
plt.Rght_layout() plt.show()
# Scaser plot using matplotlib.pylab [as plt]
plt.figure(figsize=(6, 4))
plt.Rtle("Scaser Plot of Sum of QuanRty Ordered and
Sales Margin", fontsize=12)
plt.scaser(x= df.SumOfQuanRtyOrdered,
y=df.SalesMargin, c ="blue")
plt.ylabel('Sales Margin')
plt.xlabel('Sum of QuanRty Ordered')
plt.xRcks(rotaRon=45) # rotate the x-axis values
plt.Rght_layout() plt.show()
df['SumOfQuanRtyOrdered']
(ScaNer plot using seaborn [as sns]
Since Seaborn uses Matplotlib's ploRng func,ons internally, we
can use func,ons like `plt.figure` and `plt.,tle` to modify the
figure.-- No,ce how the points in the above plot seem to form
dis,nct clusters with some outliers.--We can color the dots using
the 'productLine' as a `hue`.--We can also make the points larger
using the `s` argument.)
# Scaser plot using seaborn [as sns]
plt.figure(figsize=(6, 4))
plt.Rtle("Scaser Plot of Sum of QuanRty Ordered and
Sales Margin", fontsize=10)
sns.scaserplot(x=df.SumOfQuanRtyOrdered,
y=df.SalesMargin, hue=df.productLine, s=70)
plt.ylabel('Sales Margin')
plt.xlabel('Sum of QuanRty Ordered')
plt.xRcks(rotaRon=45) # rotate the x-axis values
3.2 Categorical and Numeric Variables
3.2.1 Grouped Boxplot
# Pandas boxplots of a column grouped by a different
column. # Grouped boxplot using
pandas.DataFrame.plot [as pd]
df.boxplot(by='productLine', column='SalesMargin',
figsize=(6, 5))
plt.Rtle("Boxplot of Sales Margin over Product Lines",
fontsize=10)
plt.ylabel('Sales Margin')
plt.xlabel('Product Lines')
plt.xRcks(rotaRon=45) # rotate the x-axis values
plt.Rght_layout() plt.show()
Extra: Drawing a boxplot of mulOple columns
df.boxplot(column=['SalesRevenue','CostofSales',
'SalesMargin'], figsize=(6, 5))
plt.Rtle("Boxplot of Sales Measures", fontsize=10)
plt.ylabel('Amount')
plt.xlabel('Sales Measures')
plt.xRcks(rotaRon=45) # rotate the x-axis values
plt.Rght_layout() plt.show()
3.2.2 Stacked Histograms over Categories
# Create subsets based on the values of categorical
variable
classic_df = df[df.productLine == 'Classic Cars']
motor_df = df[df.productLine == 'Motorcycles']
vintage_df = df[df.productLine == 'Vintage Cars']
planes_df = df[df.productLine == 'Planes']
truck_bus_df = df[df.productLine == 'Trucks and
Buses']
ship_df = df[df.productLine == 'Ships']
train_df = df[df.productLine == 'Trains']
# Stacked histogram using matplotlib.pylab [as plt]
plt.Rtle('DistribuRon of Sales Margin (Stacked)')
plt.hist([classic_df.SalesMargin,
motor_df.SalesMargin, vintage_df.SalesMargin,
planes_df.SalesMargin, truck_bus_df.SalesMargin,
ship_df.SalesMargin,
train_df.SalesMargin],stacked=True);
plt.legend(['Classic Cars', 'Motorcycles', 'Vintage
Cars','Planes','Trucks and Buses','Ships', 'Trains']);
plt.Rght_layout() plt.show()
# Unstacked histogram using matplotlib.pylab [as plt]
plt.Rtle('DistribuRon of Sales Margin (Unstacked)',
fontsize=10)
plt.hist([classic_df.SalesMargin,
motor_df.SalesMargin, vintage_df.SalesMargin,
planes_df.SalesMargin, truck_bus_df.SalesMargin,
ship_df.SalesMargin, train_df.SalesMargin],
stacked=False);
plt.legend(['Classic Cars', 'Motorcycles', 'Vintage
Cars','Planes','Trucks and Buses','Ships', 'Trains']);
plt.Rght_layout() plt.show()
3.2.3 Density Plot with Mul8ple Product Lines
# converRng data into wide-format
data_wide = df.pivot(columns='productLine',
values='SalesMargin')
data_wide
# MulRple Density Plots using pandas.DataFrame.plot
[as pd]# calling density() to make mulRple density plot
data_wide.plot.density(figsize = (7, 7), linewidth = 4)
plt.Rtle('Density Plot with MulRple Product Lines',
fontsize=13)
plt.legend(prop={'size': 10}, Rtle = 'Product Line')
plt.xlabel('Sales Margin', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.Rght_layout() plt.show()
3.3 Two Categorical Variables
3.3.1 Crosstab/Con8ngency Table
df.info()
# There is only one categorical variable in the 'df'
object # Let's add a new categorical variable based on
'SumOfQuanRtyOrdered'
df.SumOfQuanRtyOrdered.describe()
# CreaRng a new variable, 'quanRtyOrdered_levels',
depending on the following value:
# lower than or equal to 25th percenRle: 'low'
# higher than 25th percenRle & lower than or equal to
75th percenRle: 'medium'
# higher than 75th percenRle: 'high'
# creaRng a list of our condiRons
condiRons = [ (df['SumOfQuanRtyOrdered'] <= 917),
(df['SumOfQuanRtyOrdered'] > 917) &
(df['SumOfQuanRtyOrdered'] <= 998),
(df['SumOfQuanRtyOrdered'] > 998)]
# creaRng a list of the values we want to assign for
each condiRon
values = ['low', 'medium', 'high']
# creaRng a new column and use np.select to assign
values to it using our lists as arguments
df['quanRtyOrdered_levels'] = np.select(condiRons,
values)
# display updated DataFrame
Df ß this part done here
df['quanRtyOrdered_levels'].value_counts() à
output:medium,low,high,name,dtype ß
conRngency = pd.crosstab(df['productLine'],
df['quanRtyOrdered_levels'])
conRngency à output:
conRngency.info() ß
3.3.2 Grouped Bar Chart
# Grouped barchart using pandas.DataFrame.plot [as
pd]
conRngency.plot.bar(figsize=(8,6))
plt.Rtle("Bar chart of QuanRty Orderd Levels over
Product Lines", fontsize=10)
plt.ylabel('Counts')
plt.xlabel('Product Lines')
plt.xRcks(rotaRon=45) # rotate the x-axis values
plt.Rght_layout() plt.show()
Download