NAME : Durvang Vijay Kulkarni
CLASS : TY CSE C
ROLL NO : C32
PRN : 2324000622
COURSE : MACHINE LEARNING LAB
impor pandas as pd
impor
numpy a np
t
t
s
df =
'Housing.csv )
df.head(
'
)
price area bedrooms bathrooms stories mainroad
basement \
guestroom
0 13300000 7420
4
2
3
yes
n
no
1 12250000 8960
4
4
4
yes
o
n
no
2
3
2
2
yes
o 12250000 9960
ye
no
3 12215000 7500
4
2
2
yes
s
ye
no
4 11410000 7420
4
1
2
yes
s
ye
yes
s
hotwaterheating airconditioning parking prefarea
0
no
yes
2
yesfurnishingstatus
1
no
yes
3
no
furnished
2
no
no
2
yes
semifurnished
3
no
yes
3
yes
furnished
4
no
yes
2
no
furnished
furnished
df.isnull()su (
.
m )
price
0
area
0
bedrooms
0
bathrooms
0
stories
0
mainroad
0
guestroom
0
basement
0
hotwaterheating
0
airconditioning
0
parking
0
prefarea
0
furnishingstatus 0
dtype:
int64
# plotting
impor seaborn a
graph
impor
matplotlib.pyplo
a plt
t
s sns
t
t
s
plt.figure(figsiz =(14,6))
#
e PLot
plt.subplot 1,2,1)
1
sns.boxplot(
= df['price ]
(
x
'
)
plt.title('Price')
plt.subplot( 1,2,2)
sns.boxplot( = df['area ]
plt.title(
x
)
'Area') '
Text(0.5, 1.0, 'Area')
plt.figure(figsiz =(1 4 6)
#
e PLot
, )
plt.subplot( 1,2,1)
1
sns.boxplot( = d f[
'price ]
plt.title
)
x
)
'Price' '
(
plt.subplot 1,2,2)
sns.boxplot(
= df['area ]
(
plt.title
'Area ) '
x
)
(
'
Text(0.5, 1.0, 'Area')
#for
Q1 = df['area ].quantile 0.2 )
area
Q3 = df[
''area(].quantile50.7 )
'
(
5
IQR = Q3 - Q1
lower_bound= Q1 - 1. * IQR
upper_bound = Q3 +1.
* IQR
5
'area
] <
|
'area ] >
outliers = df[ (df[
5
upper_bound
'
lower_bound) (df[ '
)]
# Imputing outliers with upper_bounds and
#
median_area =
lower_bounds
#
df['area'].median()
df['area ] = df['area ].appl (lambd x :
i x <
median_area
lower_bound
els 'x )
'
y
a
lower_bound
f
df['area ] =edf['area ].appl (lambd x :
i x >
upper_bound
els 'x )
'
y
a
upper_bound
f
e
df['area ] = np.log(df'area ])
df['price
] = np.log(df
'
[''price ]
'
['
)
plt.figure(figsiz =(14,6))
#
e PLot
1
plt.subplot( 1,2,2)
sns.boxplot( = df['area ]
plt.title
x
)
'Area') '
(
Text(0.5, 1.0, 'Area')
plt.scatte (df[ 'a
rea ] ,
plt.xlabel
)
r
'
df[
'Area'
plt.ylabel
'Price )
(
(
'
Text(0, 0.5, 'Price')
'price']
)
fro sklearn.linear_mode impor LinearRegressio
fro
sklearn.model_selecti
m
l t impor train_test_spli
n
fro
sklearn.metric import
mean_squared_error
,
m
on t
t
x
=
df[[
'area
]
m
s
r2_score
y = df[['price
'
]]
xtrain ,' xtest,
=
train_test_split( x, y ,=
] ytrain,
0.
,
= 42)
ytest
test_size
2
random_state
model = LinearRegression(
model.fit(xtrain,
)
y_pre
=
model.predict(xtest
ytrain)
d
)
prin ('MSE :
, mean_squared_error(ytest ,
prin
('R
, r2_score(ytest
t
' square score :
y_pred)),
t
'
y_pred))
MSE :
R square score :
0.12869183466229298
0.333509351961115
##Simple Linear
Regression
plt.figure(figsi =(8,6))
plt.scatter(xtest,
ytest,
='blue ,
='Actual
) #
ze
Original
test
color
'
label Prices'
plt.plot(xtest, y_pred,
='red ,
=2,
='Regressio
data
color
'
linewidth
label n
Line ) # Predicted
plt.xlabel
'Area )
'
line
plt.ylabel
'Price
)
(
'
plt.title
'Linear
Regression: Area vs
(
'
plt.show(
(
Price'
)
)
price = model.predict(pd.DataFrame( 8900] ], columns 'area'])
prin (price)[[
=
[ )
t
[[4737.94812806
]]