Uploaded by sree murakonda

Regression - Naresh Kumar Reddy Chinna Subbannagari

advertisement
# -*- coding: utf-8 -*-
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
# configurate a spark context
conf = SparkConf().setMaster("local").setAppName("RegressionExample") # treat every core of your desktop as an executor
# creating a spark context object named "SpContext"
SpContext = SparkContext(conf=conf)
# Create a SparkSession object named "SpSession" (the config bit is only for Windows!)
SpSession = SparkSession.builder.config("spark.sql.warehouse.dir", "temp").getOrCreate()
# Load the CSV file into a RDD
autoData = SpContext.textFile("7-cars-miles-per-gallon.csv")
autoData.cache() # cache the RDD in memory
# autoData.take(5)
# Remove the first line (contains headers)
dataLines = autoData.filter(lambda x: "CYLINDERS" not in x)
dataLines.count()
# Use default for average HP; broadcast it as a shared variable
avgHP = SpContext.broadcast(80.0)
# Function to cleanup Data
def CleanupData(inputStr):
global avgHP
attList = inputStr.split(",")
# Replace missing ? values with a normal value [data house-power column, row 128]
hpValue = attList[3] # get the house power feature, indexed 3 in attList
if hpValue == "?": # check if it is missing
hpValue = avgHP.value # use the default value to fill in the missing
# Create a row with cleaned up and converted data
values = Row(MPG=float(attList[0]), \
CYLINDERS=float(attList[1]), \
DISPLACEMENT=float(attList[2]),
HORSEPOWER=float(hpValue), \
WEIGHT=float(attList[4]), \
ACCELERATION=float(attList[5]), \
MODELYEAR=float(attList[6]), \
NAME=attList[7])
return values
# Run map with the CleanupData method for data cleanup, and create a Row RDD
autoMap = dataLines.map(CleanupData)
autoMap.cache()
# autoMap.take(5)
# Create a DataFrame with the rwo RDD data.
autoDf = SpSession.createDataFrame(autoMap)
"""--------------------------------------------------------------------------
Perform Descriptive Data Analytics
-------------------------------------------------------------------------"""
print("=== Perform Descriptive Data Analytics ===")
# See descriptive analytics.
# use discribe to show basic statisticas for columns, such as average value, count, std, min, max
autoDf.select("MPG", "CYLINDERS", "HORSEPOWER", "WEIGHT", "DISPLACEMENT", "ACCELERATION", "MODELYEAR").describe().show() # use the MPG and Cylinders as an example
# Find correlation between predictors and target (high correlation column normally have highe predictive power)
print("=== Perform Correlation Analytics ===")
for i in autoDf.columns: # loop through the columns
# if not( isinstance(autoDf.select(i).take(1)[0][0], str)) : # this is for python 3.0+
if not (isinstance(autoDf.select(i).take(1)[0][0], str)): # exclude the "name" coolumn which is string
print("Correlation to MPG for ", i,
autoDf.stat.corr('MPG', i)) # corrilation between two columns i and target MPG.
# df.stat.corr() in default supports Pearson Correlation
"""--------------------------------------------------------------------------
Prepare data for ML
-------------------------------------------------------------------------"""
print("=== Prepare data for ML and show the 10 examples labeledPoint ===")
# Transform to a Data Frame for input to Machine Learing
# Drop columns that are not high at correlation (low correlation), use high correlation columns, for example accelleration, displacement, weight
from pyspark.ml.linalg import Vectors
def transformToLabeledPoint(row): # tranform row to a LabelPoint object
lp = (row["MPG"], \
Vectors.dense([row["ACCELERATION"], \
row["DISPLACEMENT"], \
row["WEIGHT"], \
row["CYLINDERS"], \
row["HORSEPOWER"], \
row["MODELYEAR"], \
]))
return lp
autoLp = autoMap.map(transformToLabeledPoint) # use the method to transform to LabalPoint objects
autoDF = SpSession.createDataFrame(autoLp, ["label",
"features"]) # prepare the data for ML, including labels(MPG), and feature (acc, disp, weight)
autoDF.select("label", "features").show(10)
"""--------------------------------------------------------------------------
Perform Machine Learning using training data
-------------------------------------------------------------------------"""
print("=== PPerform Machine Learning ===")
# Split into training and testing data
# data frame has a funtion to split the data randomly into traning and test dataset
# return two data sets: one training, one testing
(trainingData, testData) = autoDF.randomSplit([0.8, 0.2])
print("Total training data count: " + str(trainingData.count()))
print("Total testing data count" + str(testData.count()))
# Build the model on training data
from pyspark.ml.regression import LinearRegression # we use the DataFrame machine learning package
lr = LinearRegression(maxIter=10) # iteration times
lrModel = lr.fit(
trainingData) # algorithm.fit() means strat model building by training #lrModel is the trained model object
# Print the metrics
print("=== Learned model parameters ===")
print("Coefficients: " + str(lrModel.coefficients)) # print out trained coefficients
print("Intercept: " + str(lrModel.intercept)) # print the trained intercept
# Predict on the test data
predictions = lrModel.transform(testData) # use the model to predict the testing data
predictions.select("prediction", "label", "features").show() # show the results of the testing data
"""--------------------------------------------------------------------------
Perform Model Evaluation using test data
-------------------------------------------------------------------------"""
print("=== PPerform Machine Learning ===")
# Find R2 for Linear Regression to evaluate the prediciton performance: R2 value 0-1, the closer to 1 the better
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label",
metricName="r2") # set evaluation parameters
r2 = evaluator.evaluate(
predictions) # start evaluation (compare prediction outcome labels with the real labelled outcomes)
print("The R2 (coefficient of determination) evaluation result is: " + str(r2))
Download