# -*- coding: utf-8 -*- from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession from pyspark.sql import Row # configurate a spark context conf = SparkConf().setMaster("local").setAppName("RegressionExample") # treat every core of your desktop as an executor # creating a spark context object named "SpContext" SpContext = SparkContext(conf=conf) # Create a SparkSession object named "SpSession" (the config bit is only for Windows!) SpSession = SparkSession.builder.config("spark.sql.warehouse.dir", "temp").getOrCreate() # Load the CSV file into a RDD autoData = SpContext.textFile("7-cars-miles-per-gallon.csv") autoData.cache() # cache the RDD in memory # autoData.take(5) # Remove the first line (contains headers) dataLines = autoData.filter(lambda x: "CYLINDERS" not in x) dataLines.count() # Use default for average HP; broadcast it as a shared variable avgHP = SpContext.broadcast(80.0) # Function to cleanup Data def CleanupData(inputStr): global avgHP attList = inputStr.split(",") # Replace missing ? values with a normal value [data house-power column, row 128] hpValue = attList[3] # get the house power feature, indexed 3 in attList if hpValue == "?": # check if it is missing hpValue = avgHP.value # use the default value to fill in the missing # Create a row with cleaned up and converted data values = Row(MPG=float(attList[0]), \ CYLINDERS=float(attList[1]), \ DISPLACEMENT=float(attList[2]), HORSEPOWER=float(hpValue), \ WEIGHT=float(attList[4]), \ ACCELERATION=float(attList[5]), \ MODELYEAR=float(attList[6]), \ NAME=attList[7]) return values # Run map with the CleanupData method for data cleanup, and create a Row RDD autoMap = dataLines.map(CleanupData) autoMap.cache() # autoMap.take(5) # Create a DataFrame with the rwo RDD data. autoDf = SpSession.createDataFrame(autoMap) """-------------------------------------------------------------------------- Perform Descriptive Data Analytics -------------------------------------------------------------------------""" print("=== Perform Descriptive Data Analytics ===") # See descriptive analytics. # use discribe to show basic statisticas for columns, such as average value, count, std, min, max autoDf.select("MPG", "CYLINDERS", "HORSEPOWER", "WEIGHT", "DISPLACEMENT", "ACCELERATION", "MODELYEAR").describe().show() # use the MPG and Cylinders as an example # Find correlation between predictors and target (high correlation column normally have highe predictive power) print("=== Perform Correlation Analytics ===") for i in autoDf.columns: # loop through the columns # if not( isinstance(autoDf.select(i).take(1)[0][0], str)) : # this is for python 3.0+ if not (isinstance(autoDf.select(i).take(1)[0][0], str)): # exclude the "name" coolumn which is string print("Correlation to MPG for ", i, autoDf.stat.corr('MPG', i)) # corrilation between two columns i and target MPG. # df.stat.corr() in default supports Pearson Correlation """-------------------------------------------------------------------------- Prepare data for ML -------------------------------------------------------------------------""" print("=== Prepare data for ML and show the 10 examples labeledPoint ===") # Transform to a Data Frame for input to Machine Learing # Drop columns that are not high at correlation (low correlation), use high correlation columns, for example accelleration, displacement, weight from pyspark.ml.linalg import Vectors def transformToLabeledPoint(row): # tranform row to a LabelPoint object lp = (row["MPG"], \ Vectors.dense([row["ACCELERATION"], \ row["DISPLACEMENT"], \ row["WEIGHT"], \ row["CYLINDERS"], \ row["HORSEPOWER"], \ row["MODELYEAR"], \ ])) return lp autoLp = autoMap.map(transformToLabeledPoint) # use the method to transform to LabalPoint objects autoDF = SpSession.createDataFrame(autoLp, ["label", "features"]) # prepare the data for ML, including labels(MPG), and feature (acc, disp, weight) autoDF.select("label", "features").show(10) """-------------------------------------------------------------------------- Perform Machine Learning using training data -------------------------------------------------------------------------""" print("=== PPerform Machine Learning ===") # Split into training and testing data # data frame has a funtion to split the data randomly into traning and test dataset # return two data sets: one training, one testing (trainingData, testData) = autoDF.randomSplit([0.8, 0.2]) print("Total training data count: " + str(trainingData.count())) print("Total testing data count" + str(testData.count())) # Build the model on training data from pyspark.ml.regression import LinearRegression # we use the DataFrame machine learning package lr = LinearRegression(maxIter=10) # iteration times lrModel = lr.fit( trainingData) # algorithm.fit() means strat model building by training #lrModel is the trained model object # Print the metrics print("=== Learned model parameters ===") print("Coefficients: " + str(lrModel.coefficients)) # print out trained coefficients print("Intercept: " + str(lrModel.intercept)) # print the trained intercept # Predict on the test data predictions = lrModel.transform(testData) # use the model to predict the testing data predictions.select("prediction", "label", "features").show() # show the results of the testing data """-------------------------------------------------------------------------- Perform Model Evaluation using test data -------------------------------------------------------------------------""" print("=== PPerform Machine Learning ===") # Find R2 for Linear Regression to evaluate the prediciton performance: R2 value 0-1, the closer to 1 the better from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2") # set evaluation parameters r2 = evaluator.evaluate( predictions) # start evaluation (compare prediction outcome labels with the real labelled outcomes) print("The R2 (coefficient of determination) evaluation result is: " + str(r2))