import findspark
findspark.init()


import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *


from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline, Model
from pyspark.ml.pipeline import PipelineModel


spark = SparkSession.builder.appName("MLTask2").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/04 00:35:07 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
22/05/04 00:35:15 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!


spark


df_in = spark.read.parquet('s3://yl1269-labdata5/nlp_df/')


df_in.printSchema()

root
 |-- parent_id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- id: string (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- author_flair_richtext: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- author_flair_text_color: string (nullable = true)
 |-- body: string (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- no_follow: boolean (nullable = true)
 |-- score: long (nullable = true)
 |-- send_replies: boolean (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- author_created_time: string (nullable = true)
 |-- created_time: string (nullable = true)
 |-- post_date: string (nullable = true)
 |-- is_PtoP: integer (nullable = true)
 |-- is_BuckleUp: integer (nullable = true)
 |-- is_voted: integer (nullable = true)
 |-- author_postcnt: long (nullable = true)
 |-- is_popular_commented: integer (nullable = true)
 |-- is_weekend: integer (nullable = true)
 |-- author_age: integer (nullable = true)
 |-- gme_mentioned: integer (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- blackrock_mentioned: boolean (nullable = true)
 |-- share_mentioned: boolean (nullable = true)


df_stock = spark.read.csv('s3://yl1269-labdata5/GME.csv',header=True)


df_stock.show(10)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2021-03-01|104.540001|133.990005| 99.970001|120.400002|120.400002|49794000|
|2021-03-02|116.930000|133.199997|112.199997|118.180000|118.180000|33783000|
|2021-03-03|122.510002|127.750000|113.120003|124.180000|124.180000|19273900|
|2021-03-04|125.000000|147.869995|115.300003|132.350006|132.350006|32606900|
|2021-03-05|128.169998|151.529999|127.500000|137.740005|137.740005|30733700|
|2021-03-08|154.889999|210.869995|146.100006|194.500000|194.500000|63565600|
|2021-03-09|217.710007|249.850006|208.509995|246.899994|246.899994|39099300|
|2021-03-10|269.429993|348.500000|172.000000|265.000000|265.000000|71570600|
|2021-03-11|241.639999|281.500000|232.600006|260.000000|260.000000|28312500|
|2021-03-12|275.000000|295.500000|262.269989|264.500000|264.500000|25845900|
+----------+----------+----------+----------+----------+----------+--------+
only showing top 10 rows


#cause no stock data on weekend, only keep weekday df_in data
df = df_in.filter(col("is_weekend")== 0).dropna()
df = df.drop("is_weekend")


#Add usr activity column
df_act = df.groupby("post_date").agg(f.count(col("id")).alias("user_activity"))
df = df.join(df_act, "post_date","left")


#See the column relationship

# adjust figure size and font size
sns.set(rc = {"figure.figsize":(20, 12)})
sns.set(font_scale=1)

# compute the correlation matrix using kendall method
corr = df.limit(100000).toPandas().corr(method='kendall')

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr,annot=True, fmt='.2f', square=True, mask = mask, cmap="Blues");

/mnt/miniconda/lib/python3.7/site-packages/scipy/stats/stats.py:4812: RuntimeWarning: overflow encountered in long_scalars
  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))
/mnt/miniconda/lib/python3.7/site-packages/scipy/stats/stats.py:4814: RuntimeWarning: invalid value encountered in sqrt
  np.sqrt(var) / np.sqrt(2)))


df = df.dropna()


df.groupby('sentiment').count().show()

+---------+-------+
|sentiment|  count|
+---------+-------+
| positive|3209651|
|  neutral| 394656|
| negative| 961150|
+---------+-------+


# Compute the weights
n = df.count()
n_sen_p = df.filter(col('sentiment')=='positive').count()
n_sen_n = n - n_sen_p

c = 2
weight_sen_p = n/(2*n_sen_p)
weight_sen_n = n/(2*n_sen_n)


# Assign the weights to a new column
df = df.withColumn('weight_sen', f.when(col('sentiment')=='positive', weight_sen_p).otherwise(weight_sen_n))


#join stock data
df = df.join(df_stock,df.post_date == df_stock.Date,"left")


#change stock data from string to double
df = df.withColumn("Open",col("Open").cast(DoubleType()))\
    .withColumn("High",col("High").cast(DoubleType()))\
    .withColumn("Low",col("Low").cast(DoubleType()))\
    .withColumn("Close",col("Close").cast(DoubleType()))\
    .withColumn("Volume",col("Volume").cast(DoubleType()))\
    .withColumn("no_follow",col("no_follow").cast(IntegerType()))\


df.count()

4170801


train_data, test_data, predict_data = df.randomSplit([0.8, 0.18, 0.02], 24)


print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))
print("Number of prediction records : " + str(predict_data.count()))

Number of training records: 3336530

Number of testing records : 751268

[Stage 76:=====================================================>  (42 + 2) / 44]

Number of prediction records : 83003


train_data.printSchema()

root
 |-- post_date: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- id: string (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- author_flair_richtext: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- author_flair_text_color: string (nullable = true)
 |-- body: string (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- no_follow: integer (nullable = true)
 |-- score: long (nullable = true)
 |-- send_replies: boolean (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- author_created_time: string (nullable = true)
 |-- created_time: string (nullable = true)
 |-- is_PtoP: integer (nullable = true)
 |-- is_BuckleUp: integer (nullable = true)
 |-- is_voted: integer (nullable = true)
 |-- author_postcnt: long (nullable = true)
 |-- is_popular_commented: integer (nullable = true)
 |-- author_age: integer (nullable = true)
 |-- gme_mentioned: integer (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- blackrock_mentioned: boolean (nullable = true)
 |-- share_mentioned: boolean (nullable = true)
 |-- user_activity: long (nullable = true)
 |-- weight_sen: double (nullable = false)
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: double (nullable = true)


stringIndexer_sentiment = StringIndexer(inputCol="sentiment", outputCol="sentiment_ix")
stringIndexer_author = StringIndexer(inputCol="author", outputCol="author_ix")
stringIndexer_post_date = StringIndexer(inputCol="post_date", outputCol="post_date_ix")

grade_label_fit = stringIndexer_sentiment.fit(df)
grade_label_fit.labels

['positive', 'negative']


onehot_sentiment= OneHotEncoder(inputCol="sentiment_ix", outputCol="sentiment_vec")
onehot_author= OneHotEncoder(inputCol="author_ix", outputCol="author_vec")
onehot_post_date= OneHotEncoder(inputCol="post_date_ix", outputCol="post_date_vec")


vectorAssembler_features = VectorAssembler(
    inputCols=[ 'collapsed',
                'no_follow',
                'score',
                'total_awards_received',
                'stickied',
                'is_PtoP',
                'is_voted',
                'author_postcnt',
                'author_age',
                'gme_mentioned',
                'share_mentioned',
                'Open',
                'Close'], 
    outputCol= "features")


rf = RandomForestClassifier(labelCol="sentiment_ix", featuresCol="features", numTrees=50, weightCol='weight_sen')


labelConverter = IndexToString(inputCol="prediction", 
                               outputCol="predictedsentiment", 
                               labels= stringIndexer_sentiment.fit(df).labels)


pipeline_rf = Pipeline(stages=[stringIndexer_sentiment,
                               stringIndexer_author,
                               stringIndexer_post_date,
                               onehot_sentiment,
                               onehot_author,
                               onehot_post_date,
                               vectorAssembler_features, 
                               rf,labelConverter])


model_rf = pipeline_rf.fit(train_data)


model_rf.transform(train_data)


predictions_rf = model_rf.transform(test_data)


LABEL = "sentiment_ix"
evaluatorRF = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions_rf)
mse = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="mse").evaluate(predictions_rf)
var = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="var").evaluate(predictions_rf)
recall = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedRecall").evaluate(predictions_rf)
tp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedTruePositiveRate").evaluate(predictions_rf)
fp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedFalsePositiveRate").evaluate(predictions_rf)


precision = tp / (tp + fp)
f1 =  2 * (precision*recall) / (precision + recall)


print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))
print("Recall = %g" % recall)
print("Precision = %g" % precision)
print("F1 = %g" % f1)
print("MSE = %g" % mse)
print("Var = %g" % var)

Accuracy = 0.703594
Test Error = 0.296406
Recall = 0.703594
Precision = 0.507044
F1 = 0.589364
MSE = 0.557026
Var = 0.189046


from sklearn.metrics import confusion_matrix


y_pred=predictions.select("prediction").collect()
y_orig=predictions.select("sentiment_ix").collect()


cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[686786      0   8602]
 [204406      0   2195]
 [ 77056      0   6965]]


import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

sn.heatmap(cm, annot=True,cmap="Blues")
plt.show()


evaluatorRF = BinaryClassificationEvaluator(labelCol="sentiment_ix", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluatorRF.evaluate(predictions)
roc_result

0.5082585803262698


stringIndexer_sentiment = StringIndexer(inputCol="sentiment", outputCol="sentiment_ix")
stringIndexer_author = StringIndexer(inputCol="author", outputCol="author_ix")
stringIndexer_post_date = StringIndexer(inputCol="post_date", outputCol="post_date_ix")

grade_label_fit = stringIndexer_sentiment.fit(df)
grade_label_fit.labels

['positive', 'negative']


onehot_sentiment= OneHotEncoder(inputCol="sentiment_ix", outputCol="sentiment_vec")
onehot_author= OneHotEncoder(inputCol="author_ix", outputCol="author_vec")
onehot_post_date= OneHotEncoder(inputCol="post_date_ix", outputCol="post_date_vec")


vectorAssembler_features = VectorAssembler(
    inputCols=[ 'collapsed',
                'no_follow',
                'score',
                'total_awards_received',
                'stickied',
                'is_PtoP',
                'is_voted',
                'author_postcnt',
                'author_age',
                'gme_mentioned',
                'share_mentioned',
                'Open',
                'High',
                'Low',
                'Close'], 
    outputCol= "features")


rf1 = RandomForestClassifier(labelCol="sentiment_ix", featuresCol="features", numTrees=100, weightCol='weight_sen',maxDepth=10, maxBins=64)


labelConverter = IndexToString(inputCol="prediction", 
                               outputCol="predictedSentiment", 
                               labels= stringIndexer_sentiment.fit(df).labels)


pipeline_rf1 = Pipeline(stages=[stringIndexer_sentiment,
                               stringIndexer_author,
                               stringIndexer_post_date,
                               onehot_sentiment,
                               onehot_author,
                               onehot_post_date,
                               vectorAssembler_features, 
                               rf1,labelConverter])


model_rf1 = pipeline_rf.fit(train_data)


model_rf1.transform(train_data)

DataFrame[parent_id: string, author: string, id: string, author_premium: boolean, author_flair_richtext: string, author_flair_text: string, author_flair_text_color: string, body: string, collapsed: boolean, controversiality: bigint, no_follow: boolean, score: bigint, send_replies: boolean, total_awards_received: bigint, stickied: boolean, author_created_time: string, created_time: string, post_date: string, is_PtoP: int, is_BuckleUp: int, is_voted: int, author_postcnt: bigint, is_popular_commented: int, is_weekend: int, author_age: int, gme_mentioned: int, sentiment: string, blackrock_mentioned: boolean, share_mentioned: boolean, weight_sen: double, sentiment_ix: double, author_ix: double, post_date_ix: double, sentiment_vec: vector, author_vec: vector, post_date_vec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double, predictedSentiment: string]


predictions_rf1 = model_rf.transform(test_data)


LABEL = "sentiment_ix"
evaluatorRF = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions_rf1)
mse = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="mse").evaluate(predictions_rf1)
var = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="var").evaluate(predictions_rf1)
recall = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedRecall").evaluate(predictions_rf1)
tp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedTruePositiveRate").evaluate(predictions_rf1)
fp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedFalsePositiveRate").evaluate(predictions_rf1)


precision = tp / (tp + fp)
f1 =  2 * (precision*recall) / (precision + recall)


print("Accuracy = %g" % accuracy)
#print("Test Error = %g" % (1.0 - accuracy))
print("Recall = %g" % recall)
print("Precision = %g" % precision)
print("F1 = %g" % f1)
print("MSE = %g" % mse)
print("Var = %g" % var)

Accuracy = 0.765896
Recall = 0.765896
Precision = 0.499846
F1 = 0.60491
MSE = 0.249464
Var = 0.0724037


from sklearn.metrics import confusion_matrix


y_pred=predictions_rf1.select("prediction").collect()
y_orig=predictions_rf1.select("sentiment_ix").collect()

22/04/30 10:28:07 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB


cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[690930     14   4619]
 [205353     29   1213]
 [     0      0      0]]


import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

sn.heatmap(cm, annot=True,cmap="Blues")
plt.show()


evaluatorRF = BinaryClassificationEvaluator(labelCol="sentiment_ix", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluatorRF.evaluate(predictions_rf1)
roc_result

22/04/30 10:28:01 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB

0.49967507841585135


stringIndexer_sentiment = StringIndexer(inputCol="sentiment", outputCol="sentiment_ix")
stringIndexer_author = StringIndexer(inputCol="author", outputCol="author_ix")
stringIndexer_post_date = StringIndexer(inputCol="post_date", outputCol="post_date_ix")

grade_label_fit = stringIndexer_sentiment.fit(df)
grade_label_fit.labels

['positive', 'negative']


onehot_sentiment= OneHotEncoder(inputCol="sentiment_ix", outputCol="sentiment_vec")
onehot_author= OneHotEncoder(inputCol="author_ix", outputCol="author_vec")
onehot_post_date= OneHotEncoder(inputCol="post_date_ix", outputCol="post_date_vec")


vectorAssembler_features = VectorAssembler(
    inputCols=[ 'collapsed',
                'no_follow',
                'score',
                'total_awards_received',
                'stickied',
                'is_PtoP',
                'is_voted',
                'author_postcnt',
                'author_age',
                'gme_mentioned',
                'share_mentioned' ], 
    outputCol= "features")


#lr = LogisticRegression(labelCol="sentiment_ix", featuresCol="features", weightCol='weight_sen')
mlp = MultilayerPerceptronClassifier(labelCol="sentiment_ix", featuresCol="features", layers=[11, 2, 2], seed=123)


labelConverter = IndexToString(inputCol="prediction", 
                               outputCol="predictedSentiment", 
                               labels= stringIndexer_sentiment.fit(df).labels)


pipeline_mlp = Pipeline(stages=[stringIndexer_sentiment,
                               stringIndexer_author,
                               stringIndexer_post_date,
                               onehot_sentiment,
                               onehot_author,
                               onehot_post_date,
                               vectorAssembler_features, 
                               mlp,labelConverter])


model_mlp = pipeline_rf.fit(train_data)


model_mlp.transform(train_data)

DataFrame[parent_id: string, author: string, id: string, author_premium: boolean, author_flair_richtext: string, author_flair_text: string, author_flair_text_color: string, body: string, collapsed: boolean, controversiality: bigint, no_follow: boolean, score: bigint, send_replies: boolean, total_awards_received: bigint, stickied: boolean, author_created_time: string, created_time: string, post_date: string, is_PtoP: int, is_BuckleUp: int, is_voted: int, author_postcnt: bigint, is_popular_commented: int, is_weekend: int, author_age: int, gme_mentioned: int, sentiment: string, blackrock_mentioned: boolean, share_mentioned: boolean, weight_sen: double, sentiment_ix: double, author_ix: double, post_date_ix: double, sentiment_vec: vector, author_vec: vector, post_date_vec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double, predictedSentiment: string]


predictions_mlp = model_mlp.transform(test_data)


LABEL = "sentiment_ix"
evaluatorRF = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions_mlp)
mse = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="mse").evaluate(predictions_mlp)
var = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="var").evaluate(predictions_mlp)
recall = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedRecall").evaluate(predictions_mlp)
tp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedTruePositiveRate").evaluate(predictions_mlp)
fp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedFalsePositiveRate").evaluate(predictions_mlp)


precision = tp / (tp + fp)
f1 =  2 * (precision*recall) / (precision + recall)


print("Accuracy = %g" % accuracy)
print("Recall = %g" % recall)
print("Precision = %g" % precision)
print("F1 = %g" % f1)
print("MSE = %g" % mse)
print("Var = %g" % var)

Accuracy = 0.500303
Recall = 0.500303
Precision = 0.519612
F1 = 0.509774
MSE = 0.499697
Var = 0.336474


from sklearn.metrics import confusion_matrix


y_pred=predictions_mlp.select("prediction").collect()
y_orig=predictions_mlp.select("sentiment_ix").collect()


cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[337071 358492]
 [ 92314 114281]]


import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

sn.heatmap(cm, annot=True,cmap="Blues")
plt.show()


evaluatorRF = BinaryClassificationEvaluator(labelCol="sentiment_ix", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluatorRF.evaluate(predictions_mlp)
roc_result

0.5188830430816718


#lr = LogisticRegression(labelCol="sentiment_ix", featuresCol="features", weightCol='weight_sen')
mlp1 = MultilayerPerceptronClassifier(labelCol="sentiment_ix", featuresCol="features", layers=[5, 2, 2], seed=123)


labelConverter = IndexToString(inputCol="prediction", 
                               outputCol="predictedSentiment", 
                               labels= stringIndexer_sentiment.fit(df).labels)


pipeline_mlp1 = Pipeline(stages=[stringIndexer_sentiment,
                               stringIndexer_author,
                               stringIndexer_post_date,
                               onehot_sentiment,
                               onehot_author,
                               onehot_post_date,
                               vectorAssembler_features, 
                               mlp2,labelConverter])


model_mlp1 = pipeline_rf.fit(train_data)


model_mlp1.transform(train_data)

DataFrame[parent_id: string, author: string, id: string, author_premium: boolean, author_flair_richtext: string, author_flair_text: string, author_flair_text_color: string, body: string, collapsed: boolean, controversiality: bigint, no_follow: boolean, score: bigint, send_replies: boolean, total_awards_received: bigint, stickied: boolean, author_created_time: string, created_time: string, post_date: string, is_PtoP: int, is_BuckleUp: int, is_voted: int, author_postcnt: bigint, is_popular_commented: int, is_weekend: int, author_age: int, gme_mentioned: int, sentiment: string, blackrock_mentioned: boolean, share_mentioned: boolean, weight_sen: double, sentiment_ix: double, author_ix: double, post_date_ix: double, sentiment_vec: vector, author_vec: vector, post_date_vec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double, predictedSentiment: string]


predictions_mlp1 = model_mlp1.transform(test_data)


LABEL = "sentiment_ix"
evaluatorRF = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions_mlp1)
mse = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="mse").evaluate(predictions_mlp1)
var = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="var").evaluate(predictions_mlp1)
recall = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedRecall").evaluate(predictions_mlp1)
tp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedTruePositiveRate").evaluate(predictions_mlp1)
fp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedFalsePositiveRate").evaluate(predictions_mlp1)


precision = tp / (tp + fp)
f1 =  2 * (precision*recall) / (precision + recall)


print("Accuracy = %g" % accuracy)
print("Recall = %g" % recall)
print("Precision = %g" % precision)
print("F1 = %g" % f1)
print("MSE = %g" % mse)
print("Var = %g" % var)

Accuracy = 0.500303
Recall = 0.500303
Precision = 0.519612
F1 = 0.509774
MSE = 0.499697
Var = 0.336474


from sklearn.metrics import confusion_matrix


y_pred=predictions_mlp1.select("prediction").collect()
y_orig=predictions_mlp1.select("sentiment_ix").collect()


cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[690930     14   4619]
 [205353     29   1213]
 [     0      0      0]]


import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

sn.heatmap(cm, annot=True,cmap="Blues")
plt.show()


evaluatorRF = BinaryClassificationEvaluator(labelCol="sentiment_ix", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluatorRF.evaluate(predictions_mlp)
roc_result

0.5188830430816718


model_rf1.write().save('s3://yl1269-labdata5/model_rf1')

22/04/30 10:40:38 WARN TaskSetManager: Stage 1214 contains a task of very large size (1177 KiB). The maximum recommended task size is 1000 KiB.


spark.stop()

ML Task2¶

1. Read in the data¶

2. Clean data to prepare for machine learning¶

3. Split data into train, test, and split¶

4.RandomForestClassifier¶

4.1 Baseline¶

4.1.1 Model Test Results¶

4.1.2 Confusion Matrix¶

4.1.3 ROC Curve¶

4.2 Tuning¶

4.1.1 Model Test Results¶

4.1.2 Confusion Matrix¶

4.1.3 ROC Curve¶

5.MultilayerPerceptronClassifier¶

5.1 Baseline¶

Model Test Results¶

Confusion Matrix¶

ROC Curve¶

5.2 Tuning¶

Model Test Results¶

Confusion Matrix¶

ROC Curve¶

Save best model to s3¶