Goal: Predict the post sentiment.
import findspark
findspark.init()
import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline, Model
from pyspark.ml.pipeline import PipelineModel
spark = SparkSession.builder.appName("MLTask2").getOrCreate()
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 22/05/04 00:35:07 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. 22/05/04 00:35:15 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
Make sure your SparkSession is active:
spark
SparkSession - in-memory
Create a DataFrame called df_in
, which nlp project result.
df_in = spark.read.parquet('s3://yl1269-labdata5/nlp_df/')
df_in.printSchema()
root |-- parent_id: string (nullable = true) |-- author: string (nullable = true) |-- id: string (nullable = true) |-- author_premium: boolean (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- author_flair_text_color: string (nullable = true) |-- body: string (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- no_follow: boolean (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- total_awards_received: long (nullable = true) |-- stickied: boolean (nullable = true) |-- author_created_time: string (nullable = true) |-- created_time: string (nullable = true) |-- post_date: string (nullable = true) |-- is_PtoP: integer (nullable = true) |-- is_BuckleUp: integer (nullable = true) |-- is_voted: integer (nullable = true) |-- author_postcnt: long (nullable = true) |-- is_popular_commented: integer (nullable = true) |-- is_weekend: integer (nullable = true) |-- author_age: integer (nullable = true) |-- gme_mentioned: integer (nullable = true) |-- sentiment: string (nullable = true) |-- blackrock_mentioned: boolean (nullable = true) |-- share_mentioned: boolean (nullable = true)
df_stock = spark.read.csv('s3://yl1269-labdata5/GME.csv',header=True)
df_stock.show(10)
+----------+----------+----------+----------+----------+----------+--------+ | Date| Open| High| Low| Close| Adj Close| Volume| +----------+----------+----------+----------+----------+----------+--------+ |2021-03-01|104.540001|133.990005| 99.970001|120.400002|120.400002|49794000| |2021-03-02|116.930000|133.199997|112.199997|118.180000|118.180000|33783000| |2021-03-03|122.510002|127.750000|113.120003|124.180000|124.180000|19273900| |2021-03-04|125.000000|147.869995|115.300003|132.350006|132.350006|32606900| |2021-03-05|128.169998|151.529999|127.500000|137.740005|137.740005|30733700| |2021-03-08|154.889999|210.869995|146.100006|194.500000|194.500000|63565600| |2021-03-09|217.710007|249.850006|208.509995|246.899994|246.899994|39099300| |2021-03-10|269.429993|348.500000|172.000000|265.000000|265.000000|71570600| |2021-03-11|241.639999|281.500000|232.600006|260.000000|260.000000|28312500| |2021-03-12|275.000000|295.500000|262.269989|264.500000|264.500000|25845900| +----------+----------+----------+----------+----------+----------+--------+ only showing top 10 rows
#cause no stock data on weekend, only keep weekday df_in data
df = df_in.filter(col("is_weekend")== 0).dropna()
df = df.drop("is_weekend")
#Add usr activity column
df_act = df.groupby("post_date").agg(f.count(col("id")).alias("user_activity"))
df = df.join(df_act, "post_date","left")
#See the column relationship
# adjust figure size and font size
sns.set(rc = {"figure.figsize":(20, 12)})
sns.set(font_scale=1)
# compute the correlation matrix using kendall method
corr = df.limit(100000).toPandas().corr(method='kendall')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,annot=True, fmt='.2f', square=True, mask = mask, cmap="Blues");
/mnt/miniconda/lib/python3.7/site-packages/scipy/stats/stats.py:4812: RuntimeWarning: overflow encountered in long_scalars (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2))) /mnt/miniconda/lib/python3.7/site-packages/scipy/stats/stats.py:4814: RuntimeWarning: invalid value encountered in sqrt np.sqrt(var) / np.sqrt(2)))
df = df.dropna()
df.groupby('sentiment').count().show()
+---------+-------+ |sentiment| count| +---------+-------+ | positive|3209651| | neutral| 394656| | negative| 961150| +---------+-------+
# Compute the weights
n = df.count()
n_sen_p = df.filter(col('sentiment')=='positive').count()
n_sen_n = n - n_sen_p
c = 2
weight_sen_p = n/(2*n_sen_p)
weight_sen_n = n/(2*n_sen_n)
# Assign the weights to a new column
df = df.withColumn('weight_sen', f.when(col('sentiment')=='positive', weight_sen_p).otherwise(weight_sen_n))
#join stock data
df = df.join(df_stock,df.post_date == df_stock.Date,"left")
#change stock data from string to double
df = df.withColumn("Open",col("Open").cast(DoubleType()))\
.withColumn("High",col("High").cast(DoubleType()))\
.withColumn("Low",col("Low").cast(DoubleType()))\
.withColumn("Close",col("Close").cast(DoubleType()))\
.withColumn("Volume",col("Volume").cast(DoubleType()))\
.withColumn("no_follow",col("no_follow").cast(IntegerType()))\
df.count()
4170801
train_data, test_data, predict_data = df.randomSplit([0.8, 0.18, 0.02], 24)
After splitting into three datasets, report the number of rows for each split.
print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))
print("Number of prediction records : " + str(predict_data.count()))
Number of training records: 3336530
Number of testing records : 751268
[Stage 76:=====================================================> (42 + 2) / 44]
Number of prediction records : 83003
train_data.printSchema()
root |-- post_date: string (nullable = true) |-- parent_id: string (nullable = true) |-- author: string (nullable = true) |-- id: string (nullable = true) |-- author_premium: boolean (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- author_flair_text_color: string (nullable = true) |-- body: string (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- no_follow: integer (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- total_awards_received: long (nullable = true) |-- stickied: boolean (nullable = true) |-- author_created_time: string (nullable = true) |-- created_time: string (nullable = true) |-- is_PtoP: integer (nullable = true) |-- is_BuckleUp: integer (nullable = true) |-- is_voted: integer (nullable = true) |-- author_postcnt: long (nullable = true) |-- is_popular_commented: integer (nullable = true) |-- author_age: integer (nullable = true) |-- gme_mentioned: integer (nullable = true) |-- sentiment: string (nullable = true) |-- blackrock_mentioned: boolean (nullable = true) |-- share_mentioned: boolean (nullable = true) |-- user_activity: long (nullable = true) |-- weight_sen: double (nullable = false) |-- Date: string (nullable = true) |-- Open: double (nullable = true) |-- High: double (nullable = true) |-- Low: double (nullable = true) |-- Close: double (nullable = true) |-- Adj Close: string (nullable = true) |-- Volume: double (nullable = true)
stringIndexer_sentiment = StringIndexer(inputCol="sentiment", outputCol="sentiment_ix")
stringIndexer_author = StringIndexer(inputCol="author", outputCol="author_ix")
stringIndexer_post_date = StringIndexer(inputCol="post_date", outputCol="post_date_ix")
grade_label_fit = stringIndexer_sentiment.fit(df)
grade_label_fit.labels
['positive', 'negative']
onehot_sentiment= OneHotEncoder(inputCol="sentiment_ix", outputCol="sentiment_vec")
onehot_author= OneHotEncoder(inputCol="author_ix", outputCol="author_vec")
onehot_post_date= OneHotEncoder(inputCol="post_date_ix", outputCol="post_date_vec")
vectorAssembler_features = VectorAssembler(
inputCols=[ 'collapsed',
'no_follow',
'score',
'total_awards_received',
'stickied',
'is_PtoP',
'is_voted',
'author_postcnt',
'author_age',
'gme_mentioned',
'share_mentioned',
'Open',
'Close'],
outputCol= "features")
rf = RandomForestClassifier(labelCol="sentiment_ix", featuresCol="features", numTrees=50, weightCol='weight_sen')
labelConverter = IndexToString(inputCol="prediction",
outputCol="predictedsentiment",
labels= stringIndexer_sentiment.fit(df).labels)
pipeline_rf = Pipeline(stages=[stringIndexer_sentiment,
stringIndexer_author,
stringIndexer_post_date,
onehot_sentiment,
onehot_author,
onehot_post_date,
vectorAssembler_features,
rf,labelConverter])
model_rf = pipeline_rf.fit(train_data)
model_rf.transform(train_data)
To evaluate the model, use test data.
predictions_rf = model_rf.transform(test_data)
LABEL = "sentiment_ix"
evaluatorRF = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions_rf)
mse = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="mse").evaluate(predictions_rf)
var = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="var").evaluate(predictions_rf)
recall = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedRecall").evaluate(predictions_rf)
tp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedTruePositiveRate").evaluate(predictions_rf)
fp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedFalsePositiveRate").evaluate(predictions_rf)
precision = tp / (tp + fp)
f1 = 2 * (precision*recall) / (precision + recall)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))
print("Recall = %g" % recall)
print("Precision = %g" % precision)
print("F1 = %g" % f1)
print("MSE = %g" % mse)
print("Var = %g" % var)
Accuracy = 0.703594 Test Error = 0.296406 Recall = 0.703594 Precision = 0.507044 F1 = 0.589364 MSE = 0.557026 Var = 0.189046
from sklearn.metrics import confusion_matrix
y_pred=predictions.select("prediction").collect()
y_orig=predictions.select("sentiment_ix").collect()
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix: [[686786 0 8602] [204406 0 2195] [ 77056 0 6965]]
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
sn.heatmap(cm, annot=True,cmap="Blues")
plt.show()
In this section, we will evaluate the model area under the curve. This requires using the evaluator BinaryClassificationEvaluator
, which you can read about in the doc here. To evaluate the model, use test data.
evaluatorRF = BinaryClassificationEvaluator(labelCol="sentiment_ix", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluatorRF.evaluate(predictions)
roc_result
0.5082585803262698
stringIndexer_sentiment = StringIndexer(inputCol="sentiment", outputCol="sentiment_ix")
stringIndexer_author = StringIndexer(inputCol="author", outputCol="author_ix")
stringIndexer_post_date = StringIndexer(inputCol="post_date", outputCol="post_date_ix")
grade_label_fit = stringIndexer_sentiment.fit(df)
grade_label_fit.labels
['positive', 'negative']
onehot_sentiment= OneHotEncoder(inputCol="sentiment_ix", outputCol="sentiment_vec")
onehot_author= OneHotEncoder(inputCol="author_ix", outputCol="author_vec")
onehot_post_date= OneHotEncoder(inputCol="post_date_ix", outputCol="post_date_vec")
vectorAssembler_features = VectorAssembler(
inputCols=[ 'collapsed',
'no_follow',
'score',
'total_awards_received',
'stickied',
'is_PtoP',
'is_voted',
'author_postcnt',
'author_age',
'gme_mentioned',
'share_mentioned',
'Open',
'High',
'Low',
'Close'],
outputCol= "features")
rf1 = RandomForestClassifier(labelCol="sentiment_ix", featuresCol="features", numTrees=100, weightCol='weight_sen',maxDepth=10, maxBins=64)
labelConverter = IndexToString(inputCol="prediction",
outputCol="predictedSentiment",
labels= stringIndexer_sentiment.fit(df).labels)
pipeline_rf1 = Pipeline(stages=[stringIndexer_sentiment,
stringIndexer_author,
stringIndexer_post_date,
onehot_sentiment,
onehot_author,
onehot_post_date,
vectorAssembler_features,
rf1,labelConverter])
model_rf1 = pipeline_rf.fit(train_data)
model_rf1.transform(train_data)
DataFrame[parent_id: string, author: string, id: string, author_premium: boolean, author_flair_richtext: string, author_flair_text: string, author_flair_text_color: string, body: string, collapsed: boolean, controversiality: bigint, no_follow: boolean, score: bigint, send_replies: boolean, total_awards_received: bigint, stickied: boolean, author_created_time: string, created_time: string, post_date: string, is_PtoP: int, is_BuckleUp: int, is_voted: int, author_postcnt: bigint, is_popular_commented: int, is_weekend: int, author_age: int, gme_mentioned: int, sentiment: string, blackrock_mentioned: boolean, share_mentioned: boolean, weight_sen: double, sentiment_ix: double, author_ix: double, post_date_ix: double, sentiment_vec: vector, author_vec: vector, post_date_vec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double, predictedSentiment: string]
To evaluate the model, use test data.
predictions_rf1 = model_rf.transform(test_data)
LABEL = "sentiment_ix"
evaluatorRF = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions_rf1)
mse = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="mse").evaluate(predictions_rf1)
var = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="var").evaluate(predictions_rf1)
recall = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedRecall").evaluate(predictions_rf1)
tp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedTruePositiveRate").evaluate(predictions_rf1)
fp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedFalsePositiveRate").evaluate(predictions_rf1)
precision = tp / (tp + fp)
f1 = 2 * (precision*recall) / (precision + recall)
print("Accuracy = %g" % accuracy)
#print("Test Error = %g" % (1.0 - accuracy))
print("Recall = %g" % recall)
print("Precision = %g" % precision)
print("F1 = %g" % f1)
print("MSE = %g" % mse)
print("Var = %g" % var)
Accuracy = 0.765896 Recall = 0.765896 Precision = 0.499846 F1 = 0.60491 MSE = 0.249464 Var = 0.0724037
from sklearn.metrics import confusion_matrix
y_pred=predictions_rf1.select("prediction").collect()
y_orig=predictions_rf1.select("sentiment_ix").collect()
22/04/30 10:28:07 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix: [[690930 14 4619] [205353 29 1213] [ 0 0 0]]
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
sn.heatmap(cm, annot=True,cmap="Blues")
plt.show()
evaluatorRF = BinaryClassificationEvaluator(labelCol="sentiment_ix", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluatorRF.evaluate(predictions_rf1)
roc_result
22/04/30 10:28:01 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
0.49967507841585135
stringIndexer_sentiment = StringIndexer(inputCol="sentiment", outputCol="sentiment_ix")
stringIndexer_author = StringIndexer(inputCol="author", outputCol="author_ix")
stringIndexer_post_date = StringIndexer(inputCol="post_date", outputCol="post_date_ix")
grade_label_fit = stringIndexer_sentiment.fit(df)
grade_label_fit.labels
['positive', 'negative']
onehot_sentiment= OneHotEncoder(inputCol="sentiment_ix", outputCol="sentiment_vec")
onehot_author= OneHotEncoder(inputCol="author_ix", outputCol="author_vec")
onehot_post_date= OneHotEncoder(inputCol="post_date_ix", outputCol="post_date_vec")
vectorAssembler_features = VectorAssembler(
inputCols=[ 'collapsed',
'no_follow',
'score',
'total_awards_received',
'stickied',
'is_PtoP',
'is_voted',
'author_postcnt',
'author_age',
'gme_mentioned',
'share_mentioned' ],
outputCol= "features")
#lr = LogisticRegression(labelCol="sentiment_ix", featuresCol="features", weightCol='weight_sen')
mlp = MultilayerPerceptronClassifier(labelCol="sentiment_ix", featuresCol="features", layers=[11, 2, 2], seed=123)
labelConverter = IndexToString(inputCol="prediction",
outputCol="predictedSentiment",
labels= stringIndexer_sentiment.fit(df).labels)
pipeline_mlp = Pipeline(stages=[stringIndexer_sentiment,
stringIndexer_author,
stringIndexer_post_date,
onehot_sentiment,
onehot_author,
onehot_post_date,
vectorAssembler_features,
mlp,labelConverter])
model_mlp = pipeline_rf.fit(train_data)
model_mlp.transform(train_data)
DataFrame[parent_id: string, author: string, id: string, author_premium: boolean, author_flair_richtext: string, author_flair_text: string, author_flair_text_color: string, body: string, collapsed: boolean, controversiality: bigint, no_follow: boolean, score: bigint, send_replies: boolean, total_awards_received: bigint, stickied: boolean, author_created_time: string, created_time: string, post_date: string, is_PtoP: int, is_BuckleUp: int, is_voted: int, author_postcnt: bigint, is_popular_commented: int, is_weekend: int, author_age: int, gme_mentioned: int, sentiment: string, blackrock_mentioned: boolean, share_mentioned: boolean, weight_sen: double, sentiment_ix: double, author_ix: double, post_date_ix: double, sentiment_vec: vector, author_vec: vector, post_date_vec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double, predictedSentiment: string]
To evaluate the model, use test data.
predictions_mlp = model_mlp.transform(test_data)
LABEL = "sentiment_ix"
evaluatorRF = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions_mlp)
mse = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="mse").evaluate(predictions_mlp)
var = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="var").evaluate(predictions_mlp)
recall = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedRecall").evaluate(predictions_mlp)
tp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedTruePositiveRate").evaluate(predictions_mlp)
fp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedFalsePositiveRate").evaluate(predictions_mlp)
precision = tp / (tp + fp)
f1 = 2 * (precision*recall) / (precision + recall)
print("Accuracy = %g" % accuracy)
print("Recall = %g" % recall)
print("Precision = %g" % precision)
print("F1 = %g" % f1)
print("MSE = %g" % mse)
print("Var = %g" % var)
Accuracy = 0.500303 Recall = 0.500303 Precision = 0.519612 F1 = 0.509774 MSE = 0.499697 Var = 0.336474
from sklearn.metrics import confusion_matrix
y_pred=predictions_mlp.select("prediction").collect()
y_orig=predictions_mlp.select("sentiment_ix").collect()
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix: [[337071 358492] [ 92314 114281]]
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
sn.heatmap(cm, annot=True,cmap="Blues")
plt.show()
evaluatorRF = BinaryClassificationEvaluator(labelCol="sentiment_ix", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluatorRF.evaluate(predictions_mlp)
roc_result
0.5188830430816718
#lr = LogisticRegression(labelCol="sentiment_ix", featuresCol="features", weightCol='weight_sen')
mlp1 = MultilayerPerceptronClassifier(labelCol="sentiment_ix", featuresCol="features", layers=[5, 2, 2], seed=123)
labelConverter = IndexToString(inputCol="prediction",
outputCol="predictedSentiment",
labels= stringIndexer_sentiment.fit(df).labels)
pipeline_mlp1 = Pipeline(stages=[stringIndexer_sentiment,
stringIndexer_author,
stringIndexer_post_date,
onehot_sentiment,
onehot_author,
onehot_post_date,
vectorAssembler_features,
mlp2,labelConverter])
model_mlp1 = pipeline_rf.fit(train_data)
model_mlp1.transform(train_data)
DataFrame[parent_id: string, author: string, id: string, author_premium: boolean, author_flair_richtext: string, author_flair_text: string, author_flair_text_color: string, body: string, collapsed: boolean, controversiality: bigint, no_follow: boolean, score: bigint, send_replies: boolean, total_awards_received: bigint, stickied: boolean, author_created_time: string, created_time: string, post_date: string, is_PtoP: int, is_BuckleUp: int, is_voted: int, author_postcnt: bigint, is_popular_commented: int, is_weekend: int, author_age: int, gme_mentioned: int, sentiment: string, blackrock_mentioned: boolean, share_mentioned: boolean, weight_sen: double, sentiment_ix: double, author_ix: double, post_date_ix: double, sentiment_vec: vector, author_vec: vector, post_date_vec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double, predictedSentiment: string]
To evaluate the model, use test data.
predictions_mlp1 = model_mlp1.transform(test_data)
LABEL = "sentiment_ix"
evaluatorRF = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions_mlp1)
mse = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="mse").evaluate(predictions_mlp1)
var = RegressionEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="var").evaluate(predictions_mlp1)
recall = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedRecall").evaluate(predictions_mlp1)
tp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedTruePositiveRate").evaluate(predictions_mlp1)
fp = MulticlassClassificationEvaluator(labelCol=LABEL, predictionCol="prediction", metricName="weightedFalsePositiveRate").evaluate(predictions_mlp1)
precision = tp / (tp + fp)
f1 = 2 * (precision*recall) / (precision + recall)
print("Accuracy = %g" % accuracy)
print("Recall = %g" % recall)
print("Precision = %g" % precision)
print("F1 = %g" % f1)
print("MSE = %g" % mse)
print("Var = %g" % var)
Accuracy = 0.500303 Recall = 0.500303 Precision = 0.519612 F1 = 0.509774 MSE = 0.499697 Var = 0.336474
from sklearn.metrics import confusion_matrix
y_pred=predictions_mlp1.select("prediction").collect()
y_orig=predictions_mlp1.select("sentiment_ix").collect()
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix: [[690930 14 4619] [205353 29 1213] [ 0 0 0]]
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
sn.heatmap(cm, annot=True,cmap="Blues")
plt.show()
evaluatorRF = BinaryClassificationEvaluator(labelCol="sentiment_ix", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluatorRF.evaluate(predictions_mlp)
roc_result
0.5188830430816718
model_rf1.write().save('s3://yl1269-labdata5/model_rf1')
22/04/30 10:40:38 WARN TaskSetManager: Stage 1214 contains a task of very large size (1177 KiB). The maximum recommended task size is 1000 KiB.
spark.stop()