import findspark
findspark.init()


import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


spark = SparkSession.builder.appName("reddit").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/04 01:55:42 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
22/05/04 01:55:51 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!


spark


df_stock = spark.read.csv('s3://yl1269-labdata5/GME.csv',header=True)


df = spark.read.parquet('s3://yl1269-labdata5/nlp_df/')


df_day = df.groupby("post_date").count().sort(col("post_date"))
df_day = df_day.withColumnRenamed("count","post_cnt")
df_day.show()

[Stage 118:==============================================>        (46 + 8) / 54]

+----------+--------+
| post_date|post_cnt|
+----------+--------+
|2021-03-16|       1|
|2021-03-19|       3|
|2021-03-20|       2|
|2021-03-22|       3|
|2021-03-23|       4|
|2021-03-24|      23|
|2021-03-25|       7|
|2021-03-27|       9|
|2021-03-28|      21|
|2021-03-29|      55|
|2021-03-30|      22|
|2021-03-31|      11|
|2021-04-01|       7|
|2021-04-02|       8|
|2021-04-03|       3|
|2021-04-04|      16|
|2021-04-05|   65462|
|2021-04-06|   68277|
|2021-04-07|   71902|
|2021-04-08|   82796|
+----------+--------+
only showing top 20 rows


#join stock data
df = df_day.join(df_stock,df_day.post_date == df_stock.Date).sort(col("post_date"))


df.printSchema()

root
 |-- post_date: string (nullable = true)
 |-- post_cnt: long (nullable = true)
 |-- Date: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)


#change stock data from string to double
df = df.withColumn("Open",col("Open").cast(DoubleType()))\
    .withColumn("High",col("High").cast(DoubleType()))\
    .withColumn("Low",col("Low").cast(DoubleType()))\
    .withColumn("Close",col("Close").cast(DoubleType()))\
    .withColumn("Volume",col("Volume").cast(DoubleType()))\
    .withColumn("Adj Close",col("Adj Close").cast(DoubleType()))\
    .withColumnRenamed("post_cnt","User_Activity")\
    .drop("post_date")


df.show()

+-------------+----------+----------+----------+----------+----------+----------+---------+
|User_Activity|      Date|      Open|      High|       Low|     Close| Adj Close|   Volume|
+-------------+----------+----------+----------+----------+----------+----------+---------+
|            1|2021-03-16|203.160004|220.699997|172.350006|208.169998|208.169998|3.54229E7|
|            3|2021-03-19|195.729996|     227.0|182.660004|200.270004|200.270004|2.46773E7|
|            3|2021-03-22|205.259995|210.360001|186.199997|194.490005|194.490005|1.00615E7|
|            4|2021-03-23|     197.5|    201.75|177.550003|    181.75|    181.75|1.44291E7|
|           23|2021-03-24|157.979996|166.970001|118.620003|120.339996|120.339996|2.41779E7|
|            7|2021-03-25|123.489998|     187.5|116.900002|    183.75|    183.75|5.09623E7|
|           55|2021-03-29|    180.75|193.919998|173.509995|181.300003|181.300003|1.00422E7|
|           22|2021-03-30|     187.5|204.300003|     182.0|194.460007|194.460007|1.70949E7|
|           11|2021-03-31|     197.5|199.460007|187.110001|189.820007|189.820007|8393800.0|
|            7|2021-04-01|193.360001|196.970001|183.600006|191.449997|191.449997|9334300.0|
|        65462|2021-04-05|     171.0|     195.0|164.809998|186.949997|186.949997|1.40705E7|
|        68277|2021-04-06|185.210007|     192.0|183.559998|     184.5|     184.5|6218300.0|
|        71902|2021-04-07|183.220001|     184.5|176.110001|177.970001|177.970001|4768300.0|
|        82796|2021-04-08|185.880005|185.880005|164.300003|170.259995|170.259995|1.00474E7|
|        78806|2021-04-09|169.699997|171.580002|     153.0|158.360001|158.360001|9462400.0|
|        70783|2021-04-12|158.110001|163.899994|135.009995|141.089996|141.089996|1.66836E7|
|        73701|2021-04-13|141.880005|145.380005|     132.0|140.990005|140.990005|6806900.0|
|        82113|2021-04-14|143.570007|174.089996|     143.0|166.529999|166.529999|2.11381E7|
|        81740|2021-04-15|     163.0|    166.25|152.800003|156.440002|156.440002|7856800.0|
|        95353|2021-04-16|     156.0|160.199997|    151.25|154.690002|154.690002|5214700.0|
+-------------+----------+----------+----------+----------+----------+----------+---------+
only showing top 20 rows


!/mnt/miniconda/bin/pip install plotly

Collecting plotly
  Downloading plotly-5.7.0-py2.py3-none-any.whl (28.8 MB)
     |████████████████████████████████| 28.8 MB 15.4 MB/s eta 0:00:01
Requirement already satisfied: six in /mnt/miniconda/lib/python3.7/site-packages (from plotly) (1.16.0)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.7.0 tenacity-8.0.1


import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go


df = df.toPandas()
df


import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

#fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df['Date'],
        y=df['Close'],
        name="Close Price",
    ),
    secondary_y=True,
)
fig.add_trace(
    go.Bar(
        x=df['Date'],
        y=df['User_Activity'],
        name="User_Activity"), 
    secondary_y=False,

)

# Set y-axes titles
fig.update_yaxes(title_text="User_Activity", secondary_y=False)
fig.update_yaxes(title_text="Close Price", secondary_y=True)


# Set y-axes titles
fig.update_yaxes(title_text="User_Activity", secondary_y=False)
fig.update_yaxes(title_text="Close Price", secondary_y=True)

fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    title = "Stock price and User Activity"
   # paper_bgcolor="LightSteelBlue",
)

fig.show()


fig.write_html("stock_activity.html")

	User_Activity	Date	Open	High	Low	Close	Adj Close	Volume
0	1	2021-03-16	203.160004	220.699997	172.350006	208.169998	208.169998	35422900.0
1	3	2021-03-19	195.729996	227.000000	182.660004	200.270004	200.270004	24677300.0
2	3	2021-03-22	205.259995	210.360001	186.199997	194.490005	194.490005	10061500.0
3	4	2021-03-23	197.500000	201.750000	177.550003	181.750000	181.750000	14429100.0
4	23	2021-03-24	157.979996	166.970001	118.620003	120.339996	120.339996	24177900.0
...	...	...	...	...	...	...	...	...
66	91922	2021-06-23	221.449997	222.570007	214.119995	219.339996	219.339996	3555100.0
67	90743	2021-06-24	221.160004	227.449997	211.600006	212.309998	212.309998	3863300.0
68	88709	2021-06-25	214.000000	214.199997	198.500000	209.509995	209.509995	12692700.0
69	73250	2021-06-28	211.250000	224.449997	210.199997	213.250000	213.250000	4879400.0
70	75512	2021-06-29	213.589996	215.220001	208.009995	210.880005	210.880005	2480000.0

EDA Improvment¶