import findspark
findspark.init()
import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
spark = SparkSession.builder.appName("reddit").getOrCreate()
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 22/05/04 01:55:42 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. 22/05/04 01:55:51 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
Make sure your SparkSession is active:
spark
SparkSession - in-memory
df_stock = spark.read.csv('s3://yl1269-labdata5/GME.csv',header=True)
df = spark.read.parquet('s3://yl1269-labdata5/nlp_df/')
df_day = df.groupby("post_date").count().sort(col("post_date"))
df_day = df_day.withColumnRenamed("count","post_cnt")
df_day.show()
[Stage 118:==============================================> (46 + 8) / 54]
+----------+--------+ | post_date|post_cnt| +----------+--------+ |2021-03-16| 1| |2021-03-19| 3| |2021-03-20| 2| |2021-03-22| 3| |2021-03-23| 4| |2021-03-24| 23| |2021-03-25| 7| |2021-03-27| 9| |2021-03-28| 21| |2021-03-29| 55| |2021-03-30| 22| |2021-03-31| 11| |2021-04-01| 7| |2021-04-02| 8| |2021-04-03| 3| |2021-04-04| 16| |2021-04-05| 65462| |2021-04-06| 68277| |2021-04-07| 71902| |2021-04-08| 82796| +----------+--------+ only showing top 20 rows
#join stock data
df = df_day.join(df_stock,df_day.post_date == df_stock.Date).sort(col("post_date"))
df.printSchema()
root |-- post_date: string (nullable = true) |-- post_cnt: long (nullable = true) |-- Date: string (nullable = true) |-- Open: string (nullable = true) |-- High: string (nullable = true) |-- Low: string (nullable = true) |-- Close: string (nullable = true) |-- Adj Close: string (nullable = true) |-- Volume: string (nullable = true)
#change stock data from string to double
df = df.withColumn("Open",col("Open").cast(DoubleType()))\
.withColumn("High",col("High").cast(DoubleType()))\
.withColumn("Low",col("Low").cast(DoubleType()))\
.withColumn("Close",col("Close").cast(DoubleType()))\
.withColumn("Volume",col("Volume").cast(DoubleType()))\
.withColumn("Adj Close",col("Adj Close").cast(DoubleType()))\
.withColumnRenamed("post_cnt","User_Activity")\
.drop("post_date")
df.show()
+-------------+----------+----------+----------+----------+----------+----------+---------+ |User_Activity| Date| Open| High| Low| Close| Adj Close| Volume| +-------------+----------+----------+----------+----------+----------+----------+---------+ | 1|2021-03-16|203.160004|220.699997|172.350006|208.169998|208.169998|3.54229E7| | 3|2021-03-19|195.729996| 227.0|182.660004|200.270004|200.270004|2.46773E7| | 3|2021-03-22|205.259995|210.360001|186.199997|194.490005|194.490005|1.00615E7| | 4|2021-03-23| 197.5| 201.75|177.550003| 181.75| 181.75|1.44291E7| | 23|2021-03-24|157.979996|166.970001|118.620003|120.339996|120.339996|2.41779E7| | 7|2021-03-25|123.489998| 187.5|116.900002| 183.75| 183.75|5.09623E7| | 55|2021-03-29| 180.75|193.919998|173.509995|181.300003|181.300003|1.00422E7| | 22|2021-03-30| 187.5|204.300003| 182.0|194.460007|194.460007|1.70949E7| | 11|2021-03-31| 197.5|199.460007|187.110001|189.820007|189.820007|8393800.0| | 7|2021-04-01|193.360001|196.970001|183.600006|191.449997|191.449997|9334300.0| | 65462|2021-04-05| 171.0| 195.0|164.809998|186.949997|186.949997|1.40705E7| | 68277|2021-04-06|185.210007| 192.0|183.559998| 184.5| 184.5|6218300.0| | 71902|2021-04-07|183.220001| 184.5|176.110001|177.970001|177.970001|4768300.0| | 82796|2021-04-08|185.880005|185.880005|164.300003|170.259995|170.259995|1.00474E7| | 78806|2021-04-09|169.699997|171.580002| 153.0|158.360001|158.360001|9462400.0| | 70783|2021-04-12|158.110001|163.899994|135.009995|141.089996|141.089996|1.66836E7| | 73701|2021-04-13|141.880005|145.380005| 132.0|140.990005|140.990005|6806900.0| | 82113|2021-04-14|143.570007|174.089996| 143.0|166.529999|166.529999|2.11381E7| | 81740|2021-04-15| 163.0| 166.25|152.800003|156.440002|156.440002|7856800.0| | 95353|2021-04-16| 156.0|160.199997| 151.25|154.690002|154.690002|5214700.0| +-------------+----------+----------+----------+----------+----------+----------+---------+ only showing top 20 rows
!/mnt/miniconda/bin/pip install plotly
Collecting plotly Downloading plotly-5.7.0-py2.py3-none-any.whl (28.8 MB) |████████████████████████████████| 28.8 MB 15.4 MB/s eta 0:00:01 Requirement already satisfied: six in /mnt/miniconda/lib/python3.7/site-packages (from plotly) (1.16.0) Collecting tenacity>=6.2.0 Downloading tenacity-8.0.1-py3-none-any.whl (24 kB) Installing collected packages: tenacity, plotly Successfully installed plotly-5.7.0 tenacity-8.0.1
import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
df = df.toPandas()
df
User_Activity | Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 2021-03-16 | 203.160004 | 220.699997 | 172.350006 | 208.169998 | 208.169998 | 35422900.0 |
1 | 3 | 2021-03-19 | 195.729996 | 227.000000 | 182.660004 | 200.270004 | 200.270004 | 24677300.0 |
2 | 3 | 2021-03-22 | 205.259995 | 210.360001 | 186.199997 | 194.490005 | 194.490005 | 10061500.0 |
3 | 4 | 2021-03-23 | 197.500000 | 201.750000 | 177.550003 | 181.750000 | 181.750000 | 14429100.0 |
4 | 23 | 2021-03-24 | 157.979996 | 166.970001 | 118.620003 | 120.339996 | 120.339996 | 24177900.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
66 | 91922 | 2021-06-23 | 221.449997 | 222.570007 | 214.119995 | 219.339996 | 219.339996 | 3555100.0 |
67 | 90743 | 2021-06-24 | 221.160004 | 227.449997 | 211.600006 | 212.309998 | 212.309998 | 3863300.0 |
68 | 88709 | 2021-06-25 | 214.000000 | 214.199997 | 198.500000 | 209.509995 | 209.509995 | 12692700.0 |
69 | 73250 | 2021-06-28 | 211.250000 | 224.449997 | 210.199997 | 213.250000 | 213.250000 | 4879400.0 |
70 | 75512 | 2021-06-29 | 213.589996 | 215.220001 | 208.009995 | 210.880005 | 210.880005 | 2480000.0 |
71 rows × 8 columns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])
#fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df['Date'],
y=df['Close'],
name="Close Price",
),
secondary_y=True,
)
fig.add_trace(
go.Bar(
x=df['Date'],
y=df['User_Activity'],
name="User_Activity"),
secondary_y=False,
)
# Set y-axes titles
fig.update_yaxes(title_text="User_Activity", secondary_y=False)
fig.update_yaxes(title_text="Close Price", secondary_y=True)
# Set y-axes titles
fig.update_yaxes(title_text="User_Activity", secondary_y=False)
fig.update_yaxes(title_text="Close Price", secondary_y=True)
fig.update_layout(
autosize=False,
width=1000,
height=500,
title = "Stock price and User Activity"
# paper_bgcolor="LightSteelBlue",
)
fig.show()
fig.write_html("stock_activity.html")