Find all your Spark related environment variables, and pyspark using the findspark.init()
function:
import findspark
findspark.init()
Create your SparkSession. You are only going to create a SparkSession
, not a SparkContext
.
import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
# import python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
spark = SparkSession.builder.appName("redditML").getOrCreate()
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 22/04/29 19:38:11 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. 22/04/29 19:38:22 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
Make sure your SparkSession is active:
spark
SparkSession - in-memory
!aws s3 cp s3://bigdatateaching/reddit/subreddits/Superstonk/ s3://yl1269-labdata5/reddit --recursive
copy: s3://bigdatateaching/reddit/subreddits/Superstonk/_SUCCESS to s3://yl1269-labdata5/reddit/_SUCCESS copy: s3://bigdatateaching/reddit/subreddits/Superstonk/ym_partition=202103/part-00009-73ae9ccc-31f2-43ad-9230-6657a9d61dfa.c000.snappy.parquet to s3://yl1269-labdata5/reddit/ym_partition=202103/part-00009-73ae9ccc-31f2-43ad-9230-6657a9d61dfa.c000.snappy.parquet copy: s3://bigdatateaching/reddit/subreddits/Superstonk/ym_partition=202105/part-00016-73ae9ccc-31f2-43ad-9230-6657a9d61dfa.c000.snappy.parquet to s3://yl1269-labdata5/reddit/ym_partition=202105/part-00016-73ae9ccc-31f2-43ad-9230-6657a9d61dfa.c000.snappy.parquet copy: s3://bigdatateaching/reddit/subreddits/Superstonk/ym_partition=202104/part-00010-73ae9ccc-31f2-43ad-9230-6657a9d61dfa.c000.snappy.parquet to s3://yl1269-labdata5/reddit/ym_partition=202104/part-00010-73ae9ccc-31f2-43ad-9230-6657a9d61dfa.c000.snappy.parquet copy: s3://bigdatateaching/reddit/subreddits/Superstonk/ym_partition=202106/part-00002-73ae9ccc-31f2-43ad-9230-6657a9d61dfa.c000.snappy.parquet to s3://yl1269-labdata5/reddit/ym_partition=202106/part-00002-73ae9ccc-31f2-43ad-9230-6657a9d61dfa.c000.snappy.parquet
!aws s3 ls s3://yl1269-labdata5/reddit/
PRE ym_partition=202103/ PRE ym_partition=202104/ PRE ym_partition=202105/ PRE ym_partition=202106/ 2022-04-28 08:55:35 0 _SUCCESS
df_in = spark.read.parquet('s3://yl1269-labdata5/reddit/')
df_in.count()
22/04/28 05:12:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
7403473
df_in.printSchema()
root |-- all_awardings: string (nullable = true) |-- associated_award: string (nullable = true) |-- author: string (nullable = true) |-- author_created_utc: double (nullable = true) |-- author_flair_background_color: string (nullable = true) |-- author_flair_css_class: string (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_template_id: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- author_flair_text_color: string (nullable = true) |-- author_flair_type: string (nullable = true) |-- author_fullname: string (nullable = true) |-- author_patreon_flair: boolean (nullable = true) |-- author_premium: boolean (nullable = true) |-- awarders: string (nullable = true) |-- body: string (nullable = true) |-- can_gild: boolean (nullable = true) |-- can_mod_post: boolean (nullable = true) |-- collapsed: boolean (nullable = true) |-- collapsed_because_crowd_control: string (nullable = true) |-- collapsed_reason: string (nullable = true) |-- comment_type: string (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: long (nullable = true) |-- distinguished: string (nullable = true) |-- edited: double (nullable = true) |-- gilded: long (nullable = true) |-- gildings: string (nullable = true) |-- id: string (nullable = true) |-- is_submitter: boolean (nullable = true) |-- link_id: string (nullable = true) |-- locked: boolean (nullable = true) |-- no_follow: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- permalink: string (nullable = true) |-- quarantined: boolean (nullable = true) |-- removal_reason: string (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit_id: string (nullable = true) |-- subreddit_name_prefixed: string (nullable = true) |-- subreddit_type: string (nullable = true) |-- top_awarded_type: string (nullable = true) |-- total_awards_received: long (nullable = true) |-- treatment_tags: string (nullable = true) |-- author_cakeday: boolean (nullable = true) |-- editable: boolean (nullable = true) |-- media_metadata: string (nullable = true) |-- ym_partition: integer (nullable = true)
# Let's take a look at where the missing values are located.
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(18,6))
sns.heatmap(df_in.limit(10000).toPandas().isnull(),cbar=False, cmap="Blues")
plt.xticks(rotation=45)
<AxesSubplot:>
df = df_in.select('id',
'author',
'author_created_utc',
'author_premium',
'author_flair_richtext',
'author_flair_text',
'author_flair_text_color',
'body',
'collapsed',
'controversiality',
'created_utc',
'locked',
'parent_id',
'no_follow',
'score',
'send_replies',
'total_awards_received',
'quarantined',
'stickied')
#see the missing value of df
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(18,6))
sns.heatmap(df.limit(10000).toPandas().isnull(),cbar=False, cmap="Blues")
plt.xticks(rotation=45)
<AxesSubplot:>
df = df.withColumn('author_created_time', from_unixtime(df.author_created_utc, "yyyy-MM-dd"))
df = df.withColumn('created_time', from_unixtime(df.created_utc, "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn('post_date', from_unixtime(df.created_utc, "yyyy-MM-dd"))
df = df.drop("quarantined", "author_created_utc", "created_utc")
df.printSchema()
root |-- id: string (nullable = true) |-- author: string (nullable = true) |-- author_premium: boolean (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- author_flair_text_color: string (nullable = true) |-- body: string (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- locked: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- no_follow: boolean (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- total_awards_received: long (nullable = true) |-- stickied: boolean (nullable = true) |-- author_created_time: string (nullable = true) |-- created_time: string (nullable = true) |-- post_date: string (nullable = true)
df.select("author_flair_richtext").show()
+---------------------+ |author_flair_richtext| +---------------------+ | []| | []| | []| | nan| | []| | nan| | []| | []| | []| | nan| | []| | []| | []| | []| | []| | nan| | nan| | []| | []| | nan| +---------------------+ only showing top 20 rows
Counting the locked posts
df.groupby("author_flair_text").count().orderBy(desc('count')).show()
[Stage 43:======================================================> (55 + 2) / 57]
+--------------------+-------+ | author_flair_text| count| +--------------------+-------+ |🎮 Power to the P...|1930786| | null|1813934| | 🦍 Buckle Up 🚀|1350262| | 🦍Voted✅|1237145| | 🦍 Attempt Vote 💯| 55496| | still hodl 💎🙌| 29453| |🚀🚀 JACKED to th...| 23531| |🦍BERSERKER Twerk...| 18800| |just likes the st...| 16186| | Not a cat 🦍| 14022| | let's go 🚀🚀🚀| 13651| |Custom Flair - Te...| 13198| | ape want believe 🛸| 9100| | is a cat 🐈| 8913| |⚔Knights of New🛡...| 8472| | Crazy cat lady 🐱🐱| 6688| |Only thing short ...| 5309| |Custom Flair - Te...| 5304| |This is the way -...| 5038| |🌳Hodling for a B...| 4986| +--------------------+-------+ only showing top 20 rows
df = df.withColumn("is_PtoP", f.when(f.col("author_flair_text") == "🎮 Power to the Players 🛑" ,f.lit(1)).otherwise(0))
df = df.withColumn("is_BuckleUp", f.when(f.col("author_flair_text") == "🦍 Buckle Up 🚀" ,f.lit(1)).otherwise(0))
df = df.withColumn("is_voted", f.when(f.col("author_flair_text") == "🦍Voted✅" ,f.lit(1)).otherwise(0))
df.groupby(['is_PtoP', 'is_BuckleUp', 'is_voted']).count().show()
+-------+-----------+--------+-------+ |is_PtoP|is_BuckleUp|is_voted| count| +-------+-----------+--------+-------+ | 0| 0| 1|1237145| | 1| 0| 0|1930786| | 0| 1| 0|1350262| | 0| 0| 0|2885280| +-------+-----------+--------+-------+
df.groupby('no_follow').count().show()
+---------+-------+ |no_follow| count| +---------+-------+ | true|4315521| | false|3087952| +---------+-------+
df.groupby('locked').count().show()
+------+-------+ |locked| count| +------+-------+ | true| 10| | false|7403463| +------+-------+
#show 10 locked posts
df.filter(df.locked == "true").show()
[Stage 6:=====================================================> (30 + 2) / 32]
+---------------+--------------+--------------------+---------+----------------+------+----------+-------+---------+-----+---------------------+--------+-------------------+-------------------+----------+ | author|author_premium| body|collapsed|controversiality|locked| parent_id| id|no_follow|score|total_awards_received|stickied|author_created_time| created_time| post_date| +---------------+--------------+--------------------+---------+----------------+------+----------+-------+---------+-----+---------------------+--------+-------------------+-------------------+----------+ | [deleted]| null| [removed]| true| 0| true| t3_ml6lty|gtksqh8| true| -37| 0| false| null|2021-04-06 15:11:28|2021-04-06| | [deleted]| null| [removed]| true| 0| true| t3_mkizxx|gtg3z3e| true| -23| 0| false| null|2021-04-05 12:35:03|2021-04-05| | [deleted]| null|Already ahead of ...| false| 0| true|t1_gtwcq2k|gtweeaf| true| 3| 0| false| null|2021-04-09 05:26:03|2021-04-09| | [deleted]| null|Locking this Post...| false| 0| true| t3_mr2tph|gujzxhx| true| 1| 0| true| null|2021-04-15 01:09:08|2021-04-15| | [deleted]| null| [removed]| true| 0| true| t3_mvn2l0|gvdg8hx| true| 2| 0| false| null|2021-04-21 21:37:45|2021-04-21| | [deleted]| null| [removed]| true| 0| true|t1_gtfp175|gtftnps| true| 21| 0| false| null|2021-04-05 10:29:11|2021-04-05| | [deleted]| null| [removed]| true| 0| true|t1_guzqdmu|gv12fan| true| 47| 0| false| null|2021-04-19 00:44:40|2021-04-19| |EnriqueShockwav| false|Holy shit, YOU HA...| false| 0| true|t1_h1g0gfj|h1g0nn0| false| 33| 1| false| 2019-09-14|2021-06-11 20:29:38|2021-06-11| | [deleted]| null| [removed]| true| 0| true| t3_nvqcsk|h168g8d| true| -43| 0| false| null|2021-06-09 16:50:13|2021-06-09| | [deleted]| null| [removed]| true| 0| true|t1_h04mngn|h04o7o0| true| -25| 0| false| null|2021-05-31 22:30:27|2021-05-31| +---------------+--------------+--------------------+---------+----------------+------+----------+-------+---------+-----+---------------------+--------+-------------------+-------------------+----------+
df = df.drop("locked")
df.groupby('total_awards_received').count().orderBy(desc('count')).show()
[Stage 167:====================================================> (25 + 1) / 26]
+---------------------+-------+ |total_awards_received| count| +---------------------+-------+ | 0|7291217| | 1| 95141| | 2| 10062| | 3| 2993| | 4| 1296| | 5| 766| | 6| 473| | 7| 289| | 8| 234| | 9| 161| | 10| 149| | 11| 125| | 12| 78| | 13| 59| | 14| 56| | 16| 40| | 15| 35| | 17| 30| | 19| 26| | 18| 22| +---------------------+-------+ only showing top 20 rows
df.groupby('stickied').count().show()
+--------+-------+ |stickied| count| +--------+-------+ | true| 977| | false|7402496| +--------+-------+
df.groupby('post_date').count().orderBy('post_date').show()
[Stage 43:===================================================> (52 + 5) / 57]
+----------+-----+ | post_date|count| +----------+-----+ |03-16-2021| 1| |03-19-2021| 3| |03-20-2021| 2| |03-22-2021| 3| |03-23-2021| 4| |03-24-2021| 23| |03-25-2021| 7| |03-27-2021| 9| |03-28-2021| 21| |03-29-2021| 55| |03-30-2021| 22| |03-31-2021| 11| |04-01-2021| 7| |04-02-2021| 8| |04-03-2021| 3| |04-04-2021| 16| |04-05-2021|65462| |04-06-2021|68277| |04-07-2021|71902| |04-08-2021|82796| +----------+-----+ only showing top 20 rows
author_postsCt = df.groupby('author').count().orderBy(desc('count'))
author_postsCt.show()
[Stage 82:=======================================================>(56 + 1) / 57]
+----------------+------+ | author| count| +----------------+------+ | [deleted]|777484| | Scrollwheeler| 23277| | half_dane| 11928| |Terrible-Ad-4536| 9596| | Itz_Ape| 8668| | MouthPipetting| 7282| | AdNo8854| 6871| | Sm0515| 6777| | fed_smoker69420| 6356| | ___alexa___| 5249| | MOSfriedeggs| 5076| | loves_abyss| 5038| | An-Onymous-Name| 4986| | Mellow_Velo33| 4927| | Patarokun| 4868| | TheKingOfLimbs| 4858| | Berrybunny00| 4492| | BeansMostly| 4486| | Kharadin92| 4394| | moronthisatnine| 4165| +----------------+------+ only showing top 20 rows
df = df.join(author_postsCt, ["author"],"left")
df = df.withColumnRenamed("count","author_postcnt")
df.printSchema()
root |-- author: string (nullable = true) |-- id: string (nullable = true) |-- author_premium: boolean (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- author_flair_text_color: string (nullable = true) |-- body: string (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- locked: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- no_follow: boolean (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- total_awards_received: long (nullable = true) |-- stickied: boolean (nullable = true) |-- author_created_time: string (nullable = true) |-- created_time: string (nullable = true) |-- post_date: string (nullable = true) |-- is_PtoP: integer (nullable = false) |-- is_BuckleUp: integer (nullable = false) |-- is_voted: integer (nullable = false) |-- author_postcnt: long (nullable = true)
most_commented = df.groupby("parent_id").count().sort(col("count").desc())
most_commented.show(20)
[Stage 89:===================================================> (22 + 2) / 24]
+---------+-----+ |parent_id|count| +---------+-----+ |t3_nvqcsk|31649| |t3_nuzig8|31104| |t3_nkkbt6|28603| |t3_nlbgzn|23822| |t3_nm2gyp|22245| |t3_nwi5fg|21948| |t3_nqgjuw|21759| |t3_nmsrqc|21035| |t3_mysjss|19529| |t3_n6bjj3|18358| |t3_npoqy4|17299| |t3_nbbfbp|17229| |t3_nea5dr|16953| |t3_nr819f|15949| |t3_nf4qiq|14909| |t3_n626av|14384| |t3_nu72kv|14293| |t3_n9rsxg|13923| |t3_njslvh|13666| |t3_n8z26t|13238| +---------+-----+ only showing top 20 rows
df.filter(col("parent_id") == "t3_nvqcsk").show(truncate = False)
+---------+--------------------+-------+--------------+---------------------+-----------------------------------+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+----------------+------+---------+-----+------------+---------------------+--------+-------------------+-------------------+----------+-------+-----------+--------+--------------+--------------------+----------+----------+ |parent_id|author |id |author_premium|author_flair_richtext|author_flair_text |author_flair_text_color|body |collapsed|controversiality|locked|no_follow|score|send_replies|total_awards_received|stickied|author_created_time|created_time |post_date |is_PtoP|is_BuckleUp|is_voted|author_postcnt|is_popular_commented|is_weekend|author_age| +---------+--------------------+-------+--------------+---------------------+-----------------------------------+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+----------------+------+---------+-----+------------+---------------------+--------+-------------------+-------------------+----------+-------+-----------+--------+--------------+--------------------+----------+----------+ |t3_nvqcsk|-DeadKiller- |h16tuwp|false |[] |🦍 Buckle Up 🚀 |light |!Buckleup! |false |0 |false |true |1 |true |0 |false |2021-01-20 |2021-06-09 19:14:23|2021-06-09|0 |1 |0 |2 |1 |0 |140 | |t3_nvqcsk|-Dprimo- |h173c3t|false |[] |🦍 Buckle Up 🚀 |light |!Buckleup! |false |0 |false |true |3 |true |0 |false |2017-04-02 |2021-06-09 20:18:00|2021-06-09|0 |1 |0 |1 |1 |0 |1529 | |t3_nvqcsk|-WilliamMButtlicker_|h15yawm|false |[] |null |null |Nobody thought to just take A PEN |false |0 |false |false |32 |true |0 |false |2020-04-05 |2021-06-09 15:43:23|2021-06-09|0 |0 |0 |13 |1 |0 |430 | |t3_nvqcsk|1313169 |h172ada|false |[] |🎮 Power to the Players 🛑 |light |$20 dollar minute candles, "probably authentic" -SEC |false |0 |false |false |7 |true |0 |false |2021-01-02 |2021-06-09 20:11:00|2021-06-09|1 |0 |0 |770 |1 |0 |158 | |t3_nvqcsk|1313169 |h171zm0|false |[] |🎮 Power to the Players 🛑 |light |Kenny must be so mad rn lmao what a day to be alive |false |0 |false |false |3 |true |0 |false |2021-01-02 |2021-06-09 20:08:59|2021-06-09|1 |0 |0 |770 |1 |0 |158 | |t3_nvqcsk|1313169 |h1716wp|false |[] |🎮 Power to the Players 🛑 |light |Already with the $2+ bid asks 🙄🙄🙄🙄 |false |0 |false |false |8 |true |0 |false |2021-01-02 |2021-06-09 20:03:35|2021-06-09|1 |0 |0 |770 |1 |0 |158 | |t3_nvqcsk|1313169 |h16wnxj|false |[] |🎮 Power to the Players 🛑 |light |Beautiful day, nice shorts Ken! Enjoy them while you can ya sorry bastard. I'm strapped in and ready to ride |false |0 |false |false |4 |true |0 |false |2021-01-02 |2021-06-09 19:33:22|2021-06-09|1 |0 |0 |770 |1 |0 |158 | |t3_nvqcsk|1313169 |h1708t3|false |[] |🎮 Power to the Players 🛑 |light |Lol such a gross dip, SEC allegedly watching, let's see if they ever address these dips |false |0 |false |false |4 |true |0 |false |2021-01-02 |2021-06-09 19:57:21|2021-06-09|1 |0 |0 |770 |1 |0 |158 | |t3_nvqcsk|1313169 |h172ycx|false |[] |🎮 Power to the Players 🛑 |light |Might as well still be Gary's third week |false |0 |false |false |7 |true |0 |false |2021-01-02 |2021-06-09 20:15:28|2021-06-09|1 |0 |0 |770 |1 |0 |158 | |t3_nvqcsk|1313169 |h171o3o|false |[] |🎮 Power to the Players 🛑 |light |Yeah HOW BOUT THAT $8 DIP SEC??? |false |0 |false |false |4 |true |0 |false |2021-01-02 |2021-06-09 20:06:49|2021-06-09|1 |0 |0 |770 |1 |0 |158 | |t3_nvqcsk|18birdeyes |h171ore|false |[] |🦍 Buckle Up 🚀 |light |!Buckleup! |false |0 |false |true |1 |true |0 |false |2021-01-28 |2021-06-09 20:06:56|2021-06-09|0 |1 |0 |3 |1 |0 |132 | |t3_nvqcsk|1970Roadrunner |h1854uh|false |[] |🦍 I Am Definitely Not Uncertain 🚀|dark |!buckleup! |false |0 |false |true |3 |true |0 |false |2017-10-08 |2021-06-10 01:04:50|2021-06-10|0 |0 |0 |115 |1 |0 |1341 | |t3_nvqcsk|1HORST |h15me3a|false |[] |🎮 Power to the Players 🛑 |light |I love the talking heads format! Great work mods. ♥ Looking forwards to the shareholder meeting. |false |0 |false |false |3 |true |0 |false |2021-02-15 |2021-06-09 14:21:20|2021-06-09|1 |0 |0 |7 |1 |0 |114 | |t3_nvqcsk|22khz |h16enxx|false |[] |🎮 Power to the Players 🛑 |light |Yep yep yep. It’s 1:30 pm EST and it’s a normal trading day, the meeting happened, let’s move the fuck on 🚀 earnings at 5 |false |0 |false |false |8 |true |0 |false |2016-06-27 |2021-06-09 17:31:32|2021-06-09|1 |0 |0 |449 |1 |0 |1808 | |t3_nvqcsk|22khz |h1703sd|false |[] |🎮 Power to the Players 🛑 |light |Oh the desperation of Ken and Kyle 🙄 |false |0 |false |false |2 |true |0 |false |2016-06-27 |2021-06-09 19:56:26|2021-06-09|1 |0 |0 |449 |1 |0 |1808 | |t3_nvqcsk|22khz |h16245x|false |[] |🎮 Power to the Players 🛑 |light |So many people complaining about the livestream 🤦♀️ it was a completely informal discussion about a meeting a vast majority of shareholders weren’t even privy to attend. |false |0 |false |false |25 |true |0 |false |2016-06-27 |2021-06-09 16:08:25|2021-06-09|1 |0 |0 |449 |1 |0 |1808 | |t3_nvqcsk|22khz |h156pdj|false |[] |🎮 Power to the Players 🛑 |light |Apes, the superstonk front page has it covered: but I’m going to once again say please temper your expectations today. Nothing may come out of the meeting, the price can tank through manipulation, same old story. But the music still goes on and the MOASS will happen. |false |0 |false |false |26 |true |0 |false |2016-06-27 |2021-06-09 12:13:08|2021-06-09|1 |0 |0 |449 |1 |0 |1808 | |t3_nvqcsk|22khz |h16c4d3|false |[] |🎮 Power to the Players 🛑 |light |Yeah, um...pinkcats didn’t actually do an interview with CNBC for all those who jumped the gun and started firing: https://www.reddit.com/r/Superstonk/comments/nw0aoe/update_lower_the_pitchforks_a_little_bit/?utm_source=share&utm_medium=ios_app&utm_name=iossmf |false |0 |false |false |30 |true |0 |false |2016-06-27 |2021-06-09 17:14:44|2021-06-09|1 |0 |0 |449 |1 |0 |1808 | |t3_nvqcsk|3WordPosts |h17ie90|false |[] |🎮 Power to the Players 🛑 |light |Cramer shilling us to just take profits |false |0 |false |false |5 |true |0 |false |2012-04-14 |2021-06-09 22:02:14|2021-06-09|1 |0 |0 |365 |1 |0 |3343 | |t3_nvqcsk|3WordPosts |h189jjt|false |[] |🎮 Power to the Players 🛑 |light |My poor naive brother is excited for me but thinks I’m crazy. He tried to tell me today That gme squeeze was going to be $1500 max but also had no explanation why besides “it can’t go higher”. Told him I’m not selling till 25,000,000 and he said it’s never getting that high. Told him it’ll be in my will then cause it’s never getting sold.|false |0 |false |false |24 |true |0 |false |2012-04-14 |2021-06-10 01:41:03|2021-06-10|1 |0 |0 |365 |1 |0 |3344 | +---------+--------------------+-------+--------------+---------------------+-----------------------------------+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+----------------+------+---------+-----+------------+---------------------+--------+-------------------+-------------------+----------+-------+-----------+--------+--------------+--------------------+----------+----------+ only showing top 20 rows
most_commented.describe("count").show()
+-------+-----------------+ |summary| count| +-------+-----------------+ | count| 2728265| | mean|2.713619461452608| | stddev|67.48269667758078| | min| 1| | max| 31649| +-------+-----------------+
commented_join = df.join(most_commented,["parent_id"], how= "left")
df = commented_join.withColumn("is_popular_commented", f.when(f.col("count")>3,f.lit(1)).otherwise(0))
df = df.drop("count")
df = df.withColumn("gme_mentioned",f.when(df["body"].rlike("(?i)gme"),1).when(df["body"].rlike("(?i)gme"),1).otherwise(0))
df.groupby("gme_mentioned").count().show()
[Stage 646:=================================> (16 + 10) / 26]
+-------------+-------+ |gme_mentioned| count| +-------------+-------+ | 1| 407005| | 0|6996468| +-------------+-------+
df = df.withColumn("is_weekend", dayofweek("post_date").isin([1,7]).cast('int'))
df.groupby("is_weekend").count().show()
[Stage 188:====================================================> (25 + 1) / 26]
+----------+-------+ |is_weekend| count| +----------+-------+ | 1|1253177| | 0|6150296| +----------+-------+
df = df.withColumn("author_age", datediff(df.post_date, df.author_created_time))
df = df.drop("gme")
df.printSchema()
root |-- parent_id: string (nullable = true) |-- author: string (nullable = true) |-- id: string (nullable = true) |-- author_premium: boolean (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- author_flair_text_color: string (nullable = true) |-- body: string (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- locked: boolean (nullable = true) |-- no_follow: boolean (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- total_awards_received: long (nullable = true) |-- stickied: boolean (nullable = true) |-- author_created_time: string (nullable = true) |-- created_time: string (nullable = true) |-- post_date: string (nullable = true) |-- is_PtoP: integer (nullable = false) |-- is_BuckleUp: integer (nullable = false) |-- is_voted: integer (nullable = false) |-- author_postcnt: long (nullable = true) |-- is_popular_commented: integer (nullable = false) |-- is_weekend: integer (nullable = true) |-- author_age: integer (nullable = true) |-- gme_mentioned: integer (nullable = false)
df.count()
7403473
# adjust figure size and font size
sns.set(rc = {"figure.figsize":(20, 12)})
sns.set(font_scale=1)
# compute the correlation matrix using
corr = df.limit(1000000).toPandas().corr(method='kendall')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,annot=True, fmt='.2f', square=True, mask = mask, cmap="Blues");
df_day = df.groupby("post_date").count().sort(col("post_date"))
df_day = df_day.withColumnRenamed("count","post_cnt")
df_day.show()
[Stage 237:================================================> (23 + 3) / 26]
+----------+--------+ | post_date|post_cnt| +----------+--------+ |2021-03-16| 1| |2021-03-19| 3| |2021-03-20| 2| |2021-03-22| 3| |2021-03-23| 4| |2021-03-24| 23| |2021-03-25| 7| |2021-03-27| 9| |2021-03-28| 21| |2021-03-29| 55| |2021-03-30| 22| |2021-03-31| 11| |2021-04-01| 7| |2021-04-02| 8| |2021-04-03| 3| |2021-04-04| 16| |2021-04-05| 65462| |2021-04-06| 68277| |2021-04-07| 71902| |2021-04-08| 82796| +----------+--------+ only showing top 20 rows
import matplotlib.dates as mdates
daily_posts = df_day.filter(col("count")>20).toPandas()
ax = daily_posts.plot.line(x="post_date", y="post_cnt")
ax.figure.set_size_inches(8, 6)
plt.title("Daily Activity on Superstonk",fontsize=18)
plt.xticks(rotation=45)
#plt.savefig("Daily_Activity.png",dpi=100,bbox_inches='tight')
plt.show()
df.write.save("s3://yl1269-labdata5/prepare_df")
spark.stop()