#' --- #' title: "Trump Twitter analysis using the `tidyverse`" #' author: "Adam Spannbauer and Jennifer Chunn" #' date: "`r Sys.Date()`" #' output: #' rmarkdown::html_vignette: #' df_print: kable #' vignette: | #' %\VignetteIndexEntry{Trump Twitter tidyverse analysis} #' %\VignetteEngine{knitr::rmarkdown} #' %\VignetteEncoding{UTF-8} #' --- #' #' This vignette is based on data collected for the 538 story entitled "The World's Favorite Donald Trump Tweets" by Leah Libresco available [](here)(https://fivethirtyeight.com/features/the-worlds-favorite-donald-trump-tweets/). #' #' Load required packages to reproduce analysis. #' ## ---- message=FALSE, warning=FALSE--------------------------------------- library(fivethirtyeight) library(ggplot2) library(dplyr) library(readr) library(tidytext) library(textdata) library(stringr) library(lubridate) library(knitr) library(hunspell) # Turn off scientific notation options(scipen = 99) #' ## Check date range of tweets #' ## ----date_range---------------------------------------------------------- ## check out structure and date range ------------------------------------------------ (minDate <- min(date(trump_twitter$created_at))) (maxDate <- max(date(trump_twitter$created_at))) #' #' #' # Create vectorised stemming function using hunspell ## ----hunspell------------------------------------------------------------ my_hunspell_stem <- function(token) { stem_token <- hunspell_stem(token)[]([](1)) if (length(stem_token) == 0) return(token) else return(stem_token[](1)) } vec_hunspell_stem <- Vectorize(my_hunspell_stem, "token") #' #' #' # Clean text by tokenizing & removing urls/stopwords #' We first remove URLs and stopwords as specified in the `tidytext` library. Stopwords are common words in English. We also do spellchecking using hunspell. ## ----tokens-------------------------------------------------------------- trump_tokens <- trump_twitter %>% mutate(text = str_replace_all(text, pattern=regex("(www|https?[](^\\s)+)"), replacement = "")) %>% #rm urls mutate(text = str_replace_all(text, pattern = "[]([](:digit:))", replacement = "")) %>% unnest_tokens(tokens, text) %>% #tokenize mutate(tokens = vec_hunspell_stem(tokens)) %>% filter(!(tokens %in% stop_words$word)) #rm stopwords #' #' #' # Sentiment analysis #' To measure the sentiment of tweets, we used the AFINN lexicon for each (non-stop) word in a tweet. The score runs between -5 and 5. We then sum the scores for each word across all words in one tweet to get a total tweet sentiment score. ## ----sentiment----------------------------------------------------------- afinn_sentiment <- system.file("extdata", "afinn.csv", package = "fivethirtyeight") %>% read_csv() trump_sentiment <- trump_tokens %>% inner_join(afinn_sentiment, by=c("tokens"="word")) trump_full_text_sent <- trump_sentiment %>% group_by(id) %>% summarise(score = sum(value, na.rm=TRUE)) %>% ungroup() %>% right_join(trump_twitter, by="id") %>% mutate(score_factor = ifelse(is.na(score), "Missing score", ifelse(score < 0, "-.Negative", ifelse(score == 0, "0", "+.Pos")))) #' #' ## Distribution of sentiment scores #' ## ------------------------------------------------------------------------ trump_full_text_sent %>% count(score_factor) %>% mutate(prop = prop.table(n)) #' #' 46.4% of tweets did not have sentiment scores. 15.4% were net negative and 36.6% were net positive. #' ## ----sentiment_hist, fig.width=7, , warning=FALSE------------------------ ggplot(data=trump_full_text_sent, aes(score)) + geom_histogram(bins = 10) #' #' #' #' # plot sentiment over time ## ----plot_time, fig.width=7---------------------------------------------- sentOverTimeGraph <- ggplot(data=filter(trump_full_text_sent,!is.na(score)), aes(x=created_at, y=score)) + geom_line() + geom_point() + xlab("Date") + ylab("Sentiment (afinn)") + ggtitle(paste0("Trump Tweet Sentiment (",minDate," to ",maxDate,")")) sentOverTimeGraph #' #' #' # Examine top 5 most positive tweets ## ----pos_tweets---------------------------------------------------------- most_pos_trump <- trump_full_text_sent %>% arrange(desc(score)) %>% head(n=5) %>% .[]([]("text")) kable(most_pos_trump, format="html") #' #' # Examine top 5 most negative tweets ## ---- neg_tweets--------------------------------------------------------- most_neg_trump <- trump_full_text_sent %>% arrange(score) %>% head(n=5) %>% .[]([]("text")) kable(most_neg_trump, format = "html") #' #' #' #' # When is trumps favorite time to tweet? #' Total number of tweets and average sentiment (when available) by hour of the day, day of the week, and month ## ----tweet_time---------------------------------------------------------- trump_tweet_times <- trump_full_text_sent %>% mutate(weekday = wday(created_at, label=TRUE), month = month(created_at, label=TRUE), hour = hour(created_at), month_over_time = round_date(created_at,"month")) plotSentByTime <- function(trump_tweet_times, timeGroupVar) { timeVar <- substitute(timeGroupVar) timeVarLabel <- str_to_title(timeVar) trump_tweet_time_sent <- trump_tweet_times %>% rename(timeGroup = !! timeVar) %>% group_by(timeGroup) %>% summarise(score = mean(score, na.rm=TRUE), Count = n()) %>% ungroup() ggplot(trump_tweet_time_sent, aes(x=timeGroup, y=Count, fill = score)) + geom_bar(stat="identity") + xlab(timeVarLabel) + ggtitle(paste("Trump Tweet Count & Sentiment by", timeVarLabel)) } #' #' ## ----plot_hour, fig.width=7, warning=FALSE------------------------------- plotSentByTime(trump_tweet_times, "hour") #' #' * Trump tweets the least between 4 and 10 am. #' * Trump's tweets are most positive during the 10am hour. #' #' ## ----plot_weekday, fig.width=7, warning=FALSE---------------------------- plotSentByTime(trump_tweet_times, "weekday") #' #' * Trump tweeted the most on Tuesday and Wednesday #' * Trump was most positive in the second part of the work week (Wed, Thurs, Fri) #' ## ----plot_month, fig.width=7, warning=FALSE------------------------------ plotSentByTime(trump_tweet_times, "month_over_time") #' #' * In this dataset, the number of tweets decreased after November 2015 and drastically dropped off after March 2016. It is unclear if this is a result of actual decrease in tweeting frequency or a result of the data collection process.