Preliminaries: Load Libraries and Log into Twitter

library(rtweet)
library(httpuv)
library(tidyverse)
library(tidytext)

Make sure you’re logged onto twitter in a web browser

Extracting Data from the Twitter API via rtweet Package

Extracting data by hashtag

# Pull tweets with #ValentinesDay; returns 1000 most recent tweets; time by GMT
valentine_tweets<-search_tweets(q="#ValentinesDay", 
                                   n=1000,
                                   include_rts=FALSE,
                                   `-filter`="replies",
                                   lang="en")
#prints "valentine_tweets"
valentine_tweets
## # A tibble: 991 × 90
##    user_id   status_id           created_at          screen_name text     source
##    <chr>     <chr>               <dttm>              <chr>       <chr>    <chr> 
##  1 266143492 1495089702940073987 2022-02-19 17:35:37 InnatLB     "Ignite… Later…
##  2 542918259 1495089698045321223 2022-02-19 17:35:36 ruby_redsky "silver… Twitt…
##  3 542918259 1494795575006568450 2022-02-18 22:06:51 ruby_redsky "silver… Twitt…
##  4 542918259 1494795421138534411 2022-02-18 22:06:15 ruby_redsky "Obsidi… Twitt…
##  5 542918259 1494865769708064772 2022-02-19 02:45:47 ruby_redsky "Wolf t… Twitt…
##  6 542918259 1495043965619515394 2022-02-19 14:33:52 ruby_redsky "Black … Twitt…
##  7 542918259 1494731181904048128 2022-02-18 17:50:59 ruby_redsky "Shooti… Twitt…
##  8 542918259 1495064426482970633 2022-02-19 15:55:10 ruby_redsky "Anklet… Twitt…
##  9 542918259 1494867826338840576 2022-02-19 02:53:57 ruby_redsky "dragon… Twitt…
## 10 542918259 1494756928421044226 2022-02-18 19:33:17 ruby_redsky "twig h… Twitt…
## # … with 981 more rows, and 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>, …

Extracting by multiple hashtags or keywords

“And” conditions

# Pull tweets with #ValentinesDay AND #SinglesAwareness
valentinesday_AND_singlesawareness<-search_tweets(q="#ValentinesDay #SinglesAwareness", 
                                    n=1000,
                                    include_rts=FALSE,
                                    `-filter`="replies",
                                    lang="en")
# prints "valentinesday_single"
valentinesday_AND_singlesawareness
## # A tibble: 3 × 90
##   user_id             status_id     created_at          screen_name text  source
##   <chr>               <chr>         <dttm>              <chr>       <chr> <chr> 
## 1 2934459647          149342042238… 2022-02-15 03:02:29 DM_UrFutur… "It’… Twitt…
## 2 1469049394783686662 149323890876… 2022-02-14 15:01:13 heynicehat  "#Ha… Buffer
## 3 1469049394783686662 149223200196… 2022-02-11 20:20:08 heynicehat  "#Ni… Buffer
## # … with 84 more variables: display_text_width <dbl>, reply_to_status_id <lgl>,
## #   reply_to_user_id <lgl>, reply_to_screen_name <lgl>, is_quote <lgl>,
## #   is_retweet <lgl>, favorite_count <int>, retweet_count <int>,
## #   quote_count <int>, reply_count <int>, hashtags <list>, symbols <list>,
## #   urls_url <list>, urls_t.co <list>, urls_expanded_url <list>,
## #   media_url <list>, media_t.co <list>, media_expanded_url <list>,
## #   media_type <list>, ext_media_url <list>, ext_media_t.co <list>, …

“Or” conditions

# Pull tweets with #ValentinesDay OR singlesday
valentinesday_OR_singlesawareness<-search_tweets(q="#ValentinesDay OR singlesday", 
                                                  n=1000,
                                                  include_rts=FALSE,
                                                  `-filter`="replies",
                                                  lang="en")
# prints "valentinesday_OR_singlesawareness"
valentinesday_OR_singlesawareness
## # A tibble: 991 × 90
##    user_id   status_id           created_at          screen_name text     source
##    <chr>     <chr>               <dttm>              <chr>       <chr>    <chr> 
##  1 266143492 1495089702940073987 2022-02-19 17:35:37 InnatLB     "Ignite… Later…
##  2 542918259 1495089698045321223 2022-02-19 17:35:36 ruby_redsky "silver… Twitt…
##  3 542918259 1494838053839552514 2022-02-19 00:55:39 ruby_redsky "sexy d… Twitt…
##  4 542918259 1494795575006568450 2022-02-18 22:06:51 ruby_redsky "silver… Twitt…
##  5 542918259 1494795421138534411 2022-02-18 22:06:15 ruby_redsky "Obsidi… Twitt…
##  6 542918259 1495043965619515394 2022-02-19 14:33:52 ruby_redsky "Black … Twitt…
##  7 542918259 1495017548185849860 2022-02-19 12:48:54 ruby_redsky "Frog e… Twitt…
##  8 542918259 1494731181904048128 2022-02-18 17:50:59 ruby_redsky "Shooti… Twitt…
##  9 542918259 1494865769708064772 2022-02-19 02:45:47 ruby_redsky "Wolf t… Twitt…
## 10 542918259 1495056043801460740 2022-02-19 15:21:52 ruby_redsky "pizza … Twitt…
## # … with 981 more rows, and 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>, …

Searching twitter handles and extracting tweet history

#Pull tweets from an account (doesn't have same time constraints)
# Pull last 500 tweets from @VDay, a global activist movement to end violence against women that is associated with Valentine's day (note sometimes the query will return less than specified number due to deletions)
vday_tweets<-get_timeline("@VDay", n=500)
# prints vday_tweets
vday_tweets
## # A tibble: 600 × 90
##    user_id  status_id           created_at          screen_name text      source
##    <chr>    <chr>               <dttm>              <chr>       <chr>     <chr> 
##  1 27031487 1495076927194890243 2022-02-19 16:44:51 VDay        "#Afghan… Twitt…
##  2 27031487 1494963941428006914 2022-02-19 09:15:53 VDay        "\"Lider… Twitt…
##  3 27031487 1494924854990020608 2022-02-19 06:40:34 VDay        "\"Vario… Twitt…
##  4 27031487 1494924798161424385 2022-02-19 06:40:20 VDay        "\"Vario… Twitt…
##  5 27031487 1494924739898269696 2022-02-19 06:40:07 VDay        "\"Vario… Twitt…
##  6 27031487 1494909659416653833 2022-02-19 05:40:11 VDay        "\"#WeOn… Twitt…
##  7 27031487 1494884200922505216 2022-02-19 03:59:01 VDay        "\"Atten… Twitt…
##  8 27031487 1494843840171167746 2022-02-19 01:18:39 VDay        "\"CARE … Twitt…
##  9 27031487 1494826837452607493 2022-02-19 00:11:05 VDay        "This #B… Twitt…
## 10 27031487 1494794626737184769 2022-02-18 22:03:05 VDay        "New Mob… Twitt…
## # … with 590 more rows, and 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>, …

Querying twitter datasets

Querying vday_tweets to find the 10 tweets with most favorites

# Extracts 10 most favorited tweets from "vday_tweets"
vday_tweets_most_favorites<-vday_tweets %>% 
                              slice_max(favorite_count, n=10)
# prints "vday_tweets_most_favorites"
vday_tweets_most_favorites
## # A tibble: 11 × 90
##    user_id  status_id           created_at          screen_name text      source
##    <chr>    <chr>               <dttm>              <chr>       <chr>     <chr> 
##  1 27031487 1493244612063891456 2022-02-14 15:23:53 VDay        "\"On 14… Twitt…
##  2 27031487 1493251071208161287 2022-02-14 15:49:33 VDay        "TODAY a… Twitt…
##  3 27031487 1491631604346753025 2022-02-10 04:34:22 VDay        "\"At 8:… Twitt…
##  4 27031487 1467250501678813189 2021-12-04 21:52:34 VDay        "\"Gabri… Twitt…
##  5 27031487 1493135547686010881 2022-02-14 08:10:30 VDay        "#1Billi… Twitt…
##  6 27031487 1452073970006593543 2021-10-24 00:46:27 VDay        "\"Brave… Twitt…
##  7 27031487 1484337581567332352 2022-01-21 01:30:31 VDay        "\"Despi… Twitt…
##  8 27031487 1493017894866669568 2022-02-14 00:22:59 VDay        "#1Billi… Twitt…
##  9 27031487 1493079847509258241 2022-02-14 04:29:10 VDay        "\"Are y… Twitt…
## 10 27031487 1463190977049333763 2021-11-23 17:01:28 VDay        "\"What … Twitt…
## 11 27031487 1449101796325031936 2021-10-15 19:56:05 VDay        "\"We ex… Twitt…
## # … with 84 more variables: display_text_width <dbl>, reply_to_status_id <chr>,
## #   reply_to_user_id <chr>, reply_to_screen_name <chr>, is_quote <lgl>,
## #   is_retweet <lgl>, favorite_count <int>, retweet_count <int>,
## #   quote_count <int>, reply_count <int>, hashtags <list>, symbols <list>,
## #   urls_url <list>, urls_t.co <list>, urls_expanded_url <list>,
## #   media_url <list>, media_t.co <list>, media_expanded_url <list>,
## #   media_type <list>, ext_media_url <list>, ext_media_t.co <list>, …

Querying vday_tweets to find the 10 tweets with most retweets

# Extracts 10 most retweeted observations from "vday_tweets"
vday_tweets_most_retweeted<-vday_tweets %>% 
                              slice_max(retweet_count, n=10) %>% 
                              select(created_at, screen_name, text, retweet_count)
# prints "vday_tweets_most_retweeted"
vday_tweets_most_retweeted
## # A tibble: 11 × 4
##    created_at          screen_name text                            retweet_count
##    <dttm>              <chr>       <chr>                                   <int>
##  1 2022-02-14 08:10:30 VDay        "#1BillionRising activists eve…            14
##  2 2022-01-21 01:30:31 VDay        "\"Despite the achievements in…            13
##  3 2022-02-10 04:34:22 VDay        "\"At 8:30am SLT, 11 Feb, at t…            11
##  4 2021-12-13 01:20:42 VDay        "\"Major press conference to a…            11
##  5 2021-12-27 21:59:40 VDay        "\"Thank you Berenice Leila fo…            10
##  6 2021-11-25 19:22:43 VDay        "Reasons why you shouldn't sto…            10
##  7 2022-02-14 15:23:53 VDay        "\"On 14 Feb, we dance &amp; R…             9
##  8 2022-02-09 06:50:18 VDay        "\"I feel happy &amp; grateful…             9
##  9 2022-02-02 02:24:41 VDay        "CALL TO ACTION: Call For Inte…             9
## 10 2022-01-30 21:13:05 VDay        "\"#MexicoCity will Rise for t…             9
## 11 2021-11-23 17:01:28 VDay        "\"What a great day w the offi…             9

Visualizing Twitter data

Using ggplot to visualize twitter data

# creates new column that adds #
CancelStudentDebt_coinciding_hashtags<-ValentinesDay_coinciding_hashtags %>% 
                                        mutate(hashtag=paste0("#", hashtags))
# Makes inverted bar chart of "CancelStudentDebt_coinciding_hashtags"
coincident_hashtags_plot<-
  ggplot(CancelStudentDebt_coinciding_hashtags, aes(x=reorder(hashtag, n), y=n))+
    geom_bar(stat="identity")+
      coord_flip()+
      xlab("")+
      ylab("Frequency")+
      ggtitle("Hashtags Most Frequently Used Along With #ValentinesDay")+
      labs(caption = "Data Collected from Twitter REST API via rtweet")
# prints "coincident_hashtags_plot"
coincident_hashtags_plot

Using rtweet’s visualization functions: time series

# creates hourly time series of tweets with #ValentinesDay on February 17
ts_plot(valentine_tweets, by="hours") +
  labs(x = NULL, y = NULL,
       title = "Frequency of tweets with #ValentinesDay",
       subtitle = paste0(format(min(valentine_tweets$created_at), "%d %B %Y"), 
                         " to ", 
                         format(max(valentine_tweets$created_at),"%d %B %Y")),
       caption = "Data collected from Twitter's REST API via rtweet") +
  theme_minimal()

Student Exercise: Select a hashtag, and make a visualization of the 15 most frequently along with your chosen hashtag