---
title: "WHO-2022"
output: html_document
---
# how to get the tweets posted by the followers of WHO on Twitter
```{r}
#first, we should get the user IDs of accounts following WHO
who_flw <- get_followers(
"WHO", n = 8000000, retryonratelimit = TRUE)
```
```{r}
#second, we find the screen_names of users following WHO
lookup_many_users <- function(users$user_id, twitter_token, retry_limit = 5){
require(rtweet)
breaks <- seq(1, length(users$user_id), 89999)
if(breaks[length(breaks)] != length(users$user_id)){
breaks <- c(breaks, length(users$user_id))
}
user_details <- NULL
for(i in 1:(length(breaks) -1)){
attempt <- 0
while(is.null(user_details) && attempt <= retry_limit){
attempt <- attempt + 1
try({
user_details <- lookup_users(users$user_id[breaks[i]:breaks[i+1]])
Sys.sleep(15 * 60) #wait 15 minutes for rate limit to reset before proceeding
})
}
if(is.null(user_details)){
stop("failed to get users")
}
if(i == 1){
all_user_details <- user_details
} else {
all_user_details <- rbind(all_user_details, user_details)
}
user_details <- NULL
}
return(all_user_details)
}
```
```{r}
# save the results in the Rds or csv formats
saveRDS(all_user_details, file = "~/who_flw/who_flw9.Rds")
write_as_csv(all_user_details, file = "~/who_flw/who_flw9.csv",
prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
```
#now we want to use the descriptions used in the profiels of accounts following WHO to classify them to six Twitter user categories
```{r}
# third, get the descriptions in user profiles
who_flw_user_data <- users_data(who_flw_all)
#save the results in Rds format
saveRDS(who_flw_user_data, file = "~/who_flw/who_user_data.Rds")
```
```{r}
# then, clean the strings in the descriptions
# we need some libraries to be uploaded
library(dplyr)
library(reshape2)
library(stringr)
library(ggplot2)
library(vegan)
library(readxl)
library(quanteda)
library(tidyr)
library(tidytext)
library(tidyverse)
library(UpSetR)
# remove duplicated screen names in the dataset, if any
who_flw_user_data <- users
users <- users[!duplicated(users$user_id), ]
#delete numbers
users$description <- gsub("[0-9]+", "", users$description)
#delete capital letters
users$description <- tolower(users$description)
#delete emojis
users$description <- gsub("[^\x01-\x7F]", "", users$description)
#delete urls
users$description <- gsub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", users$description)
#delete punctuations
punct <- '[]\\?!\"\'$%&(){}+*/:;,._`|~\\[<=>\\^-]'
users$description <- gsub(punct, "", users$description)
#delete NULL
users <- users %>%
filter(!grepl('NULL', description))
users <- users %>%
filter(!is.na(description))
```
```{r}
#now that the descriptions in profiles are cleaned, it's time to classify Twitter accounts following WHO to six categoreis. I have gotten and modified the following codes from Toupin, R., Millerand, F., & Larivière, V. (2019). Scholarly communication or public communication of science? Assessing who engage with climate change research on Twitter. In 17th International Conference on Scientometrics and Informetrics (ISSI 2019) with a special STI conference track, Rome, Italy.
## =================
# Academia (except healthcare)
## =================
acad <-"\\blecturer\\b|\\bprofessor\\b|\\bPhD\\b|\\bstudent\\b|\\bPh.D.\\b|\\bpostdoc\\b|
\\bpostdoctoral\\b|\\bdoctoral\\b|\\bmsc\\b|\\bmaster\\b|\\bMS\\b|\\bBS\\b|
\\bbachelor\\b|\\bundergrad\\b|\\bgrad\\b|\\bgraduate\\b|\\bundergraduate\\b|
\\bscientist\\b|\\bpostgrad\b|\\bfaculty\\b|\\bchancellor\\b|\\buniversity\\b|
\\bcollege\\b|\\bprovost\\b|vice-provost\\b"
## =================
# Politics (power)
## =================
polit <- "\\bcampaign\\b|\\bcampaigner\\b|\\brepublican\\b|\\bdemocrat\\b|
\\bsenator\\b|\\bmayor\\b|\\bgovernor\\b|\\bminister\\b|\\bministry\\b|\\
bcounselor\\b|\\bparliament\\b|\\bpolitician\\b|\\brepresentative\\b|\\bstatseman\\b|
\\bstateswoman\\b|\\bcongressman\\b|\\bcongresswoman\\b|\\bspokesman\\b|\\bspokeswoman\\b|
\\bsenate\\b|\\bchairwoman\\b|\\bchairman\\b|\\brepresenting\\b|
\\bcongressional\\b|\\bcommissioner\\b|\\bambassador\\b|\\bcongress\\b|\\bcouncilmember\\b|
\\bassemblymember\\b|\\bassemblywoman\\b|\\bassemblyman\\b|\\bassembly\\b|\\bcouncilman\\b|
\\bcouncilwoman\\b\\bcongressional\\b|\\b@WhiteHouse\\b"
## =================
# Print and Electronic Media
## =================
medi <-"\\bwriter\\b|\\bauthor\\b|\\bjournalist\\b|\\bblogger\\b|columnist\\b|
\\bcommunicator\b|\\breporter\\b|\\bproducer\\b|\\bstoryteller\\b|\\bpodcaster\\b|
\\byoutuber\\b|\\bnews\\b|\\bnewsman\\b|\\bnewswoman\\b|\\bpressman\\b|\\bpresswoman\\b|
\\bjournal\\b|\\bpublish\\b|\\bpublisher\\b|\\bpublishing\\b|\\bpeer-reviewed\\b|\\breviewed\\b|
\\bpress\\b|\\bfrancis\\b|\\belsevier\\b|\\bwiley\\b|\\bemerald\\b|\\bwolters\\b|\\breuters\\b|
\\bblackwell\\b|\\bmdpi\\b|\\bpalgrave\\b|\\broutledge\\b|\\bspringer\\b|\\bsage\\b|\\bclarivate\\b|
\\bbmc\\b|\\bbmj\\b|\\bplos\\b|\\bieee\\b|\\bcnn\\b|\\bfox\\b|\\bmsnbc\\b|\\bbbc\\b|\\bcbs\\b|
\\bnbc\\b|\\babc\\b|\\bwion\\b|\\bcnbc\\b|\\bbroadcaster\\bcgtn\\b|\\bjazeera\\b"
## =================
# Healthcare professionals (academia and industry)
## =================
heal <- "\\bphysician\\b|\\bdoctor\\b|\\bnurse\\b|\\bmidwife\\b|\\bdentist\\b|\\bpharmacist\\b|
\\bmicrobiologist\\b|\\bvirologist\\b|\\boncologist\\b|\\bhematologist\\b|\\bepidemiologist\\b|
\\bphysiologist\\b|\\bpathologist\\b|\\bendocrinologist\\b|
\\bnephrologist\\b|\\bneurologist\\b|\\bgeneticist\\b|\\bgeriatrician\\b|\\bsurgeon\\b|
\\boptometrist\\b|\\bophthalmologist\\b|\\btherapist\\b|\\bpodiatrist\\b|
\\bchiropodist\\b|\\bpedorthist\\b|\\bpediatrician\\b|\\bimmunologist\\b|
\\banesthesiologist\\b|\\bcardiologist\\b|\\bdermatologist\\b|\\bendocrinologist\\b|
\\bgastroenterologist\\b|\\bhematologist\\b|\\bobstetrician\\b|\\bgynecologists\\b|
\\bosteopath\\b|\\botolaryngologist\\b|\\bphysiatrist\\b|\\bpodiatrist\\b|\\bpsychiatrist\\b|
\\bpulmonologist\\b|\\bradiologist\\b|\\brheumatologist\\b|\\burologist\\b|
'infectious disease physician'|'infectious disease specialist'|'Johnson & Johnson'|\\broche\\b|
\\bpfizer\\b|\\bnovartis\\b|\\bmerck\\b|\\bglaxosmithkline\\b|\\bsanofi\\b|\\babbvie\\b|\\btakeda\\b|
\\bbayer\\b|\\bbristol-myers\\b|\\bpharmaceutical\\b|
\\bastraZeneca\\b|\\bamgen\\b|\\bgilead\\b|\\bbiogen\\b|\\bmoderna\\b|\\bsinovac\\b|
\\bnovavax\\b|\\bcfra\\b|\\ballergan\\b|\\babbvie\\b|\\babbott\\b|
\\bstryker\\b|\\bregeneron\\b"
## =================
# Legal professionals (courts, justice, law, attorney) (Legal structure)
## =================
leg <- "\\blawyer\\b|\\battorney\\b|\\bcourt\\b|\\bjustice\\b|\\blawmaker\\b|\\bjudge\\b|
\\bsolicitor\\b|\\bbarrister\\b|\\bjurist\\b|\\bjury\\b|\\bparalegal\\b|\\bcourtroom\\b|
\\barbitrator\\b|\\blegislator\\b"
## =================
# Private sector (corporations, Inc., company, CEO, for-profits) (Except drug and pharma companies)
## =================
priv <- "\\bcorporation\\b|\\bInc.\\b|\\bceo\\b|\\bcorporate\\b|\\bcompany\\b|\\benterprise\\b|
\\bcfo\\b|\\bcio\\b|\\bcoo\\b|\\bcmo\\b|\\bcfo\\b|\\bcto\\b|\\bcco\\b|\\bfirm\\b"
## =================
#integration
## =================
users$acad <- ifelse(grepl(acad,users$description) , 1, 0)
users$polit <- ifelse(grepl(polit,users$description) , 1, 0)
users$medi <- ifelse(grepl(medi,users$description) , 1, 0)
users$heal <- ifelse(grepl(heal,users$description) , 1, 0)
users$leg <- ifelse(grepl(leg,users$description) , 1, 0)
users$priv <- ifelse(grepl(priv,users$description) , 1, 0)
academia <- filter(users, users$acad == 1)
politics <- filter(users, users$polit == 1)
medi <- filter(users, users$medi == 1)
health <- filter(users, users$heal == 1)
legal <- filter(users, users$leg == 1)
private <- filter(users, users$priv == 1)
notassigned <- filter(users, users$acad == 0, users$polit == 0,
users$medi == 0, users$hel == 0,
users$leg == 0, users$priv == 0)
#academialist <- as.list(academia$user_id)
#politicslist <- as.list(politics$user_id)
#medialist <- as.list(media$user_id)
#healthlist <- as.list(health$user_id)
#legallist <- as.list(legal$user_id)
#privatellist <- as.list(private$user_id)
fullist <- list(Academia = c(academialist), Politicians = c(politicslist),
EP_Media = c(medialist), Health_Professionals = c(healthlist), Legal_Professionals = c(legallist),
Private_Sector = c(privatellist))
#save the data
saveRDS(fullist, file = "~/who_main_data/fullist.Rds")
```
```{r}
# Now it's time to find the most recent 200 tweets posted by each Twitter user cateogry such as academia
# First, remove duplicated users (screen names) in each category such as academia (see the following codes) like academics based on Sepal.Width columns
tweets_dup <- academia[!duplicated(academia$user_id), ]
users <- tweets_dup
get_timeline_unlimited <- function(users, n){
if (length(users) ==0){
return(NULL)
}
rl <- rate_limit(query = "get_timeline")
if (length(users) <= rl$remaining){
print(glue("Getting data for {length(users)} users"))
tweets <- get_timeline(users, n, check = FALSE)
}else{
if (rl$remaining > 0){
users_first <- users[1:rl$remaining]
users_rest <- users[-(1:rl$remaining)]
print(glue("Getting data for {length(users_first)} users"))
tweets_first <- get_timeline(users_first, n, check = FALSE)
rl <- rate_limit(query = "get_timeline")
}else{
tweets_first <- NULL
users_rest <- users
}
wait <- rl$reset + 0.1
print(glue("Waiting for {round(wait,2)} minutes"))
Sys.sleep(wait * 60)
tweets_rest <- get_timeline_unlimited(users_rest, n)
tweets <- bind_rows(tweets_first, tweets_rest)
}
return(tweets)
}
#get the most recent 200 tweets
get<-get_timeline_unlimited(users$screen_name, n=200)
#results can be saved
#saveRDS(get, file = "~/newdata/academia_200.Rds")
```
```{r}
# only get the tweets posted from Jan, 01 to July, 31 2020 for academia and other categories
library(lubridate)
get %>%
dplyr::select(created_at) %>%
count(date = floor_date(created_at, "month"), sort = TRUE) %>%
filter(date >= as.Date("2020-01-01"))
get_limit_date <- academia_200 %>%
filter(created_at >= as.Date("2020-01-01") & created_at <= as.Date("2020-06-31"))
saveRDS(get_limit_date, file = "~/newdata/academia_date.Rds")
```
```{r}
# now only get the English langauge tweets for academia and other categories
get_limit_lang <- get_limit_date %>%
filter(lang == "en" )
#save the results for academia and other categories
saveRDS(get_limit_lang, file = "~/newdata/academia_eng.Rds")
# lets take a look at the top 100 hashtags in academia and other categories
tophashtags <- get_limit_lang %>%
unnest_tokens(word, text, "tweets") %>%
filter(str_detect(word, "^#")) %>%
select("status_id", "word") %>%
rename(tweet = "status_id",
name = "word") %>%
head (100)
# lets take a look at the top 100 words in academia and other categories
topwords <- get_limit_lang %>%
unnest_tokens(word, description) %>%
anti_join(stop_words, by= "word") %>%
count(word, sort = TRUE) %>%
head(100)
#another way to get the top 100 words
extrawords <- c("the", "can", "get", "got", "can", "one",
"dont", "even", "may", "but", "will",
"much", "first", "but", "see", "new",
"many", "less", "now", "well", "like",
"often", "every", "said", "two", "amp", "day", "dr")
# clean corpus
get_limit_lang %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"),
str_detect(word, "[a-z]"),
!str_detect(word, "^#"),
!str_detect(word, "@\\S+")) %>%
count(word, sort = TRUE) %>%
top_n(100)
# lets take a look at the top 100 mentions in academia and other categories
get_limit_lang %>%
unnest_tokens(mentions, text, "tweets", to_lower = FALSE) %>%
filter(str_detect(mentions, "^@")) %>%
count(mentions, sort = TRUE) %>%
top_n(100)
```
```{r}
#select only the variables needed for this project in each Twitter user category
get_limit <- get_limit_lang %>%
select(user_id, status_id, screen_name, text, favorite_count, retweet_count, reply_to_screen_name,
is_quote, is_retweet, quoted_text, retweet_text, location, description,
followers_count, friends_count, favourites_count, urls_t.co)
saveRDS(get_limit_lang, file = "~/newdata/academia_variables.Rds")
```
############
Performing the Granger Causality Tests
############
```{r}
library(tidyverse)
library(dplyr)
library(lubridate)
library(stringi)
library(tidytext)
library(readxl)
library(rtweet)
library(dynamac)
library(forecast)
library(tidyverse)
library(tseries)
library(urca)
library(TSstudio)
library(vars)
library(knitr)
WHO_freq <- read_excel("~/Documents/Dissertation_Dataset/1-Dissertation-Final-Files/who_agent_freq/WHO_freq.xls")
media_freq <- read_excel("~/Documents/Dissertation_Dataset/1-Dissertation-Final-Files/who_agent_freq/media_freq.xlsx")
agent <- media_freq
# daily data to weekly data
who_sub_week <- WHO_freq %>% group_by(week = week(Date)) %>% summarise(ill_effect)
age_sub_week <- agent %>% group_by(week = week(Date)) %>% summarise(ill_effect = ill_effect)
```
```{r}
# removing 0s from the dataset, by removing the first two weeks
who_sub_week <- who_sub_week[3:26, 1:1, drop = FALSE]
age_sub_week <- age_sub_week[3:26, 1:1, drop = FALSE]
```
```{r}
who_1 <- who_sub_week$ill_effect
age_1 <- age_sub_week$ill_effect
#Declaring the Time Series, ADF (Augmented Dickey-Fuller) stationarity tests
who_1 <- ts(who_1)
adfwho <- adf.test(who_1)
print(adfwho)
age_1 <- ts(age_1)
adfage <- adf.test(age_1)
print(adfage)
```
```{r}
#when necessary usign boxcox (normality issue)
who_1 <- BoxCox(who_sub_week$surveillance , lambda = "auto")
who_1 <- ts(who_1)
adfwho <- adf.test(who_1)
print(adfwho)
age_1 <- BoxCox(age_sub_week$surveillance , lambda = "auto")
age_1 <- ts(age_1)
adfage <- adf.test(age_1)
print(adfage)
```
```{r setup, include=FALSE}
#make it stationary using first differences
who_1 <- diff(who_1)
who_1 <- ts(who_1)
adfwho <- adf.test(who_1)
print(adfwho)
age_1 <- diff(age_1)
age_1 <- ts(age_1)
adfage <- adf.test(age_1)
print(adfage)
# you lose some degrees of freedom/run these tests
# it should be covariance stationary
```
```{r}
#make it stationary using second and higher differences if it's not yet stationary
who_1 = diff(who_1, differences = 2)
who_1 <- ts(who_1)
adfwho <- adf.test(who_1)
print(adfwho)
age_1 <- diff(age_1, differences = 2)
age_1 <- ts(age_1)
adfage <- adf.test(age_1)
print(adfage)
#remove one row 22 WHO and put it to y1 again
who_1 <- who_1[2:181] #March 29
who_1 <- ts(who_1)
age_1 <- age_1[2:24] #March 29
age_1 <- ts(age_1)
```
```{r}
#combine them and omit NAs
combined = cbind(who_1, age_1) #my variable after 5 lags
#optimal lag length
lag <- VARselect(combined, lag.max = 10, type = "const")
lag$selection
```
```{r}
model <- VAR(combined, p =5, type = "const", season = NULL)
summary(model)
#stargazer(model["varresult"], type = "html", out = "test.text")
#acf(residuals(model)[,1])
#TOZIH ALI VAR: https://online.stat.psu.edu/stat510/lesson/11/11.2
```
```{r}
#save results
#sink("organization_freq_credibility.text")
print(summary(model))
sink()
```
```{r setup, include=FALSE}
#Serial correlation (we dont want serial correlation inside the var model)
serial <- serial.test(model, lags.pt = 12, type = "PT.asymptotic")
serial # if p_value is greater than 0.05 it passess the test which is good
#heteroscedasticity
Arch1 <- arch.test(model, lags.multi = 10, multivariate.only = TRUE)
Arch1
#normal distribution of residual (should be nromal)
Norm1 <- normality.test(model, multivariate.only = TRUE)
Norm1 #if pvalue is larger than 0.05 it passess the normality
#structural breaks in the residuals (model stability)
stability1 <- stability(model, type = "OLS-CUSUM")
plot(stability1) #the system is stable if id does not cross the red line
causality(model, cause = "who_1")$Granger #WHO
causality(model, cause = "age_1")$Granger #agent
```