--- title: "WHO-2022" output: html_document --- # how to get the tweets posted by the followers of WHO on Twitter ```{r} #first, we should get the user IDs of accounts following WHO who_flw <- get_followers( "WHO", n = 8000000, retryonratelimit = TRUE) ``` ```{r} #second, we find the screen_names of users following WHO lookup_many_users <- function(users$user_id, twitter_token, retry_limit = 5){ require(rtweet) breaks <- seq(1, length(users$user_id), 89999) if(breaks[length(breaks)] != length(users$user_id)){ breaks <- c(breaks, length(users$user_id)) } user_details <- NULL for(i in 1:(length(breaks) -1)){ attempt <- 0 while(is.null(user_details) && attempt <= retry_limit){ attempt <- attempt + 1 try({ user_details <- lookup_users(users$user_id[breaks[i]:breaks[i+1]]) Sys.sleep(15 * 60) #wait 15 minutes for rate limit to reset before proceeding }) } if(is.null(user_details)){ stop("failed to get users") } if(i == 1){ all_user_details <- user_details } else { all_user_details <- rbind(all_user_details, user_details) } user_details <- NULL } return(all_user_details) } ``` ```{r} # save the results in the Rds or csv formats saveRDS(all_user_details, file = "~/who_flw/who_flw9.Rds") write_as_csv(all_user_details, file = "~/who_flw/who_flw9.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") ``` #now we want to use the descriptions used in the profiels of accounts following WHO to classify them to six Twitter user categories ```{r} # third, get the descriptions in user profiles who_flw_user_data <- users_data(who_flw_all) #save the results in Rds format saveRDS(who_flw_user_data, file = "~/who_flw/who_user_data.Rds") ``` ```{r} # then, clean the strings in the descriptions # we need some libraries to be uploaded library(dplyr) library(reshape2) library(stringr) library(ggplot2) library(vegan) library(readxl) library(quanteda) library(tidyr) library(tidytext) library(tidyverse) library(UpSetR) # remove duplicated screen names in the dataset, if any who_flw_user_data <- users users <- users[!duplicated(users$user_id), ] #delete numbers users$description <- gsub("[0-9]+", "", users$description) #delete capital letters users$description <- tolower(users$description) #delete emojis users$description <- gsub("[^\x01-\x7F]", "", users$description) #delete urls users$description <- gsub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", users$description) #delete punctuations punct <- '[]\\?!\"\'$%&(){}+*/:;,._`|~\\[<=>\\^-]' users$description <- gsub(punct, "", users$description) #delete NULL users <- users %>% filter(!grepl('NULL', description)) users <- users %>% filter(!is.na(description)) ``` ```{r} #now that the descriptions in profiles are cleaned, it's time to classify Twitter accounts following WHO to six categoreis. I have gotten and modified the following codes from Toupin, R., Millerand, F., & Larivière, V. (2019). Scholarly communication or public communication of science? Assessing who engage with climate change research on Twitter. In 17th International Conference on Scientometrics and Informetrics (ISSI 2019) with a special STI conference track, Rome, Italy. ## ================= # Academia (except healthcare) ## ================= acad <-"\\blecturer\\b|\\bprofessor\\b|\\bPhD\\b|\\bstudent\\b|\\bPh.D.\\b|\\bpostdoc\\b| \\bpostdoctoral\\b|\\bdoctoral\\b|\\bmsc\\b|\\bmaster\\b|\\bMS\\b|\\bBS\\b| \\bbachelor\\b|\\bundergrad\\b|\\bgrad\\b|\\bgraduate\\b|\\bundergraduate\\b| \\bscientist\\b|\\bpostgrad\b|\\bfaculty\\b|\\bchancellor\\b|\\buniversity\\b| \\bcollege\\b|\\bprovost\\b|vice-provost\\b" ## ================= # Politics (power) ## ================= polit <- "\\bcampaign\\b|\\bcampaigner\\b|\\brepublican\\b|\\bdemocrat\\b| \\bsenator\\b|\\bmayor\\b|\\bgovernor\\b|\\bminister\\b|\\bministry\\b|\\ bcounselor\\b|\\bparliament\\b|\\bpolitician\\b|\\brepresentative\\b|\\bstatseman\\b| \\bstateswoman\\b|\\bcongressman\\b|\\bcongresswoman\\b|\\bspokesman\\b|\\bspokeswoman\\b| \\bsenate\\b|\\bchairwoman\\b|\\bchairman\\b|\\brepresenting\\b| \\bcongressional\\b|\\bcommissioner\\b|\\bambassador\\b|\\bcongress\\b|\\bcouncilmember\\b| \\bassemblymember\\b|\\bassemblywoman\\b|\\bassemblyman\\b|\\bassembly\\b|\\bcouncilman\\b| \\bcouncilwoman\\b\\bcongressional\\b|\\b@WhiteHouse\\b" ## ================= # Print and Electronic Media ## ================= medi <-"\\bwriter\\b|\\bauthor\\b|\\bjournalist\\b|\\bblogger\\b|columnist\\b| \\bcommunicator\b|\\breporter\\b|\\bproducer\\b|\\bstoryteller\\b|\\bpodcaster\\b| \\byoutuber\\b|\\bnews\\b|\\bnewsman\\b|\\bnewswoman\\b|\\bpressman\\b|\\bpresswoman\\b| \\bjournal\\b|\\bpublish\\b|\\bpublisher\\b|\\bpublishing\\b|\\bpeer-reviewed\\b|\\breviewed\\b| \\bpress\\b|\\bfrancis\\b|\\belsevier\\b|\\bwiley\\b|\\bemerald\\b|\\bwolters\\b|\\breuters\\b| \\bblackwell\\b|\\bmdpi\\b|\\bpalgrave\\b|\\broutledge\\b|\\bspringer\\b|\\bsage\\b|\\bclarivate\\b| \\bbmc\\b|\\bbmj\\b|\\bplos\\b|\\bieee\\b|\\bcnn\\b|\\bfox\\b|\\bmsnbc\\b|\\bbbc\\b|\\bcbs\\b| \\bnbc\\b|\\babc\\b|\\bwion\\b|\\bcnbc\\b|\\bbroadcaster\\bcgtn\\b|\\bjazeera\\b" ## ================= # Healthcare professionals (academia and industry) ## ================= heal <- "\\bphysician\\b|\\bdoctor\\b|\\bnurse\\b|\\bmidwife\\b|\\bdentist\\b|\\bpharmacist\\b| \\bmicrobiologist\\b|\\bvirologist\\b|\\boncologist\\b|\\bhematologist\\b|\\bepidemiologist\\b| \\bphysiologist\\b|\\bpathologist\\b|\\bendocrinologist\\b| \\bnephrologist\\b|\\bneurologist\\b|\\bgeneticist\\b|\\bgeriatrician\\b|\\bsurgeon\\b| \\boptometrist\\b|\\bophthalmologist\\b|\\btherapist\\b|\\bpodiatrist\\b| \\bchiropodist\\b|\\bpedorthist\\b|\\bpediatrician\\b|\\bimmunologist\\b| \\banesthesiologist\\b|\\bcardiologist\\b|\\bdermatologist\\b|\\bendocrinologist\\b| \\bgastroenterologist\\b|\\bhematologist\\b|\\bobstetrician\\b|\\bgynecologists\\b| \\bosteopath\\b|\\botolaryngologist\\b|\\bphysiatrist\\b|\\bpodiatrist\\b|\\bpsychiatrist\\b| \\bpulmonologist\\b|\\bradiologist\\b|\\brheumatologist\\b|\\burologist\\b| 'infectious disease physician'|'infectious disease specialist'|'Johnson & Johnson'|\\broche\\b| \\bpfizer\\b|\\bnovartis\\b|\\bmerck\\b|\\bglaxosmithkline\\b|\\bsanofi\\b|\\babbvie\\b|\\btakeda\\b| \\bbayer\\b|\\bbristol-myers\\b|\\bpharmaceutical\\b| \\bastraZeneca\\b|\\bamgen\\b|\\bgilead\\b|\\bbiogen\\b|\\bmoderna\\b|\\bsinovac\\b| \\bnovavax\\b|\\bcfra\\b|\\ballergan\\b|\\babbvie\\b|\\babbott\\b| \\bstryker\\b|\\bregeneron\\b" ## ================= # Legal professionals (courts, justice, law, attorney) (Legal structure) ## ================= leg <- "\\blawyer\\b|\\battorney\\b|\\bcourt\\b|\\bjustice\\b|\\blawmaker\\b|\\bjudge\\b| \\bsolicitor\\b|\\bbarrister\\b|\\bjurist\\b|\\bjury\\b|\\bparalegal\\b|\\bcourtroom\\b| \\barbitrator\\b|\\blegislator\\b" ## ================= # Private sector (corporations, Inc., company, CEO, for-profits) (Except drug and pharma companies) ## ================= priv <- "\\bcorporation\\b|\\bInc.\\b|\\bceo\\b|\\bcorporate\\b|\\bcompany\\b|\\benterprise\\b| \\bcfo\\b|\\bcio\\b|\\bcoo\\b|\\bcmo\\b|\\bcfo\\b|\\bcto\\b|\\bcco\\b|\\bfirm\\b" ## ================= #integration ## ================= users$acad <- ifelse(grepl(acad,users$description) , 1, 0) users$polit <- ifelse(grepl(polit,users$description) , 1, 0) users$medi <- ifelse(grepl(medi,users$description) , 1, 0) users$heal <- ifelse(grepl(heal,users$description) , 1, 0) users$leg <- ifelse(grepl(leg,users$description) , 1, 0) users$priv <- ifelse(grepl(priv,users$description) , 1, 0) academia <- filter(users, users$acad == 1) politics <- filter(users, users$polit == 1) medi <- filter(users, users$medi == 1) health <- filter(users, users$heal == 1) legal <- filter(users, users$leg == 1) private <- filter(users, users$priv == 1) notassigned <- filter(users, users$acad == 0, users$polit == 0, users$medi == 0, users$hel == 0, users$leg == 0, users$priv == 0) #academialist <- as.list(academia$user_id) #politicslist <- as.list(politics$user_id) #medialist <- as.list(media$user_id) #healthlist <- as.list(health$user_id) #legallist <- as.list(legal$user_id) #privatellist <- as.list(private$user_id) fullist <- list(Academia = c(academialist), Politicians = c(politicslist), EP_Media = c(medialist), Health_Professionals = c(healthlist), Legal_Professionals = c(legallist), Private_Sector = c(privatellist)) #save the data saveRDS(fullist, file = "~/who_main_data/fullist.Rds") ``` ```{r} # Now it's time to find the most recent 200 tweets posted by each Twitter user cateogry such as academia # First, remove duplicated users (screen names) in each category such as academia (see the following codes) like academics based on Sepal.Width columns tweets_dup <- academia[!duplicated(academia$user_id), ] users <- tweets_dup get_timeline_unlimited <- function(users, n){ if (length(users) ==0){ return(NULL) } rl <- rate_limit(query = "get_timeline") if (length(users) <= rl$remaining){ print(glue("Getting data for {length(users)} users")) tweets <- get_timeline(users, n, check = FALSE) }else{ if (rl$remaining > 0){ users_first <- users[1:rl$remaining] users_rest <- users[-(1:rl$remaining)] print(glue("Getting data for {length(users_first)} users")) tweets_first <- get_timeline(users_first, n, check = FALSE) rl <- rate_limit(query = "get_timeline") }else{ tweets_first <- NULL users_rest <- users } wait <- rl$reset + 0.1 print(glue("Waiting for {round(wait,2)} minutes")) Sys.sleep(wait * 60) tweets_rest <- get_timeline_unlimited(users_rest, n) tweets <- bind_rows(tweets_first, tweets_rest) } return(tweets) } #get the most recent 200 tweets get<-get_timeline_unlimited(users$screen_name, n=200) #results can be saved #saveRDS(get, file = "~/newdata/academia_200.Rds") ``` ```{r} # only get the tweets posted from Jan, 01 to July, 31 2020 for academia and other categories library(lubridate) get %>% dplyr::select(created_at) %>% count(date = floor_date(created_at, "month"), sort = TRUE) %>% filter(date >= as.Date("2020-01-01")) get_limit_date <- academia_200 %>% filter(created_at >= as.Date("2020-01-01") & created_at <= as.Date("2020-06-31")) saveRDS(get_limit_date, file = "~/newdata/academia_date.Rds") ``` ```{r} # now only get the English langauge tweets for academia and other categories get_limit_lang <- get_limit_date %>% filter(lang == "en" ) #save the results for academia and other categories saveRDS(get_limit_lang, file = "~/newdata/academia_eng.Rds") # lets take a look at the top 100 hashtags in academia and other categories tophashtags <- get_limit_lang %>% unnest_tokens(word, text, "tweets") %>% filter(str_detect(word, "^#")) %>% select("status_id", "word") %>% rename(tweet = "status_id", name = "word") %>% head (100) # lets take a look at the top 100 words in academia and other categories topwords <- get_limit_lang %>% unnest_tokens(word, description) %>% anti_join(stop_words, by= "word") %>% count(word, sort = TRUE) %>% head(100) #another way to get the top 100 words extrawords <- c("the", "can", "get", "got", "can", "one", "dont", "even", "may", "but", "will", "much", "first", "but", "see", "new", "many", "less", "now", "well", "like", "often", "every", "said", "two", "amp", "day", "dr") # clean corpus get_limit_lang %>% unnest_tokens(word, text, token = "tweets") %>% filter(!word %in% stop_words$word, !word %in% str_remove_all(stop_words$word, "'"), str_detect(word, "[a-z]"), !str_detect(word, "^#"), !str_detect(word, "@\\S+")) %>% count(word, sort = TRUE) %>% top_n(100) # lets take a look at the top 100 mentions in academia and other categories get_limit_lang %>% unnest_tokens(mentions, text, "tweets", to_lower = FALSE) %>% filter(str_detect(mentions, "^@")) %>% count(mentions, sort = TRUE) %>% top_n(100) ``` ```{r} #select only the variables needed for this project in each Twitter user category get_limit <- get_limit_lang %>% select(user_id, status_id, screen_name, text, favorite_count, retweet_count, reply_to_screen_name, is_quote, is_retweet, quoted_text, retweet_text, location, description, followers_count, friends_count, favourites_count, urls_t.co) saveRDS(get_limit_lang, file = "~/newdata/academia_variables.Rds") ``` ############ Performing the Granger Causality Tests ############ ```{r} library(tidyverse) library(dplyr) library(lubridate) library(stringi) library(tidytext) library(readxl) library(rtweet) library(dynamac) library(forecast) library(tidyverse) library(tseries) library(urca) library(TSstudio) library(vars) library(knitr) WHO_freq <- read_excel("~/Documents/Dissertation_Dataset/1-Dissertation-Final-Files/who_agent_freq/WHO_freq.xls") media_freq <- read_excel("~/Documents/Dissertation_Dataset/1-Dissertation-Final-Files/who_agent_freq/media_freq.xlsx") agent <- media_freq # daily data to weekly data who_sub_week <- WHO_freq %>% group_by(week = week(Date)) %>% summarise(ill_effect) age_sub_week <- agent %>% group_by(week = week(Date)) %>% summarise(ill_effect = ill_effect) ``` ```{r} # removing 0s from the dataset, by removing the first two weeks who_sub_week <- who_sub_week[3:26, 1:1, drop = FALSE] age_sub_week <- age_sub_week[3:26, 1:1, drop = FALSE] ``` ```{r} who_1 <- who_sub_week$ill_effect age_1 <- age_sub_week$ill_effect #Declaring the Time Series, ADF (Augmented Dickey-Fuller) stationarity tests who_1 <- ts(who_1) adfwho <- adf.test(who_1) print(adfwho) age_1 <- ts(age_1) adfage <- adf.test(age_1) print(adfage) ``` ```{r} #when necessary usign boxcox (normality issue) who_1 <- BoxCox(who_sub_week$surveillance , lambda = "auto") who_1 <- ts(who_1) adfwho <- adf.test(who_1) print(adfwho) age_1 <- BoxCox(age_sub_week$surveillance , lambda = "auto") age_1 <- ts(age_1) adfage <- adf.test(age_1) print(adfage) ``` ```{r setup, include=FALSE} #make it stationary using first differences who_1 <- diff(who_1) who_1 <- ts(who_1) adfwho <- adf.test(who_1) print(adfwho) age_1 <- diff(age_1) age_1 <- ts(age_1) adfage <- adf.test(age_1) print(adfage) # you lose some degrees of freedom/run these tests # it should be covariance stationary ``` ```{r} #make it stationary using second and higher differences if it's not yet stationary who_1 = diff(who_1, differences = 2) who_1 <- ts(who_1) adfwho <- adf.test(who_1) print(adfwho) age_1 <- diff(age_1, differences = 2) age_1 <- ts(age_1) adfage <- adf.test(age_1) print(adfage) #remove one row 22 WHO and put it to y1 again who_1 <- who_1[2:181] #March 29 who_1 <- ts(who_1) age_1 <- age_1[2:24] #March 29 age_1 <- ts(age_1) ``` ```{r} #combine them and omit NAs combined = cbind(who_1, age_1) #my variable after 5 lags #optimal lag length lag <- VARselect(combined, lag.max = 10, type = "const") lag$selection ``` ```{r} model <- VAR(combined, p =5, type = "const", season = NULL) summary(model) #stargazer(model["varresult"], type = "html", out = "test.text") #acf(residuals(model)[,1]) #TOZIH ALI VAR: https://online.stat.psu.edu/stat510/lesson/11/11.2 ``` ```{r} #save results #sink("organization_freq_credibility.text") print(summary(model)) sink() ``` ```{r setup, include=FALSE} #Serial correlation (we dont want serial correlation inside the var model) serial <- serial.test(model, lags.pt = 12, type = "PT.asymptotic") serial # if p_value is greater than 0.05 it passess the test which is good #heteroscedasticity Arch1 <- arch.test(model, lags.multi = 10, multivariate.only = TRUE) Arch1 #normal distribution of residual (should be nromal) Norm1 <- normality.test(model, multivariate.only = TRUE) Norm1 #if pvalue is larger than 0.05 it passess the normality #structural breaks in the residuals (model stability) stability1 <- stability(model, type = "OLS-CUSUM") plot(stability1) #the system is stable if id does not cross the red line causality(model, cause = "who_1")$Granger #WHO causality(model, cause = "age_1")$Granger #agent ```