#{r} #require(devtools) #install_version("quanteda", version = "1.2.0", repos = "http://cran.us.r-project.org") #

library(readtext)
library(quanteda)
library(tidyverse)
library(stm)
library(tidytext)
library(haven)
library(data.table)

UNGD data are available on the Harvard Dataverse at https://doi.org/10.7910/DVN/0TJX8Y


DATA_DIR <- "~/Dropbox/Research/UNGDC projects/UN Data/" 

ungd_files <- readtext(paste0(DATA_DIR, "TXT/*"), 
                                 docvarsfrom = "filenames", 
                                 dvsep="_", 
                                 docvarnames = c("Country", "Session", "Year"))


ungd_files$doc_id <- str_replace(ungd_files$doc_id , ".txt", "") %>%
   str_replace(. , "_\\d{2}", "")
set.seed(123)
#Logicals for EU member states
EU <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD")
wave1 <- c("DNK", "IRL", "GBR")
wave2 <- "GRC" 
wave3 <- c("ESP", "PRT") 
wave4 <- c("AUT", "FIN", "SWE") 
wave5 <- c("CZE", "HUN", "POL", "EST", "LVA", "LTU", "CYP", "MLT", "SVK", "SVN")
wave6 <- c("BGR", "ROU") 
wave7 <- "HRV" 
ungd_files$accession <- as.numeric(ungd_files$Year>2003)

ungd_files <- ungd_files %>% mutate(is.eu = as.numeric(Country %in% EU))

#first wave
ungd_files$is.eu[ungd_files$Country== "DNK" & ungd_files$Year >1972] <- 1
ungd_files$is.eu[ungd_files$Country== "IRL" & ungd_files$Year >1972] <- 1
ungd_files$is.eu[ungd_files$Country== "GBR" & ungd_files$Year >1972] <- 1

#second wave
ungd_files$is.eu[ungd_files$Country== "GRC" & ungd_files$Year >1980] <- 1

#third wave
ungd_files$is.eu[ungd_files$Country== "ESP" & ungd_files$Year >1985] <- 1
ungd_files$is.eu[ungd_files$Country== "PRT" & ungd_files$Year >1985] <- 1

#fourth wave
ungd_files$is.eu[ungd_files$Country== "AUT" & ungd_files$Year >1994] <- 1
ungd_files$is.eu[ungd_files$Country== "FIN" & ungd_files$Year >1994] <- 1
ungd_files$is.eu[ungd_files$Country== "SWE" & ungd_files$Year >1994] <- 1

#fifth wave
ungd_files$is.eu[ungd_files$Country== "CZE" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "HUN" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "POL" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "EST" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "LVA" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "LTU" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "CYP" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "MLT" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "SVK" & ungd_files$Year >2003] <- 1
ungd_files$is.eu[ungd_files$Country== "SVN" & ungd_files$Year >2003] <- 1

#sixth wave
ungd_files$is.eu[ungd_files$Country== "BGR" & ungd_files$Year >2006] <- 1
ungd_files$is.eu[ungd_files$Country== "ROU" & ungd_files$Year >2006] <- 1

#seventh wave
ungd_files$is.eu[ungd_files$Country== "HRV" & ungd_files$Year >2012] <- 1
ungd_files <-  mutate(ungd_files, eu6 = as.numeric(Country %in% EU))
#simil <-  mutate(simil, wave1 = Country %in% wave1)
#simil <-  mutate(simil, wave2 = Country %in% wave2)
#simil <-  mutate(simil, wave3 = Country %in% wave3)
#simil <-  mutate(simil, wave4 = Country %in% wave4)
ungd_files <-  mutate(ungd_files, eu10 = as.numeric(Country %in% wave5))
#simil <-  mutate(simil, wave6 = Country %in% wave6)
#simil <-  mutate(simil, wave7 = Country %in% wave7)
ungd_files <-  mutate(ungd_files, eu9 = eu6)
ungd_files$eu9[ungd_files$Country== "DNK" & ungd_files$Year >1972] <- 1
ungd_files$eu9[ungd_files$Country== "IRL" & ungd_files$Year >1972] <- 1
ungd_files$eu9[ungd_files$Country== "GBR" & ungd_files$Year >1972] <- 1

ungd_files <-  mutate(ungd_files, eu12 = eu9)
ungd_files$eu12[ungd_files$Country== "GRC" & ungd_files$Year >1980] <- 1
ungd_files$eu12[ungd_files$Country== "ESP" & ungd_files$Year >1985] <- 1
ungd_files$eu12[ungd_files$Country== "PRT" & ungd_files$Year >1985] <- 1

ungd_files <-  mutate(ungd_files, eu15 = eu12)
ungd_files$eu15[ungd_files$Country== "AUT" & ungd_files$Year >1994] <- 1
ungd_files$eu15[ungd_files$Country== "FIN" & ungd_files$Year >1994] <- 1
ungd_files$eu15[ungd_files$Country== "SWE" & ungd_files$Year >1994] <- 1
ungd_corpus <- corpus(ungd_files, text_field = "text")
ungd_corpus.10y2004 <- corpus_subset(ungd_corpus, Year>1993 & Year<2014)

Figure 5 (and figures 14-15 in supplementary materials)

dfm_eu10_10 <- corpus_subset(ungd_corpus.10y2004, eu10==1) %>% 
  tokens(what = "word", 
         remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% 
  tokens_select(stopwords("english"), selection = "remove", padding = TRUE) %>%
  tokens(ngrams = c(2)) %>%
  dfm(groups = "accession")

#keyness analysis
keyness_eu10_10 <-  textstat_keyness(dfm_eu10_10, target = 2)

#plotting keyness
keyplot <- textplot_keyness(keyness_eu10_10, margin = 0.2, labelsize = 2, color = c("red", "darkblue"), n = 30) +
  theme(legend.position="none")

ggsave(keyplot,filename="keyness_eu10_10y2004.pdf", width = 7, height = 7)

dfm_eu10_10@Dimnames$docs <- c("1994-2003", "2004-2013")

#plotting wordcloud
pdf("wordcloud_eu10_10y2004.pdf", width = 7, height = 7)
textplot_wordcloud(dfm_eu10_10,comparison = TRUE, max_words = 300, color = c("blue", "red"))
dev.off()

Figure 16-17 in supplementary materials

dfm_eu6_10 <- corpus_subset(ungd_corpus.10y2004, eu6==1) %>% 
  tokens(what = "word", 
         remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% 
  tokens_select(stopwords("english"), selection = "remove", padding = TRUE) %>%
  tokens(ngrams = c(2)) %>%
  dfm(groups = "accession")

#keyness analysis
keyness_eu6_10 <-  textstat_keyness(dfm_eu6_10, target = 2)

#plotting keyness
keyplot <- textplot_keyness(keyness_eu6_10, margin = 0.2, labelsize = 2, color = c("red", "darkblue"), n = 30) +
  theme(legend.position="none")

ggsave(keyplot,filename="keyness_eu6_10y2004.pdf", width = 7, height = 7)

dfm_eu6_10@Dimnames$docs <- c("1994-2003", "2004-2013")

#plotting wordcloud
pdf("wordcloud_eu6_10y2004.pdf", width = 7, height = 7)
textplot_wordcloud(dfm_eu6_10,comparison = TRUE, max_words = 300, color = c("blue", "red"))
dev.off()
