For STM the number of observations in content covariate (7897), prevalence covariate (5768), and documents (7897) are not all equal. So removing missings from prevalence covariates and corresponding documents with missing covariates.
library(readtext)
library(quanteda)
library(dplyr)
library(ggplot2)
library(stm)
library(tidytext)
library(haven)
library(data.table)
UNGD data are available on the Harvard Dataverse at https://doi.org/10.7910/DVN/0TJX8Y
DATA_DIR <- "~/Dropbox/Research/UNGDC projects/UN Data/"
ungd_files <- readtext(paste0(DATA_DIR, "TXT/*"),
docvarsfrom = "filenames",
dvsep="_",
docvarnames = c("Country", "Session", "Year"))
covariates <- read_dta("../MasterDS_EU_Feb2018.dta")
covariates$Year <- as.integer(covariates$year)
full_files <- left_join(ungd_files, covariates, by = c("Country"="iso", "Year"))
#Keeping only complete cases for STM model
nn <- as.data.table(full_files)
complete_eu <- na.omit(nn, cols = "eu_total")
corpus <- corpus(complete_eu, text_field = "text")
#Tokenization and basic pre-processing
tok.sm <- tokens(corpus, what = "word",
remove_numbers = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_twitter = TRUE,
remove_url = TRUE,
verbose = TRUE)
#DFM creation from tokens, removing stopwords.
dfm <- dfm(tok.sm,
tolower = TRUE,
remove=stopwords("english"),
verbose = TRUE)
dfm.m <- dfm_select(dfm, c("[\\d-]", "[[:punct:]]", "^.{1}$"),
selection = "remove",
valuetype="regex", verbose = TRUE)
head(featnames(dfm.m),50)
tail(featnames(dfm.m),50)
stm.dfm <- convert(dfm.m, to = "stm", docvars = docvars(corpus))
search <- searchK(stm.dfm$documents, stm.dfm$vocab,
K = c(3:50),
prevalence = ~ factor(Country) + s(Year) + factor(eu_total),
data = stm.dfm$meta)
search.results <- as.data.frame(search$results)
readr::write_csv(search.results, "search.results.csv")
ggplot(search_results, aes(x=semcoh, y=exclus)) +
geom_point(size=5, shape =1, color = "green") +
geom_text(aes(label=K), size=2) +
geom_smooth(method="lm", se = FALSE, color = "red", size = .3) +
geom_vline(xintercept = mean(search_results$semcoh), size = .2, linetype="dashed") +
geom_hline(yintercept = mean(search_results$exclus), size = .2, linetype="dashed") +
theme_bw() +
# ggtitle("Selecting optimal number of topics") +
xlab("Semantic coherence") + ylab("Exclusivity")
ggsave("optimal_topics.pdf")
topics14 <- stm(stm.dfm$documents, stm.dfm$vocab,
prevalence = ~ factor(Country) + s(Year) + factor(eu_total),
data = stm.dfm$meta,
K = 14, init.type = "Spectral")
words <- labelTopics(topics14, n = 15)
prob <- as.data.frame(words[1])
frex <- as.data.frame(words[2])
labelTopics(topics14,n = 5)
pdf("topic14_prob_words.pdf", width = 10, height = 7)
plot(topics14,type="summary", labeltype = "prob",
xlim = c(0, 1.5),
n = 25,
text.cex = .4,
main = "Top 25 highest prob words")
dev.off()
pdf("topic14_frex_words.pdf", width = 10, height = 7)
plot(topics14,type="summary", labeltype = "frex",
xlim = c(0, 1.5),
n = 25,
text.cex = .35,
main = "Top 25 FREX words")
dev.off()
Topic 1: Disarmament; Topic 2: African peace and security; Topic 3: The United Nations, Topic 4: Colonialism and independence; Topic 5: International security; Topic 6: War and peace; Topic 7: Middle East peace; Topic 8: Small Island Developing States (SIDS); Topic 9: Economic development and the United Nations; Topic 10: Africa region; Topic 11: Latin America region; Topic 12: International development and the Global South; Topic 13: Europe region; Topic 14: Sustainable development and climate change
topiclabels <- c("Disarmament", "African peace and security", "Pan-Asian cooperation", "Colonialism and independence", "International security", "Conflict and terrorism", "Middle East peace", "Small Island Developing States (SIDS)", "Economic development and the United Nations", "Africa region", "Latin America region", "International development and the Global South", "European region", "Sustainable development and climate change")
doc.names <- tidy(topics14, matrix = "gamma", document_names = names(stm.dfm$documents))
colnames(doc.names)[1] <- "country"
doc.names
documents <- tidy(topics14, matrix = "gamma", document_names = stm.dfm$meta$eu_total)
topics <- cbind(documents, doc.names)
topics[5:6] <- NULL
topics
gamma <- stringr::str_replace(topics$country, ".txt", "") %>%
stringr::str_split(., "_", simplify = TRUE) %>%
cbind(., topics)
gamma$`1` <- as.character(gamma$`1`)
gamma$`2` <- as.numeric(as.character(gamma$`2`))
gamma$`3` <- as.numeric(as.character(gamma$`3`))
colnames(gamma)[1] <- "Country"
colnames(gamma)[2] <- "Session"
colnames(gamma)[3] <- "Year"
colnames(gamma)[4] <- "membership"
gamma$membership[gamma$Country=="USA"] <- 4
gamma$membership[gamma$Country=="RUS"] <- 5
sums <- gamma %>% group_by(membership,topic) %>% summarise(Topic_Proportion = mean(gamma)) %>% arrange(topic, Topic_Proportion) %>% filter(membership !=0)
topic_distributions <- as.data.frame(sums)
readr::write_csv(topic_distributions, "topic_distributions.csv")
knitr::kable(sums)
topic_names <- as_labeller(c(
`1` = "Disarmament",
`2` = "African peace and security",
`3` = "Pan-Asian cooperation",
`4` = "Colonialism and independence",
`5` = "International security",
`6` = "Conflict and terrorism",
`7` = "Middle East peace",
`8` = "Small Island Developing States (SIDS)",
`9` = "Economic development and the United Nations",
`10` = "Africa region",
`11` = "Latin America region",
`12` = "International development and the Global South",
`13` = "European region",
`14` = "Sustainable development and climate change"
))
ggplot(sums, aes(x = reorder(membership, Topic_Proportion),
y = Topic_Proportion, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free", ncol=2, labeller = topic_names) +
coord_flip() + labs(x="", y="") +
scale_x_discrete(labels=c( "1" = "Applicant", "2" = "Candidate",
"3" = "Member", "4" = "USA", "5" = "RUS"),
limits = c(1,2,3,4,5))
ggsave("topic_usage.pdf", width = 8, height = 14)
EU6 <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD")
EU9 <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD", "DNK", "IRL", "GBR")
EU12 <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD", "DNK", "IRL", "GBR", "GRC","ESP", "PRT")
EU15 <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD", "DNK", "IRL", "GBR", "GRC","ESP", "PRT","AUT", "FIN", "SWE")
wave1 <- c("DNK", "IRL", "GBR")
wave2 <- "GRC"
wave3 <- c("ESP", "PRT")
wave4 <- c("AUT", "FIN", "SWE")
wave5 <- c("CZE", "HUN", "POL", "EST", "LVA", "LTU", "CYP", "MLT", "SVK", "SVN")
wave6 <- c("BGR", "ROU")
wave7 <- "HRV"
For a group of EU related topics
topics_eu <- c("2","9", "10", "11", "14")
gamma$topics_eu <- gamma$topic %in% topics_eu
topic_sums <- gamma %>% group_by(topics_eu, Country, Year) %>% summarise(sum_eu_topics = sum(gamma)) %>% filter(topics_eu==TRUE, Year>1990) %>% arrange(Country, Year, sum_eu_topics)
topic_sums$eu10 <- topic_sums$Country %in% wave5
topic_sums$eu6 <- topic_sums$Country %in% EU6
topic_sums$eu9 <- topic_sums$Country %in% EU9
topic_sums$eu12 <- topic_sums$Country %in% EU12
topic_sums$eu15 <- topic_sums$Country %in% EU15
topic_sums$usa <- topic_sums$Country=="USA"
topic_sums$rus <- topic_sums$Country=="RUS"
eu10_mean_topic <- topic_sums %>% group_by(eu10, Year) %>% summarise(eu10_tp = mean(sum_eu_topics)) %>% filter(eu10==TRUE)
eu6_mean_topic <- topic_sums %>% group_by(eu6, Year) %>% summarise(eu6_tp = mean(sum_eu_topics)) %>% filter(eu6==TRUE)
eu9_mean_topic <- topic_sums %>% group_by(eu9, Year) %>% summarise(eu9_tp = mean(sum_eu_topics)) %>% filter(eu9==TRUE)
eu12_mean_topic <- topic_sums %>% group_by(eu12, Year) %>% summarise(eu12_tp = mean(sum_eu_topics)) %>% filter(eu12==TRUE)
eu15_mean_topic <- topic_sums %>% group_by(eu15, Year) %>% summarise(eu15_tp = mean(sum_eu_topics)) %>% filter(eu15==TRUE)
usa_mean_topic <- topic_sums %>% group_by(usa, Year) %>% summarise(usa_tp = mean(sum_eu_topics)) %>% filter(usa==TRUE)
rus_mean_topic <- topic_sums %>% group_by(rus, Year) %>% summarise(rus_tp = mean(sum_eu_topics)) %>% filter(rus==TRUE)
mean_topics <- bind_cols(eu10_mean_topic, eu6_mean_topic, eu9_mean_topic, eu12_mean_topic, eu15_mean_topic, usa_mean_topic, rus_mean_topic)
ggplot(mean_topics, aes(x=Year)) +
geom_area(aes(y=eu10_tp), fill = "red", alpha=0.3) +
ylab("EU Topics Proportion") +
scale_x_continuous(breaks=c(1990,2000, 2010)) +
# ggtitle("Total EU topic proportions average per year per country group: EU10") +
theme_bw()
ggsave("eu10_topic_proportions.pdf")
ggplot(mean_topics, aes(x=Year)) +
geom_area(aes(y=eu10_tp), fill = "red", alpha=0.3) +
geom_area(aes(y=eu6_tp), fill = "blue", alpha=0.3) +
geom_line(aes(y=usa_tp), colour = "black") +
geom_line(aes(y=rus_tp), colour = "black", linetype = "dashed") +
ylab("EU Topics Proportion") +
scale_x_continuous(breaks=c(1990,2000, 2010)) +
# ggtitle("Total EU topic proportions average per year per country group: EU10 (red) vs EU6") +
theme_bw()
ggsave("eu10_eu6_topic_proportions_rus_usa.pdf")
ggplot(mean_topics, aes(x=Year)) +
geom_area(aes(y=eu10_tp), fill = "red", alpha=0.3) +
geom_area(aes(y=eu6_tp), fill = "blue", alpha=0.3) +
ylab("EU Topics Proportion") +
scale_x_continuous(breaks=c(1990,2000, 2010)) +
# ggtitle("Total EU topic proportions average per year per country group: EU10 (red) vs EU6") +
theme_bw()
ggsave("eu10_eu6_topic_proportions.pdf")