############### ## Jan Vanhove & Raphael Berthele ## "Item-related determinants of cognate guessing in multilinguals" ## Analysis Study 1 ## Contact: jan.vanhove@unifr.ch ############### ################### ### PREPARATION ### ################### # Load Seminar data and treat "Correct" as factor semdat = read.csv("Study1/SeminarData.csv", header=T) # SeminarData.csv semdat$Correct = factor(semdat$Correct) # Load vpn (= participants) data semvpn = read.csv("Study1/SeminarVpn.csv", header=T) # SeminarVpn.csv # Exclude participants with no self-reported English reading skills semvpn = droplevels(subset(semvpn, EngReading > 0)) # Add info to semdat semdat = merge(semdat, semvpn, by="Subject", all.y=TRUE) summary(semdat) # Compute percentage of correct translations per item library(plyr) # install plyr package first: install.packages("plyr") sem_proItem = ddply(semdat, .(Stimulus), summarise, PercCorrect = length(which(Correct=="1")) / length(which(Correct=="1"|Correct=="0"))) # Load item data semitem = read.csv("Study1/SeminarStimuli.csv", header=T) # SemStimuli.csv # Merge with sem_proItem and drop profile words as well as "eftermiddag" sem_proItem = merge(sem_proItem, semitem, by="Stimulus", all.y=TRUE) sem_proItem = droplevels(subset(sem_proItem, Status=="target" & Stimulus!="eftermiddag")) # Summary of % correct translation per item summary(sem_proItem$PercCorrect); sd(sem_proItem$PercCorrect) ##################### ### RANDOM FOREST ### ##################### # RandomForest: full model library(party) # install party package first: install.packages("party") sem.rf = cforest(PercCorrect ~ LevGer + LevGerCons + LevGerIni + LevEng + LevEngCons + LevEngIni + LevFre + LevFreCons + LevFreIni + log.FreqGerman + log.FreqEnglish + log.FreqFrench, data = sem_proItem, controls=cforest_unbiased(mtry=4, ntree=1000)) # Compute conditional variable importances # This may take some time! sem.vi = varimp(sem.rf, conditional=TRUE) # Combine variable importances by type summary.sem.vi = data.frame(cbind(VarImp = sem.vi, VarName=c( "German", "German", "German", # German Levenshtein measures "English", "English", "English", # English Levenshtein measures "French", "French", "French", # French Levenshtein measures "German", "English", "French"), # Frequency measures VarType=c( "Overall Levenshtein distance","Consonantal Levenshtein distance", "Word-initial Levenshtein distance", # German "Overall Levenshtein distance","Consonantal Levenshtein distance", "Word-initial Levenshtein distance", # English "Overall Levenshtein distance","Consonantal Levenshtein distance", "Word-initial Levenshtein distance", # French "Cognate frequency (log)", "Cognate frequency (log)","Cognate frequency (log)")), # Frequency stringsAsFactors = F) summary.sem.vi$VarImp = as.numeric(summary.sem.vi$VarImp) summary.sem.vi$VarName = factor(summary.sem.vi$VarName) summary.sem.vi$VarType = factor(summary.sem.vi$VarType) # Plot dotchart using rms package library(rms) # install rms package first: install.packages("rms") #tiff("varimp_semdat.tiff", width=8, height=7.5, units="in", res=300) par(oma=c(0,2,0,0), mar=c(4,0,1,1)) dotchart2(summary.sem.vi$VarImp, labels=summary.sem.vi$VarName, groups=summary.sem.vi$VarType, cex = 0.9, cex.group.labels = 0.9, pch = 1, dotsize = 0.9, sort. = FALSE, xlab="Conditional permutation importance") abline(v=abs(min(summary.sem.vi$VarImp)), lty=2) #dev.off() # Grow random forest with Germanic variables (i.e. combination of German and English) sem2.rf = cforest(PercCorrect ~ LevGer + LevGerCons + LevGerIni + LevEng + LevEngCons + LevEngIni + LevFre + LevFreCons + LevFreIni + MinLevGermanic + MinLevGermanicCons + MinLevGermanicIni + log.FreqGerman + log.FreqEnglish + log.FreqFrench + log.FreqGermanic, data = sem_proItem, controls=cforest_unbiased(mtry=4, ntree=1000)) # Compute conditional variable importances sem2.vi = varimp(sem2.rf, conditional=TRUE) # Combine variable importances into data frame summary.sem2.vi = data.frame(cbind(VarImp = sem2.vi, VarName=c( "German", "German", "German", # German Levenshtein measures "English", "English", "English", # English Levenshtein measures "French", "French", "French", # French Levenshtein measures "Germanic","Germanic","Germanic", # Germanic Levenshtein measures "German", "English", "French","Germanic"), # Frequency measures VarType=c( "Overall Levenshtein distance","Consonantal Levenshtein distance", "Word-initial Levenshtein distance", # German "Overall Levenshtein distance","Consonantal Levenshtein distance", "Word-initial Levenshtein distance", # English "Overall Levenshtein distance","Consonantal Levenshtein distance", "Word-initial Levenshtein distance", # French "Overall Levenshtein distance","Consonantal Levenshtein distance", "Word-initial Levenshtein distance", # Germanic "Cognate frequency (log)", "Cognate frequency (log)","Cognate frequency (log)", "Cognate frequency (log)")), # Frequency stringsAsFactors = F) summary.sem2.vi$VarImp = as.numeric(summary.sem2.vi$VarImp) summary.sem2.vi$VarName = factor(summary.sem2.vi$VarName) summary.sem2.vi$VarType = factor(summary.sem2.vi$VarType) # Plot dotchart using rms package library(rms) #tiff("varimp_semdat2.tiff", width=8, height=7.5, units="in", res=300) par(oma=c(0,2,0,0), mar=c(4,0,1,1)) dotchart2(summary.sem2.vi$VarImp, labels=summary.sem2.vi$VarName, groups=summary.sem2.vi$VarType, cex = 0.9, cex.group.labels = 0.9, pch = 1, dotsize = 0.9, sort. = FALSE, xlab="Conditional permutation importance") abline(v=abs(min(summary.sem2.vi$VarImp)), lty=2) #dev.off() ############################ ### REGRESSION MODELLING ### ############################ # Are the effects linear? # Model MinLevGermanic and log.FreqGermanic using (nonlinear) thin plate regression splines library(mgcv) # install mgcv first: install.packages("mgcv") sem.gam = gam(PercCorrect ~ s(MinLevGermanic, bs="tp") + s(log.FreqGermanic, bs="tp"), data = sem_proItem) # With nonlinear effects: 35% of variance explained summary(lm(PercCorrect ~ MinLevGermanic + log.FreqGermanic, data=sem_proItem)) # With linear effects: 33% --> not much lost by modelling the effects linearly # Merge semitem and semdat files for mixed effects modelling semdat = merge(semdat, semitem, by="Stimulus", all.y=TRUE) # Load lme4 package library(lme4) # install lme4 first: install.packages("lme4") # Centre predictors at their means c. = function(x) x-mean(x) # create convenience function for centring variables semdat$c.MinLevGermanic = semdat$MinLevGermanic - mean(semdat$MinLevGermanic) semdat$clog.FreqGermanic = semdat$log.FreqGermanic - mean(semdat$log.FreqGermanic) # "Verhoging" is pure in terms of the outcome variables: only incorrect responses. # This may cause convergence issues; we'll compute the model with and without this item. # Computing these models can take a while. study1a.mer = lmer(Correct ~ c.MinLevGermanic + clog.FreqGermanic + # Fixed effects (1+c.MinLevGermanic+clog.FreqGermanic|Subject) + (1|Stimulus), # Random intercepts for Stimulus and Subject + correlated random slopes for MinLevGermanic and log.FreqGermanic data = semdat[semdat$Stimulus!="verhoging",], # without "verhoging" family="binomial") study1b.mer = lmer(Correct ~ c.MinLevGermanic + clog.FreqGermanic + (1+c.MinLevGermanic+clog.FreqGermanic|Subject) + (1|Stimulus), data = semdat, # same model as before, but with verhoging family="binomial") # --> convergence issues print(study1a.mer, corr=F) print(study1b.mer, corr=F) # Nevertheless highly similar results; in the article, we report study1b.mer (which uses the whole dataset) # Residual analysis: can any of the variables account for variance in the residuals? # Extract the model's residuals and fit nonlinear regressions on them. semdat$Resid = resid(study1b.mer) summary(gam(Resid ~ s(MinLevGermanic,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevGer,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevFre,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevEng,bs="tp"), data=semdat)) summary(gam(Resid ~ s(log.FreqGerman,bs="tp"), data=semdat)) summary(gam(Resid ~ s(log.FreqEnglish,bs="tp"), data=semdat)) summary(gam(Resid ~ s(log.FreqFrench,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevGerCons,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevGerIni,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevEngCons,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevEngIni,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevFreCons,bs="tp"), data=semdat)) summary(gam(Resid ~ s(LevFreIni,bs="tp"), data=semdat)) summary(gam(Resid ~ s(MinLevGermanicCons,bs="tp"), data=semdat)) summary(gam(Resid ~ s(MinLevGermanicIni,bs="tp"), data=semdat)) # No significant residual patterns (not even nonlinear ones). # Analysis of by-item random intercepts: can any of the variables account for variance in the random intercepts? # Extract the by-item random intercepts and merge them with the item data. sem_ranef = data.frame(ranef(study1b.mer)[["Stimulus"]]) sem_ranef$Stimulus = rownames(sem_ranef) colnames(sem_ranef) <- c("RanInt","Stimulus") semitem = merge(semitem, sem_ranef, by="Stimulus",all.y=TRUE) # Fit nonlinear regressions on them. summary(gam(RanInt ~ s(MinLevGermanic,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(LevGer,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(LevFre,bs="tp"), data=semitem)); plot(gam(RanInt ~ s(LevFre,bs="tp"), data=semitem)) # slight residual effect, but does not occur in other dataset summary(gam(RanInt ~ s(LevEng,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(log.FreqGerman,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(log.FreqEnglish,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(log.FreqFrench,bs="tp"), data=semitem)); plot(gam(RanInt ~ s(log.FreqFrench,bs="tp"), data=semitem)) # not in the expected direction; effect does not occur in other dataset summary(gam(RanInt ~ s(LevGerCons,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(LevGerIni,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(LevEngCons,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(LevEngIni,bs="tp"), data=semitem)) summary(gam(RanInt ~ s(LevFreCons,bs="tp"), data=semitem)); plot(gam(RanInt ~ s(LevFreCons,bs="tp"), data=semitem)) # slight residual effect, but does not occur in the other dataset summary(gam(RanInt ~ s(LevFreIni,bs="tp"), data=semitem)); plot(gam(RanInt ~ s(LevFreIni,bs="tp"), data=semitem)) # slight residual effect, but does not occur in the other dataset summary(gam(RanInt ~ s(MinLevGermanicCons,bs="tp"), data=semitem)); plot(gam(RanInt ~ s(MinLevGermanicCons,bs="tp"), data=semitem)) # not in the expected direction; slight hint in other dataset (see below) summary(gam(RanInt ~ s(MinLevGermanicIni,bs="tp"), data=semitem)) # There is a slight hint that MinLevGermanicCons explains some of the variance (about 8.7%) in the by-item random intercepts. # The effect is not in the expected direction, however: larger consonantal distances are associated with somewhat higher by-item random intercepts. # We now fit a model to verify whether including this variable to the model below actually improves the model: study1c.mer = lmer(Correct ~ c.MinLevGermanic + clog.FreqGermanic + c.(MinLevGermanicCons) + (1+c.MinLevGermanic+clog.FreqGermanic+c.(MinLevGermanicCons)|Subject) + (1|Stimulus), data = semdat,family="binomial") summary(study1c.mer) # no significant effect of MinLevGermanicCons anova(study1b.mer, study1c.mer) # and no improvement in the model fit # Conclusion: Effect not needed in the model