30-01-2023 Exercise

Context

Field trial in rural northern Ghana on the effect of Vitamin A given in large doses to children under 5 . The data in this exercise were taken from the baseline survey. They include interview data provided by the mother or her substitute, clinical and laboratory data. You will find the da-ta in VASTCHS.csv . Please note that in this csv file semi colons (;) have been used as field separators and comma’s as decimal-point characters. Take this into account when you import the dataset in R!

1. Preparation

Keyboard shortcuts:
- `ctrl + shift + m` -\> `%>%` (operador pipe)
- `ctrl + alt + i` -\> Crea un chunk de código
Set working directory
- setwd("C:/Users/xxx/yyy") 
- getwd()
Load (basic) packages

To import Excel files library(readxl) To calculate OR or RR library(EpiStats)

library(readxl)                      # To import Excel files
library(tidyverse)                   # Varias funciones de tidyverse (dplyr, ggplot2, etc.)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(dslabs)                      # Paquete que contiene la base de datos de Gapminder
library(EpiStats)                    # To calculate OR or RR
Warning: package 'EpiStats' was built under R version 4.2.2
Loading required package: epiR
Warning: package 'epiR' was built under R version 4.2.2
Loading required package: survival
Package epiR 2.0.53 is loaded
Type help(epi.about) for summary information
Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
Import data
  • From Text (.csv)-example
    • data <- read.csv(“H:/Teaching/My_Courses/IIH-epistat/R/Sharm_Exercise/mysharm.csv”)
  • From Excel file (.xls, .xlsx)-example
    • library(readxl)
    • data <- read_excel(“H:/Teaching/My_Courses/IIH-epistat/R/Sharm_Exercise/mysharm.xlsx”)
  • from STATA file (.dta)-example
    • library(haven)
    • data <- read_dta(“H:/Teaching/My_Courses/IIH-epistat/R/Sharm_Exercise/mysharm.dta”)

Importing a CSV under the name of vastchs, with “;” as separator, and with “,” as decimal:

vastchs <- read.csv("C:/Users/pined/OneDrive - Universidad Nacional Mayor de San Marcos/Javier 2022/Belgica/AC2/Linear Regresion Exercise Database/Datasets/VASTCHS.csv", sep=";", dec= ",")

2. Descriptive Statistics

Summary

Numeric Variable
summary(vastchs$AGE)                      #summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   10.00   21.00   22.49   34.00   55.00 
var(vastchs$AGE)                          #variance
[1] 200.287
sd(vastchs$AGE)                           #standart desviation
[1] 14.15228
Categoric Variable

Frecuency and Percentage

table(vastchs$SEX)                        #Counting

  1   2 
560 577 
prop.table(table(vastchs$SEX))            #Percentages

        1         2 
0.4925242 0.5074758 

Recode to a factor - From number to string

vastchs$SEX2<-factor(vastchs$SEX, levels=c(1,2))
summary(vastchs$SEX2)
  1   2 
560 577 

Creating variables- Recoding: Yes/No = 1/0

vastchs$wasted<- ifelse(vastchs$WHZ< -2, 1,0)     #wasted si es menos a -2 poner 1 sino 0
vastchs$stunded<- ifelse(vastchs$HAZ< -2, 1,0)     #stunded si es menos a -2 poner 1 sino 0
vastchs$underweight<- ifelse(vastchs$WAZ< -2, 1,0)     #undweig si es menos a -2 poner 1 sino 0
vastchs$anemia<- ifelse(vastchs$HB< 8, 1,0)     #anemia si es menos que 8 poner 1 sino 0
vastchs$vitadef<- ifelse(vastchs$RETINOL< 0.7, 1,0)     #Def of VitA si es menos de 0.7 poner 1 sino 0

vastchs$HANDPUMP2<- ifelse(vastchs$HANDPUMP== 1, 1,0) 
table(vastchs$vitadef)                        #Counting

  0   1 
302 835 
prop.table(table(vastchs$vitadef))            #Percentages

        0         1 
0.2656113 0.7343887 
table(vastchs$HANDPUMP, vastchs$HANDPUMP2)
   
      0   1
  1   0 903
  2  65   0
  3 169   0

Breast feeding as a potential protective factor agains vit A deficiency

table (vastchs$CURRBF)            #To see what values have CURRBF

  1   2 
793 344 
vastchs$CURRBF2<- ifelse(vastchs$CURRBF==1, 1,0)    #To change values to 0 to 1
table (vastchs$CURRBF, vastchs$CURRBF2)     #To see if has already changed
   
      0   1
  1   0 793
  2 344   0

Calculate de OR

table (vastchs$vitadef, vastchs$CURRBF2)
   
      0   1
  0  74 228
  1 270 565
prop.table(table(vastchs$vitadef, vastchs$CURRBF2))       #Proportion above total sum
   
             0          1
  0 0.06508355 0.20052770
  1 0.23746702 0.49692172
prop.table(table(vastchs$vitadef, vastchs$CURRBF2), margin = 1) #Proportion above ROW
   
            0         1
  0 0.2450331 0.7549669
  1 0.3233533 0.6766467
cc(vastchs,vitadef,CURRBF2)
$df1
                   Cases Controls Total
Exposed              565      228   793
Unexposed            270       74   344
Total                835      302  1137
Proportion exposed  0.68     0.75  0.70

$df2
                Point estimate 95%CI-ll 95%CI-ul
Odds ratio                0.68     0.50     0.92
Prev. frac. ex.           0.32     0.08     0.50
Prev. frac. pop           0.24       NA       NA
chi2(1)                   6.45       NA       NA
Pr>chi2                  0.011       NA       NA

Agruping Variable

# Regrouping (e.g. age in months to agegroup)
          vastchs$agegrp <- NA
          vastchs$agegrp[vastchs$AGE<12] <- "1"
          vastchs$agegrp[vastchs$AGE>11 & vastchs$AGE<24] <- "2"
          vastchs$agegrp[vastchs$AGE>23 & vastchs$AGE<36] <- "3"
          vastchs$agegrp[vastchs$AGE>35 & vastchs$AGE<48] <- "4"
          vastchs$agegrp[vastchs$AGE>47 & vastchs$AGE<60] <- "5"
          # don't forget to convert agegrp to a factor variable
          vastchs$agegrp <- factor(vastchs$agegrp)

Being under2 is a coufunder?

vastchs$under2 <- "0"
vastchs$under2[vastchs$AGE<2] <- "1"
vastchs$agegrp <- factor(vastchs$agegrp)
hist(vastchs$AGE)

summary(vastchs$AGE)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   10.00   21.00   22.49   34.00   55.00 

With more options

hist(vastchs$AGE,main="Age distribution of study participants", 
     xlab="Age (years)", ylab="nb", col="green", border="dark green",
     breaks=5)

With Tidyverse

ggplot(vastchs, aes(x=AGE)) +
  geom_histogram( color="darkblue", fill="lightblue",  alpha=0.5, origin = 0, binwidth=10)
Warning: `origin` is deprecated. Please use `boundary` instead.

Summary of a categorical variable