Process data

Survival analysis data

# Create a data frame of SN:yes patients
data_sn.yes <- data %>%
    # Select columns
    select(ID, 
           visit_number, 
           visit_day, 
           visit_year,
           approximate_day,
           approximate_year,
           hivsn_present,
           pain) %>%
    # Add counting columns
    ## SN:yes and SN:no
    mutate(sn_count = ifelse(hivsn_present == 'yes',
                             yes = 1,
                             no = 0)) %>%
    # Create counting index to identify when SN first develop
    ## Sum sn_count
    group_by(ID) %>%
    mutate(sn_count = cumsum(sn_count)) %>%
    # Filter by sn_count == 1 to get when SN first developed
    filter(sn_count == 1) %>%
    # Remove sn_count
    select(-sn_count)

# Create data frame of SN:no patients
data_sn.no <- data %>%
    # Select columns
    select(ID, 
           visit_number, 
           visit_day, 
           visit_year,
           approximate_day,
           approximate_year,
           hivsn_present,
           pain) %>%
    # Filter out SN:yes patient with the filter created earlier
    filter(!ID %in% sn_filter) %>%
    # Filter out 'repeats' (only the data at the last visit)
    group_by(ID) %>%
    mutate(max_visits = max(visit_number)) %>%
    filter(visit_number == max_visits) %>%
    select(-max_visits) 

# Merge the SN:yes and SN:no data frames
data_surv <- data_sn.yes %>%
    full_join(data_sn.no) %>%
    # New column with recoded hivsn_present data for survival analysis
    mutate(hivsn_coded = ifelse(hivsn_present == 'yes',
                                yes = 1, # SN:yes
                                no = 0)) # SN:no

# Add a new column with painful SN (but only if they have SN)
data_surv <- data_surv %>%
    mutate(hivsn_painful = ifelse(hivsn_present == 'yes' & pain == 'yes',
                                  yes = 'yes',
                                  no = ifelse(hivsn_present == 'yes' &
                                                  pain == 'no',
                                              yes = 'no',
                                              no = NA))) 

Analysis

Cumulative incidence

Cumulative incidence measures the number of new cases per person in the population over a defined period of time (i.e., a fixed follow-up period). Therefore, to calculate cumulative incidence we defined a fixed 6-month follow-up period, and also subdivided this period into a 1st and 2nd 3-month period of this follow-up. To standardize the periods of follow-up, data were cleaned such that only visits falling into the indicated periods ((0-3 months], (3-6 months], (0-6 months]) were used to define SN status. As such, participant’s whose last clinic visit occurred after 91 days (end of first 3-month interval) or 182 days (end of 6-month interval), and who were found to have new-onset SN at this visit, were recorded as SN:no over the 3 or 6-month period of follow-up. This conservative strategy may have lead to a slight under-estimation of the cumulative incidence of SN.

Boostrap function

Incidence rate (person years)

Incidence rate is a measure of the number of new cases per unit of time. We did not have extact dates of SN onset to define the per patient unit of time, and nor was there uniform spacing between clinic visits. We therefore chose to calculate an approximate SN onset time, arbitrarily defined as the number of days between the first neuropathy screening and the mid-point between the visit when neuropathy was detected and the preceeding visit. For participants who did not develop SN, the date of censoring was defined by the number of days between the first neuropathy screening and the last screening (study exit).

Survival curves

Plot a basic Kaplan-Meyer survival curve

## Call: survfit(formula = Surv(time = approximate_day, event = hivsn_coded) ~ 
##     1, data = data_surv)
## 
##       n  events  median 0.95LCL 0.95UCL 
##     120      20      NA      NA      NA
## Call: survfit(formula = Surv(time = approximate_day, event = hivsn_coded) ~ 
##     1, data = data_surv)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    14    120       1    0.992  0.0083        0.976        1.000
##    22    119       1    0.983  0.0117        0.961        1.000
##    23    118       1    0.975  0.0143        0.947        1.000
##    28    117       2    0.958  0.0182        0.923        0.995
##    31    115       1    0.950  0.0199        0.912        0.990
##    46    113       1    0.942  0.0214        0.901        0.985
##    47    112       1    0.933  0.0228        0.890        0.979
##    51    111       1    0.925  0.0241        0.879        0.973
##    52    110       1    0.916  0.0253        0.868        0.967
##    53    109       1    0.908  0.0264        0.858        0.961
##    55    108       1    0.900  0.0275        0.847        0.955
##    56    107       1    0.891  0.0285        0.837        0.949
##    64    105       1    0.883  0.0295        0.827        0.942
##    73    104       1    0.874  0.0304        0.817        0.936
##    84     98       1    0.865  0.0314        0.806        0.929
##   104     89       1    0.856  0.0325        0.794        0.922
##   132     80       1    0.845  0.0338        0.781        0.914
##   143     70       1    0.833  0.0354        0.766        0.905
##   144     69       1    0.821  0.0369        0.752        0.896

Plot a basic KM curve conditioned on painful SN

In addition we performed a log-rank test to asssess for differences between painful and non-painful SN strata.

## Call: survfit(formula = Surv(time = approximate_day, event = hivsn_coded) ~ 
##     pain, data = data_surv)
## 
##           n events median 0.95LCL 0.95UCL
## pain=no  96     11     NA      NA      NA
## pain=yes 24      9     NA      84      NA
## Call: survfit(formula = Surv(time = approximate_day, event = hivsn_coded) ~ 
##     pain, data = data_surv)
## 
##                 pain=no 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    23     96       1    0.990  0.0104        0.969        1.000
##    28     95       1    0.979  0.0146        0.951        1.000
##    31     94       1    0.969  0.0178        0.935        1.000
##    47     92       1    0.958  0.0205        0.919        0.999
##    51     91       1    0.948  0.0228        0.904        0.993
##    53     90       1    0.937  0.0248        0.890        0.987
##    56     89       1    0.927  0.0267        0.876        0.980
##    64     87       1    0.916  0.0284        0.862        0.973
##   132     70       1    0.903  0.0309        0.844        0.966
##   143     61       1    0.888  0.0337        0.824        0.957
##   144     60       1    0.873  0.0363        0.805        0.947
## 
##                 pain=yes 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    14     24       1    0.958  0.0408        0.882        1.000
##    22     23       1    0.917  0.0564        0.813        1.000
##    28     22       1    0.875  0.0675        0.752        1.000
##    46     21       1    0.833  0.0761        0.697        0.997
##    52     20       1    0.792  0.0829        0.645        0.972
##    55     19       1    0.750  0.0884        0.595        0.945
##    73     18       1    0.708  0.0928        0.548        0.916
##    84     15       1    0.661  0.0979        0.495        0.884
##   104     12       1    0.606  0.1041        0.433        0.849
## Call:
## survdiff(formula = Surv(time = visit_day, event = hivsn_coded) ~ 
##     pain, data = data_surv)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## pain=no  96       11    16.89      2.05      13.4
## pain=yes 24        9     3.11     11.13      13.4
## 
##  Chisq= 13.4  on 1 degrees of freedom, p= 3e-04

Multivariable modeling

In an exploratory analysis using Cox proportional hazard models (not shown here) various predictors violated assumptions of the model (e.g., proportional hazard, linearity, no influence points). Therefore we used logistic regression modelling, with visit 1 charateristics as predictors of SN onset.

Model data

The model does not include:

  1. diabetes_hba1c because nobody had HbA1c > 7%.

  2. vitaminB12_deficiency because only one individual had a deficiency.

  3. mass_kg because it is likely to be co-linear with height_m.

  4. consumes_alcohol because this information was deemed to be encoded in alcohol_units.week.

Model selection

Check height vs mass assumption

## 
##  Pearson's product-moment correlation
## 
## data:  height_m and mass_kg
## t = 0.77427, df = 52, p-value = 0.4423
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1657403  0.3641103
## sample estimates:
##       cor 
## 0.1067581
## 
##  Pearson's product-moment correlation
## 
## data:  height_m and mass_kg
## t = 4.42, df = 64, p-value = 3.899e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2735853 0.6495961
## sample estimates:
##      cor 
## 0.483596

The two variables are related in females, but not in males. We decided to omit weight_kg in favour of height_m because of this relationship, and the stronger historical association between height and SN.

Elastic net regression

We chose to use elastic net for variable selection. Elastic net is a regression analysis method that performs both variable selection and regularization in order to enhance the prediction accuracy and interpretability of the statistical model it produces.

The process involves performing a 10-fold cross validation to find the optimal lambda (penalization parameter). And then running the analysis and extracting the model based on the best lambda.

Fit the models using CV alphas and best lambdas

## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                1
## (Intercept)           -2.0528761
## age_years              .        
## sexF                   .        
## sexM                   .        
## height_m               0.3674762
## CD4_cell.ul            .        
## viral_load_copies.ml   .        
## alcohol_units.week     .        
## TB_currentno          -0.1183944
## TB_currentyes          0.1184323
## rifafour_treatmentno   .        
## rifafour_treatmentyes  .

## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                1
## (Intercept)           -2.3406269
## age_years              .        
## sexF                   .        
## sexM                   .        
## height_m               0.5639000
## CD4_cell.ul            .        
## viral_load_copies.ml   .        
## alcohol_units.week     .        
## TB_currentno          -0.1703713
## TB_currentyes          0.1705513
## rifafour_treatmentno   .        
## rifafour_treatmentyes  .

## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                1
## (Intercept)           -2.2224057
## age_years              .        
## sexF                   .        
## sexM                   .        
## height_m               0.4914842
## CD4_cell.ul            .        
## viral_load_copies.ml   .        
## alcohol_units.week     .        
## TB_currentno          -0.1698450
## TB_currentyes          0.1698534
## rifafour_treatmentno   .        
## rifafour_treatmentyes  .

## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                   1
## (Intercept)           -1.695158e+00
## age_years              .           
## sexF                   .           
## sexM                   .           
## height_m               2.954020e-01
## CD4_cell.ul            .           
## viral_load_copies.ml   .           
## alcohol_units.week     .           
## TB_currentno          -3.843904e-01
## TB_currentyes          3.766113e-14
## rifafour_treatmentno   .           
## rifafour_treatmentyes  .

Across all alphas, and best lambdas, the output shows the best model includes height_m and TB_current.


Session information

## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] survminer_0.4.4   ggpubr_0.2.1      survival_2.44-1.1
##  [4] boot_1.3-22       glmnetUtils_1.1.2 lubridate_1.7.4  
##  [7] forcats_0.4.0     stringr_1.4.0     dplyr_0.8.2      
## [10] purrr_0.3.2       readr_1.3.1       tidyr_0.8.3      
## [13] tibble_2.1.3      ggplot2_3.2.0     tidyverse_1.2.1  
## [16] magrittr_1.5     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1        lattice_0.20-38   zoo_1.8-6        
##  [4] zeallot_0.1.0     utf8_1.1.4        assertthat_0.2.1 
##  [7] glmnet_2.0-18     digest_0.6.19     foreach_1.4.4    
## [10] R6_2.4.0          cellranger_1.1.0  backports_1.1.4  
## [13] evaluate_0.14     httr_1.4.0        pillar_1.4.2     
## [16] rlang_0.4.0       lazyeval_0.2.2    readxl_1.3.1     
## [19] rstudioapi_0.10   data.table_1.12.2 Matrix_1.2-17    
## [22] rmarkdown_1.13    labeling_0.3      splines_3.6.0    
## [25] munsell_0.5.0     broom_0.5.2       compiler_3.6.0   
## [28] modelr_0.1.4      xfun_0.8          pkgconfig_2.0.2  
## [31] htmltools_0.3.6   tidyselect_0.2.5  gridExtra_2.3    
## [34] km.ci_0.5-2       codetools_0.2-16  fansi_0.4.0      
## [37] crayon_1.3.4      withr_2.1.2.9000  grid_3.6.0       
## [40] xtable_1.8-4      nlme_3.1-140      jsonlite_1.6     
## [43] gtable_0.3.0      KMsurv_0.1-5      scales_1.0.0     
## [46] cli_1.1.0         stringi_1.4.3     ggsignif_0.5.0   
## [49] xml2_1.2.0        vctrs_0.1.0       survMisc_0.5.5   
## [52] generics_0.0.2    iterators_1.0.10  tools_3.6.0      
## [55] cmprsk_2.2-8      glue_1.3.1        hms_0.4.2        
## [58] parallel_3.6.0    yaml_2.2.0        colorspace_1.4-1 
## [61] rvest_0.3.4       knitr_1.23        haven_2.1.0