vignettes/relate.Rmd
relate.Rmd
set.seed(123)
#load simulated dataset from the package
data("cohort_na_df")
cohort_na_df[199:202,]
#> X1 X2 X3 X4 X5 X6 X7
#> 199 9.800547 11.192049 13.22608 11.741157 3.2985284 17.568287 16.80876
#> 200 9.931415 10.138727 11.55728 13.071213 0.1115508 9.439059 12.11496
#> 201 4.040424 10.757824 NA 5.497721 -2.1834396 -7.249764 16.84709
#> 202 4.658563 8.917538 NA 5.997931 1.5099440 8.548666 11.56883
#> X8 X9 X10 X11 X12 X13 X14
#> 199 19.57499 15.711291 8.704099 14.994329 19.08767 17.18427 9.340641
#> 200 19.50130 15.995014 12.911449 8.116613 20.50924 17.98611 2.671327
#> 201 11.47614 10.095340 NA 7.683143 24.87351 12.84377 7.324220
#> 202 16.18313 8.848892 NA 10.561502 18.80442 19.89488 8.512655
#> X15 cohortid distribution
#> 199 12.6633576 2 1
#> 200 9.7992329 2 1
#> 201 1.0246224 3 1
#> 202 0.8836475 3 1
#save the true distribution
true_distribution <- cohort_na_df$distribution
cohort_na_df <- cohort_na_df %>% dplyr::select(-distribution)
# examine missing pattern (e.g., some cohorts did not collect specific information)
cohort_na_df %>% find_na_level(clusterid = "cohortid")
#> # A tibble: 10 × 16
#> ClusterID X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 0 0 0 0 0 0 0 0 0 0
#> 2 2 0 0 0 0 0 0 0 0 0 0 0
#> 3 3 0 0 1 0 0 0 0 0 0 1 0
#> 4 4 0 0 0 0 0 0 0 0 0 0 0
#> 5 5 0 0 0 1 0 0 0 0 0 0 0
#> 6 6 0 0 0 0 0 0 0 0 0 0 0
#> 7 7 0 0 0 0 0 0 0 0 1 0 1
#> 8 8 0 0 0 0 0 0 0 0 0 0 0
#> 9 9 0 0 1 0 1 0 0 0 0 0 0
#> 10 10 0 0 0 0 0 0 0 0 0 0 0
#> # … with 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
#> # ℹ Use `colnames()` to see all variable names
#add random missingness in addition to the systematic missingness for a more realistic scenario
random_missing_row <- sample(1:nrow(cohort_na_df), floor(nrow(cohort_na_df)/2))
missing_col <- c(1,8,13,14)
cohort_na_df[random_missing_row, missing_col] <- NA
# examine missing pattern after adding random missingness
cohort_na_df %>% find_na_level(clusterid = "cohortid")
#> # A tibble: 10 × 16
#> ClusterID X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.49 0 0 0 0 0 0 0.49 0 0 0
#> 2 2 0.55 0 0 0 0 0 0 0.55 0 0 0
#> 3 3 0.48 0 1 0 0 0 0 0.48 0 1 0
#> 4 4 0.54 0 0 0 0 0 0 0.54 0 0 0
#> 5 5 0.49 0 0 1 0 0 0 0.49 0 0 0
#> 6 6 0.47 0 0 0 0 0 0 0.47 0 0 0
#> 7 7 0.49 0 0 0 0 0 0 0.49 1 0 1
#> 8 8 0.48 0 0 0 0 0 0 0.48 0 0 0
#> 9 9 0.53 0 1 0 1 0 0 0.53 0 0 0
#> 10 10 0.48 0 0 0 0 0 0 0.48 0 0 0
#> # … with 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
#> # ℹ Use `colnames()` to see all variable names
analytic.rfsrc <- rfsrc.fast(cohortid ~ ., #regress cohort id on all other variables
data = cohort_na_df,
distance=T, ntree=1500, nsplit=3,
na.action="na.impute",nimpute=1) # impute within nodes
distance.matrix <- analytic.rfsrc$distance # extract distance matrix
#calculate average distance matrix
dist_avg_mat <- avg_dist_func(distance.matrix, cohort_na_df$cohortid)
#cluster average distance matrix and
#identify which cohorts are clustering together on dendogram
dmean <- as.dist(dist_avg_mat)
#hierarchical clustering
hc2 <- hclust(dmean, method = "ward.D2")
dend <- as.dendrogram(hc2)
plot(dend)
We generally do not impute the missing variables that are completely missing or missing more than 50%.
The relate::mice_group_impute
in this package provides imputation by group regardless of missingness level (imputeLevel = 1). Please find miceadds::mice.impute.bygroup
for similar functionality.
mice_group_impute(df = cohort_na_df, clusterid = "cohortid", imputeLevel = 1, miceArg = list(method = 'mean', maxit = 1))
# we use mean imputation and asymptotic BG test for illustrative purposes.
ll <- recursive.test(dend = dend, df = cohort_na_df, cohortid.var = "cohortid",
verbose = T, saveIntermediate = F,
BG.method = 'permutation', n_perm = 100, impute = T,
miceArgs = list(method = c("mean"), m =1, maxit = 5, seed = 123, printFlag = F))
#> Start testing:
#> # A tibble: 1 × 2
#> X Y
#> <chr> <chr>
#> 1 9 10
#> Results:
#> # A tibble: 1 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0.49 FALSE
#> --------------------------
#> Start testing:
#> # A tibble: 1 × 2
#> X Y
#> <chr> <chr>
#> 1 8 9,10
#> Results:
#> # A tibble: 1 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0.29 FALSE
#> --------------------------
#> Start testing:
#> # A tibble: 1 × 2
#> X Y
#> <chr> <chr>
#> 1 7 8,9,10
#> Results:
#> # A tibble: 1 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0.03 TRUE
#> --------------------------
#> Start testing:
#> # A tibble: 1 × 2
#> X Y
#> <chr> <chr>
#> 1 1 2
#> Results:
#> # A tibble: 1 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0.32 FALSE
#> --------------------------
#> Start testing:
#> # A tibble: 1 × 2
#> X Y
#> <chr> <chr>
#> 1 3 1,2
#> Results:
#> # A tibble: 1 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0.71 FALSE
#> --------------------------
#> Start testing:
#> # A tibble: 1 × 2
#> X Y
#> <chr> <chr>
#> 1 4 5
#> Results:
#> # A tibble: 1 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0.79 FALSE
#> --------------------------
#> Start testing:
#> # A tibble: 1 × 2
#> X Y
#> <chr> <chr>
#> 1 6 4,5
#> Results:
#> # A tibble: 1 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0.04 TRUE
#> --------------------------
#> Start testing:
#> # A tibble: 2 × 2
#> X Y
#> <chr> <chr>
#> 1 3,1,2 6
#> 2 3,1,2 4,5
#> Results:
#> # A tibble: 2 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0 TRUE
#> 2 0 TRUE
#> --------------------------
#> Start testing:
#> # A tibble: 6 × 2
#> X Y
#> <chr> <chr>
#> 1 7 3,1,2
#> 2 8,9,10 3,1,2
#> 3 7 6
#> 4 8,9,10 6
#> 5 7 4,5
#> 6 8,9,10 4,5
#> Results:
#> # A tibble: 6 × 2
#> Pairwise.p Significant
#> <dbl> <lgl>
#> 1 0 TRUE
#> 2 0 TRUE
#> 3 0 TRUE
#> 4 0 TRUE
#> 5 0 TRUE
#> 6 0 TRUE
#> --------------------------
ll$clusters
#> [1] "7" "8,9,10" "3,1,2" "6" "4,5"
table(true_distribution, cohort_na_df$cohortid, dnn = c("True Cluster","Cohort ID"))
#> Cohort ID
#> True Cluster 1 2 3 4 5 6 7 8 9 10
#> 1 100 100 100 0 0 0 0 0 0 0
#> 2 0 0 0 100 100 100 0 0 0 0
#> 3 0 0 0 0 0 0 100 100 100 100