National Health and Nutrition Examination Survey (NHANES)
Doctors and dentists accompany survey interviewers in a mobile medical center that travels the country. While survey researchers read the questionnaires, medical professionals administer laboratory tests and conduct a full medical examination. The blood work and in-person check-up allow epidemiologists to answer questions like, “how many people have diabetes but don’t know they have diabetes?”
Many tables containing information from the various examinations, generally one row per respondent.
A complex sample survey designed to generalize to the civilian non-institutionalized U.S. population.
Released biennially since 1999-2000.
Administered by the Centers for Disease Control and Prevention.
Please skim before you begin:
A haiku regarding this microdata:
Download, Import, Preparation
Download and import the demographics (demo) and total cholesterol laboratory (tchol) data:
library(haven)
nhanes_2015_2016_demo_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DEMO_I.XPT"
nhanes_2017_2018_demo_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT"
nhanes_2015_2016_tchol_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/TCHOL_I.XPT"
nhanes_2017_2018_tchol_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TCHOL_J.XPT"
nhanes_2015_2016_demo_tbl <- read_xpt( nhanes_2015_2016_demo_url )
nhanes_2017_2018_demo_tbl <- read_xpt( nhanes_2017_2018_demo_url )
nhanes_2015_2016_tchol_tbl <- read_xpt( nhanes_2015_2016_tchol_url )
nhanes_2017_2018_tchol_tbl <- read_xpt( nhanes_2017_2018_tchol_url )
nhanes_2015_2016_demo_df <- data.frame( nhanes_2015_2016_demo_tbl )
nhanes_2017_2018_demo_df <- data.frame( nhanes_2017_2018_demo_tbl )
nhanes_2015_2016_tchol_df <- data.frame( nhanes_2015_2016_tchol_tbl )
nhanes_2017_2018_tchol_df <- data.frame( nhanes_2017_2018_tchol_tbl )
Specify which variables to keep from both the demo and tchol data files, then stack the four years:
demo_vars <-
c(
# unique person identifier (merge variable)
"SEQN" ,
# the two-year interviewed + MEC examined weight
"WTMEC2YR" ,
# note that this is a special weight for only
# individuals who took the mobile examination center (MEC) exam
# there is one other weight available - WTINT2YR -
# that should be used when MEC variables are not part of the analysis
# interviewed only or interviewed + MEC
"RIDSTATR" ,
# primary sampling unit varaible, used in complex design
"SDMVPSU" ,
# strata variable, used in complex design
"SDMVSTRA" ,
# race / ethnicity
"RIDRETH3" ,
# age
"RIDAGEYR" ,
# gender
"RIAGENDR" ,
# pregnant at interview
"RIDEXPRG"
)
nhanes_2015_2018_demo_df <-
rbind(
nhanes_2015_2016_demo_df[ , demo_vars ] ,
nhanes_2017_2018_demo_df[ , demo_vars ]
)
tchol_vars <-
c(
# unique person identifier (merge variable)
"SEQN" ,
# laboratory total cholesterol variable
# https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TCHOL_J.htm
"LBXTC"
)
nhanes_2015_2018_tchol_df <-
rbind(
nhanes_2015_2016_tchol_df[ , tchol_vars ] ,
nhanes_2017_2018_tchol_df[ , tchol_vars ]
)
Merge the two pooled datasets, limit the data.frame to mobile examination component respondents:
nhanes_full_df <-
merge(
nhanes_2015_2018_demo_df ,
nhanes_2015_2018_tchol_df ,
all = TRUE
)
names( nhanes_full_df ) <- tolower( names( nhanes_full_df ) )
nhanes_df <- subset( nhanes_full_df , ridstatr %in% 2 )
Scale the mobile examination component two-year weight to generalize to the pooled, four year period:
Save Locally
Save the object at any point:
# nhanes_fn <- file.path( path.expand( "~" ) , "NHANES" , "this_file.rds" )
# saveRDS( nhanes_df , file = nhanes_fn , compress = FALSE )
Load the same object:
Variable Recoding
Add new columns to the data set:
nhanes_design <-
update(
nhanes_design ,
one = 1 ,
# define high total cholesterol as 1 if mg/dL is at or above 240 and zero otherwise.
hi_tchol = ifelse( lbxtc >= 240 , 1 , 0 ) ,
gender = factor( riagendr , levels = 1:2 , labels = c( 'male' , 'female' ) ) ,
age_categories =
factor(
1 + findInterval( ridageyr , c( 20 , 40 , 60 ) ) ,
levels = 1:4 ,
labels = c( "0-19" , "20-39" , "40-59" , "60+" )
) ,
# recode the ridreth3 variable as:
# mexican american and other hispanic -> 4
# non-hispanic white -> 1
# non-hispanic black -> 2
# non-hispanic asian -> 3
# other race including multi-racial -> 5
race_ethnicity =
factor(
c( 4 , 4 , 1 , 2 , NA , 3 , 5 )[ ridreth3 ] ,
levels = 1:5 ,
labels = c( 'nh white' , 'nh black' , 'nh asian' , 'hispanic' , 'other' )
) ,
pregnant_at_interview =
ifelse( ridexprg %in% 1:2 , as.numeric( ridexprg == 1 ) , NA )
)
Analysis Examples with the survey
library
Unweighted Counts
Count the unweighted number of records in the survey sample, overall and by groups:
Descriptive Statistics
Calculate the mean (average) of a linear variable, overall and by groups:
svymean( ~ lbxtc , nhanes_design , na.rm = TRUE )
svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svymean , na.rm = TRUE )
Calculate the distribution of a categorical variable, overall and by groups:
svymean( ~ riagendr , nhanes_design )
svyby( ~ riagendr , ~ race_ethnicity , nhanes_design , svymean )
Calculate the sum of a linear variable, overall and by groups:
svytotal( ~ lbxtc , nhanes_design , na.rm = TRUE )
svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svytotal , na.rm = TRUE )
Calculate the weighted sum of a categorical variable, overall and by groups:
svytotal( ~ riagendr , nhanes_design )
svyby( ~ riagendr , ~ race_ethnicity , nhanes_design , svytotal )
Calculate the median (50th percentile) of a linear variable, overall and by groups:
svyquantile( ~ lbxtc , nhanes_design , 0.5 , na.rm = TRUE )
svyby(
~ lbxtc ,
~ race_ethnicity ,
nhanes_design ,
svyquantile ,
0.5 ,
ci = TRUE , na.rm = TRUE
)
Estimate a ratio:
Subsetting
Restrict the survey design to respondents aged 60 or older:
Calculate the mean (average) of this subset:
Measures of Uncertainty
Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:
this_result <- svymean( ~ lbxtc , nhanes_design , na.rm = TRUE )
coef( this_result )
SE( this_result )
confint( this_result )
cv( this_result )
grouped_result <-
svyby(
~ lbxtc ,
~ race_ethnicity ,
nhanes_design ,
svymean ,
na.rm = TRUE
)
coef( grouped_result )
SE( grouped_result )
confint( grouped_result )
cv( grouped_result )
Calculate the degrees of freedom of any survey design object:
Calculate the complex sample survey-adjusted variance of any statistic:
Include the complex sample design effect in the result for a specific statistic:
# SRS without replacement
svymean( ~ lbxtc , nhanes_design , na.rm = TRUE , deff = TRUE )
# SRS with replacement
svymean( ~ lbxtc , nhanes_design , na.rm = TRUE , deff = "replace" )
Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop
for alternatives:
Direct Method of Age-Adjustment Replication Example
This example matches the total cholesterol statistics and standard errors in Table 1 from Data Brief 363:
Match the crude estimates in the footnote and also in the unadjusted age categories:
crude_overall <-
svymean( ~ hi_tchol , subset( nhanes_design , ridageyr >= 20 ) , na.rm = TRUE )
stopifnot( round( coef( crude_overall ) , 3 ) == 0.115 )
crude_by_gender <-
svyby(
~ hi_tchol ,
~ gender ,
subset( nhanes_design , ridageyr >= 20 ) ,
svymean ,
na.rm = TRUE
)
stopifnot( round( coef( crude_by_gender )[ 1 ] , 3 ) == 0.103 )
stopifnot( round( coef( crude_by_gender )[ 2 ] , 3 ) == 0.126 )
crude_by_age <-
svyby(
~ hi_tchol ,
~ age_categories ,
subset( nhanes_design , ridageyr >= 20 ) ,
svymean ,
na.rm = TRUE
)
stopifnot( round( coef( crude_by_age )[ 1 ] , 3 ) == 0.075 )
stopifnot( round( coef( crude_by_age )[ 2 ] , 3 ) == 0.157 )
stopifnot( round( coef( crude_by_age )[ 3 ] , 3 ) == 0.114 )
stopifnot( round( SE( crude_by_age )[ 1 ] , 3 ) == 0.005 )
stopifnot( round( SE( crude_by_age )[ 2 ] , 3 ) == 0.011 )
stopifnot( round( SE( crude_by_age )[ 3 ] , 3 ) == 0.008 )
Sum up 2000 Census totals based on the age groupings specified in footnote:
pop_by_age <-
data.frame(
age_categories = c( "0-19" , "20-39" , "40-59" , "60+" ) ,
Freq = c( 78782657 , 77670618 , 72816615 , 45363752 )
)
Create a design with the nationwide population stratified to the above census counts:
nhanes_age_adjusted <-
postStratify(
subset( nhanes_design , !is.na( hi_tchol ) ) ,
~ age_categories ,
pop_by_age
)
Match the overall adjusted estimates:
results_overall <-
svymean( ~ hi_tchol , subset( nhanes_age_adjusted , ridageyr >= 20 ) , na.rm = TRUE )
stopifnot( round( coef( results_overall ) , 3 ) == 0.114 )
stopifnot( round( SE( results_overall ) , 3 ) == 0.006 )
Create a design stratified to census counts broken out by gender, then match those estimates:
nhanes_by_gender <-
svystandardize(
nhanes_design ,
by = ~ age_categories , # stratification variable
over = ~ gender , # break out variable
population = pop_by_age , # data.frame containing census populations
excluding.missing = ~ hi_tchol # analysis variable of interest
)
results_by_gender <-
svyby(
~ hi_tchol ,
~ gender ,
subset( nhanes_by_gender , ridageyr >= 20 ) ,
svymean ,
na.rm=TRUE
)
stopifnot( round( coef( results_by_gender )[ 1 ] , 3 ) == 0.105 )
stopifnot( round( coef( results_by_gender )[ 2 ] , 3 ) == 0.121 )
stopifnot( round( SE( results_by_gender )[ 1 ] , 3 ) == 0.007 )
stopifnot( round( SE( results_by_gender )[ 2 ] , 3 ) == 0.008 )
Create a design stratified to census counts broken out by race/ethnicity, then match those estimates:
nhanes_by_race <-
svystandardize(
nhanes_design ,
by = ~ age_categories , # stratification variable
over = ~ race_ethnicity , # break out variable
population = pop_by_age , # data.frame containing census populations
excluding.missing = ~ hi_tchol # analysis variable of interest
)
results_by_race_ethnicity <-
svyby(
~ hi_tchol ,
~ race_ethnicity ,
design = subset( nhanes_by_race , ridageyr >= 20 ) ,
svymean ,
na.rm=TRUE
)
stopifnot( round( coef( results_by_race_ethnicity )[ 1 ] , 3 ) == 0.117 )
stopifnot( round( coef( results_by_race_ethnicity )[ 2 ] , 3 ) == 0.100 )
stopifnot( round( coef( results_by_race_ethnicity )[ 3 ] , 3 ) == 0.116 )
stopifnot( round( coef( results_by_race_ethnicity )[ 4 ] , 3 ) == 0.109 )
stopifnot( round( SE( results_by_race_ethnicity )[ 1 ] , 3 ) == 0.007 )
stopifnot( round( SE( results_by_race_ethnicity )[ 2 ] , 3 ) == 0.009 )
stopifnot( round( SE( results_by_race_ethnicity )[ 3 ] , 3 ) == 0.011 )
stopifnot( round( SE( results_by_race_ethnicity )[ 4 ] , 3 ) == 0.009 )
Analysis Examples with srvyr
The R srvyr
library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize
, group_by
, and mutate
, the convenience of pipe-able functions, the tidyverse
style of non-standard evaluation and more consistent return types than the survey
package. This vignette details the available features. As a starting point for NHANES users, this code replicates previously-presented examples:
Calculate the mean (average) of a linear variable, overall and by groups: