National Health Interview Survey (NHIS)
America’s most detailed household survey of health status and medical experience.
One table with one row per sampled adult (18+) within each sampled household, one table with one row per sample child (when available, same family not required), multiply-imputed income tables.
A complex sample survey designed to generalize to the U.S. civilian non-institutionalized population.
Released annually since 1963, the most recent major re-design in 2019.
Conducted by the National Center for Health Statistics at the Centers for Disease Control.
Please skim before you begin:
This human-composed haiku or a bouquet of artificial intelligence-generated limericks
# excellent health poor
# wealth. "sup, doc?" bugs, daft bills, free
# laughs best medicine
Download, Import, Preparation
Define a function to download, unzip, and import each comma-separated value file:
<-
nhis_csv_import function( this_url ){
<- tempfile()
this_tf
download.file( this_url , this_tf , mode = 'wb' )
<- unzip( this_tf , exdir = tempdir() )
unzipped_files
<- grep( '\\.csv$' , unzipped_files , value = TRUE )
this_csv
<- read.csv( this_csv )
this_df
file.remove( c( this_tf , unzipped_files ) )
names( this_df ) <- tolower( names( this_df ) )
this_df }
Download and import the sample adult interview and imputed income files:
<-
nhis_df nhis_csv_import(
"https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2021/adult21csv.zip"
)
<-
imputed_income_df nhis_csv_import(
"https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2021/adultinc21csv.zip"
)
Save locally
Save the object at any point:
# nhis_fn <- file.path( path.expand( "~" ) , "NHIS" , "this_file.rds" )
# saveRDS( nhis_df , file = nhis_fn , compress = FALSE )
Load the same object:
# nhis_df <- readRDS( nhis_fn )
Survey Design Definition
Construct a multiply-imputed, complex sample survey design:
Reshape the imputed income data.frame into a list based on the implicate number:
<- split( imputed_income_df , imputed_income_df[ , 'impnum_a' ] ) imputed_income_list
Remove overlapping columns except the merge variable:
<-
variables_to_remove setdiff( intersect( names( nhis_df ) , names( imputed_income_df ) ) , 'hhx' )
<- nhis_df[ , !( names( nhis_df ) %in% variables_to_remove ) ] nhis_df
Merge each implicate onto the sample adult table:
<-
nhis_list lapply( imputed_income_list ,
function( w ){
<- merge( nhis_df , w )
this_df stopifnot( nrow( this_df ) == nrow( nhis_df ) )
this_df } )
Define the design:
library(survey)
library(mitools)
<-
nhis_design svydesign(
id = ~ ppsu ,
strata = ~ pstrat ,
nest = TRUE ,
weights = ~ wtfa_a ,
data = imputationList( nhis_list )
)
Variable Recoding
Add new columns to the data set:
<-
nhis_design update(
nhis_design ,
one = 1 ,
poverty_category =
factor(
findInterval( povrattc_a , c( 1 , 2 , 4 ) ) ,
labels =
c( "below poverty" , "100-199%" , "200-399%" , "400%+" )
) ,
fair_or_poor_reported_health =
ifelse( phstat_a %in% 1:5 , as.numeric( phstat_a >= 4 ) , NA ) ,
sex_a = factor( sex_a , levels = 1:2 , labels = c( "male" , "female" ) ) ,
annual_premium_first_plan = ifelse( hicostr1_a > 40000 , NA , hicostr1_a )
)
Analysis Examples with the survey
library
Unweighted Counts
Count the unweighted number of records in the survey sample, overall and by groups:
MIcombine( with( nhis_design , svyby( ~ one , ~ one , unwtd.count ) ) )
MIcombine( with( nhis_design , svyby( ~ one , ~ poverty_category , unwtd.count ) ) )
Weighted Counts
Count the weighted size of the generalizable population, overall and by groups:
MIcombine( with( nhis_design , svytotal( ~ one ) ) )
MIcombine( with( nhis_design ,
svyby( ~ one , ~ poverty_category , svytotal )
) )
Descriptive Statistics
Calculate the mean (average) of a linear variable, overall and by groups:
MIcombine( with( nhis_design , svymean( ~ agep_a ) ) )
MIcombine( with( nhis_design ,
svyby( ~ agep_a , ~ poverty_category , svymean )
) )
Calculate the distribution of a categorical variable, overall and by groups:
MIcombine( with( nhis_design , svymean( ~ sex_a ) ) )
MIcombine( with( nhis_design ,
svyby( ~ sex_a , ~ poverty_category , svymean )
) )
Calculate the sum of a linear variable, overall and by groups:
MIcombine( with( nhis_design , svytotal( ~ agep_a ) ) )
MIcombine( with( nhis_design ,
svyby( ~ agep_a , ~ poverty_category , svytotal )
) )
Calculate the weighted sum of a categorical variable, overall and by groups:
MIcombine( with( nhis_design , svytotal( ~ sex_a ) ) )
MIcombine( with( nhis_design ,
svyby( ~ sex_a , ~ poverty_category , svytotal )
) )
Calculate the median (50th percentile) of a linear variable, overall and by groups:
MIcombine( with( nhis_design ,
svyquantile(
~ agep_a ,
0.5 , se = TRUE
) ) )
MIcombine( with( nhis_design ,
svyby(
~ agep_a , ~ poverty_category , svyquantile ,
0.5 , se = TRUE ,
ci = TRUE
) ) )
Estimate a ratio:
MIcombine( with( nhis_design ,
svyratio( numerator = ~ annual_premium_first_plan , denominator = ~ agep_a , na.rm = TRUE )
) )
Subsetting
Restrict the survey design to uninsured:
<- subset( nhis_design , notcov_a == 1 ) sub_nhis_design
Calculate the mean (average) of this subset:
MIcombine( with( sub_nhis_design , svymean( ~ agep_a ) ) )
Measures of Uncertainty
Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:
<-
this_result MIcombine( with( nhis_design ,
svymean( ~ agep_a )
) )
coef( this_result )
SE( this_result )
confint( this_result )
cv( this_result )
<-
grouped_result MIcombine( with( nhis_design ,
svyby( ~ agep_a , ~ poverty_category , svymean )
) )
coef( grouped_result )
SE( grouped_result )
confint( grouped_result )
cv( grouped_result )
Calculate the degrees of freedom of any survey design object:
degf( nhis_design$designs[[1]] )
Calculate the complex sample survey-adjusted variance of any statistic:
MIcombine( with( nhis_design , svyvar( ~ agep_a ) ) )
Include the complex sample design effect in the result for a specific statistic:
# SRS without replacement
MIcombine( with( nhis_design ,
svymean( ~ agep_a , deff = TRUE )
) )
# SRS with replacement
MIcombine( with( nhis_design ,
svymean( ~ agep_a , deff = "replace" )
) )
Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop
for alternatives:
# MIsvyciprop( ~ fair_or_poor_reported_health , nhis_design ,
# method = "likelihood" , na.rm = TRUE )
Regression Models and Tests of Association
Perform a design-based t-test:
# MIsvyttest( agep_a ~ fair_or_poor_reported_health , nhis_design )
Perform a chi-squared test of association for survey data:
# MIsvychisq( ~ fair_or_poor_reported_health + sex_a , nhis_design )
Perform a survey-weighted generalized linear model:
<-
glm_result MIcombine( with( nhis_design ,
svyglm( agep_a ~ fair_or_poor_reported_health + sex_a )
) )
summary( glm_result )
Replication Example
This example matches statistics and standard errors within 0.01% from Figure 3 of this Characteristics of Adults Aged 18–64 Who Did Not Take Medication as Prescribed to Reduce Costs Data Brief:
<-
results MIcombine(
with(
subset( nhis_design , agep_a < 65 ) ,
svyby(
~ as.numeric( rxsk12m_a == 1 | rxls12m_a == 1 | rxdl12m_a == 1 ) ,
~ poverty_category ,
svymean , na.rm = TRUE
)
)
)
stopifnot(
all(
as.numeric( round( coef( results ) , 3 ) ) == c( 0.145 , 0.138 , 0.099 , 0.039 )
)
)
stopifnot(
all(
as.numeric( round( SE( results ) , 5 ) ) - c( 0.0126 , 0.0098 , 0.0062 , 0.0031 ) < 0.0001
) )