National Health Interview Survey (NHIS)

Github Actions Badge

America’s most detailed household survey of health status and medical experience.

  • One table with one row per sampled adult (18+) within each sampled household, one table with one row per sample child (when available, same family not required), multiply-imputed income tables.

  • A complex sample survey designed to generalize to the U.S. civilian non-institutionalized population.

  • Released annually since 1963, the most recent major re-design in 2019.

  • Conducted by the National Center for Health Statistics at the Centers for Disease Control.


Please skim before you begin:

  1. 2021 Survey Description

  2. Wikipedia Entry

  3. This human-composed haiku or a bouquet of artificial intelligence-generated limericks

# excellent health poor
# wealth. "sup, doc?" bugs, daft bills, free
# laughs best medicine

Download, Import, Preparation

Define a function to download, unzip, and import each comma-separated value file:

nhis_csv_import <-
    function( this_url ){
        
        this_tf <- tempfile()
        
        download.file( this_url , this_tf , mode = 'wb' )
        
        unzipped_files <- unzip( this_tf , exdir = tempdir() )
        
        this_csv <- grep( '\\.csv$' , unzipped_files , value = TRUE )
        
        this_df <- read.csv( this_csv )
        
        file.remove( c( this_tf , unzipped_files ) )
        
        names( this_df ) <- tolower( names( this_df ) )
        
        this_df
    }

Download and import the sample adult interview and imputed income files:

nhis_df <-
    nhis_csv_import( 
        "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2021/adult21csv.zip" 
    )

imputed_income_df <- 
    nhis_csv_import( 
        "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2021/adultinc21csv.zip" 
    )

Save locally  

Save the object at any point:

# nhis_fn <- file.path( path.expand( "~" ) , "NHIS" , "this_file.rds" )
# saveRDS( nhis_df , file = nhis_fn , compress = FALSE )

Load the same object:

# nhis_df <- readRDS( nhis_fn )

Survey Design Definition

Construct a multiply-imputed, complex sample survey design:

Reshape the imputed income data.frame into a list based on the implicate number:

imputed_income_list <- split( imputed_income_df , imputed_income_df[ , 'impnum_a' ] )

Remove overlapping columns except the merge variable:

variables_to_remove <-
    setdiff( intersect( names( nhis_df ) , names( imputed_income_df ) ) , 'hhx' )

nhis_df <- nhis_df[ , !( names( nhis_df ) %in% variables_to_remove ) ]

Merge each implicate onto the sample adult table:

nhis_list <-
    lapply( imputed_income_list ,
        function( w ){
            this_df <- merge( nhis_df , w )
            stopifnot( nrow( this_df ) == nrow( nhis_df ) )
            this_df
        } )

Define the design:

library(survey)
library(mitools)

nhis_design <- 
    svydesign( 
        id = ~ ppsu , 
        strata = ~ pstrat ,
        nest = TRUE ,
        weights = ~ wtfa_a ,
        data = imputationList( nhis_list )
    )

Variable Recoding

Add new columns to the data set:

nhis_design <- 
    update( 
        nhis_design , 
        
        one = 1 ,
        
        poverty_category =
            factor( 
                findInterval( povrattc_a , c( 1 , 2 , 4 ) ) ,
                labels = 
                    c( "below poverty" , "100-199%" , "200-399%" , "400%+" )
            ) ,
            
        fair_or_poor_reported_health = 
            ifelse( phstat_a %in% 1:5 , as.numeric( phstat_a >= 4 ) , NA ) ,
            
        sex_a = factor( sex_a , levels = 1:2 , labels = c( "male" , "female" ) ) ,
        
        annual_premium_first_plan = ifelse( hicostr1_a > 40000 , NA , hicostr1_a )

    )

Analysis Examples with the survey library  

Unweighted Counts

Count the unweighted number of records in the survey sample, overall and by groups:

MIcombine( with( nhis_design , svyby( ~ one , ~ one , unwtd.count ) ) )

MIcombine( with( nhis_design , svyby( ~ one , ~ poverty_category , unwtd.count ) ) )

Weighted Counts

Count the weighted size of the generalizable population, overall and by groups:

MIcombine( with( nhis_design , svytotal( ~ one ) ) )

MIcombine( with( nhis_design ,
    svyby( ~ one , ~ poverty_category , svytotal )
) )

Descriptive Statistics

Calculate the mean (average) of a linear variable, overall and by groups:

MIcombine( with( nhis_design , svymean( ~ agep_a ) ) )

MIcombine( with( nhis_design ,
    svyby( ~ agep_a , ~ poverty_category , svymean )
) )

Calculate the distribution of a categorical variable, overall and by groups:

MIcombine( with( nhis_design , svymean( ~ sex_a ) ) )

MIcombine( with( nhis_design ,
    svyby( ~ sex_a , ~ poverty_category , svymean )
) )

Calculate the sum of a linear variable, overall and by groups:

MIcombine( with( nhis_design , svytotal( ~ agep_a ) ) )

MIcombine( with( nhis_design ,
    svyby( ~ agep_a , ~ poverty_category , svytotal )
) )

Calculate the weighted sum of a categorical variable, overall and by groups:

MIcombine( with( nhis_design , svytotal( ~ sex_a ) ) )

MIcombine( with( nhis_design ,
    svyby( ~ sex_a , ~ poverty_category , svytotal )
) )

Calculate the median (50th percentile) of a linear variable, overall and by groups:

MIcombine( with( nhis_design ,
    svyquantile(
        ~ agep_a ,
        0.5 , se = TRUE 
) ) )

MIcombine( with( nhis_design ,
    svyby(
        ~ agep_a , ~ poverty_category , svyquantile ,
        0.5 , se = TRUE ,
        ci = TRUE 
) ) )

Estimate a ratio:

MIcombine( with( nhis_design ,
    svyratio( numerator = ~ annual_premium_first_plan , denominator = ~ agep_a , na.rm = TRUE )
) )

Subsetting

Restrict the survey design to uninsured:

sub_nhis_design <- subset( nhis_design , notcov_a == 1 )

Calculate the mean (average) of this subset:

MIcombine( with( sub_nhis_design , svymean( ~ agep_a ) ) )

Measures of Uncertainty

Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:

this_result <-
    MIcombine( with( nhis_design ,
        svymean( ~ agep_a )
    ) )

coef( this_result )
SE( this_result )
confint( this_result )
cv( this_result )

grouped_result <-
    MIcombine( with( nhis_design ,
        svyby( ~ agep_a , ~ poverty_category , svymean )
    ) )

coef( grouped_result )
SE( grouped_result )
confint( grouped_result )
cv( grouped_result )

Calculate the degrees of freedom of any survey design object:

degf( nhis_design$designs[[1]] )

Calculate the complex sample survey-adjusted variance of any statistic:

MIcombine( with( nhis_design , svyvar( ~ agep_a ) ) )

Include the complex sample design effect in the result for a specific statistic:

# SRS without replacement
MIcombine( with( nhis_design ,
    svymean( ~ agep_a , deff = TRUE )
) )

# SRS with replacement
MIcombine( with( nhis_design ,
    svymean( ~ agep_a , deff = "replace" )
) )

Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives:

# MIsvyciprop( ~ fair_or_poor_reported_health , nhis_design ,
#   method = "likelihood" , na.rm = TRUE )

Regression Models and Tests of Association

Perform a design-based t-test:

# MIsvyttest( agep_a ~ fair_or_poor_reported_health , nhis_design )

Perform a chi-squared test of association for survey data:

# MIsvychisq( ~ fair_or_poor_reported_health + sex_a , nhis_design )

Perform a survey-weighted generalized linear model:

glm_result <- 
    MIcombine( with( nhis_design ,
        svyglm( agep_a ~ fair_or_poor_reported_health + sex_a )
    ) )
    
summary( glm_result )

Replication Example

This example matches statistics and standard errors within 0.01% from Figure 3 of this Characteristics of Adults Aged 18–64 Who Did Not Take Medication as Prescribed to Reduce Costs Data Brief:

results <-
    MIcombine( 
        with(
            subset( nhis_design , agep_a < 65 ) , 
            svyby(
                ~ as.numeric( rxsk12m_a == 1 | rxls12m_a == 1 | rxdl12m_a == 1 ) , 
                ~ poverty_category , 
                svymean , 
                na.rm = TRUE 
            ) 
        )
    )

stopifnot(
    all(
        as.numeric( round( coef( results ) , 3 ) ) == c( 0.145 , 0.138 , 0.099 , 0.039 )
    ) 
)

stopifnot(
    all( 
        as.numeric( round( SE( results ) , 5 ) ) - c( 0.0126 , 0.0098 , 0.0062 , 0.0031 ) < 0.0001
    ) 
)