Basic Stand Alone Public Use File (BSAPUF)

Build Status Build status

The CMS Basic Stand Alone Public Use File (BSAPUF) contains a five percent sample of Medicare beneficiary spending and utilization in the enrolled population.

Simplified Download and Importation

The R lodown package easily downloads and imports all available BSAPUF microdata by simply specifying "bsapuf" with an output_dir = parameter in the lodown() function. Depending on your internet connection and computer processing speed, you might prefer to run this step overnight.

library(lodown)
lodown( "bsapuf" , output_dir = file.path( path.expand( "~" ) , "BSAPUF" ) )

Analysis Examples with SQL and RSQLite  

Connect to a database:

library(DBI)
dbdir <- file.path( path.expand( "~" ) , "BSAPUF" , "SQLite.db" )
db <- dbConnect( RSQLite::SQLite() , dbdir )

Variable Recoding

Add new columns to the data set:

dbSendQuery( 
    db , 
    "ALTER TABLE bsa_partd_events_2008 ADD COLUMN brand_name_drug INTEGER" 
)

dbSendQuery( db , 
    "UPDATE bsa_partd_events_2008 
    SET brand_name_drug = 
        CASE 
            WHEN pde_drug_type_cd = 1 THEN 1 
            WHEN pde_drug_type_cd = 2 THEN 0 
            ELSE NULL 
        END" 
)

Unweighted Counts

Count the unweighted number of records in the SQL table, overall and by groups:

dbGetQuery( db , "SELECT COUNT(*) FROM bsa_partd_events_2008" )

dbGetQuery( db ,
    "SELECT
        bene_sex_ident_cd ,
        COUNT(*) 
    FROM bsa_partd_events_2008
    GROUP BY bene_sex_ident_cd"
)

Descriptive Statistics

Calculate the mean (average) of a linear variable, overall and by groups:

dbGetQuery( db , "SELECT AVG( pde_drug_cost ) FROM bsa_partd_events_2008" )

dbGetQuery( db , 
    "SELECT 
        bene_sex_ident_cd , 
        AVG( pde_drug_cost ) AS mean_pde_drug_cost
    FROM bsa_partd_events_2008 
    GROUP BY bene_sex_ident_cd" 
)

Calculate the distribution of a categorical variable:

dbGetQuery( db , 
    "SELECT 
        bene_age_cat_cd , 
        COUNT(*) / ( SELECT COUNT(*) FROM bsa_partd_events_2008 ) 
            AS share_bene_age_cat_cd
    FROM bsa_partd_events_2008 
    GROUP BY bene_age_cat_cd" 
)

Calculate the sum of a linear variable, overall and by groups:

dbGetQuery( db , "SELECT SUM( pde_drug_cost ) FROM bsa_partd_events_2008" )

dbGetQuery( db , 
    "SELECT 
        bene_sex_ident_cd , 
        SUM( pde_drug_cost ) AS sum_pde_drug_cost 
    FROM bsa_partd_events_2008 
    GROUP BY bene_sex_ident_cd" 
)

Calculate the 25th, median, and 75th percentiles of a linear variable, overall and by groups:

RSQLite::initExtension( db )

dbGetQuery( db , 
    "SELECT 
        LOWER_QUARTILE( pde_drug_cost ) , 
        MEDIAN( pde_drug_cost ) , 
        UPPER_QUARTILE( pde_drug_cost ) 
    FROM bsa_partd_events_2008" 
)

dbGetQuery( db , 
    "SELECT 
        bene_sex_ident_cd , 
        LOWER_QUARTILE( pde_drug_cost ) AS lower_quartile_pde_drug_cost , 
        MEDIAN( pde_drug_cost ) AS median_pde_drug_cost , 
        UPPER_QUARTILE( pde_drug_cost ) AS upper_quartile_pde_drug_cost
    FROM bsa_partd_events_2008 
    GROUP BY bene_sex_ident_cd" 
)

Subsetting

Limit your SQL analysis to events where patient paid 100% of drug’s cost with WHERE:

dbGetQuery( db ,
    "SELECT
        AVG( pde_drug_cost )
    FROM bsa_partd_events_2008
    WHERE pde_drug_pat_pay_cd = 3"
)

Measures of Uncertainty

Calculate the variance and standard deviation, overall and by groups:

RSQLite::initExtension( db )

dbGetQuery( db , 
    "SELECT 
        VARIANCE( pde_drug_cost ) , 
        STDEV( pde_drug_cost ) 
    FROM bsa_partd_events_2008" 
)

dbGetQuery( db , 
    "SELECT 
        bene_sex_ident_cd , 
        VARIANCE( pde_drug_cost ) AS var_pde_drug_cost ,
        STDEV( pde_drug_cost ) AS stddev_pde_drug_cost
    FROM bsa_partd_events_2008 
    GROUP BY bene_sex_ident_cd" 
)

Regression Models and Tests of Association

Perform a t-test:

bsapuf_slim_df <- 
    dbGetQuery( db , 
        "SELECT 
            pde_drug_cost , 
            brand_name_drug ,
            bene_age_cat_cd
        FROM bsa_partd_events_2008" 
    )

t.test( pde_drug_cost ~ brand_name_drug , bsapuf_slim_df )

Perform a chi-squared test of association:

this_table <-
    table( bsapuf_slim_df[ , c( "brand_name_drug" , "bene_age_cat_cd" ) ] )

chisq.test( this_table )

Perform a generalized linear model:

glm_result <- 
    glm( 
        pde_drug_cost ~ brand_name_drug + bene_age_cat_cd , 
        data = bsapuf_slim_df
    )

summary( glm_result )

Analysis Examples with dplyr  

The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for BSAPUF users, this code replicates previously-presented examples:

library(dplyr)
library(dbplyr)
dplyr_db <- dplyr::src_sqlite( dbdir )
bsapuf_tbl <- tbl( dplyr_db , 'bsa_partd_events_2008' )

Calculate the mean (average) of a linear variable, overall and by groups:

bsapuf_tbl %>%
    summarize( mean = mean( pde_drug_cost ) )

bsapuf_tbl %>%
    group_by( bene_sex_ident_cd ) %>%
    summarize( mean = mean( pde_drug_cost ) )