Survey of Business Owners (SBO)
The Survey of Business Owners tracks nearly every tax-filing sole proprietorship, partnership, and corporation in the nation.
One table with one row per firm per state per industry.
A complex sample survey designed to generalize to all firms in the United States, however the public use microdata only includes classifiable (non-identifiable) firms which comprise nearly all businesses but only about half of workers.
Released as part of the U.S. Census Bureau’s Economic Census, every year ending in 2 or 7.
Administered by the U.S. Census Bureau.
Simplified Download and Importation
The R lodown
package easily downloads and imports all available SBO microdata by simply specifying "sbo"
with an output_dir =
parameter in the lodown()
function. Depending on your internet connection and computer processing speed, you might prefer to run this step overnight.
library(lodown)
lodown( "sbo" , output_dir = file.path( path.expand( "~" ) , "SBO" ) )
Analysis Examples with the survey
library
Construct a multiply-imputed, complex sample survey design:
gc()
options( survey.lonely.psu = "adjust" )
library(survey)
library(mitools)
sbo_design <-
readRDS( file.path( path.expand( "~" ) , "SBO" , "2007 main.rds" ) )
# keep only the variables you need
variables_to_keep <-
c(
"one" ,
"newwgt" ,
"tabwgt" ,
"receipts_noisy" ,
"employment_noisy" ,
"n07_employer" ,
"established" ,
"healthins" ,
"husbwife"
)
# keep only columns used in this analysis
sbo_design$coef$variables <-
sbo_design$coef$variables[ variables_to_keep ]
sbo_design$var <-
lapply(
sbo_design$var ,
function( w ){
w$variables <- w$variables[ variables_to_keep ]
w
}
)
gc()
# this step conserves RAM
Variable Recoding
Add new columns to the data set:
sbo_design <-
sbo_update(
sbo_design ,
established_before_2000 =
ifelse( established %in% c( '0' , 'A' ) , NA , as.numeric( established < 4 ) ) ,
healthins =
factor( healthins , levels = 1:2 ,
labels = c( "offered health insurance" , "did not offer health insurance" )
)
)
gc()
Unweighted Counts
Count the unweighted number of records in the survey sample, overall and by groups:
sbo_MIcombine( sbo_with( sbo_design , svyby( ~ one , ~ one , unwtd.count ) ) )
sbo_MIcombine( sbo_with( sbo_design , svyby( ~ one , ~ healthins , unwtd.count ) ) )
Weighted Counts
Count the weighted size of the generalizable population, overall and by groups:
sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ one ) ) )
sbo_MIcombine( sbo_with( sbo_design ,
svyby( ~ one , ~ healthins , svytotal )
) )
Descriptive Statistics
Calculate the mean (average) of a linear variable, overall and by groups:
sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy ) ) )
sbo_MIcombine( sbo_with( sbo_design ,
svyby( ~ receipts_noisy , ~ healthins , svymean )
) )
Calculate the distribution of a categorical variable, overall and by groups:
sbo_MIcombine( sbo_with( sbo_design , svymean( ~ n07_employer , na.rm = TRUE ) ) )
sbo_MIcombine( sbo_with( sbo_design ,
svyby( ~ n07_employer , ~ healthins , svymean , na.rm = TRUE )
) )
Calculate the sum of a linear variable, overall and by groups:
sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ receipts_noisy ) ) )
sbo_MIcombine( sbo_with( sbo_design ,
svyby( ~ receipts_noisy , ~ healthins , svytotal )
) )
Calculate the weighted sum of a categorical variable, overall and by groups:
sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ n07_employer , na.rm = TRUE ) ) )
sbo_MIcombine( sbo_with( sbo_design ,
svyby( ~ n07_employer , ~ healthins , svytotal , na.rm = TRUE )
) )
Calculate the median (50th percentile) of a linear variable, overall and by groups:
sbo_MIcombine( sbo_with( sbo_design ,
svyquantile(
~ receipts_noisy ,
0.5 , se = TRUE
) ) )
sbo_MIcombine( sbo_with( sbo_design ,
svyby(
~ receipts_noisy , ~ healthins , svyquantile ,
0.5 , se = TRUE ,
keep.var = TRUE , ci = TRUE
) ) )
Estimate a ratio:
sbo_MIcombine( sbo_with( sbo_design ,
svyratio( numerator = ~ receipts_noisy , denominator = ~ employment_noisy )
) )
Subsetting
Restrict the survey design to jointly owned by husband and wife:
sub_sbo_design <- sbo_subset( sbo_design , husbwife %in% 1:3 )
Calculate the mean (average) of this subset:
sbo_MIcombine( sbo_with( sub_sbo_design , svymean( ~ receipts_noisy ) ) ) ; rm( sub_sbo_design ) ; gc()
Measures of Uncertainty
Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:
this_result <-
sbo_MIcombine( sbo_with( sbo_design ,
svymean( ~ receipts_noisy )
) )
coef( this_result )
SE( this_result )
confint( this_result )
cv( this_result )
grouped_result <-
sbo_MIcombine( sbo_with( sbo_design ,
svyby( ~ receipts_noisy , ~ healthins , svymean )
) )
coef( grouped_result )
SE( grouped_result )
confint( grouped_result )
cv( grouped_result )
Calculate the degrees of freedom of any survey design object:
sbo_degf( sbo_design )
Calculate the complex sample survey-adjusted variance of any statistic:
sbo_MIcombine( sbo_with( sbo_design , svyvar( ~ receipts_noisy ) ) )
Include the complex sample design effect in the result for a specific statistic:
# SRS without replacement
sbo_MIcombine( sbo_with( sbo_design ,
svymean( ~ receipts_noisy , deff = TRUE )
) )
# SRS with replacement
sbo_MIcombine( sbo_with( sbo_design ,
svymean( ~ receipts_noisy , deff = "replace" )
) )
Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop
for alternatives:
sbo_MIsvyciprop( ~ established_before_2000 , sbo_design ,
method = "likelihood" , na.rm = TRUE ) ; gc()
Regression Models and Tests of Association
Perform a design-based t-test:
# not implemented sbo_MIsvyttest( receipts_noisy ~ established_before_2000 , sbo_design )
Perform a chi-squared test of association for survey data:
# not implemented sbo_MIsvychisq( ~ established_before_2000 + n07_employer , sbo_design )
Perform a survey-weighted generalized linear model:
glm_result <-
sbo_MIcombine( sbo_with( sbo_design ,
svyglm( receipts_noisy ~ established_before_2000 + n07_employer )
) )
glm_result