Pesquisa de Orcamentos Familiares (POF)
Brazil’s household budget survey designed to guide major economic indicators like the Contas nacionais.
Various tables with one record per sampled household, resident, job, expenditure.
A complex sample survey designed to generalize to the civilian population of Brazil.
Released at irregular intervals, 2002-2003, 2008-2009, and 2017-2018 microdata available.
Administered by the Instituto Brasileiro de Geografia e Estatistica.
Please skim before you begin:
Pesquisa de Orçamentos Familiares 2017-2018 Perfil das despesas no Brasil
A haiku regarding this microdata:
Download, Import, Preparation
Download the dictionary files:
library(archive)
dictionary_tf <- tempfile()
dictionary_url <-
paste0(
"https://ftp.ibge.gov.br/Orcamentos_Familiares/" ,
"Pesquisa_de_Orcamentos_Familiares_2017_2018/Microdados/Documentacao_20230713.zip"
)
download.file( dictionary_url , dictionary_tf , mode = 'wb' )
dictionary_files <- archive_extract( dictionary_tf , dir = tempdir() )
Import the household variable dictionary:
library(readxl)
dictionary_fn <- file.path( tempdir() , "Dicionários de váriaveis.xls" )
domicilio_dictionary_tbl <- read_excel( dictionary_fn , sheet = "Domicílio" , skip = 3 )
domicilio_dictionary_df <- data.frame( domicilio_dictionary_tbl )
names( domicilio_dictionary_df ) <-
c( 'position' , 'length' , 'decimals' , 'column_name' , 'description' , 'variable_labels' )
domicilio_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] <-
sapply( domicilio_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] , as.integer )
domicilio_dictionary_df <- subset( domicilio_dictionary_df , !is.na( position ) )
Import the resident variable dictionary:
morador_dictionary_tbl <- read_excel( dictionary_fn , sheet = "Morador" , skip = 3 )
morador_dictionary_df <- data.frame( morador_dictionary_tbl )
names( morador_dictionary_df ) <-
c( 'position' , 'length' , 'decimals' , 'column_name' , 'description' , 'variable_labels' )
morador_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] <-
sapply( morador_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] , as.integer )
morador_dictionary_df <- subset( morador_dictionary_df , !is.na( position ) )
Import the post-stratification totals:
post_stratification_fn <- file.path( tempdir() , "Pos_estratos_totais.xlsx" )
post_stratification_tbl <- read_excel( post_stratification_fn , skip = 5 )
post_stratification_df <- data.frame( post_stratification_tbl )
names( post_stratification_df ) <-
c( 'estrato_pof' , 'pos_estrato' , 'total_pessoas' , 'uf' , 'cod_upa' )
Download the full dataset:
this_tf <- tempfile()
this_url <-
paste0(
"https://ftp.ibge.gov.br/Orcamentos_Familiares/" ,
"Pesquisa_de_Orcamentos_Familiares_2017_2018/Microdados/Dados_20230713.zip"
)
download.file( this_url , this_tf , mode = 'wb' )
unzipped_files <- unzip( this_tf , exdir = tempdir() )
Import the household table:
library(readr)
domicilio_fn <- grep( 'DOMICILIO\\.txt$' , unzipped_files , value = TRUE )
domicilio_tbl <-
read_fwf(
domicilio_fn ,
fwf_widths(
widths = domicilio_dictionary_df[ , 'length' ] ,
col_names = domicilio_dictionary_df[ , 'column_name' ]
)
)
domicilio_df <- data.frame( domicilio_tbl )
names( domicilio_df ) <- tolower( names( domicilio_df ) )
Import the resident table:
morador_fn <- grep( 'MORADOR\\.txt$' , unzipped_files , value = TRUE )
morador_tbl <-
read_fwf(
morador_fn ,
fwf_widths(
widths = morador_dictionary_df[ , 'length' ] ,
col_names = morador_dictionary_df[ , 'column_name' ]
)
)
morador_df <- data.frame( morador_tbl )
names( morador_df ) <- tolower( names( morador_df ) )
Merge one household-level variable and also the post-stratification info onto the person-level table:
dom_mor_df <- merge( domicilio_df[ c( 'cod_upa' , 'num_dom' , 'v6199' ) ] , morador_df )
pof_df <- merge( dom_mor_df , post_stratification_df )
stopifnot( nrow( pof_df ) == nrow( morador_df ) )
Save Locally
Save the object at any point:
# pof_fn <- file.path( path.expand( "~" ) , "POF" , "this_file.rds" )
# saveRDS( pof_df , file = pof_fn , compress = FALSE )
Load the same object:
Survey Design Definition
Construct a complex sample survey design:
library(survey)
options( survey.lonely.psu = "adjust" )
pre_stratified_design <-
svydesign(
id = ~ cod_upa ,
strata = ~ estrato_pof ,
weights = ~ peso ,
data = pof_df ,
nest = TRUE
)
population_totals <-
aggregate( peso_final ~ pos_estrato , data = pof_df , sum )
names( population_totals ) <- c( 'pos_estrato' , 'Freq' )
pof_design <-
postStratify(
pre_stratified_design ,
~ pos_estrato ,
population_totals
)
Variable Recoding
Add new columns to the data set:
pof_design <-
update(
pof_design ,
one = 1 ,
food_security =
factor(
v6199 ,
levels = 1:4 ,
labels = c( 'food secure' , 'mild' , 'moderate' , 'severe' )
) ,
age_categories =
factor(
1 + findInterval( v0403 ,
c( 20 , 25 , 30 , 35 , 45 , 55 , 65 , 75 ) ) ,
levels = 1:9 ,
labels =
c( "under 20" , "20-24" , "25-29" , "30-34" , "35-44" ,
"45-54" , "55-64" , "65-74" , "75+" )
) ,
sex = factor( v0404 , levels = 1:2 , labels = c( 'male' , 'female' ) ) ,
urban = as.numeric( tipo_situacao_reg == 1 )
)
Analysis Examples with the survey
library
Unweighted Counts
Count the unweighted number of records in the survey sample, overall and by groups:
Descriptive Statistics
Calculate the mean (average) of a linear variable, overall and by groups:
Calculate the distribution of a categorical variable, overall and by groups:
Calculate the sum of a linear variable, overall and by groups:
Calculate the weighted sum of a categorical variable, overall and by groups:
Calculate the median (50th percentile) of a linear variable, overall and by groups:
svyquantile( ~ renda_total , pof_design , 0.5 )
svyby(
~ renda_total ,
~ sex ,
pof_design ,
svyquantile ,
0.5 ,
ci = TRUE
)
Estimate a ratio:
Subsetting
Restrict the survey design to credit card holders:
Calculate the mean (average) of this subset:
Measures of Uncertainty
Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:
this_result <- svymean( ~ renda_total , pof_design )
coef( this_result )
SE( this_result )
confint( this_result )
cv( this_result )
grouped_result <-
svyby(
~ renda_total ,
~ sex ,
pof_design ,
svymean
)
coef( grouped_result )
SE( grouped_result )
confint( grouped_result )
cv( grouped_result )
Calculate the degrees of freedom of any survey design object:
Calculate the complex sample survey-adjusted variance of any statistic:
Include the complex sample design effect in the result for a specific statistic:
# SRS without replacement
svymean( ~ renda_total , pof_design , deff = TRUE )
# SRS with replacement
svymean( ~ renda_total , pof_design , deff = "replace" )
Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop
for alternatives:
Replication Example
This example matches the 2017-2018 person-level food security estimates from Tabela 3:
person_level_food_security <- svymean( ~ food_security , pof_design , na.rm = TRUE )
stopifnot(
all.equal(
round( coef( person_level_food_security ) , 2 ) ,
c( 0.59 , 0.27 , 0.09 , 0.05 ) ,
check.attributes = FALSE
)
)
Poverty and Inequality Estimation with convey
The R convey
library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for POF users, this code calculates the gini coefficient on complex sample survey data:
library(convey)
pof_design <- convey_prep( pof_design )
svygini( ~ renda_total , pof_design , na.rm = TRUE )
Analysis Examples with srvyr
The R srvyr
library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize
, group_by
, and mutate
, the convenience of pipe-able functions, the tidyverse
style of non-standard evaluation and more consistent return types than the survey
package. This vignette details the available features. As a starting point for POF users, this code replicates previously-presented examples:
Calculate the mean (average) of a linear variable, overall and by groups: