Progress in International Reading Literacy Study (PIRLS)
A comparative study of student achievement in reading and literacy across more than 50 nations.
Grade-specific tables with one record per school, student, teacher, plus files containing student achievement, home background, student-teacher linkage, and within-country scoring reliability.
A complex survey generalizing to fourth-grade populations of participating countries.
Released quinquennially since 2001.
Funded by the International Association for the Evaluation of Educational Achievement, run at BC.
Please skim before you begin:
A haiku regarding this microdata:
# lascaux canary
# glyph jump reveal caged bard notes
# cryogenesis
Function Definitions
This survey uses a multiply-imputed variance estimation technique described in Methods Chapter 13. Most users do not need to study this function carefully. Define a function specific to only this dataset:
<-
pirls_MIcombine function (results, variances, call = sys.call(), df.complete = Inf, ...) {
<- length(results)
m <- attr(results, "call")
oldcall if (missing(variances)) {
<- suppressWarnings(lapply(results, vcov))
variances <- lapply(results, coef)
results
}<- variances[[1]]
vbar <- results[[1]]
cbar for (i in 2:m) {
<- cbar + results[[i]]
cbar <- vbar + variances[[i]]
vbar
}<- cbar/m
cbar <- vbar/m
vbar
# MODIFICATION
# evar <- var(do.call("rbind", results))
<- sum( ( unlist( results ) - cbar )^2 / 4 )
evar
<- (1 + 1/m) * evar/vbar
r <- (m - 1) * (1 + 1/r)^2
df if (is.matrix(df)) df <- diag(df)
if (is.finite(df.complete)) {
<- ((df.complete + 1)/(df.complete + 3)) * df.complete *
dfobs /(vbar + evar)
vbarif (is.matrix(dfobs)) dfobs <- diag(dfobs)
<- 1/(1/dfobs + 1/df)
df
}if (is.matrix(r)) r <- diag(r)
<- list(coefficients = cbar, variance = vbar + evar *
rval + 1)/m, call = c(oldcall, call), nimp = m, df = df,
(m missinfo = (r + 2/(df + 3))/(r + 1))
class(rval) <- "MIresult"
rval }
Download, Import, Preparation
Download and unzip the 2021 fourth grade international database:
library(httr)
<- tempfile()
tf
<- "https://pirls2021.org/data/downloads/P21_Data_R.zip"
this_url
GET( this_url , write_disk( tf ) , progress() )
<- unzip( tf , exdir = tempdir() ) unzipped_files
Import and stack each of the student context data files for Abu Dhabi through Bulgaria:
library(haven)
# limit unzipped files to those starting with `asg` followed by three letters followed by `r5`
<-
asg_fns
unzipped_files[ grepl(
'^asg[a-z][a-z][a-z]r5' ,
basename( unzipped_files ) ,
ignore.case = TRUE
)
]
# further limit asg files to the first ten countries
<-
countries_thru_bulgaria c("aad", "adu", "alb", "are", "aus", "aut", "aze", "bfl", "bfr", "bgr")
<-
fns_thru_bulgaria paste0( paste0( '^asg' , countries_thru_bulgaria , 'r5' ) , collapse = "|" )
<-
asg_aad_bgr_fns grepl( fns_thru_bulgaria , basename( asg_fns ) , ignore.case = TRUE ) ]
asg_fns[
<- NULL
pirls_df
for( rdata_fn in asg_aad_bgr_fns ){
<- load( rdata_fn )
this_tbl_name
<- get( this_tbl_name ) ; rm( this_tbl_name )
this_tbl
<- zap_labels( this_tbl )
this_tbl
<- data.frame( this_tbl )
this_df
names( this_df ) <- tolower( names( this_df ) )
<- rbind( pirls_df , this_df )
pirls_df
}
# order the data.frame by unique student id
<- pirls_df[ with( pirls_df , order( idcntry , idstud ) ) , ] pirls_df
Save Locally
Save the object at any point:
# pirls_fn <- file.path( path.expand( "~" ) , "PIRLS" , "this_file.rds" )
# saveRDS( pirls_df , file = pirls_fn , compress = FALSE )
Load the same object:
# pirls_df <- readRDS( pirls_fn )
Survey Design Definition
Construct a multiply-imputed, complex sample survey design:
From among possibly plausible values, determine all columns that are multiply-imputed plausible values:
# identify all columns ending with `01` thru `05`
<- grep( "(.*)0[1-5]$" , names( pirls_df ) , value = TRUE )
ppv
# remove those ending digits
<- gsub( "0[1-5]$" , "" , ppv )
ppv_prefix
# identify each of the possibilities with exactly five matches (five implicates)
<- names( table( ppv_prefix )[ table( ppv_prefix ) == 5 ] )
pv
# identify each of the `01` thru `05` plausible value columns
<-
pv_columns grep(
paste0( "^" , pv , "0[1-5]$" , collapse = "|" ) ,
names( pirls_df ) ,
value = TRUE
)
Extract those multiply-imputed columns into a separate data.frame, then remove them from the source:
<- pirls_df[ c( 'idcntry' , 'idstud' , pv_columns ) ]
pv_wide_df
<- NULL pirls_df[ pv_columns ]
Reshape these columns from one record per student to one record per student per implicate:
<-
pv_long_df reshape(
pv_wide_df , varying = lapply( paste0( pv , '0' ) , paste0 , 1:5 ) ,
direction = 'long' ,
timevar = 'implicate' ,
idvar = c( 'idcntry' , 'idstud' )
)
names( pv_long_df ) <- gsub( "01$" , "" , names( pv_long_df ) )
Merge the columns from the source data.frame onto the one record per student per implicate data.frame:
<- merge( pirls_df , pv_long_df )
pirls_long_df
<- pirls_long_df[ with( pirls_long_df , order( idcntry , idstud ) ) , ]
pirls_long_df
stopifnot( nrow( pirls_long_df ) == nrow( pv_long_df ) )
stopifnot( nrow( pirls_long_df ) / 5 == nrow( pirls_df ) )
Divide the five plausible value implicates into a list with five data.frames based on the implicate number:
<- split( pirls_long_df , pirls_long_df[ , 'implicate' ] ) pirls_list
Construct a replicate weights table following the estimation technique described in Methods Chapter 13:
<- pirls_df[ c( 'jkrep' , 'jkzone' ) ]
weights_df
for( j in 1:75 ){
for( i in 0:1 ){
'jkzone' ] != j , paste0( 'rw' , i , j ) ] <- 1
weights_df[ weights_df[ ,
'jkzone' ] == j , paste0( 'rw' , i , j ) ] <-
weights_df[ weights_df[ , 2 * ( weights_df[ weights_df[ , 'jkzone' ] == j , 'jkrep' ] == i )
}
}
c( 'jkrep' , 'jkzone' ) ] <- NULL weights_df[
Define the design:
library(survey)
library(mitools)
<-
pirls_design svrepdesign(
weights = ~totwgt ,
repweights = weights_df ,
data = imputationList( pirls_list ) ,
type = "other" ,
scale = 0.5 ,
rscales = rep( 1 , 150 ) ,
combined.weights = FALSE ,
mse = TRUE
)
Variable Recoding
Add new columns to the data set:
<-
pirls_design update(
pirls_design ,
one = 1 ,
countries_thru_bulgaria =
factor(
as.numeric( idcntry ) ,
levels = c(7842L, 7841L, 8L, 784L, 36L, 40L, 31L, 956L, 957L, 100L) ,
labels =
c("Abu Dhabi, UAE", "Dubai, UAE", "Albania", "UAE", "Australia", "Austria",
"Azerbaijan", "Belgium (Flemish)", "Belgium (French)","Bulgaria")
) ,
sex = factor( itsex , levels = 1:2 , labels = c( "female" , "male" ) ) ,
always_speak_language_of_test_at_home =
ifelse( asbg03 %in% 1:4 , as.numeric( asbg03 == 1 ) , NA )
)
Analysis Examples with the survey
library
Unweighted Counts
Count the unweighted number of records in the survey sample, overall and by groups:
pirls_MIcombine( with( pirls_design , svyby( ~ one , ~ one , unwtd.count ) ) )
pirls_MIcombine( with( pirls_design , svyby( ~ one , ~ sex , unwtd.count ) ) )
Weighted Counts
Count the weighted size of the generalizable population, overall and by groups:
pirls_MIcombine( with( pirls_design , svytotal( ~ one ) ) )
pirls_MIcombine( with( pirls_design ,
svyby( ~ one , ~ sex , svytotal )
) )
Descriptive Statistics
Calculate the mean (average) of a linear variable, overall and by groups:
pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE ) ) )
pirls_MIcombine( with( pirls_design ,
svyby( ~ asrrea , ~ sex , svymean , na.rm = TRUE )
) )
Calculate the distribution of a categorical variable, overall and by groups:
pirls_MIcombine( with( pirls_design , svymean( ~ countries_thru_bulgaria ) ) )
pirls_MIcombine( with( pirls_design ,
svyby( ~ countries_thru_bulgaria , ~ sex , svymean )
) )
Calculate the sum of a linear variable, overall and by groups:
pirls_MIcombine( with( pirls_design , svytotal( ~ asrrea , na.rm = TRUE ) ) )
pirls_MIcombine( with( pirls_design ,
svyby( ~ asrrea , ~ sex , svytotal , na.rm = TRUE )
) )
Calculate the weighted sum of a categorical variable, overall and by groups:
pirls_MIcombine( with( pirls_design , svytotal( ~ countries_thru_bulgaria ) ) )
pirls_MIcombine( with( pirls_design ,
svyby( ~ countries_thru_bulgaria , ~ sex , svytotal )
) )
Calculate the median (50th percentile) of a linear variable, overall and by groups:
pirls_MIcombine( with( pirls_design ,
svyquantile(
~ asrrea ,
0.5 , se = TRUE , na.rm = TRUE
) ) )
pirls_MIcombine( with( pirls_design ,
svyby(
~ asrrea , ~ sex , svyquantile ,
0.5 , se = TRUE ,
ci = TRUE , na.rm = TRUE
) ) )
Estimate a ratio:
pirls_MIcombine( with( pirls_design ,
svyratio( numerator = ~ asrlit , denominator = ~ asrrea )
) )
Subsetting
Restrict the survey design to Australia, Austria, Azerbaijan, Belgium (French):
<- subset( pirls_design , idcntry %in% c( 36 , 40 , 31 , 956 ) ) sub_pirls_design
Calculate the mean (average) of this subset:
pirls_MIcombine( with( sub_pirls_design , svymean( ~ asrrea , na.rm = TRUE ) ) )
Measures of Uncertainty
Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:
<-
this_result pirls_MIcombine( with( pirls_design ,
svymean( ~ asrrea , na.rm = TRUE )
) )
coef( this_result )
SE( this_result )
confint( this_result )
cv( this_result )
<-
grouped_result pirls_MIcombine( with( pirls_design ,
svyby( ~ asrrea , ~ sex , svymean , na.rm = TRUE )
) )
coef( grouped_result )
SE( grouped_result )
confint( grouped_result )
cv( grouped_result )
Calculate the degrees of freedom of any survey design object:
degf( pirls_design$designs[[1]] )
Calculate the complex sample survey-adjusted variance of any statistic:
pirls_MIcombine( with( pirls_design , svyvar( ~ asrrea , na.rm = TRUE ) ) )
Include the complex sample design effect in the result for a specific statistic:
# SRS without replacement
pirls_MIcombine( with( pirls_design ,
svymean( ~ asrrea , na.rm = TRUE , deff = TRUE )
) )
# SRS with replacement
pirls_MIcombine( with( pirls_design ,
svymean( ~ asrrea , na.rm = TRUE , deff = "replace" )
) )
Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop
for alternatives:
# MIsvyciprop( ~ always_speak_language_of_test_at_home , pirls_design ,
# method = "likelihood" , na.rm = TRUE )
Regression Models and Tests of Association
Perform a design-based t-test:
# MIsvyttest( asrrea ~ always_speak_language_of_test_at_home , pirls_design )
Perform a chi-squared test of association for survey data:
# MIsvychisq( ~ always_speak_language_of_test_at_home + countries_thru_bulgaria , pirls_design )
Perform a survey-weighted generalized linear model:
<-
glm_result pirls_MIcombine( with( pirls_design ,
svyglm( asrrea ~ always_speak_language_of_test_at_home + countries_thru_bulgaria )
) )
summary( glm_result )
Replication Example
This example matches the mean proficiency and standard error of the Australia
row of the Summary Statistics and Standard Errors for Proficiency in Overall Reading
table from the Appendix 13A: Summary Statistics and Standard Errors for Proficiency in Reading:
<- subset( pirls_design , countries_thru_bulgaria %in% "Australia" )
australia_design
stopifnot( nrow( australia_design ) == 5487 )
<- pirls_MIcombine( with( australia_design , svymean( ~ asrrea ) ) )
result
stopifnot( round( coef( result ) , 3 ) == 540.134 )
stopifnot( round( SE( result ) , 3 ) == 1.728 )
This example matches the jackknife sampling, imputation, and total variances of the same row:
<- unzipped_files[ grepl( 'ASGAUS' , basename( unzipped_files ) ) ]
australia_fn <- load( australia_fn )
australia_tbl_name <- get( australia_tbl_name ) ; rm( australia_tbl_name )
australia_tbl <- zap_labels( australia_tbl )
australia_tbl <- data.frame( australia_tbl )
australia_df names( australia_df ) <- tolower( names( australia_df ) )
<-
estimate mean( c(
with( australia_df , weighted.mean( asrrea01 , totwgt ) ) ,
with( australia_df , weighted.mean( asrrea02 , totwgt ) ) ,
with( australia_df , weighted.mean( asrrea03 , totwgt ) ) ,
with( australia_df , weighted.mean( asrrea04 , totwgt ) ) ,
with( australia_df , weighted.mean( asrrea05 , totwgt ) )
) )
stopifnot( round( estimate , 3 ) == 540.134 )
for( k in 1:5 ){
<- 0
this_variance
for( j in 1:75 ){
for( i in 0:1 ){
<-
this_variance +
this_variance
( weighted.mean(
paste0( 'asrrea0' , k ) ] ,
australia_df[ , ifelse(
== australia_df[ , 'jkzone' ] ,
j 'totwgt' ] * 2 * ( australia_df[ , 'jkrep' ] == i ) ,
australia_df[ , 'totwgt' ]
australia_df[ ,
)-
) weighted.mean(
paste0( 'asrrea0' , k ) ] ,
australia_df[ , 'totwgt' ]
australia_df[ ,
)^2
)
}
}
assign( paste0( 'v' , k ) , this_variance * 0.5 )
}
<- mean( c( v1 , v2 , v3 , v4 , v5 ) )
sampling_variance stopifnot( round( sampling_variance , 3 ) == 2.653 )
<-
imputation_variance 6 / 5 ) *
(
( with( australia_df , weighted.mean( asrrea01 , totwgt ) ) - estimate )^2 / 4 ) +
( ( with( australia_df , weighted.mean( asrrea02 , totwgt ) ) - estimate )^2 / 4 ) +
( ( with( australia_df , weighted.mean( asrrea03 , totwgt ) ) - estimate )^2 / 4 ) +
( ( with( australia_df , weighted.mean( asrrea04 , totwgt ) ) - estimate )^2 / 4 ) +
( ( with( australia_df , weighted.mean( asrrea05 , totwgt ) ) - estimate )^2 / 4 )
( (
)
stopifnot( round( imputation_variance , 3 ) == 0.333 )
stopifnot( round( sampling_variance + imputation_variance , 3 ) == 2.987 )