Trends in International Mathematics and Science Study (TIMSS)
A comparative study of student achievement in math and science across more than 50 nations.
Grade-specific tables with one record per school, student, teacher, plus files containing student achievement, home background, student-teacher linkage, and within-country scoring reliability.
A complex survey generalizing to fourth- and eighth-grade populations of participating countries.
Released quadrennially since 1995.
Funded by the International Association for the Evaluation of Educational Achievement, run at BC.
Please skim before you begin:
TIMSS 2019 User Guide for the International Database, 2nd Edition
This human-composed haiku or a bouquet of artificial intelligence-generated limericks
# brando for stella,
# gump's jenny, rock's adrian,
# students toward math test
Function Definitions
This survey uses a multiply-imputed variance estimation technique described in Methods Chapter 14. Most users do not need to study this function carefully. Define a function specific to only this dataset:
<-
timss_MIcombine function (results, variances, call = sys.call(), df.complete = Inf, ...) {
<- length(results)
m <- attr(results, "call")
oldcall if (missing(variances)) {
<- suppressWarnings(lapply(results, vcov))
variances <- lapply(results, coef)
results
}<- variances[[1]]
vbar <- results[[1]]
cbar for (i in 2:m) {
<- cbar + results[[i]]
cbar <- vbar + variances[[i]]
vbar
}<- cbar/m
cbar <- vbar/m
vbar
# MODIFICATION
# evar <- var(do.call("rbind", results))
<- sum( ( unlist( results ) - cbar )^2 / 4 )
evar
<- (1 + 1/m) * evar/vbar
r <- (m - 1) * (1 + 1/r)^2
df if (is.matrix(df)) df <- diag(df)
if (is.finite(df.complete)) {
<- ((df.complete + 1)/(df.complete + 3)) * df.complete *
dfobs /(vbar + evar)
vbarif (is.matrix(dfobs)) dfobs <- diag(dfobs)
<- 1/(1/dfobs + 1/df)
df
}if (is.matrix(r)) r <- diag(r)
<- list(coefficients = cbar, variance = vbar + evar *
rval + 1)/m, call = c(oldcall, call), nimp = m, df = df,
(m missinfo = (r + 2/(df + 3))/(r + 1))
class(rval) <- "MIresult"
rval }
Download, Import, Preparation
Download and unzip the 2019 fourth grade international database:
library(httr)
<- tempfile()
tf
<- "https://timss2019.org/international-database/downloads/T19_G4_SPSS%20Data.zip"
this_url
GET( this_url , write_disk( tf ) , progress() )
<- unzip( tf , exdir = tempdir() ) unzipped_files
Import and stack each of the student context data files for Albania through Canada:
library(haven)
# limit unzipped files to those starting with `asg` followed by three letters followed by `m7`
<- unzipped_files[ grepl( '^asg[a-z][a-z][a-z]m7' , basename( unzipped_files ) ) ]
asg_fns
# further limit asg files to the first ten countries
<- c("alb", "arm", "aus", "aut", "aze", "bhr", "bfl", "bih", "bgr", "can")
countries_thru_canada
<- paste0( paste0( '^asg' , countries_thru_canada , 'm7' ) , collapse = "|" )
fns_thru_canada
<- asg_fns[ grepl( fns_thru_canada , basename( asg_fns ) ) ]
asg_alb_can_fns
<- NULL
timss_df
for( spss_fn in asg_alb_can_fns ){
<- read_spss( spss_fn )
this_tbl
<- zap_labels( this_tbl )
this_tbl
<- data.frame( this_tbl )
this_df
names( this_df ) <- tolower( names( this_df ) )
<- rbind( timss_df , this_df )
timss_df
}
# order the data.frame by unique student id
<- timss_df[ with( timss_df , order( idcntry , idstud ) ) , ] timss_df
Save locally
Save the object at any point:
# timss_fn <- file.path( path.expand( "~" ) , "TIMSS" , "this_file.rds" )
# saveRDS( timss_df , file = timss_fn , compress = FALSE )
Load the same object:
# timss_df <- readRDS( timss_fn )
Survey Design Definition
Construct a multiply-imputed, complex sample survey design:
From among possibly plausible values, determine all columns that are multiply-imputed plausible values:
# identify all columns ending with `01` thru `05`
<- grep( "(.*)0[1-5]$" , names( timss_df ) , value = TRUE )
ppv
# remove those ending digits
<- gsub( "0[1-5]$" , "" , ppv )
ppv_prefix
# identify each of the possibilities with exactly five matches (five implicates)
<- names( table( ppv_prefix )[ table( ppv_prefix ) == 5 ] )
pv
# identify each of the `01` thru `05` plausible value columns
<-
pv_columns grep(
paste0( "^" , pv , "0[1-5]$" , collapse = "|" ) ,
names( timss_df ) ,
value = TRUE
)
Extract those multiply-imputed columns into a separate data.frame, then remove them from the source:
<- timss_df[ c( 'idcntry' , 'idstud' , pv_columns ) ]
pv_wide_df
<- NULL timss_df[ pv_columns ]
Reshape these columns from one record per student to one record per student per implicate:
<-
pv_long_df reshape(
pv_wide_df , varying = lapply( paste0( pv , '0' ) , paste0 , 1:5 ) ,
direction = 'long' ,
timevar = 'implicate' ,
idvar = c( 'idcntry' , 'idstud' )
)
names( pv_long_df ) <- gsub( "01$" , "" , names( pv_long_df ) )
Merge the columns from the source data.frame onto the one record per student per implicate data.frame:
<- merge( timss_df , pv_long_df )
timss_long_df
<- timss_long_df[ with( timss_long_df , order( idcntry , idstud ) ) , ]
timss_long_df
stopifnot( nrow( timss_long_df ) == nrow( pv_long_df ) )
stopifnot( nrow( timss_long_df ) / 5 == nrow( timss_df ) )
Divide the five plausible value implicates into a list with five data.frames based on the implicate number:
<- split( timss_long_df , timss_long_df[ , 'implicate' ] ) timss_list
Construct a replicate weights table following the estimation technique described in Methods Chapter 14:
<- timss_df[ c( 'jkrep' , 'jkzone' ) ]
weights_df
for( j in 1:75 ){
for( i in 0:1 ){
'jkzone' ] != j , paste0( 'rw' , i , j ) ] <- 1
weights_df[ weights_df[ ,
'jkzone' ] == j , paste0( 'rw' , i , j ) ] <-
weights_df[ weights_df[ , 2 * ( weights_df[ weights_df[ , 'jkzone' ] == j , 'jkrep' ] == i )
}
}
c( 'jkrep' , 'jkzone' ) ] <- NULL weights_df[
Define the design:
library(survey)
library(mitools)
<-
timss_design svrepdesign(
weights = ~totwgt ,
repweights = weights_df ,
data = imputationList( timss_list ) ,
type = "other" ,
scale = 0.5 ,
rscales = rep( 1 , 150 ) ,
combined.weights = FALSE ,
mse = TRUE
)
Variable Recoding
Add new columns to the data set:
<-
timss_design update(
timss_design ,
one = 1 ,
countries_thru_canada =
factor(
as.numeric( idcntry ) ,
levels = c(8L, 51L, 36L, 40L, 31L, 48L, 956L, 70L, 100L, 124L) ,
labels =
c("Albania", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahrain",
"Belgium (Flemish)", "Bosnia and Herzegovina", "Bulgaria", "Canada")
) ,
sex = factor( asbg01 , levels = 1:2 , labels = c( "female" , "male" ) ) ,
born_in_country = ifelse( asbg07 %in% 1:2 , as.numeric( asbg07 == 1 ) , NA )
)
Analysis Examples with the survey
library
Unweighted Counts
Count the unweighted number of records in the survey sample, overall and by groups:
timss_MIcombine( with( timss_design , svyby( ~ one , ~ one , unwtd.count ) ) )
timss_MIcombine( with( timss_design , svyby( ~ one , ~ sex , unwtd.count ) ) )
Weighted Counts
Count the weighted size of the generalizable population, overall and by groups:
timss_MIcombine( with( timss_design , svytotal( ~ one ) ) )
timss_MIcombine( with( timss_design ,
svyby( ~ one , ~ sex , svytotal )
) )
Descriptive Statistics
Calculate the mean (average) of a linear variable, overall and by groups:
timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE ) ) )
timss_MIcombine( with( timss_design ,
svyby( ~ asmmat , ~ sex , svymean , na.rm = TRUE )
) )
Calculate the distribution of a categorical variable, overall and by groups:
timss_MIcombine( with( timss_design , svymean( ~ countries_thru_canada ) ) )
timss_MIcombine( with( timss_design ,
svyby( ~ countries_thru_canada , ~ sex , svymean )
) )
Calculate the sum of a linear variable, overall and by groups:
timss_MIcombine( with( timss_design , svytotal( ~ asmmat , na.rm = TRUE ) ) )
timss_MIcombine( with( timss_design ,
svyby( ~ asmmat , ~ sex , svytotal , na.rm = TRUE )
) )
Calculate the weighted sum of a categorical variable, overall and by groups:
timss_MIcombine( with( timss_design , svytotal( ~ countries_thru_canada ) ) )
timss_MIcombine( with( timss_design ,
svyby( ~ countries_thru_canada , ~ sex , svytotal )
) )
Calculate the median (50th percentile) of a linear variable, overall and by groups:
timss_MIcombine( with( timss_design ,
svyquantile(
~ asmmat ,
0.5 , se = TRUE , na.rm = TRUE
) ) )
timss_MIcombine( with( timss_design ,
svyby(
~ asmmat , ~ sex , svyquantile ,
0.5 , se = TRUE ,
ci = TRUE , na.rm = TRUE
) ) )
Estimate a ratio:
timss_MIcombine( with( timss_design ,
svyratio( numerator = ~ asssci , denominator = ~ asmmat )
) )
Subsetting
Restrict the survey design to Australia, Austria, Azerbaijan, Belgium (French):
<- subset( timss_design , idcntry %in% c( 36 , 40 , 31 , 956 ) ) sub_timss_design
Calculate the mean (average) of this subset:
timss_MIcombine( with( sub_timss_design , svymean( ~ asmmat , na.rm = TRUE ) ) )
Measures of Uncertainty
Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:
<-
this_result timss_MIcombine( with( timss_design ,
svymean( ~ asmmat , na.rm = TRUE )
) )
coef( this_result )
SE( this_result )
confint( this_result )
cv( this_result )
<-
grouped_result timss_MIcombine( with( timss_design ,
svyby( ~ asmmat , ~ sex , svymean , na.rm = TRUE )
) )
coef( grouped_result )
SE( grouped_result )
confint( grouped_result )
cv( grouped_result )
Calculate the degrees of freedom of any survey design object:
degf( timss_design$designs[[1]] )
Calculate the complex sample survey-adjusted variance of any statistic:
timss_MIcombine( with( timss_design , svyvar( ~ asmmat , na.rm = TRUE ) ) )
Include the complex sample design effect in the result for a specific statistic:
# SRS without replacement
timss_MIcombine( with( timss_design ,
svymean( ~ asmmat , na.rm = TRUE , deff = TRUE )
) )
# SRS with replacement
timss_MIcombine( with( timss_design ,
svymean( ~ asmmat , na.rm = TRUE , deff = "replace" )
) )
Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop
for alternatives:
# MIsvyciprop( ~ born_in_country , timss_design ,
# method = "likelihood" , na.rm = TRUE )
Regression Models and Tests of Association
Perform a design-based t-test:
# MIsvyttest( asmmat ~ born_in_country , timss_design )
Perform a chi-squared test of association for survey data:
# MIsvychisq( ~ born_in_country + countries_thru_canada , timss_design )
Perform a survey-weighted generalized linear model:
<-
glm_result timss_MIcombine( with( timss_design ,
svyglm( asmmat ~ born_in_country + countries_thru_canada )
) )
summary( glm_result )
Replication Example
This example matches the mean proficiency and standard error of the Australia
row of the Summary Statistics and Standard Errors for Proficiency in Overall Mathematics-Grade 4
table from the Appendix 14A: Summary Statistics and Standard Errors for Proficiency in Grade 4 Mathematics:
<- subset( timss_design , countries_thru_canada %in% "Australia" )
australia_design
stopifnot( nrow( australia_design ) == 5890 )
<- timss_MIcombine( with( australia_design , svymean( ~ asmmat ) ) )
result
stopifnot( round( coef( result ) , 3 ) == 515.880 )
stopifnot( round( SE( result ) , 3 ) == 2.776 )
This example matches the jackknife sampling, imputation, and total variances of the same row:
<- unzipped_files[ grepl( 'asgaus' , basename( unzipped_files ) ) ]
australia_fn <- read_spss( australia_fn )
australia_tbl <- zap_labels( australia_tbl )
australia_tbl <- data.frame( australia_tbl )
australia_df names( australia_df ) <- tolower( names( australia_df ) )
<-
estimate mean( c(
with( australia_df , weighted.mean( asmmat01 , totwgt ) ) ,
with( australia_df , weighted.mean( asmmat02 , totwgt ) ) ,
with( australia_df , weighted.mean( asmmat03 , totwgt ) ) ,
with( australia_df , weighted.mean( asmmat04 , totwgt ) ) ,
with( australia_df , weighted.mean( asmmat05 , totwgt ) )
) )
stopifnot( round( estimate , 3 ) == 515.880 )
for( k in 1:5 ){
<- 0
this_variance
for( j in 1:75 ){
for( i in 0:1 ){
<-
this_variance +
this_variance
( weighted.mean(
paste0( 'asmmat0' , k ) ] ,
australia_df[ , ifelse(
== australia_df[ , 'jkzone' ] ,
j 'totwgt' ] * 2 * ( australia_df[ , 'jkrep' ] == i ) ,
australia_df[ , 'totwgt' ]
australia_df[ ,
)-
) weighted.mean(
paste0( 'asmmat0' , k ) ] ,
australia_df[ , 'totwgt' ]
australia_df[ ,
)^2
)
}
}
assign( paste0( 'v' , k ) , this_variance * 0.5 )
}
<- mean( c( v1 , v2 , v3 , v4 , v5 ) )
sampling_variance stopifnot( round( sampling_variance , 3 ) == 7.397 )
<-
imputation_variance 6 / 5 ) *
(
( with( australia_df , weighted.mean( asmmat01 , totwgt ) ) - estimate )^2 / 4 ) +
( ( with( australia_df , weighted.mean( asmmat02 , totwgt ) ) - estimate )^2 / 4 ) +
( ( with( australia_df , weighted.mean( asmmat03 , totwgt ) ) - estimate )^2 / 4 ) +
( ( with( australia_df , weighted.mean( asmmat04 , totwgt ) ) - estimate )^2 / 4 ) +
( ( with( australia_df , weighted.mean( asmmat05 , totwgt ) ) - estimate )^2 / 4 )
( (
)
stopifnot( round( imputation_variance , 3 ) == 0.309 )
stopifnot( round( sampling_variance + imputation_variance , 3 ) == 7.706 )