National Crime Victimization Survey (NCVS)
The primary information source on victims of nonfatal personal crimes and household property crimes (especially those not reported to the police), and also victim experience within the justice system.
Three tables, the first one row per household per interview, the second one per person-interview, the third one per incident reported across each sampled household’s seven-interview, three-year period.
A complex survey designed to generalize to civilian, non-institutional americans aged 12 and older.
Released annually since its 1992 rename and redesign, related surveys dating to the early 1970s.
Sponsored by the Bureau of Justics Statistics and administered by the US Census Bureau.
Please skim before you begin:
National Crime Victimization Survey, 2016: Technical Documentation
A New Measure of Prevalence for the National Crime Victimization Survey
A haiku regarding this microdata:
# saint peter's sports bar
# evil deed instant replay
# sinful thought jukebox
Function Definitions
Define a function to extract values stored in parentheses:
<-
ncvs_numeric_to_factor function( this_column ) as.numeric( gsub( "^\\(([0-9]+)\\) (.*)" , "\\1" , this_column ) )
Define a function to merge aggregated information onto main data.frame objects:
<-
left_join_zero_missings function( left_df , right_df ){
<-
final_df merge(
left_df ,
right_df ,all.x = TRUE
)
stopifnot( nrow( final_df ) == nrow( left_df ) )
for( this_column in setdiff( names( right_df ) , names( left_df ) ) ){
is.na( final_df[ , this_column ] ) , this_column ] <- 0
final_df[
}
gc()
final_df }
Download, Import, Preparation
Register for the National Archive of Criminal Justice Data at https://www.icpsr.umich.edu/web/NACJD/series/95
Choose
National Crime Victimization Survey, Concatenated File, [United States], 1992-2022 (ICPSR 38604)
Download the
R
version of the September 18, 2023 file.
Import the three main files:
<-
ncvs_household_df_name load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0001/38604-0001-Data.rda" ) )
<-
ncvs_person_df_name load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0002/38604-0002-Data.rda" ) )
<-
ncvs_incident_df_name load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0003/38604-0003-Data.rda" ) )
<- get( ncvs_household_df_name )
ncvs_household_df <- get( ncvs_person_df_name )
ncvs_person_df <- get( ncvs_incident_df_name )
ncvs_incident_df
rm( list = ncvs_household_df_name ) ; gc()
rm( list = ncvs_person_df_name ) ; gc()
rm( list = ncvs_incident_df_name ) ; gc()
names( ncvs_household_df ) <- tolower( names( ncvs_household_df ) )
names( ncvs_person_df ) <- tolower( names( ncvs_person_df ) )
names( ncvs_incident_df ) <- tolower( names( ncvs_incident_df ) )
Determine which variables from each table to retain:
<-
household_variables_to_keep c( 'year' , 'yearq' , 'idhh' , 'wgthhcy' , 'v2002' , 'sc214a' ,
'v2026' , 'v2126a' , 'v2126b' , 'v2015' , 'v2017' , 'v2117' ,
'v2118' , 'v2125' , 'v2071' , 'v2072' , 'v2127b' , 'v2129' )
<-
person_variables_to_keep c( 'year' , 'yearq' , 'v3018' , 'v3014' , 'sc214a' , 'v3023' ,
'v3023a' , 'v3024' , 'v3024a' , 'v2117' , 'v2118' , 'v3002' ,
'idhh' , 'idper' , 'wgtpercy' , 'v3015' , 'v3033' , 'v2026' )
<-
incident_variables_to_keep c( 'year' , 'yearq' , 'v2117' , 'v2118' , 'v4022' ,
paste0( 'v401' , 6:9 ) , 'v4399' , 'v4529' , 'v4049' , paste0( 'v405' , 0:8 ) ,
'v4060' , 'v4062' , paste0( 'v41' , 11:22 ) , 'v4064' , paste0( 'v41' , 27:37 ) ,
'v4467' , 'v4234' , 'v4245' , 'v4243' , 'v4241' , 'v4256' , 'v4258' , 'v4278' ,
'v4262' , paste0( 'v42' , 59:61 ) , 'v4269' , 'v4270' , 'v4268' , 'v4267' ,
'v4271' , 'v4266' , 'v4265' , 'wgtviccy' , 'idhh' , 'idper' , 'v4002' , 'v4288' ,
'v4290' , 'v4400' , 'v4437' , 'v4422' , 'v4024' )
Limit columns in each data.frame to those specified above:
<- ncvs_household_df[ , household_variables_to_keep ]
ncvs_household_df
<- ncvs_person_df[ , person_variables_to_keep ]
ncvs_person_df
<- ncvs_incident_df[ , incident_variables_to_keep ]
ncvs_incident_df
gc()
In this example, limit the 1993-2022 data.frame
to only the first & last years for quicker processing:
<- ncvs_household_df[ ncvs_household_df[ , 'year' ] %in% c( 1994 , 2022 ) , ]
ncvs_household_df
<- ncvs_person_df[ ncvs_person_df[ , 'year' ] %in% c( 1994 , 2022 ) , ]
ncvs_person_df
<- ncvs_incident_df[ ncvs_incident_df[ , 'year' ] %in% c( 1994 , 2022 ) , ]
ncvs_incident_df
gc()
Recode identifiers to character class:
'idhh' ] <- as.character( ncvs_household_df[ , 'idhh' ] )
ncvs_household_df[ ,
c( 'idhh' , 'idper' ) ] <-
ncvs_person_df[ sapply( ncvs_person_df[ c( 'idhh' , 'idper' ) ] , as.character )
c( 'idhh' , 'idper' ) ] <-
ncvs_incident_df[ sapply( ncvs_incident_df[ c( 'idhh' , 'idper' ) ] , as.character )
Recode factor variables to numeric values:
sapply( ncvs_household_df , class ) == 'factor' ] <-
ncvs_household_df[ sapply(
sapply( ncvs_household_df , class ) == 'factor' ] ,
ncvs_household_df[
ncvs_numeric_to_factor , simplify = FALSE
)
sapply( ncvs_person_df , class ) == 'factor' ] <-
ncvs_person_df[ sapply(
sapply( ncvs_person_df , class ) == 'factor' ] ,
ncvs_person_df[
ncvs_numeric_to_factor ,simplify = FALSE
)
sapply( ncvs_incident_df , class ) == 'factor' ] <-
ncvs_incident_df[ sapply(
sapply( ncvs_incident_df , class ) == 'factor' ] ,
ncvs_incident_df[
ncvs_numeric_to_factor ,simplify = FALSE
)
Add a column of ones to each data.frame:
'one' ] <- 1
ncvs_household_df[ ,
'one' ] <- 1
ncvs_person_df[ ,
'one' ] <- 1 ncvs_incident_df[ ,
Add a year group variable to each data.frame:
'yr_grp' ] <-
ncvs_household_df[ , findInterval( ncvs_household_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) )
'yr_grp' ] <-
ncvs_person_df[ , findInterval( ncvs_person_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) )
'yr_grp' ] <-
ncvs_incident_df[ , findInterval( ncvs_incident_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) )
Add a flag indicating whether each incident occurred inside the country:
'exclude_outus' ] <-
ncvs_incident_df[ , 'v4022' ] %in% 1 ncvs_incident_df[ ,
Add a half-year indicator to the incident data.frame:
<-
ncvs_incident_df transform(
ncvs_incident_df ,half_year =
ifelse( substr( yearq , 6 , 6 ) %in% c( '1' , '2' ) , 1 ,
ifelse( substr( yearq , 6 , 6 ) %in% c( '3' , '4' ) , 2 ,
NA ) )
)
stopifnot( all( ncvs_incident_df[ , 'half_year' ] %in% 1:2 ) )
Define violent crimes on the incident data.frame:
# rape and sexual assault
'rsa' ] <-
ncvs_incident_df[ , 'v4529' ] %in% c( 1:4 , 15 , 16 , 18 , 19 )
ncvs_incident_df[ ,
# robbery
'rob' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 5:10
ncvs_incident_df[ ,
# assault
'ast' ] <-
ncvs_incident_df[ , 'v4529' ] %in% c( 11:14 , 17 , 20 )
ncvs_incident_df[ ,
# simple assault
'sast' ] <-
ncvs_incident_df[ , 'v4529' ] %in% c( 14 , 17 , 20 )
ncvs_incident_df[ ,
# aggravated assault
'aast' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 11:13
ncvs_incident_df[ ,
# violent crime
'violent' ] <-
ncvs_incident_df[ , apply( ncvs_incident_df[ c( 'rsa' , 'rob' , 'ast' ) ] , 1 , any )
# violent crime excluding simple assault
'sviolent' ] <-
ncvs_incident_df[ , apply( ncvs_incident_df[ , c( 'rsa' , 'rob' , 'aast' ) ] , 1 , any )
Define personal theft and then person-crime on the incident data.frame:
'ptft' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 21:23
ncvs_incident_df[ ,
'personcrime' ] <-
ncvs_incident_df[ , apply( ncvs_incident_df[ , c( 'violent' , 'ptft' ) ] , 1 , any )
Define property crimes on the incident data.frame:
'hhburg' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 31:33
ncvs_incident_df[ ,
# completed theft with something taken
'burg_ct' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) &
( ncvs_incident_df[ , 'v4288' ] %in% 1 )
( ncvs_incident_df[ ,
# attempted theft
'burg_at' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) &
( ncvs_incident_df[ , 'v4290' ] %in% 1 )
( ncvs_incident_df[ ,
'burg_ncat' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) &
( ncvs_incident_df[ , 'v4288' ] %in% 2 ) &
( ncvs_incident_df[ , 'v4290' ] %in% 2 )
( ncvs_incident_df[ ,
'burgcats2' ] <- 0
ncvs_incident_df[ , 'burg_ncat' ] , 'burgcats2' ] <- 2
ncvs_incident_df[ ncvs_incident_df[ , 'burg_ct' ] | ncvs_incident_df[ , 'burg_at' ] , 'burgcats2' ] <- 1
ncvs_incident_df[ ncvs_incident_df[ ,
'burg' ] <-
ncvs_incident_df[ , 'burgcats2' ] %in% 1
ncvs_incident_df[ ,
# trespassing
'tres' ] <-
ncvs_incident_df[ , 'burgcats2' ] %in% 2
ncvs_incident_df[ ,
# motor vehicle theft
'mvtft' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 40:41
ncvs_incident_df[ ,
# household theft
'hhtft' ] <-
ncvs_incident_df[ , 'v4529' ] %in% 54:59
ncvs_incident_df[ ,
# property crime
'property' ] <-
ncvs_incident_df[ , apply( ncvs_incident_df[ c( 'hhburg' , 'mvtft' , 'hhtft' ) ] , 1 , any )
Define a series weight on the incident data.frame:
'series' ] <- 2
ncvs_incident_df[ ,
ncvs_incident_df[
'v4017' ] %in% c( 1 , 8 ) |
ncvs_incident_df[ , 'v4018' ] %in% c( 2 , 8 ) |
ncvs_incident_df[ , 'v4019' ] %in% c( 1 , 8 )
ncvs_incident_df[ ,
'series' ] <- 1
,
'serieswgt' ] <- 1
ncvs_incident_df[ ,
!( ncvs_incident_df[ , 'v4016' ] %in% 997:998 ) , 'n10v4016' ] <-
ncvs_incident_df[ pmin( ncvs_incident_df[ !( ncvs_incident_df[ , 'v4016' ] %in% 997:998 ) , 'v4016' ] , 10 )
'series' ] == 2 , 'serieswgt' ] <-
ncvs_incident_df[ ncvs_incident_df[ , 'series' ] == 2 , 'n10v4016' ]
ncvs_incident_df[ ncvs_incident_df[ ,
'series' ] == 2 & is.na( ncvs_incident_df[ , 'n10v4016' ] ) , 'serieswgt' ] <- 6 ncvs_incident_df[ ncvs_incident_df[ ,
Aggregate property-crimes to the household-interview level:
<-
summed_hh_crimes aggregate(
cbind(
* serieswgt ,
property * serieswgt ,
hhburg * serieswgt ,
mvtft * serieswgt ,
burg * serieswgt
tres ~ yearq + idhh + v4002 + wgtviccy ,
)
data = subset( ncvs_incident_df , !exclude_outus & property ) ,
sum
)
names( summed_hh_crimes ) <-
c( 'yearq' , 'idhh' , 'v2002' , 'wgtviccy' , 'property' , 'hhburg' ,
'mvtft' , 'burg' , 'tres' )
Merge aggregated property-crimes on to the household-interview data.frame:
<- left_join_zero_missings( ncvs_household_df , summed_hh_crimes )
ncvs_household_df
rm( summed_hh_crimes ) ; gc()
Aggregate person-crimes to the person-interview level:
<-
summed_person_crimes aggregate(
cbind(
* serieswgt ,
violent * serieswgt ,
sviolent * serieswgt ,
rsa * serieswgt ,
rob * serieswgt ,
aast * serieswgt ,
sast * serieswgt
ptft ~ yearq + idhh + v4002 + idper + wgtviccy ,
)
data = subset( ncvs_incident_df , !exclude_outus & personcrime ) ,
sum
)
names( summed_person_crimes ) <-
c( 'yearq' , 'idhh' , 'v3002' , 'idper' , 'wgtviccy' , 'violent' ,
'sviolent' , 'rsa' , 'rob' , 'aast' , 'sast' , 'ptft' )
Merge aggregated property-crimes on to the person-interview data.frame:
<- left_join_zero_missings( ncvs_person_df , summed_person_crimes )
ncvs_person_df
rm( summed_person_crimes ) ; gc()
Starting here, the weight calculation prepares an adjustment for all violence combined with the variables violent
and violent_year
. To calculate the prevalence rate of a subset of person-crimes, starting at this point, replace these two values with variables like rob
and rob_year
.
Aggregate violent crimes to the person-year level:
<-
summed_person_year_violent_crimes aggregate(
* serieswgt ~ idhh + idper + year ,
violent data = subset( ncvs_incident_df , !exclude_outus & violent ) ,
sum
)
names( summed_person_year_violent_crimes )[ ncol( summed_person_year_violent_crimes ) ] <-
'violent_year'
Merge aggregated person-year violent crime series weights on to the person-interview data.frame:
<- left_join_zero_missings( ncvs_person_df , summed_person_year_violent_crimes )
ncvs_person_df
rm( summed_person_year_violent_crimes ) ; gc()
Aggregate violent crimes to the person-half-year level, then reshape into a wide data.frame:
<-
summed_person_half_year_violent_crimes aggregate(
~ idhh + idper + year + half_year ,
wgtviccy data = subset( ncvs_incident_df , !exclude_outus & violent ) ,
mean
)
<-
first_half_violent_crimes subset( summed_person_half_year_violent_crimes , half_year == 1 )
<-
second_half_violent_crimes subset( summed_person_half_year_violent_crimes , half_year == 2 )
'half_year' ] <-
first_half_violent_crimes[ , 'half_year' ] <- NULL
second_half_violent_crimes[ ,
names( first_half_violent_crimes )[ ncol( first_half_violent_crimes ) ] <- 'vwgt1'
names( second_half_violent_crimes )[ ncol( second_half_violent_crimes ) ] <- 'vwgt2'
<-
wide_person_half_year_violent_crimes merge(
first_half_violent_crimes ,
second_half_violent_crimes ,all = TRUE
)
Merge both violent crime weights on to the person-interview data.frame:
<- left_join_zero_missings( ncvs_person_df , wide_person_half_year_violent_crimes )
ncvs_person_df
rm( wide_person_half_year_violent_crimes ) ; gc()
Find the maximum incident victim weight among three half-year periods:
<-
max_half_v_crimes aggregate(
~ idhh + idper + year + half_year + v4002 ,
wgtviccy data = subset( ncvs_incident_df , !exclude_outus & violent ) ,
max
)
<-
max_half_v_crimes
max_half_v_crimes[ do.call(
order , c( 'idhh' , 'idper' , 'year' , 'half_year' ) ] ) ,
max_half_v_crimes[
]
'byvar' ] <-
max_half_v_crimes[ , apply(
c( 'idhh' , 'idper' , 'year' , 'half_year' ) ] ,
max_half_v_crimes[ 1 ,
paste , collapse = ' '
)
1 , 'id' ] <- 1
max_half_v_crimes[
for( i in seq( 2 , nrow( max_half_v_crimes ) ) ){
if( max_half_v_crimes[ i , 'byvar' ] == max_half_v_crimes[ i - 1 , 'byvar' ] ){
'id' ] <- max_half_v_crimes[ i - 1 , 'id' ] + 1
max_half_v_crimes[ i ,
else {
}
'id' ] <- 1
max_half_v_crimes[ i ,
}
}
'label' ] <-
max_half_v_crimes[ , paste0(
'_' ,
'half_year' ] ,
max_half_v_crimes[ , '_' ,
'id' ]
max_half_v_crimes[ ,
)
'byvar' ] <- NULL
max_half_v_crimes[ ,
stopifnot( all( max_half_v_crimes[ , 'label' ] %in% c( '_1_1' , '_2_1' , '_1_2' ) ) )
<-
h_1_1_df
max_half_v_crimes[ 'label' ] == '_1_1' ,
max_half_v_crimes[ , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' )
]
names( h_1_1_df )[ ncol( h_1_1_df ) ] <- 'wgtviccy_1_1'
<-
h_2_1_df
max_half_v_crimes[ 'label' ] == '_2_1' ,
max_half_v_crimes[ , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' )
]
names( h_2_1_df )[ ncol( h_2_1_df ) ] <- 'wgtviccy_2_1'
<-
h_1_2_df
max_half_v_crimes[ 'label' ] == '_1_2' ,
max_half_v_crimes[ , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' )
]
names( h_1_2_df )[ ncol( h_1_2_df ) ] <- 'wgtviccy_1_2'
<-
three_half_df Reduce( function( ... ) merge( ... , all = TRUE ) , list( h_1_1_df , h_2_1_df , h_1_2_df ) )
rm( h_1_1_df , h_2_1_df , h_1_2_df ) ; gc()
Merge these three half-year period weights on to the person-interview data.frame:
<- left_join_zero_missings( ncvs_person_df , three_half_df )
ncvs_person_df
rm( three_half_df ) ; gc()
Aggregate interview counts to the person-year level:
<-
summed_person_year_interviews aggregate(
~ idhh + idper + year ,
one data = subset( ncvs_person_df , wgtpercy > 0 ) ,
sum
)
names( summed_person_year_interviews )[ ncol( summed_person_year_interviews ) ] <-
'interview_count'
Merge interview_count on to the person-interview data.frame:
<- left_join_zero_missings( ncvs_person_df , summed_person_year_interviews )
ncvs_person_df
rm( summed_person_year_interviews ) ; gc()
Apply Interview/Incident Groups:
<-
ncvs_person_df transform(
ncvs_person_df ,interview_incident_groups =
ifelse( violent_year == 0 ,
1 ,
ifelse(
== 1 &
interview_count as.numeric( vwgt1 > 0 ) + as.numeric( vwgt2 > 0 ) ) == 1 ) &
( ( > 0 ,
wgtviccy 2 ,
ifelse(
== 2 &
interview_count as.numeric( vwgt1 > 0 ) + as.numeric( vwgt2 > 0 ) ) == 1 ) ,
( ( 3 ,
ifelse(
== 2 &
interview_count > 0 ) & ( vwgt2 > 0 ) & ( wgtviccy > 0 ) ,
( vwgt1 4 ,
ifelse(
== 3 &
interview_count
( ( as.numeric( wgtviccy_1_1 > 0 ) +
as.numeric( wgtviccy_2_1 > 0 ) +
as.numeric( wgtviccy_1_2 > 0 )
== 1 ) ,
) 5 ,
ifelse(
== 3 &
interview_count > 0 ) & ( wgtviccy_2_1 > 0 ) & ( wgtviccy_1_2 > 0 ) ,
( wgtviccy_1_1 6 ,
ifelse(
== 3 &
interview_count > 0 ) & ( wgtviccy_2_1 > 0 ) &
( wgtviccy_1_1 substr( yearq , 6 , 6 ) %in% 1:2 ,
7 ,
ifelse(
== 3 &
interview_count > 0 ) & ( wgtviccy_2_1 > 0 ) &
( wgtviccy_1_1 substr( yearq , 6 , 6 ) %in% 3:4 ,
8 ,
9
) ) ) ) ) ) ) )
)
# confirm all records in group 9 have both a wgtviccy == 0 & wgtpercy == 0
stopifnot( nrow( subset( ncvs_person_df , interview_incident_groups == 9 & wgtviccy > 0 ) ) == 0 )
stopifnot( nrow( subset( ncvs_person_df , interview_incident_groups == 9 & wgtpercy > 0 ) ) == 0 )
<-
ncvs_person_df transform(
ncvs_person_df ,
prev_wgt0 =
ifelse( interview_incident_groups == 1 , wgtpercy ,
ifelse( interview_incident_groups == 2 , wgtviccy / 2 ,
ifelse( interview_incident_groups == 3 , pmax( vwgt1 , vwgt2 , na.rm = TRUE ) / 2 ,
ifelse( interview_incident_groups == 4 , wgtviccy / 2 ,
ifelse( interview_incident_groups == 5 ,
pmax( wgtviccy_1_1 , wgtviccy_1_2 , wgtviccy_2_1 , na.rm = TRUE ) / 2 ,
ifelse( interview_incident_groups == 6 , wgtviccy / 2 ,
ifelse( interview_incident_groups == 7 , wgtviccy_1_1 / 2 ,
ifelse( interview_incident_groups == 8 , wgtviccy_2_1 / 2 ,
ifelse( interview_incident_groups == 9 , 0 ,
NA ) ) ) ) ) ) ) ) )
)
# matches table 8
# https://www.ojp.gov/pdffiles1/bjs/grants/308745.pdf#page=44
Aggregate wgtviccy
and prev_wgt0
sums to the year level, then merge:
<-
summed_year_weights aggregate(
cbind( wgtviccy , prev_wgt0 ) ~ year ,
data = subset( ncvs_person_df , violent_year == 1 ) ,
sum
)
names( summed_year_weights ) <- c( 'year' , 'vwgt_1v' , 'prev_1v' )
<- left_join_zero_missings( ncvs_person_df , summed_year_weights )
ncvs_person_df
rm( summed_year_weights ) ; gc()
Calibrate so that the weight sums to wgtviccy
for persons with exactly one victimization:
<-
ncvs_person_df transform(
ncvs_person_df ,
prev_wgt1 =
ifelse( violent_year == 0 , prev_wgt0 ,
ifelse( violent_year > 0 & wgtpercy > 0 ,
* ( vwgt_1v / prev_1v ) , 0 ) )
prev_wgt0 )
Aggregate wgtviccy
and prev_wgt0
sums to the year level, then merge:
<-
summed_year_crimes aggregate(
cbind(
wgtpercy , ifelse( violent_year > 0 , prev_wgt1 , 0 ) ,
ifelse( violent_year == 0 , prev_wgt1 , 0 )
~ year ,
) data = ncvs_person_df ,
sum
)
names( summed_year_crimes ) <- c( 'year' , 'total_persons' , 'prev_with_crime' , 'prev_no_crime' )
<- left_join_zero_missings( ncvs_person_df , summed_year_crimes )
ncvs_person_df
rm( summed_year_crimes ) ; gc()
Calibrate so that the weight sums to wgtpercy
for all persons:
<-
ncvs_person_df transform(
ncvs_person_df ,
prev_wgt =
ifelse(
== 0 ,
violent_year * ( ( total_persons - prev_with_crime ) / prev_no_crime ) ,
prev_wgt1
prev_wgt1
) )
Save Locally
Save the object at any point:
# ncvs_fn <- file.path( path.expand( "~" ) , "NCVS" , "this_file.rds" )
# saveRDS( ncvs_df , file = ncvs_fn , compress = FALSE )
Load the same object:
# ncvs_df <- readRDS( ncvs_fn )
Survey Design Definition
Construct a complex sample survey design:
library(survey)
options('survey.lonely.psu' = 'adjust')
# replace missing clusters
is.na( ncvs_person_df[ , 'v2118' ] ) , 'v2118' ] <- -1
ncvs_person_df[ is.na( ncvs_person_df[ , 'v2117' ] ) , 'v2117' ] <- -1
ncvs_person_df[
# subset this dataset to only 2022
<- subset( ncvs_person_df , year == max( year ) )
ncvs_df
<-
ncvs_design svydesign(
~ v2118 ,
strata = ~ interaction( yr_grp , v2117 ) ,
data = ncvs_df ,
weights = ~ prev_wgt ,
nest = TRUE
)
Variable Recoding
Add new columns to the data set:
<-
ncvs_design update(
ncvs_design ,
one = 1 ,
victim = as.numeric( violent_year > 0 ) ,
sex = factor( v3018 , levels = 1:2 , labels = c( 'male' , 'female' ) ) ,
linear_age = ifelse( v3014 == 99 , NA , v3014 ) ,
times_moved_in_prior_five_years =
ifelse( v3033 == 99 , NA , v3033 ) ,
current_marital_status =
factor(
v3015 , levels = c( 1:5 , 8 ) ,
labels =
c( 'married' , 'widowed' , 'divorced' , 'separated' , 'single' , 'residue' )
) ,
household_income_starting_2015q1 =
factor(
findInterval( sc214a , c( 1 , 9 , 13 , 16 , 18 ) ) ,
levels = 1:5 ,
labels =
c( 'less than $25,000' , '$25,000 - $49,999' , '$50,000 - $99,999' ,
'$100,000 - $199,999' , '$200,000 or more' )
) ,
household_income_75k =
ifelse( v2026 == 98 , NA , as.numeric( v2026 %in% 14:18 ) )
)
Analysis Examples with the survey
library
Unweighted Counts
Count the unweighted number of records in the survey sample, overall and by groups:
sum( weights( ncvs_design , "sampling" ) != 0 )
svyby( ~ one , ~ sex , ncvs_design , unwtd.count )
Weighted Counts
Count the weighted size of the generalizable population, overall and by groups:
svytotal( ~ one , ncvs_design )
svyby( ~ one , ~ sex , ncvs_design , svytotal )
Descriptive Statistics
Calculate the mean (average) of a linear variable, overall and by groups:
svymean( ~ victim , ncvs_design )
svyby( ~ victim , ~ sex , ncvs_design , svymean )
Calculate the distribution of a categorical variable, overall and by groups:
svymean( ~ current_marital_status , ncvs_design )
svyby( ~ current_marital_status , ~ sex , ncvs_design , svymean )
Calculate the sum of a linear variable, overall and by groups:
svytotal( ~ victim , ncvs_design )
svyby( ~ victim , ~ sex , ncvs_design , svytotal )
Calculate the weighted sum of a categorical variable, overall and by groups:
svytotal( ~ current_marital_status , ncvs_design )
svyby( ~ current_marital_status , ~ sex , ncvs_design , svytotal )
Calculate the median (50th percentile) of a linear variable, overall and by groups:
svyquantile( ~ victim , ncvs_design , 0.5 )
svyby(
~ victim ,
~ sex ,
ncvs_design ,
svyquantile , 0.5 ,
ci = TRUE
)
Estimate a ratio:
svyratio(
numerator = ~ times_moved_in_prior_five_years ,
denominator = ~ linear_age ,
ncvs_design ,na.rm = TRUE
)
Subsetting
Restrict the survey design to elderly americans:
<- subset( ncvs_design , linear_age >= 65 ) sub_ncvs_design
Calculate the mean (average) of this subset:
svymean( ~ victim , sub_ncvs_design )
Measures of Uncertainty
Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:
<- svymean( ~ victim , ncvs_design )
this_result
coef( this_result )
SE( this_result )
confint( this_result )
cv( this_result )
<-
grouped_result svyby(
~ victim ,
~ sex ,
ncvs_design ,
svymean
)
coef( grouped_result )
SE( grouped_result )
confint( grouped_result )
cv( grouped_result )
Calculate the degrees of freedom of any survey design object:
degf( ncvs_design )
Calculate the complex sample survey-adjusted variance of any statistic:
svyvar( ~ victim , ncvs_design )
Include the complex sample design effect in the result for a specific statistic:
# SRS without replacement
svymean( ~ victim , ncvs_design , deff = TRUE )
# SRS with replacement
svymean( ~ victim , ncvs_design , deff = "replace" )
Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop
for alternatives:
svyciprop( ~ household_income_75k , ncvs_design ,
method = "likelihood" )
Regression Models and Tests of Association
Perform a design-based t-test:
svyttest( victim ~ household_income_75k , ncvs_design )
Perform a chi-squared test of association for survey data:
svychisq(
~ household_income_75k + current_marital_status ,
ncvs_design )
Perform a survey-weighted generalized linear model:
<-
glm_result svyglm(
~ household_income_75k + current_marital_status ,
victim
ncvs_design
)
summary( glm_result )
Replication Example
This example matches the 1994 and 2022 victimization rates and SEs in Appendix Table 1:
<-
new_prevalence_design svydesign(
~ v2118 ,
strata = ~ interaction( yr_grp , v2117 ) ,
data = ncvs_person_df ,
weights = ~ prev_wgt ,
nest = TRUE
)
<-
new_prevalence_results svyby(
~ as.numeric( violent_year > 0 ) ,
~ year ,
new_prevalence_design ,
svymean
)
# match new method (wgt_ovam) 1994 and 2022 estimates
stopifnot(
round( coef( new_prevalence_results )[ c( 1 , nrow( new_prevalence_results ) ) ] , 4 ) ==
c( 0.0442 , 0.0151 )
)
# match new method (wgt_ovam) 1994 and 2022 standard errors
stopifnot(
round( SE( new_prevalence_results )[ c( 1 , nrow( new_prevalence_results ) ) ] , 5 ) ==
c( 0.0010 , 0.00054 )
)
<-
old_prevalence_design svydesign(
~ v2118 ,
strata = ~ interaction( yr_grp , v2117 ) ,
data = ncvs_person_df ,
weights = ~ wgtpercy ,
nest = TRUE
)
<-
old_prevalence_results svyby(
~ as.numeric( violent_year > 0 ) ,
~ year ,
old_prevalence_design ,
svymean
)
# match old method (wgtpercy) 1994 and 2022 estimates
stopifnot(
round( coef( old_prevalence_results )[ c( 1 , nrow( old_prevalence_results ) ) ] , 4 ) ==
c( 0.0328 , 0.0124 )
)
# match old method (wgtpercy) 1994 and 2022 standard errors
stopifnot(
round( SE( old_prevalence_results )[ c( 1 , nrow( old_prevalence_results ) ) ] , 5 ) ==
c( 0.00075 , 0.00042 )
)
Analysis Examples with srvyr
The R srvyr
library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize
, group_by
, and mutate
, the convenience of pipe-able functions, the tidyverse
style of non-standard evaluation and more consistent return types than the survey
package. This vignette details the available features. As a starting point for NCVS users, this code replicates previously-presented examples:
library(srvyr)
<- as_survey( ncvs_design ) ncvs_srvyr_design
Calculate the mean (average) of a linear variable, overall and by groups:
%>%
ncvs_srvyr_design summarize( mean = survey_mean( victim ) )
%>%
ncvs_srvyr_design group_by( sex ) %>%
summarize( mean = survey_mean( victim ) )