/** Import data **/ /* Determine treatment and control groups, assuming uniform dispersion */ data home_sample_uni(keep=loc dist inf_level inf_level1 sqft yrblt garage down home_val); set home_sample; if dist > 0.5 then inf_level1 = 0; else inf_level1 = inf_level; run; /* Determine treatment and control groups, assuming non-uniform dispersion; 0.75 mile max infection if downstream; 0.2 mile max if upstream */ data home_sample_non(keep=loc dist inf_level inf_level1 sqft yrblt garage down home_val); set home_sample; inf_level1 = 0; if (down=1 and dist <= 0.75) then inf_level1 = inf_level; else if (down=0 and dist <= 0.2) then inf_level1 = inf_level; run; /* IML Analysis */ proc iml; /** (1) Uniform dispersion assumption **/ /* Read data into matrix language */ use home_sample_uni; read all var{home_val} into y_u; read all var{sqft yrblt dock inf_level1} into x_u; /* Add column of ones */ x_u = j(nrow(x_u),1,1)||x_u; /* Perform OLS */ b_hat_u = inv(x_u`*x_u)*x_u`*y_u; e_u = y_u - x_u*b_hat_u; /* Variance-covariance matrix and vector of standard errors */ var_b_u = ((y_u-x_u*b_hat_u)`*(y_u-x_u*b_hat_u))/(nrow(y_u)-ncol(x_u))*inv(x_u`*x_u); se_b_u = sqrt(vecdiag(var_b_u)); /* Calculate vector of t-statistics */ t_stat_b_u = b_hat_u / se_b_u; p_value_u = 2*(1 - cdf("t",abs(t_stat_b_u),nrow(y_u)-ncol(x_u))); /*Fit inferences*/ R_num = (e_u`*e_u)/(nrow(y_u)-ncol(x_u)); R_den = ((y_u - y_u[:,])`*(y_u - y_u[:,]))/(nrow(y_u)-1); R_sq_u = 1 - R_num/R_den; AIC_u = log((e_u`*e_u)/nrow(y_u)) + (2*ncol(x_u))/nrow(y_u); BIC_u = log((e_u`*e_u)/nrow(y_u)) + (ncol(x_u)/nrow(y_u))*log(nrow(y_u)); print b_hat_u se_b_u t_stat_b_u p_value_u, R_sq_u AIC_u BIC_u; /** (2) Non-uniform assumption **/ use home_sample_non; read all var{home_val} into y_n; read all var{sqft yrblt dock inf_level1} into x_n; x_n = j(nrow(x_n),1,1)||x_n; b_hat_n = inv(x_n`*x_n)*x_n`*y_n; e_n = y_n - x_n*b_hat_n; var_b_n = ((y_n-x_n*b_hat_n)`*(y_n-x_n*b_hat_n))/(nrow(y_n)-ncol(x_n))*inv(x_n`*x_n); se_b_n = sqrt(vecdiag(var_b_n)); t_stat_b_n = b_hat_n / se_b_n; p_value_n = 2*(1 - cdf("t",abs(t_stat_b_n),nrow(y_n)-ncol(x_n))); /*Fit inferences*/ R_num = (e_n`*e_n)/(nrow(y_n)-ncol(x_n)); R_den = ((y_n - y_n[:,])`*(y_n - y_n[:,]))/(nrow(y_n)-1); R_sq_n = 1 - R_num/R_den; AIC_n = log((e_n`*e_n)/nrow(y_n)) + (2*ncol(x_n))/nrow(y_n); BIC_n = log((e_n`*e_n)/nrow(y_n)) + (ncol(x_n)/nrow(y_n))*log(nrow(y_n)); print b_hat_n se_b_n t_stat_b_n p_value_n, R_sq_n AIC_n BIC_n; quit;