##Data and Coding Final Assignment: Spencer Katzman ##POLS4150 ##Prof. Alexandria M. Putman ##NOTE: ##To access data files, set the working directory where the files are located. ##This program also creates data files and writes them to a subdirectory called ##"Output". For example, after cleaning the data and combining it from multiple ##sources, the program writes "the "CompleteDataSet.csv" to the "Output directory. ##This command creates the subdirectory "Output" in the specified working directory. dir.create(file.path(getwd(), 'Output'), recursive = TRUE) #Load libraries library(tidyr) library(tidyverse) library(readxl) ##Clear workspace rm(list=ls(all=TRUE)) ##Load data on population and percent of population over 18 from file with data downloaded from the Census Bureau. library(readxl) ##Note: I set my working directory as the one my data files are in. age_data <- read_excel("age data.xlsx") age_data$population <- age_data$...2 ##Reduce population scale by a factor of one million age_data$popinmil <- as.numeric(as.character(age_data$...2)) / 1000000 ##Convert age data from decimal to percentage form age_data$percentover18 <- as.numeric(as.character(age_data$...3)) / as.numeric(as.character(age_data$...2)) * 100 ##Remove unwanted rows from census data file age_data <- age_data[-c(1,2,3, 12, 31, 55, 56, 57,58,59,60,61), ] ##Remove unneccessary columns from census data file age_data <- age_data[-c(2,3,4,5)] ##Standardize first column name as "State". ##This will be done with all downloaded and converted data tables so they can be properly merged when creating ##the "complete" data file colnames(age_data)[1] <- "State" ##Change other column names to match references in paper colnames(age_data)[2] <- "Pop" colnames(age_data)[3] <- "Age" ##Load data on TPS percentage with data downloaded from the EdChoice. EdChoice_Data <- read_excel("EdChoice Data.xlsx") EdChoice_Data <- EdChoice_Data[order(EdChoice_Data$...1),] ##Delete unneccessary columns EdChoice_Data <- EdChoice_Data[-c(2,3,4,5,7,8,9)] ##Remove unwanted rows EdChoice_Data <- EdChoice_Data[-c(9,28),] ##Standardize first column name to "State" colnames(EdChoice_Data)[1] <- "State" ##Change second column name to "TPS" to match data reference in paper colnames(EdChoice_Data)[2] <- "TPS" ##Load data on Per Capita Income with data downloaded from the Federal Reserve. library(readxl) Fed_Reserve_Income_Data <- read_excel("Fed Reserve Income Data.xlsx") ##Remove unwanted rows Fed_Reserve_Income_Data <- Fed_Reserve_Income_Data[-c(9,28),] ##Remove unnecessary columns Fed_Reserve_Income_Data <- Fed_Reserve_Income_Data[-c(3,4)] ##Change second column name to "Inc" to match data reference in paper colnames(Fed_Reserve_Income_Data)[2] <- "Inc" ##Convert income values to per $1000 Fed_Reserve_Income_Data$Inc <- as.numeric(as.character(Fed_Reserve_Income_Data$Inc)) / 1000 ##Load data on legislative control with data downloaded from the National Conference of State Legislatures website. Legis_Control_Data_2023 <- read_excel("Legis_Control Data 2023.xlsx") ##Delete unwanted row Legis_Control_Data_2023 <- Legis_Control_Data_2023[-c(27),] ##Remove unneccessary columns Legis_Control_Data_2023 <- Legis_Control_Data_2023[-c(2,3,4,5,6,7,8,9,10,11,12)] ##Converts sting entries in "State Control" column to Rep column value to '1' if State Control = 'Rep', else = '0' Legis_Control_Data_2023$rep <- factor(Legis_Control_Data_2023$`State Control`, levels = c("Rep","Dem","Divided"), labels = c(1,0,0)) ##Converts sting entries in "State Control" column to Dem column value to '1' if State Control = 'Dem', else = '0' Legis_Control_Data_2023$dem <- factor(Legis_Control_Data_2023$`State Control`, levels = c("Rep","Dem","Divided"), labels = c(0,1,0)) ##Remove State Control column Legis_Control_Data_2023 <- Legis_Control_Data_2023[-c(2)] ##Standardize first column name to "State" colnames(Legis_Control_Data_2023)[1] <- "State" ##Load data on Right To Work status with data downloaded from the National Conference of State Legislatures website. library(readxl) Right_to_Work_State_Data <- read_excel("Right to Work State Data.xlsx") ##Remove Nebraska Right_to_Work_State_Data <- Right_to_Work_State_Data[-c(27),] ##Standardize first column name to "State" colnames(Right_to_Work_State_Data)[1] <- "State" ##Change second column name to "RTW" to match data reference in paper colnames(Right_to_Work_State_Data)[2] <- "RTW" ##Load data on racial percentages with data downloaded from the Governing.com. library(readxl) Race_Data <- read_excel("Race Data.xlsx") ##Remove Nebraska Race_Data <- Race_Data[-c(27),] ##Remove unnecessary columns Race_Data <- Race_Data[-c(3,5,6)] ##Change second and third column names to "Hsp" and "Blk" to match data references in paper colnames(Race_Data)[2] <- "Hsp" colnames(Race_Data)[3] <- "Blk" ##Convert racial data from decimal to percentage form Race_Data$`Hsp` <- as.numeric(as.character(Race_Data$`Hsp`)) * 100 Race_Data$`Blk` <- as.numeric(as.character(Race_Data$`Blk`)) * 100 ##Combine data extracted and cleaned from above into one data file. CompleteDataSet <- dplyr::left_join(x = EdChoice_Data, y = Legis_Control_Data_2023, by = "State") CompleteDataSet <- dplyr::left_join(x = CompleteDataSet, y = Race_Data, by = "State") CompleteDataSet <- dplyr::left_join(x = CompleteDataSet, y = age_data, by = "State") CompleteDataSet <- dplyr::left_join(x = CompleteDataSet, y = Fed_Reserve_Income_Data, by = "State") CompleteDataSet <- dplyr::left_join(x = CompleteDataSet, y = Right_to_Work_State_Data, by = "State") ##Write the complete data set to file CompleteDataSet.csv ##Note: All output files are written in the sub-directory "Output" of the working directory. write.csv(CompleteDataSet,"Output/CompleteDataSet.csv") ##Created and read this into the code to make sure the data being used is correct. library(readxl) DataSet <- read_csv("Output/CompleteDataSet.csv") ##Run regression with all 8 X's reg.model1 <- lm(DataSet$TPS ~ DataSet$rep + DataSet$dem + DataSet$Hsp + DataSet$Blk + DataSet$Age + DataSet$Pop + DataSet$Inc + DataSet$RTW) ##Glance at regression results library(tidymodels) glance(reg.model1) summary.lm(reg.model1) ##Create histogram to show variability in TPS hist(DataSet$TPS, main = "", xlab = "TPS Enrollments") ##Create histogram to show variability in Rep counts <- table(CompleteDataSet$rep, factor(CompleteDataSet$rep, levels = c("0", "1"))) barplot(counts, xlab = "Republican Control (1 = Yes, 0 = No)", ylab = "Frequency") ##Create histogram to show variability in Dem counts <- table(CompleteDataSet$dem, factor(CompleteDataSet$dem, levels = c("0", "1"))) barplot(counts, xlab = "Democrat Control (1 = Yes, 0 = No)", ylab = "Frequency") ##Plot relation between Rep and TSP boxplot(DataSet$TPS ~ DataSet$rep, xlab = "Republican Control", ylab = "TPS Enrollment") ##Plot relation between Dem and TSP boxplot(DataSet$TPS ~ DataSet$dem, xlab = "Democrat Control", ylab = "TPS Enrollment") ##Check for Outliers cooks.distance(reg.model1) cooks.distance(reg.model1)[which.max(cooks.distance(reg.model1))] plot(reg.model1,which = 4) ##I am finding that Florida (9), Mississippi (24), and New Mexico (30) have Cook's distance values well more ##than 3 times the average distance so they are outliers. ##Run smaller regression with 5 X's (Dem, Pop, and RTW removed for causing multicollinearity) and outliers left in. reg.model2 <- lm(DataSet$TPS ~ DataSet$rep + DataSet$Hsp + DataSet$Blk + DataSet$Age + DataSet$Inc ) ##Glance at regression results glance(reg.model2) summary.lm(reg.model2) ##DataSetNO (NO stands for No Outliers) removes Dem, Pop, and RTW because of multicollinearity and AK, FL, NM as outliers DataSetNO <- DataSet[-c(2, 9, 30), ] ##Run smaller regression with 5 X's and outliers removed reg.model3 <- lm(DataSetNO$TPS ~ DataSetNO$rep + DataSetNO$Hsp + DataSetNO$Blk + DataSetNO$Age + DataSetNO$Inc ) ##Glance at regression results glance(reg.model3) summary.lm(reg.model3) ######################################### Assumptions Testing ######################################### ##Testing for Multicollinearity ##Remove Index and TPS columns CompleteXData <- CompleteDataSet[-c(1, 2)] ##Write X data to a file to be used in creating correlation table write.csv(CompleteXData,"Output/CompleteX.csv") ##Compute Pearson correlation matrix write.csv(CompleteXData,"Output/CorFileX.csv") CorFile <- read_csv("Output/CorFileX.csv") ##Remove index column CorFile <- CorFile[-c(1)] ## Calculate VIF library(car) ##Check multicollinearity with all 8 X's vif1 <- vif(reg.model1) view(vif1) ##Check multicollinearity with only 5 X's in Models 2 and 3 (three X's removed that seem to cause multicollinearity ##found by above vif and in Pearson covariance matrix) vif2 <- vif(reg.model2) view(vif2) vif3 <- vif(reg.model3) view(vif3) ##Testing for Linearity of Parameters ##Plots residuals vs fitted values. plot(reg.model1, 1, main = "Model 1") plot(reg.model2, 1, main = "Model 2") plot(reg.model3, 1, main = "Model 3") ##All three lines are flat with the exception that the one from Model 3 is curved for the lowest fitted values. ##The average residuals are near 0 over the entire range with the exception of the low fitted values in Model 3. ##Conclusion: There is linearity in parameters. ##Testing for Homoscedasticity ##Plot residuals versus fitted values for the three models. plot(reg.model1$fitted.values, reg.model1$residuals, main = "Model 1", xlab = "Fitted Values", ylab = "Residuals") plot(reg.model2$fitted.values, reg.model2$residuals, main = "Model 2", xlab = "Fitted Values", ylab = "Residuals") plot(reg.model3$fitted.values, reg.model3$residuals, main = "Model 3", xlab = "Fitted Values", ylab = "Residuals") ##The residuals are evenly spread out with no signs of a trend in them so this test is passed. ##Plot SQRT(Standardized Residuals) versus Fitted Values to check for heteroskedasticity for all three models plot(reg.model1, 3, main = "Model 1") plot(reg.model2, 3, main = "Model 2") plot(reg.model3, 3, main = "Model 3") ##These lines are fairly flat and this test is passed. ##Conclusion: homoskedasticity assumption is satisfied. ##Testing for Normally Distributed Errors ## Plots standardized residuals against the theoretical quantiles for all three models. plot(reg.model1, 2, main = "Model 1") plot(reg.model2, 2, main = "Model 2") plot(reg.model3, 2, main = "Model 3") ##All points are near the dotted line in Model except for Florida which is below the line by a fair amount. ##Generally, points are near the 45-degree line for all three Models. ##Conclusion: Errors are normally distributed. ##Thanks for an incredibly interesting class Professor Putman!