rm(list=ls()) # allows us to work with spatial data library(sf) # allows us to work with rasters library(raster) # lets us view spatial data in R library(mapview) #mapviewOptions(default = TRUE) may be necessary if points arent showing up. # lets us add piping operator library(dplyr) setwd("/Users/sophiacarryl/Documents/Sparrows/Geospatial/UrbanScore/SophiaSparrows/") # load in study sites sites <- read.csv("./Data/studysites.csv") sites <- sites %>% filter(!site %in% c("LPZ", "Dave", "Harper", "Sana", "Santymire")) # make them points # note: crs NAD83 (EPSG:4269) WGS84 (EPSG: 4326) # https://www.nceas.ucsb.edu/sites/default/files/2020-04/OverviewCoordinateReferenceSystems.pdf # we will use WGS since there are sites that expand across multiple UTM Zones points <- sf::st_as_sf(sites, coords = c("lat","lon"), crs = 4326) #may be necessary if points arent showing up mapviewOptions(default = TRUE) # always check data when you load it in mapview::mapview(points) # looks good # create a buffer around the site (2000 m) # data is in WGS84 so the distance needs to be in arc degrees # 1.11km = 0.01 degrees: https://www.usna.edu/Users/oceano/pguth/md_help/html/approx_equivalents.htm scl <- 0.0182 # create a buffer around each site buff <- st_buffer(points, dist = scl, nQuadSegs = 1000) # map it to make sure it looks right # here I just look at LPZ and it looks correct mapview(buff[c(1),]) # can see that there is a buffer around all sites mapview(buff, label = TRUE) # Note: Because the two Evanstons overlap, you decided to take the average urban score of the two in order to represent the fact that they are not independent populations of house sparrows ### Working with Rasters ## Extract Tree Cover # get NLCD 2016 USFS Tree Canopy Cover (CONUS) data from here: https://www.mrlc.gov/data?f%5B0%5D=category%3Atree%20canopy # bring in canopy cover raster for US -- NLCD 2016 USFS Tree Canopy Cover (CONUS) # downloaded from NLCD website canopy <- raster("./Data/nlcd_2016_treecanopy_2019_08_31/nlcd_2016_treecanopy_2019_08_31.img") # reproject points to match tree cover points_reproject <- st_transform(points, st_crs(canopy)) # calculate the mean canopy cover of each buffer # our reprojection points into UTM so we dont have to convert to dec degrees # use meters as the distance argument canopy_cover <- raster::extract(canopy, points_reproject, buffer = 2000, fun=mean) # get canopy raster (it's huge!) out of our environment rm(canopy) ## Extract Impervious Cover # get NLCD 2016 Percent Developed Imperviousness (CONUS) data: https://www.mrlc.gov/data?f%5B0%5D=category%3AUrban%20Imperviousness&f%5B1%5D=category%3Atree%20canopy #NLCD 2016 Percent Developed Imperviousness (CONUS) # load in impervious cover raster # data from NLCD website imp <- raster("./Data/NLCD_2016_Impervious_L48_20190405/NLCD_2016_Impervious_L48_20190405.img") points_reproject <- st_transform(points, st_crs(imp)) # extract the mean impervious cover from 2000m around each site imp_cover <- raster::extract(imp, points_reproject, buffer = 2000, fun=mean) # remove large raster now that we are done with it rm(imp) ### Working with shapefiles ## Extract Housing Density # get data for each state (IN, IL, NY) from here: http://silvis.forest.wisc.edu/data/housing-block-change/ # load shapefiles from computer. We can row bind them to make one big file, since # they are all in the same projection # dsn is the file stored # layer is the name layer name with out the extension housing_data <- rbind(read_sf(dsn = "./Data/il_blk10_Census_change_1990_2010_PLA2", layer = "il_blk10_Census_change_1990_2010_PLA2"), read_sf(dsn = "./Data/in_blk10_Census_change_1990_2010_PLA2", layer = "in_blk10_Census_change_1990_2010_PLA2"), read_sf(dsn = "./Data/ny_blk10_Census_change_1990_2010_PLA2", layer = "ny_blk10_Census_change_1990_2010_PLA2")) #head(housing_data) #mapview::mapview(housing_data) # reproject buffer to match housing_data buff_reproject <- st_transform(buff, st_crs(housing_data)) mapview(buff_reproject) # create a list to hold all of the census blocks for each site # each element is a site (or buffer around site) housing_buff <- vector("list", nrow(buff_reproject)) # extract all census around each site and stored in the list for(i in 1:nrow(buff_reproject)){ housing_buff[[i]] <- housing_data[buff_reproject[i,],] } #convert a list of dataframes in to a single dataframe for fun-sie #data <- plyr::ldply(housing_buff, data.frame) # loop through each element of housing_buff list and take the sum of POP10 pop_list <- lapply(housing_buff, function(x){ sum(x$POP10) }) # condense list down to a vector pop <- do.call("c", pop_list) # calculate population density in our buffer # buffer area buff_area <- pi*2^2 # calculate density pop_dens <- pop/buff_area #Sum and collate housing Units # loop through each element of housing_buff list and take the sum of Housing Units 2010 hu_list <- lapply(housing_buff, function(x){ sum(x$HU10) }) hu <- do.call("c", hu_list) # Housing Units defined - https://www.census.gov/quickfacts/fact/note/US/HSG010219 # A housing unit is a house, an apartment, a mobile home, a group of rooms, or a single room that is occupied as separate living quarters #create data frame with sites, imp_cover, and hectare per housing units imp_hu_data <- data.frame(site = sites$site, impervious = imp_cover, human_density = pop_dens, housing_units = hu, greencover = canopy_cover) imp_hu_data$site <- as.factor(imp_hu_data$site) str(imp_hu_data$site) #Calculate hectare (1km = 100 hectares ) per housing units imp_hu_data <- imp_hu_data %>% group_by(site) %>% mutate(hectare_per_housing_units = (200/housing_units)) # write.csv(imp_hu_data, "imp_hu_data.csv") library(ggplot2) imp_hu_data %>% ggplot(aes(x = hectare_per_housing_units, y = impervious)) + geom_point(aes(color = site), size = 9) + theme_bw() + scale_x_continuous(trans = 'log10') scale_y_continuous(trans = 'log10') ## Extract Pollution Data # Read in pollution data Pollution <- read.csv("./Data/Sophia Dataset USGS Soil Pollution 2007-2010.csv") Pollution <- Pollution[-1,] #remove the first row. #colnames(Pollution) # Select the columns of interest # Lead (Pb) Pollution <- Pollution %>% dplyr::select(SiteID, StateID, lat, lon, Top5_Pb) Pollution_count <- Pollution %>% group_by(StateID) %>% mutate(frequency = n()) %>% select(StateID, frequency) %>% distinct() # Convert pollution data into a simple feature file Pollution_sf <- sf::st_as_sf(Pollution, coords = c("lat","lon"), crs = 4269) mapview::mapview(Pollution_sf, coords = c("lat", "long")) #convert to a shapefile sf::st_write(Pollution_sf,"./Pb_Pollution", driver = "ESRI Shapefile", delete_layer = TRUE) #read in shapefile Pb_Pollution_sf <- sf::st_read("./Pb_Pollution/") mapview::mapview(Pb_Pollution_sf) # create a buffer around the site (2km). Note scl is 1.11km = 0.01 degrees convertion from above buff_Pb <- sf::st_buffer(points, dist = scl, nQuadSegs = 1000) # reproject buffer (our 2000m area) to match Lead pollution buff_reproject_Pb <- st_transform(buff_Pb, st_crs(Pb_Pollution_sf)) plot(buff_reproject_Pb) # create a list to hold all of the pollution data for each site # each element is a site (or buffer around site) pollution_buffer <- vector("list", nrow(buff_reproject_Pb)) # extract all pollution data within 2000m buffer and stored in the list called pollution buffer for(i in 1:nrow(buff_reproject_Pb)){ pollution_buffer[[i]] <- Pb_Pollution_sf[buff_reproject_Pb[i,],] } ## Combine our parameters # combine our three parameters urb_data <- cbind(canopy_cover, imp_cover, pop_dens) # Transform data by centering and scaling as recommended by Adam Haber library(caret) sdat <- subset(urb_data, select = c("canopy_cover", "imp_cover", "pop_dens")) urb_data_trans <- caret::preProcess(sdat, method = c("center", "scale")) urb_data_transformed <- predict(urb_data_trans, sdat) summary(urb_data_transformed) # check correlation between variables cor(urb_data) cor(urb_data_transformed) # Conduct principle component analysis urb_pc <- prcomp(urb_data) summary(urb_pc) urb_pc_trans <- prcomp(urb_data_transformed) summary(urb_pc_trans) library("FactoMineR") library("factoextra") fviz_eig(urb_pc, addlabels = TRUE) # creates a scree plot fviz_contrib(urb_pc, choice = "var", axes = 1, ggtheme = theme_bw(), addlabels = TRUE) # The contribution (%) by each variable to PC1 cont_var <- factoextra::get_pca(urb_pc, "var") # prints the contribution (%) for each variable to the correspodning PC cont_var$contrib fviz_eig(urb_pc_trans, addlabels = TRUE) # creates a scree plot fviz_contrib(urb_pc_trans, choice = "var", axes = 1, ggtheme = theme_bw(), addlabels = TRUE) # The contribution (%) by each variable to PC1 cont_var_trans <- factoextra::get_pca(urb_pc_trans, "var") # prints the contribution (%) for each variable to the correspodning PC cont_var_trans$contrib Var_contribution_trans <- cont_var_trans$contrib[,1] #write.csv(Var_contribution_trans, "./Var_contribution_trans.csv") # library(ggpubr) # ggpubr::ggpar(fig , yticks.by = 1) #if you have an interest in changing the tick marks #fviz_contrib(urb_pc_trans, choice = "var", axes = 2) #fviz_contrib(urb_pc_trans, choice = "var", axes = (1:2)) # principle components pc <- urb_pc$x pc_trans <- urb_pc_trans$x # calculate the variance of each PC var <- urb_pc$sdev^2 var_trans <- urb_pc_trans$sdev^2 # calculate how much variance each PC explains # This is actually the Proportion of Variance row in Summary var_expl <- var/sum(var) var_expl_trans <- var_trans/sum(var_trans) # use PC1 as our urban index urb_index <- urb_pc$x[,1] urb_index_trans <- urb_pc_trans$x[,1] # PC1 explains 99% of the variation in the data, and more positive values of # PC1 indicate higher human population density and impervious around each site # and lower canopy cover # create a new data frame with all of your data urban_data <- data.frame(site = sites$site, canopy = canopy_cover, housing_units= hu, impervious = imp_cover, pop_dens = pop_dens, urban_index = urb_index) urban_data_trans <- data.frame(site = sites$site, canopy = canopy_cover, housing_units= hu, impervious = imp_cover, pop_dens = pop_dens, urban_index = urb_index_trans) urban_data <- urban_data %>% group_by(site) %>% mutate(urban_index = urban_index * -1) %>% # change the direction of the signs to make sure it is positive more urban and negative less urban mutate(hectare_per_housing_units = (200/housing_units)) %>% dplyr::mutate_if(is.numeric, round, digits = 3) urban_data_trans <- urban_data_trans %>% group_by(site) %>% mutate(urban_index = urban_index * -1) %>% # change the direction of the signs to make sure it is positive more urban and negative less urban mutate(hectare_per_housing_units = (200/housing_units)) %>% dplyr::mutate_if(is.numeric, round, digits = 3) ## Save output #write.csv(urban_data, "./urban_index_results.csv") ## Save output #write.csv(urban_data, "./urban_index_results.csv") #write.csv(urban_data_trans, "./urban_index_results_trans.csv") # Note: Because the two Evanstons overlap, you decided to take the average urban score of the two in order to represent the fact that they are not independent pollutions of house sparrows # Save workspace save.image(file = "urban_score_pca.RData")