rm(list=ls())
# allows us to work with spatial data
library(sf)
# allows us to work with rasters
library(raster)
# lets us view spatial data in R
library(mapview) #mapviewOptions(default = TRUE) may be necessary if points arent showing up.
# lets us add piping operator 
library(dplyr)


setwd("/Users/sophiacarryl/Documents/Sparrows/Geospatial/UrbanScore/SophiaSparrows/")

# load in study sites
sites <- read.csv("./Data/studysites.csv")

sites <- sites %>%
  filter(!site %in% c("LPZ", "Dave", "Harper", "Sana", "Santymire"))

# make them points 
# note: crs NAD83 (EPSG:4269) WGS84 (EPSG: 4326)
# https://www.nceas.ucsb.edu/sites/default/files/2020-04/OverviewCoordinateReferenceSystems.pdf
# we will use WGS since there are sites that expand across multiple UTM Zones
points <- sf::st_as_sf(sites, coords = c("lat","lon"), crs = 4326)


#may be necessary if points arent showing up
mapviewOptions(default = TRUE) 

# always check data when you load it in
mapview::mapview(points) # looks good

# create a buffer around the site (2000 m)
# data is in WGS84 so the distance needs to be in arc degrees
# 1.11km = 0.01 degrees: https://www.usna.edu/Users/oceano/pguth/md_help/html/approx_equivalents.htm
scl <- 0.0182

# create a buffer around each site
buff <- st_buffer(points, dist = scl, nQuadSegs = 1000)

# map it to make sure it looks right
# here I just look at LPZ and it looks correct
mapview(buff[c(1),])
# can see that there is a buffer around all sites
mapview(buff, label = TRUE)


# Note: Because the two Evanstons overlap, you decided to take the average urban score of the two in order to represent the fact that they are not independent populations of house sparrows

### Working with Rasters

## Extract Tree Cover

# get NLCD 2016 USFS Tree Canopy Cover (CONUS) data from here: https://www.mrlc.gov/data?f%5B0%5D=category%3Atree%20canopy


# bring in canopy cover raster for US -- NLCD 2016 USFS Tree Canopy Cover (CONUS)
# downloaded from NLCD website
canopy <- raster("./Data/nlcd_2016_treecanopy_2019_08_31/nlcd_2016_treecanopy_2019_08_31.img")

# reproject points to match tree cover
points_reproject <- st_transform(points, st_crs(canopy))

# calculate the mean canopy cover of each buffer
# our reprojection points into UTM so we dont have to convert to dec degrees
# use meters as the distance argument
canopy_cover <- raster::extract(canopy, points_reproject, buffer = 2000, fun=mean)

# get canopy raster (it's huge!) out of our environment
rm(canopy)

## Extract Impervious Cover

# get NLCD 2016 Percent Developed Imperviousness (CONUS) data: https://www.mrlc.gov/data?f%5B0%5D=category%3AUrban%20Imperviousness&f%5B1%5D=category%3Atree%20canopy

#NLCD 2016 Percent Developed Imperviousness (CONUS)
# load in impervious cover raster
# data from NLCD website
imp <- raster("./Data/NLCD_2016_Impervious_L48_20190405/NLCD_2016_Impervious_L48_20190405.img")
points_reproject <- st_transform(points, st_crs(imp))
# extract the mean impervious cover from 2000m around each site
imp_cover <- raster::extract(imp, points_reproject, buffer = 2000, fun=mean)

# remove large raster now that we are done with it
rm(imp)


### Working with shapefiles

## Extract Housing Density

# get data for each state (IN, IL, NY) from here: http://silvis.forest.wisc.edu/data/housing-block-change/

# load shapefiles from computer. We can row bind them to make one big file, since
# they are all in the same projection

# dsn is the file stored
# layer is the name layer name with out the extension


housing_data <- rbind(read_sf(dsn = "./Data/il_blk10_Census_change_1990_2010_PLA2", 
                              layer = "il_blk10_Census_change_1990_2010_PLA2"),
                      read_sf(dsn = "./Data/in_blk10_Census_change_1990_2010_PLA2", 
                              layer = "in_blk10_Census_change_1990_2010_PLA2"),
                      read_sf(dsn = "./Data/ny_blk10_Census_change_1990_2010_PLA2", 
                              layer = "ny_blk10_Census_change_1990_2010_PLA2"))
#head(housing_data)

#mapview::mapview(housing_data)

# reproject buffer to match housing_data
buff_reproject <- st_transform(buff, st_crs(housing_data))
mapview(buff_reproject)

# create a list to hold all of the census blocks for each site
# each element is a site (or buffer around site)
housing_buff <- vector("list", nrow(buff_reproject))

# extract all census around each site and stored in the list
for(i in 1:nrow(buff_reproject)){
  housing_buff[[i]] <- housing_data[buff_reproject[i,],]
}

#convert a list of dataframes in to a single dataframe for fun-sie
#data <- plyr::ldply(housing_buff, data.frame)

# loop through each element of housing_buff list and take the sum of POP10
pop_list <- lapply(housing_buff, function(x){ sum(x$POP10) })

# condense list down to a vector
pop <- do.call("c", pop_list)

# calculate population density in our buffer
# buffer area
buff_area <- pi*2^2
# calculate density
pop_dens <- pop/buff_area

#Sum and collate housing Units
# loop through each element of housing_buff list and take the sum of Housing Units 2010

hu_list <- lapply(housing_buff, function(x){ sum(x$HU10) })
hu <- do.call("c", hu_list)

# Housing Units defined - https://www.census.gov/quickfacts/fact/note/US/HSG010219
# A housing unit is a house, an apartment, a mobile home, a group of rooms, or a single room that is occupied as separate living quarters

#create data frame with sites, imp_cover, and hectare per housing units
imp_hu_data <- data.frame(site = sites$site, 
                          impervious = imp_cover, 
                          human_density = pop_dens,
                          housing_units = hu,
                          greencover = canopy_cover)

imp_hu_data$site <- as.factor(imp_hu_data$site)

str(imp_hu_data$site)

#Calculate hectare (1km = 100 hectares ) per housing units
imp_hu_data <- imp_hu_data %>% 
  group_by(site) %>% 
  mutate(hectare_per_housing_units = (200/housing_units))

#   write.csv(imp_hu_data, "imp_hu_data.csv")

library(ggplot2)

imp_hu_data %>% 
  ggplot(aes(x = hectare_per_housing_units, y = impervious)) +
  geom_point(aes(color = site), size = 9) +
  theme_bw() +
  scale_x_continuous(trans = 'log10') 
  scale_y_continuous(trans = 'log10')
  

## Extract Pollution Data 

# Read in pollution data
Pollution <- read.csv("./Data/Sophia Dataset USGS Soil Pollution 2007-2010.csv")
Pollution <- Pollution[-1,] #remove the first row. 
#colnames(Pollution)


# Select the columns of interest
# Lead (Pb)
Pollution <- Pollution %>% 
  dplyr::select(SiteID, StateID, lat, lon, Top5_Pb)

Pollution_count <- Pollution %>% 
  group_by(StateID) %>% 
  mutate(frequency = n()) %>% 
  select(StateID, frequency) %>% 
  distinct()

# Convert pollution data into a simple feature file
Pollution_sf <- sf::st_as_sf(Pollution, coords = c("lat","lon"), crs = 4269)

mapview::mapview(Pollution_sf, coords = c("lat", "long"))


#convert to a shapefile
sf::st_write(Pollution_sf,"./Pb_Pollution", driver = "ESRI Shapefile", delete_layer = TRUE)

#read in shapefile
Pb_Pollution_sf <- sf::st_read("./Pb_Pollution/")
mapview::mapview(Pb_Pollution_sf)

# create a buffer around the site (2km). Note scl is 1.11km = 0.01 degrees convertion from above
buff_Pb <- sf::st_buffer(points, dist = scl, nQuadSegs = 1000)


# reproject buffer (our 2000m area) to match Lead pollution
buff_reproject_Pb <- st_transform(buff_Pb, st_crs(Pb_Pollution_sf))
plot(buff_reproject_Pb)

# create a list to hold all of the pollution data for each site
# each element is a site (or buffer around site)
pollution_buffer <- vector("list", nrow(buff_reproject_Pb))

# extract all pollution data within 2000m buffer and stored in the list called pollution buffer
for(i in 1:nrow(buff_reproject_Pb)){
  pollution_buffer[[i]] <- Pb_Pollution_sf[buff_reproject_Pb[i,],]
}

## Combine our parameters

# combine our three parameters
urb_data <- cbind(canopy_cover, imp_cover, pop_dens)


# Transform data by centering and scaling as recommended by Adam Haber

library(caret)
sdat <- subset(urb_data, select = c("canopy_cover", "imp_cover", "pop_dens"))
urb_data_trans <- caret::preProcess(sdat, method = c("center", "scale"))
urb_data_transformed <- predict(urb_data_trans, sdat)
summary(urb_data_transformed)


# check correlation between variables
cor(urb_data)

cor(urb_data_transformed)

# Conduct principle component analysis
urb_pc <- prcomp(urb_data)
summary(urb_pc)


urb_pc_trans <- prcomp(urb_data_transformed)
summary(urb_pc_trans)

library("FactoMineR")
library("factoextra")

fviz_eig(urb_pc, addlabels = TRUE) # creates a scree plot
fviz_contrib(urb_pc, choice = "var", axes = 1, ggtheme =  theme_bw(), addlabels = TRUE) # The contribution (%) by each variable to PC1
cont_var <- factoextra::get_pca(urb_pc, "var") # prints the contribution (%) for each variable to the correspodning PC
cont_var$contrib


fviz_eig(urb_pc_trans, addlabels = TRUE) # creates a scree plot
fviz_contrib(urb_pc_trans, choice = "var", axes = 1, ggtheme =  theme_bw(), addlabels = TRUE) # The contribution (%) by each variable to PC1
cont_var_trans <- factoextra::get_pca(urb_pc_trans, "var") # prints the contribution (%) for each variable to the correspodning PC
cont_var_trans$contrib

Var_contribution_trans <- cont_var_trans$contrib[,1]

#write.csv(Var_contribution_trans, "./Var_contribution_trans.csv") 


# library(ggpubr)
# ggpubr::ggpar(fig , yticks.by = 1) #if you have an interest in changing the tick marks

#fviz_contrib(urb_pc_trans, choice = "var", axes = 2)

#fviz_contrib(urb_pc_trans, choice = "var", axes = (1:2))

# principle components
pc <- urb_pc$x

pc_trans <- urb_pc_trans$x


# calculate the variance of each PC
var <- urb_pc$sdev^2

var_trans <- urb_pc_trans$sdev^2

# calculate how much variance each PC explains
# This is actually the Proportion of Variance row in Summary
var_expl <- var/sum(var)

var_expl_trans <- var_trans/sum(var_trans)

# use PC1 as our urban index
urb_index <- urb_pc$x[,1]

urb_index_trans <- urb_pc_trans$x[,1]

# PC1 explains 99% of the variation in the data, and more positive values of
# PC1 indicate higher human population density and impervious around each site 
# and lower canopy cover

# create a new data frame with all of your data
urban_data <- data.frame(site = sites$site, 
                         canopy = canopy_cover, 
                         housing_units= hu,
                         impervious = imp_cover, 
                         pop_dens = pop_dens,
                         urban_index = urb_index)


urban_data_trans <- data.frame(site = sites$site, 
                         canopy = canopy_cover, 
                         housing_units= hu,
                         impervious = imp_cover, 
                         pop_dens = pop_dens,
                         urban_index = urb_index_trans)

urban_data  <- urban_data  %>% 
  group_by(site) %>% 
  mutate(urban_index = urban_index * -1) %>% # change the direction of the signs to make sure it is positive more urban and negative less urban
  mutate(hectare_per_housing_units = (200/housing_units)) %>% 
  dplyr::mutate_if(is.numeric, round, digits = 3)

urban_data_trans  <- urban_data_trans  %>% 
  group_by(site) %>% 
  mutate(urban_index = urban_index * -1) %>% # change the direction of the signs to make sure it is positive more urban and negative less urban
  mutate(hectare_per_housing_units = (200/housing_units)) %>% 
  dplyr::mutate_if(is.numeric, round, digits = 3)

## Save output
#write.csv(urban_data, "./urban_index_results.csv")

## Save output
#write.csv(urban_data, "./urban_index_results.csv") 

#write.csv(urban_data_trans, "./urban_index_results_trans.csv") 

# Note: Because the two Evanstons overlap, you decided to take the average urban score of the two in order to represent the fact that they are not independent pollutions of house sparrows
                         
# Save workspace

save.image(file = "urban_score_pca.RData")