Code
library(dplyr)
library(sf)
library(tmap)
library(ggplot2)
library(tidyr)
library(leaflet)
library(glue)
Spatial visualization | Assess variable influence
Heli Xu
February 12, 2024
After using finite mixture modeling to cluster the neighborhoods (ZATs) across all the ZAT-level indicators on road infrastructures, we’ll visualize the resulting neighborhood profiles on maps.
As mentioned in the previous post, all ZAT-level indicators were categorized into two domains, street design and transportation, and the neighborhood profiles are generated for each ZAT unit from the Gaussian mixture model-based clustering across the variables in each domain, as well as in both domains together. From the modeling results, we could extract the domain-specific and overall cluster information of all ZATs with clusters()
, and join the data with the ZAT-level shapefiles in ZAT folder
to get the geometry.
To spatially visualize the neighborhood profiles, we’ll create 3 maps to plot the cluster assignment of each ZAT unit for street design domain, transportation domain and both domains together.
street <- zat_cluster %>%
select(ZAT, geometry, street) %>%
drop_na() %>%
#using NAD83 in tmap
tm_shape(projection = sf::st_crs(26915))+
tm_polygons("street",
palette = "YlGn",
style = "cat",
title = "cluster")+
tm_layout(panel.show = TRUE,
panel.labels = "Street Design Profiles",
panel.label.fontface = "bold",
title.position = c("left", "TOP"),
legend.position = c("RIGHT", "bottom"),
legend.title.size = 0.9,
legend.width = 2)
transp <- zat_cluster %>%
select(ZAT, geometry, transp) %>%
drop_na() %>%
#using NAD83 in tmap
tm_shape(projection = sf::st_crs(26915))+
tm_polygons("transp",
palette = "YlGn",
style = "cat",
title = "cluster")+
tm_layout(panel.show = TRUE,
panel.labels = "Transportation Profiles",
panel.label.fontface = "bold",
title.position = c("left", "TOP"),
legend.position = c("RIGHT", "bottom"),
legend.title.size = 0.9,
legend.width = 2)
all_ind <- zat_cluster %>%
select(ZAT, geometry, all) %>%
drop_na() %>%
#using NAD83 in tmap
tm_shape(projection = sf::st_crs(26915))+
tm_polygons("all",
palette = "YlGn",
style = "cat",
title = "cluster")+
tm_layout(panel.show = TRUE,
panel.labels = "All-Indicator Profiles",
panel.label.fontface = "bold",
title.position = c("left", "TOP"),
legend.position = c("RIGHT", "bottom"),
legend.title.size = 0.9,
legend.width = 2)
plots <- list(street, transp, all_ind)
current.mode <- tmap_mode("plot")
tmap mode set to plotting
We can see that the clustering by street design and/or transportation domain(s) yielded a relatively consistent pattern of neighborhood profiles in Bogotá.
Since these profiles are modeled across the variables (in each domain), we’ll now explore further which variables may be more influential in the clustering result, which will also be important in our interpretation of the different profiles/clusters.
One way to do that is to visualize each variable by the ZAT unit and see which variables have a better cluster separation (as in the previous post).
In addition, we can examine the mean and variance of each variable see which ones are significantly different across clusters.
zat_std2 <- readRDS("../../../../clean_data/ZAT/zat_std2.rds")
zat_cluster_var <- zat_cluster %>%
as.data.frame() %>%
select(-geometry) %>%
left_join(zat_std2, by = "ZAT")
street_dm <- c("st_4ln_length_log", "bikelane_m_log", "trlight_per_km2",
"sttree_per_km2", "bridg_per_km2")
transportation_dm <- c("BUSTOPDENS", "bus_length_log", "brt_length_log", "numrbp_per_km2")
st_var <- zat_cluster_var %>%
select(-ZAT, -transp, -all) %>%
pivot_longer(-street, names_to = "indicator") %>%
filter(indicator %in% street_dm) %>%
group_by(street, indicator) %>%
mutate(mean = mean(value),
sd = sd(value))
MinMeanSDMax <- function(x) {
v <- c(mean(x)-3*sd(x), mean(x) - sd(x), mean(x), mean(x) + sd(x), mean(x)+3*sd(x))
names(v) <- c("ymin", "lower", "middle", "upper", "ymax")
v
}
ggplot(st_var, aes(x= factor(street), y =value, fill = factor(street)))+
stat_summary(fun.data=MinMeanSDMax, geom="boxplot") +
scale_fill_brewer(palette = "YlGn") + # Change fill colors
theme_minimal() + # Use a minimal theme
labs(title = "Street Design Indicators by Neighborhood Profiles",
x = "Cluster",
y = "Value",
fill = "cluster") +
facet_wrap(~indicator, scales = "free", nrow = 1)
It looks like the street length (st_4ln_length_log), traffic lights (trlight_per_km2) and pedestrian bridge have the most separation between clusters and therefore are the more influential variables in this model-based clustering result. For street length:
zat_imp_geo <- readRDS("../../../../clean_data/zat_imp_geo.rds")
pal <- colorNumeric(
palette = "Blues",
domain = zat_imp_geo$st_4ln_length_log
)
zat_label <- glue("ZAT{zat_imp_geo$ZAT} Street length (log): {zat_imp_geo$st_4ln_length_log}")
zat_imp_geo %>%
select(ZAT, st_4ln_length_log) %>%
st_zm() %>%
st_transform(crs = st_crs("+proj=longlat +datum=WGS84")) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Voyager) %>%
addPolygons(color = "white",
weight = 0.5,
smoothFactor = 0.5,
opacity = 1,
fillColor = ~pal(st_4ln_length_log),
fillOpacity = 0.8,
highlightOptions = highlightOptions(
weight = 5,
color = "#666",
fillOpacity = 0.8,
bringToFront = TRUE),
label = zat_label,
labelOptions = labelOptions(
style = list(
"font-family" = "Fira Sans, sans-serif",
"font-size" = "1.2em"
))
)%>%
addLegend("bottomleft",
pal = pal,
values = ~st_4ln_length_log,
title = "Street length (log) in Bogotá</br>(ZAT-level)",
opacity = 1)
pal2 <- colorNumeric(
palette = "viridis",
domain = zat_imp_geo$trlight_per_km2
)
zat_label2 <- glue("ZAT{zat_imp_geo$ZAT} Traffic light/km2: {zat_imp_geo$trlight_per_km2}")
zat_imp_geo %>%
select(ZAT, trlight_per_km2) %>%
st_zm() %>%
st_transform(crs = st_crs("+proj=longlat +datum=WGS84")) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Voyager) %>%
addPolygons(color = "white",
weight = 0.5,
smoothFactor = 0.5,
opacity = 1,
fillColor = ~pal2(trlight_per_km2),
fillOpacity = 0.8,
highlightOptions = highlightOptions(
weight = 5,
color = "#666",
fillOpacity = 0.8,
bringToFront = TRUE),
label = zat_label2,
labelOptions = labelOptions(
style = list(
"font-family" = "Fira Sans, sans-serif",
"font-size" = "1.2em"
))
)%>%
addLegend("bottomleft",
pal = pal2,
values = ~trlight_per_km2,
title = "Traffic light per km2 in Bogotá</br>(ZAT-level)",
opacity = 1)
Indeed it looks like the cluster 1 (light yellow in the profile map) is corresponding to the neighborhoods that have longer streets (with 4 or more lanes). Although it’s hard to tell on the map due to the zeros, based on the boxplot, these neighborhoods also have a slightly lower pedestrian bridge count and traffic light per km2.
transp_var <- zat_cluster_var %>%
select(-ZAT, -street, -all) %>%
pivot_longer(-transp, names_to ="indicator") %>%
filter(indicator %in% transportation_dm) %>%
group_by(transp, indicator) %>%
mutate(mean = mean(value),
sd = sd(value))
ggplot(transp_var, aes(x= factor(transp), y = value, fill = factor(transp)))+
stat_summary(fun.data=MinMeanSDMax, geom="boxplot") +
scale_fill_brewer(palette = "YlGn") + # Change fill colors
theme_minimal() + # Use a minimal theme
labs(title = "Transportation Indicators by Neighborhood Profiles",
x = "Cluster",
y = "Value",
fill = "cluster") +
facet_wrap(~indicator, scales = "free", nrow = 1)
While the general pattern of these variables are relatively consistent (despite the large variance), it looks like bus-related variables are more influential than the BRT one (hmm) in the modeling, especially the bus route length (bus_length_log). Looking at the bus route length on a map:
pal3 <- colorNumeric(
palette = "magma",
domain = zat_imp_geo$bus_length_log
)
zat_label3 <- glue("ZAT{zat_imp_geo$ZAT} Bus routes length (log): {zat_imp_geo$bus_length_log}")
zat_imp_geo %>%
select(ZAT, bus_length_log) %>%
st_zm() %>%
st_transform(crs = st_crs("+proj=longlat +datum=WGS84")) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Voyager) %>%
addPolygons(color = "white",
weight = 0.5,
smoothFactor = 0.5,
opacity = 1,
fillColor = ~pal3(bus_length_log),
fillOpacity = 0.8,
highlightOptions = highlightOptions(
weight = 5,
color = "#666",
fillOpacity = 0.8,
bringToFront = TRUE),
label = zat_label3,
labelOptions = labelOptions(
style = list(
"font-family" = "Fira Sans, sans-serif",
"font-size" = "1.2em"
))
)%>%
addLegend("bottomleft",
pal = pal3,
values = ~bus_length_log,
title = "Bus routes length (log) in Bogotá</br>(ZAT-level)",
opacity = 1)
From the map and boxplots, cluster 3 (darker green) seem to be corresponding to neighborhoods with few bus routes, whereas cluster 1 and 2 seem to have longer bus routes. It is a little surprising to me that some of the neighborhoods with long bus routes do not have many traffic lights.
After we have more variables perhaps aggregated from street level, we can also use logistic regression model to test the influence of each variable, and try including/excluding different variables to compare the entropy from the model-based clustering.