Albuquerque Urban Greenspace¶

No description has been provided for this image

This project aims to explore possible correlative relationshps between various socioeconomic factors and greenspace in Albuqerque, New Mexico. Albuquerque is the largest city in New Mexico with a population of 500,000 not including the surrounding communities. It has an semi-arid climate with a monsoon seasonal pattern with brutal summers but relatively mild winters. It is also fairly high elevation, which increases the aridity potential.

Economic indictaors such as Meidan Income, Property Value, etc. will be queried through the US Census Bureau API for the American Community Survery website. The geography from the geodataframes will then be used to query the microsoft planety computer api, which should return a list of urls with tiffs of aerial imagery of the census tract in question. This can then be loaded, clipped, and used to caluclated NDVI from the band data contained in the layers.

Setting up analysis¶

In [ ]:
import os
import sys
print(sys.path)

# Import modules
import census
import earthpy as et
import geopandas as gpd
import hashlib
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pystac_client
import rioxarray as rxr
import rioxarray.merge as rxrmerge
import shapely.geometry
import us
import xarray as xr

from shapely.geometry import shape

from hvplot.plotting import scatter_matrix
from ipywidgets import interact, Dropdown
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

data_dir = os.path.join(et.io.HOME, et.io.DATA_NAME)
abq_dir = os.path.join(data_dir, 'albuquerque')


if not os.path.exists(abq_dir):
    os.makedirs(abq_dir)
In [ ]:
# Load shapefile url
abq_path = os.path.join(abq_dir, 'cityboundary.shp')
if not os.path.exists (abq_path):
    abq_url = ('jurisdiction.zip')
    gpd.read_file(abq_url).to_file(abq_path)
In [ ]:
NM_fips = us.states.NM.fips

year = "2021"  # Example year, change as needed
census_url = (f"https://www2.census.gov/geo/tiger/TIGER{year}/TRACT/tl_{year}_{NM_fips}_tract.zip")

NM_census_path = os.path.join(data_dir, 'NM2020census.shp')
if not os.path.exists(NM_census_path):
    gpd.read_file(census_url).to_file(NM_census_path)
In [ ]:
tract_gdf = gpd.read_file(NM_census_path)  # Census tracts
abq_gdf = gpd.read_file(abq_path).to_crs('epsg:4269')  # ABQ boundary GDF
abq_gdf = abq_gdf[abq_gdf['JURISDICTI'] == 'ALBUQUERQUE']
# Reproject tract_gdf to match the CRS
tract_gdf = tract_gdf.to_crs(abq_gdf.crs)

# Perform the spatial join
abq_tracts = gpd.sjoin(tract_gdf, abq_gdf, how="inner", predicate='within')

print(abq_gdf.crs)
print(abq_tracts.crs)
epsg:4269
EPSG:4269
In [ ]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 10))
abq_gdf.plot(ax=ax, color='none', edgecolor='blue', linewidth=2)  # Plot city boundaries in blue
abq_tracts.plot(ax=ax, color='green', alpha=0.5, edgecolor = 'white')  # Plot tracts in green
plt.show()
No description has been provided for this image

API KEY¶

This step will require a valid API key from the US Census Bureau. One can be obtained for free at https://api.census.gov/data/key_signup.html

In [ ]:
#Code from Ed

import getpass
import keyring



def get_api_key(service_name, username, override=False):
    # Get the API key from the keyring
    api_key = keyring.get_password(service_name, username)

    # If the API key is None or override is True, prompt for a new key
    if api_key is None or override:
        api_key = getpass.getpass(prompt='Enter U.S. Census Bureau API Key: ')
        keyring.set_password(service_name, username, api_key)

    return api_key


service_name = "US_Census_Bureau_ACS"
username = "api_key"

# API key expires in 48 hours. Set override to True to force a new key.
api_key = get_api_key(service_name, username, override=True)
In [ ]:
from census import Census
from us import states

c= Census(os.environ.get(api_key))
In [ ]:
def get_all_tracts_for_state(api_key, state_fips, year):
    c = Census(api_key)
    try:
        # Fetching data for a basic variable, but we're interested in the tract IDs
        data = c.acs5.get(('B01003_001E',), {'for': 'tract:*', 'in': f'state:{NM_fips}'}, year=year)
        tracts = set([d['tract'] for d in data])  # Extracting unique tract IDs
        return list(tracts)
    except Exception as e:
        print(f"Error occurred during fetching tracts: {e}")
        return []

# Example usage
state_fips = states.NM.fips
year =2021 
all_tracts = get_all_tracts_for_state(api_key, state_fips, year)
print(f"Number of tracts in New Mexico: {len(all_tracts)}")
Number of tracts in New Mexico: 466
In [ ]:
# Step 1: Define Basic Parameters
variables = ["B06011_001E",'B19001_001E','B25077_001E','B25064_001E','B25034_001E' ]  
# Example: Median Income, Estimated Earnings in last 12 months, median pproperty value
year = 2021
cache_dir = os.path.join(data_dir, "census_dir")
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)
In [ ]:
from census import Census
from us import states

def get_all_tracts_for_state(api_key, state_fips, year):
    c = Census(api_key)
    filters = {"for": "tract:*", "in": f"state:{state_fips}"}
    all_tracts = c.acs5.get(('NAME',), filters, year=year)
    return [(tract["state"], tract["county"], tract["tract"]) for tract in all_tracts]

# Get all tracts for New Mexico
all_tracts = get_all_tracts_for_state(api_key, state_fips, year)

print(all_tracts)
In [ ]:
# Step 3: Batch Processing
def create_batches(data_list, batch_size):
    for i in range(0, len(data_list), batch_size):
        yield data_list[i:i + batch_size]

batch_size = 50  # Adjust as needed
tract_batches = list(create_batches(all_tracts, batch_size))
In [ ]:
def download_state_tract_data(api_key, state_fips, tract_batches, variables, year, cache_dir):
    c = Census(api_key)
    all_data = []  # List to store data from all batches

    for batch_index, batch in enumerate(tract_batches):
        batch_id = hashlib.md5('_'.join([f"{state}-{county}-{tract}" for state, county, tract in batch]).encode()).hexdigest()
        cache_file = os.path.join(cache_dir, f"{state_fips}_{batch_id}.csv")

        print(f"Processing batch {batch_index + 1}/{len(tract_batches)}: {batch_id}")

        if os.path.exists(cache_file) and os.path.getsize(cache_file) > 0:
            print(f"Cache file found for batch {batch_id}, loading data from cache.")
            batch_data = pd.read_csv(cache_file)
            print(f"Loaded {len(batch_data)} rows from cache.")
        else:
            print(f"No cache file found for batch {batch_id}, downloading data.")
            batch_data_list = []
            for state, county, tract in batch:
                print(f"Querying API for state: {state}, county: {county}, tract: {tract}")
                data = c.acs5.get(variables, {'for': f'tract:{tract}', 'in': f'state:{state} county:{county}'}, year=year)
                if data:
                    batch_data_list.append(pd.DataFrame(data))
                    print(f"Saving to {cache_file} in {cache_dir}")

            if batch_data_list:
                batch_data = pd.concat(batch_data_list, ignore_index=True)
                print(f"Batch {batch_id} data size: {len(batch_data)} rows. Saving to cache.")
                batch_data.to_csv(cache_file, index=False)

        all_data.append(batch_data)  # Append the batch data to the all_data list

# Concatenate all batch data into a single DataFrame
    state_tract_data = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    print(f"Total data size: {len(state_tract_data)} rows")


    # Concatenate all batch data into a single DataFrame
    state_tract_data = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    print(f"Total data size: {len(state_tract_data)} rows")

    return state_tract_data
In [ ]:
state_fips =us.states.NM.fips  # Illinois FIPS code
year = 2021
variables = ["B06011_001E",'B19001_001E','B25077_001E','B25064_001E','B25034_001E' ]  

# Assume county_tract_batches is a list of batches, each batch being a list of (state, county, tract) tuples
# Ensure cache_dir exists

state_tract_data = download_state_tract_data(api_key, state_fips, tract_batches, variables, year, cache_dir)
 
In [ ]:
import pandas as pd
# Convert both values to int
abq_tracts['TRACTCE'] = abq_tracts['TRACTCE'].astype(int)
state_tract_data['tract'] = state_tract_data['tract'].astype(int)


# Merging the DataFrames on columns with different names
abq_gdf_merged = pd.merge(abq_tracts, 
                      state_tract_data, 
                      how='inner', 
                      left_on='TRACTCE', 
                      right_on='tract')

# Check for missing data and remove tracts without income data 
abq_gdf_merged = abq_gdf_merged.dropna(subset=['B06011_001E',
                                               'B19001_001E', 
                                               'B25077_001E',
                                                'B25064_001E',
                                                'B25034_001E' ] ) 

# Remove tracts with nonsenical values below 0 
abq_gdf_merged = abq_gdf_merged[(abq_gdf_merged['B06011_001E'] >= 0) 
                                & (abq_gdf_merged['B25077_001E']>=0)
                                & (abq_gdf_merged['B19001_001E']>=0)
                                & (abq_gdf_merged['B25064_001E']>=0)
                                & (abq_gdf_merged['B25034_001E']>=0)]

abq_gdf_merged = abq_gdf_merged.to_crs('4326')


print(abq_gdf_merged.head)
In [ ]:
min_income = abq_gdf_merged['B06011_001E'].min()
max_income = abq_gdf_merged['B06011_001E'].max()
print(f"Minimum Income: {min_income}, Maximum Income: {max_income}")

abq_gdf_merged.plot(column='B06011_001E',
                    legend=True,
                    figsize=(10, 10),
                    cmap='viridis',
                    vmin=min_income,  # Set the minimum value
                    vmax=max_income)  # Set the maximum value
plt.title('Median Income by Census Tract in Albuquerque')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()
Minimum Income: 7437.0, Maximum Income: 64207.0
No description has been provided for this image
In [ ]:
import pystac_client

# Initialize STAC client
e84_catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")

# Define a simple GeoJSON geometry for testing
test_geometry = {
    "type": "Point",
    "coordinates": [-90.199402, 38.627003]  # Coordinates for St. Louis, MO
}

# Define a time range (you can adjust this to your needs)
test_datetime = "2020-01-01/2021-01-01"

# Perform the test search
try:
    search = e84_catalog.search(
        collections=["naip"],
        intersects=test_geometry,
        datetime=test_datetime
    )

    # Fetch items
    items = list(search.get_items())
    print(f"Found {len(items)} items.")

    # Optionally, print the IDs of the first few items
    for item in items[:5]:
        print(item.id)

except Exception as e:
    print(f"An error occurred: {e}")
c:\Users\Degg\miniconda3\envs\earth-analytics-python\lib\site-packages\pystac_client\item_search.py:835: FutureWarning: get_items() is deprecated, use items() instead
  warnings.warn(
Found 2 items.
mo_m_3809031_nw_15_060_20200725
mo_m_3809023_sw_15_060_20200725
In [ ]:
# Initialize the STAC client
client = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")

# Path to cache file
cache_file_path = 'stac_results.json'

# Function to save stac_results to cache
def save_to_cache(stac_results, cache_file_path):
    with open(cache_file_path, 'w') as file:
        json.dump(stac_results, file)

# Try to load cached results
if os.path.exists(cache_file_path):
    with open(cache_file_path, 'r') as file:
        stac_results = json.load(file)
else:
    stac_results = []

# Initialize cached_df with a 'tract_id' column if stac_results is empty or the file doesn't exist
if not stac_results:
    cached_df = pd.DataFrame(columns=['tract_id'])
else:
    # Convert list of dicts to DataFrame for easier checking
    cached_df = pd.DataFrame(stac_results)

# Iterate over each tract in the GeoDataFrame
for index, row in abq_gdf_merged.iterrows():
    tract_id = row['tract']
    if tract_id in cached_df['tract_id'].values:
        print(f"Skipping tract ID: {tract_id}, already processed.")
        continue  # Skip this tract if already processed
    
    geometry = row['geometry']  # The geometry column from the GeoDataFrame

    # Ensure the geometry is in a format that can be serialized to GeoJSON
    if not isinstance(geometry, dict):
        geometry = shapely.geometry.mapping(geometry)  # Convert Shapely object to GeoJSON
    
    # Perform the search
    search = client.search(
        collections=["naip"],
        intersects=geometry,
        datetime="2020"
    )

    # Process the search results
    items = list(search.get_items())
    print(f"Found {len(items)} items for tract ID: {tract_id}")

    if items:
        item_urls = [i.assets['image'].href for i in items]
        stac_results.append({'tract_id': tract_id, 'urls': item_urls, 'geometry': geometry})
        # Save updated results to cache after each tract is processed
        save_to_cache(stac_results, cache_file_path)

print("Updated stac_results cached.")
In [ ]:
stac_results_df = pd.DataFrame(stac_results)  # Convert list of dicts to DataFrame for easier processing

# Load existing stats from CSV if it exists
stats_path = 'abq_ndvi_stats.csv'


    
def compute_ndvi_by_tract(tract_data, stats_path):
    '''Iterates through all tracts and loads in each url, merging rasters
    if there are more than one for a particular track, then computes NDVI
    and appends tract info to stats csv file.
    '''

    # Load existing stats from CSV if it exists, otherwise create an empty DataFrame with the correct columns
    if os.path.exists(stats_path):
        existing_stats_df = pd.read_csv(stats_path)
    else:
        columns = ['tract_id', 
                'ndvi_min', 
                'ndvi_max', 
                'ndvi_mean', 
                'ndvi_25pctl', 
                'ndvi_50pctl', 
                'ndvi_75pctl', 
                'fraction_ndvi_gt_0_12']
        existing_stats_df = pd.DataFrame(columns=columns)
        existing_stats_df.to_csv(stats_path, index=False) 


    tract_id = tract_data['tract_id']
    urls = list(tract_data['urls'])
    
    if not urls:
        print("No URLs to process for tract", tract_id)
        return

    geometry_dict = tract_data['geometry']

    # Convert the dictionary to a Shapely geometry object
    geometry_shapely = shape(geometry_dict)
    
    # Wrap the Shapely geometry object into a GeoSeries
    geometry_gseries = gpd.GeoSeries([geometry_shapely], crs="EPSG:4326")    
    
    # Skip processing if NDVI stats for this tract already exist
    if tract_id in existing_stats_df['tract_id'].values:
        print(f"NDVI stats for tract {tract_id} already exist. Skipping...")
        return
    
    # Load and merge rasters from url
    tract_rasters = None
    for url in urls:
        print(f'Loading raster from {url}')
        raster_data = rxr.open_rasterio(url, Masked = True).squeeze()
        print(raster_data.shape)
        tract_geometry = geometry_gseries.to_crs(raster_data.rio.crs)
        print(tract_geometry)
        
        minx, miny, maxx, maxy = tract_geometry.total_bounds
        print(f'Cropping raster...')
        cropped_raster_data = raster_data.rio.clip_box(minx=minx,
                                                        miny=miny, 
                                                        maxx=maxx, 
                                                        maxy=maxy,
                                                         auto_expand=True)
   
        
        print(cropped_raster_data.shape)
        
        print(f'Clipping raster...')
        clipped_raster_data =  cropped_raster_data.rio.clip(tract_geometry, all_touched=True).fillna(0)
        if tract_rasters is None:
            tract_rasters = clipped_raster_data
        else: 
            tract_rasters = rxrmerge.merge_arrays([tract_rasters, clipped_raster_data])
        
    if tract_rasters.shape[0] < 4:
        print(f"Raster from {url} does not contain the necessary bands for NDVI calculation.")
    
        
    # Calculate NDVI stats
    nir = tract_rasters.sel(band=4)
    print(nir)
    red =  tract_rasters.sel(band=1)
    print(red)
    ndvi_da = xr.where((nir + red) == 0, 0, (nir - red) / (nir + red))

    #Eliminate infinte values
    ndvi_da = ndvi_da.where(np.isfinite(ndvi_da), np.nan)

    print(ndvi_da)

    new_ndvi_stats = pd.DataFrame([{
        'tract_id': tract_id,
        'ndvi_min': float(ndvi_da.min(skipna=True).values),
        'ndvi_max': float(ndvi_da.max(skipna=True).values),
        'ndvi_mean': float(ndvi_da.mean(skipna=True).values),
        'ndvi_25pctl': float(np.nanpercentile(ndvi_da, 25)),
        'ndvi_50pctl': float(np.nanpercentile(ndvi_da, 50)),
        'ndvi_75pctl': float(np.nanpercentile(ndvi_da, 75)),
        'fraction_ndvi_gt_0_12': float(((ndvi_da > 0.12).sum(skipna=True) / ndvi_da.count()).values)}])
    
    print(new_ndvi_stats)
    
    # Append new stats to the existing DataFrame and save to CSV
    updated_stats_df = pd.concat([existing_stats_df, new_ndvi_stats], ignore_index=True)
    updated_stats_df.to_csv(stats_path, index=False)
    print(f'Stats for tract {tract_id} updated in {stats_path}.')

    

    

    
In [ ]:
for tract in stac_results:
    compute_ndvi_by_tract(tract, stats_path)

DROPDOWN PLOT COMPARISON¶

In [ ]:
# Load in full stats file as pandas DataFrame
ndvi_stats_df = pd.read_csv(stats_path)

# Prep GeoDataFrame for merging and rename columns
abq_gdf_merged.rename(columns={'B06011_001E': 'Median Income',
                                'B19001_001E': 'Income sources in last 12 Months', 
                                'B25077_001E': 'Estimated Property Value',
                                'B25064_001E': 'Monthly Gross Rent',
                                'B25034_001E': 'Infrastructure Years Built (Total)',
                                "TRACTCE": 'tract_id'}, inplace=True)

# Merge census data with NDVI stats
abq_ndvi_gdf = abq_gdf_merged.merge(ndvi_stats_df, on='tract_id')

# Function to plot selected data
def plot_data(selected_column):
    fig, ax = plt.subplots(figsize=(10, 6))
    abq_ndvi_gdf.plot(column=selected_column, 
                      legend=True, 
                      ax=ax, 
                      cmap='viridis',
                      legend_kwds={'label': f"{selected_column} by Census Tract"},
                      vmin=abq_ndvi_gdf[selected_column].min(), 
                      vmax=abq_ndvi_gdf[selected_column].max())
    ax.set_title(f'{selected_column} by Census Tract in Albuquerque')
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    plt.tight_layout()
    plt.show()

# Create dropdown and interact function call
column_dropdown = Dropdown(options=['Median Income', 
                                    'Income sources in last 12 Months', 
                                    'Estimated Property Value', 
                                    'Monthly Gross Rent',
                                    'Infrastructure Years Built (Total)',
                                      'ndvi_mean'])
interact(plot_data, selected_column=column_dropdown)
interactive(children=(Dropdown(description='selected_column', options=('Median Income', 'Income sources in las…
Out[ ]:
<function __main__.plot_data(selected_column)>

Model Description¶

In order to evaluate the data correlation to fit to an ordinary least squares regression, the following assumptions are neccessary:

  • The chosen economic variables are representative of the actual socioeconomic demographics of the census tract with sufficient sample size.
  • That the reflectance values being calculated for NDVI are from actual greenspace and not other sources of reflectance (e.g. that there are not a bunch of red buildings everywhre)
  • That ndvi and economic values per census tract are not being influenced by additional factors that lie outside the scope of the analysis.
  • That the datasets for economic indicators and the data used to compute NDVI are aligned spatially and temporally.

The objective of this model is to explore the relationship between urban greenspace and socioeconomic level of a neighborhood to determine whether the NDVI value of the greenspace can be predictedf by the economic indicators derived from census data. To evaluate fit, we need to use metrics that are useful for prediction of both socio-economic level, such as median income, median property values, and tax brackets, assuming that these follow an even Gaussian distribution. We seek to identify colinearity between these and metrics that indicate the amount and health of greenspace in the urban areas such to measure the liklihood that a value for either economic level or vegetation could be used to reasonably predict one another from any particular census.

Advantages:

  • If there is a clear relationship between the dependent and indepdennt variables, a least squares linear regerssion will make it evident.
  • It allows the dependent variable to be plotted as a function of the independent variable to show linearity.
  • If there are corroborating factors, a least squares linear regression could be used as a basis for exploring the relationship to the variables.

Disadvanges:

  • If the data points do not have a strong correlatoin, a least squares linear regression will not show the existing relationship well.
  • A Least squares regression may overshadow variable relatipnships that may be better explored wth a different model.
  • Even if variables can be mapped in a linear relaitonship, this may not necessarily imply causality.

Data Transformations¶

  • I applied a logarithmic transformation to the Economic Indictaors and NDVI Mean data to reduce skew that could affect the model.
  • I applied a normalization and a standardization to the logarithmic transformation of the economic indicators and NDVI mean and stored the results in separate columns to compare througha scatterplot matrix. This is to ensure that the variables are being compared on similar scales relative to each other.
  • Tracts with no data were dropped already when creating the dataframe.
In [ ]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Initialize scalers
scaler = MinMaxScaler()
std_scaler = StandardScaler()

# Existing code
abq_ndvi_gdf['log_median_income'] = np.log(abq_ndvi_gdf['Median Income'] + 1)  # Adding 1 to avoid log(0)
abq_ndvi_gdf['log_ndvi_mean'] = np.log(abq_ndvi_gdf['ndvi_mean'] + 1) 
abq_ndvi_gdf['log_ndvi_75pctl'] = np.log(abq_ndvi_gdf['ndvi_75pctl'] + 1) 

# New variables: log transformation
abq_ndvi_gdf['log_income_sources_last_12_months'] = np.log(abq_ndvi_gdf['Income sources in last 12 Months'] + 1)
abq_ndvi_gdf['log_estimated_property_value'] = np.log(abq_ndvi_gdf['Estimated Property Value'] + 1)
abq_ndvi_gdf['log_ndvi_min'] = np.log(abq_ndvi_gdf['ndvi_min'] + 1)
abq_ndvi_gdf['log_monthly_rent'] = np.log(abq_ndvi_gdf['Monthly Gross Rent'] + 1)


# Normalizing new and existing variables
abq_ndvi_gdf['Normalized Median Income'] = scaler.fit_transform(abq_ndvi_gdf[['log_median_income']].values.reshape(-1, 1))
abq_ndvi_gdf['Normalized NDVI Mean'] = scaler.fit_transform(abq_ndvi_gdf[['log_ndvi_mean']].values.reshape(-1, 1))
abq_ndvi_gdf['Normalized NDVI 75ptcl'] = scaler.fit_transform(abq_ndvi_gdf[['log_ndvi_75pctl']].values.reshape(-1, 1))
abq_ndvi_gdf['Normalized Income Sources'] = scaler.fit_transform(abq_ndvi_gdf[['log_income_sources_last_12_months']].values.reshape(-1, 1))
abq_ndvi_gdf['Normalized Property Value'] = scaler.fit_transform(abq_ndvi_gdf[['log_estimated_property_value']].values.reshape(-1, 1))
abq_ndvi_gdf['Normalized Monthly Rent'] = scaler.fit_transform(abq_ndvi_gdf[['log_monthly_rent']].values.reshape(-1, 1))
abq_ndvi_gdf['Normalized NDVI Min'] = scaler.fit_transform(abq_ndvi_gdf[['log_ndvi_min']].values.reshape(-1, 1))

# Standardizing new and existing variables
abq_ndvi_gdf['Standardized Median Income'] = std_scaler.fit_transform(abq_ndvi_gdf[['log_median_income']].values.reshape(-1, 1))
abq_ndvi_gdf['Standardized NDVI Mean'] = std_scaler.fit_transform(abq_ndvi_gdf[['log_ndvi_mean']].values.reshape(-1, 1))
abq_ndvi_gdf['Standardized NDVI 75ptcl'] = std_scaler.fit_transform(abq_ndvi_gdf[['log_ndvi_75pctl']].values.reshape(-1, 1))
abq_ndvi_gdf['Standardized Income Sources'] = std_scaler.fit_transform(abq_ndvi_gdf[['log_income_sources_last_12_months']].values.reshape(-1, 1))
abq_ndvi_gdf['Standardized Property Value'] = std_scaler.fit_transform(abq_ndvi_gdf[['log_estimated_property_value']].values.reshape(-1, 1))
abq_ndvi_gdf['Standardized NDVI Min'] = std_scaler.fit_transform(abq_ndvi_gdf[['log_ndvi_min']].values.reshape(-1, 1))

# Select columns of interest for the scatter matrix
columns_of_interest = ['Normalized Median Income', 'Normalized Property Value', 'Normalized Monthly Rent', 'Normalized NDVI Mean', 'Standardized NDVI 75ptcl']

# Create the scatter matrix
scatter_matrix(abq_ndvi_gdf[columns_of_interest])
Out[ ]:

Fitting to a Least Squares Regression model¶

Here is where we can finally train the model to predict what the NDVI might be for some of the given economic indicoatrs. We create a training test split using 20@ of the data and input that into the regression model. The function we get back wilil output a predictied NDVI for any test values (x values) of the income statisticvs fed to it. Then we plot the actual measured results against the perfect line predicted by the least squares regression. We then store the predicted and measured results in a dataframe to compare.

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Feature list: these should correspond to the normalized and logged columns created earlier
features = ['Normalized Median Income', 'Normalized Income Sources', 'Normalized Property Value']
target = 'Normalized NDVI Mean'

# Initialize scalers for each feature for normalization reversal
scalers = {feature: MinMaxScaler() for feature in features}
scalers[target] = MinMaxScaler()  # Adding a scaler for the target as well

# Fit scalers on the full dataset to be able to inverse transform later
for feature in features + [target]:
    scalers[feature].fit(abq_ndvi_gdf[[feature]])

# Preparing DataFrames and fitting models
results = []
fig, axes = plt.subplots(nrows=1, ncols=len(features), figsize=(15, 5), sharey=True)

for i, feature in enumerate(features):
    X = abq_ndvi_gdf[[feature]].values
    y = abq_ndvi_gdf[target].values  # Target vector is the normalized NDVI

    # Splitting the data into training/testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Creating and training the linear regression model
    regr = LinearRegression()
    regr.fit(X_train, y_train)

    # Predictions
    y_pred = regr.predict(X_test)

    # Reversing normalization for actual and predicted values
    y_test_actual = scalers[target].inverse_transform(y_test.reshape(-1, 1)).flatten()
    y_pred_actual = scalers[target].inverse_transform(y_pred.reshape(-1, 1)).flatten()

    # Reversing log transformation
    y_test_exp = np.exp(y_test_actual) - 1  # Assuming log(x + 1) was used
    y_pred_exp = np.exp(y_pred_actual) - 1

    # Collecting results in a DataFrame
    df_results = pd.DataFrame({'Actual': y_test_exp, 'Predicted': y_pred_exp})
    results.append(df_results)

    # Plotting results
    ax = axes[i]
    ax.scatter(X_test, y_test_exp, color="black", label='Actual data')
    ax.plot(X_test, y_pred_exp, color="blue", linewidth=2, label='Prediction')
    ax.set_title(f'Model for {feature}')
    ax.set_xlabel(f'{feature}')
    ax.set_ylabel('NDVI Mean' if i == 0 else "")
    ax.legend()

plt.tight_layout()
plt.show()

# Combining and saving results
combined_results = pd.concat(results, keys=features)
combined_results.to_csv('predicted_vs_measured.csv', index=True)
predict_df = pd.read_csv('predicted_vs_measured.csv')

print(predict_df.head())

# Example and performance metrics
print("Example results from the first model:")
print(results[0].head())
print("\nCoefficients: \n", regr.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test_exp, y_pred_exp))
print("Coefficient of determination (R^2): %.2f" % r2_score(y_test_exp, y_pred_exp))
No description has been provided for this image
                 Unnamed: 0  Unnamed: 1    Actual  Predicted
0  Normalized Median Income           0  0.986380   0.728381
1  Normalized Median Income           1  0.749880   0.712661
2  Normalized Median Income           2  0.786089   0.699502
3  Normalized Median Income           3  0.741052   0.715726
4  Normalized Median Income           4  0.293868   0.704962
Example results from the first model:
     Actual  Predicted
0  0.986380   0.728381
1  0.749880   0.712661
2  0.786089   0.699502
3  0.741052   0.715726
4  0.293868   0.704962

Coefficients: 
 [-0.14234052]
Mean squared error: 0.14
Coefficient of determination (R^2): 0.02

Predictive Model Shows a Weak Relationship Between Parameters¶

Data for a city like Albuquerque would be discontiguous because the aridity of the climate means that greenspace is heavily confined to the area closest to the river. Elsewhere it is spotty, but there are several parks and golf courses throughout the city. As one can see on the plots, many of the higher income neighborhoods are towards the edge of the city limits in the open desert, and these houses typically have zeroscaped properties.

In [ ]:
%%capture
%%bash
jupyter nbconvert abq-ndvi.ipynb --to html --no-input
---------------------------------------------------------------------------
CalledProcessError                        Traceback (most recent call last)
Cell In[81], line 1
----> 1 get_ipython().run_cell_magic('bash', '', 'jupyter nbconvert abq-ndvi.ipynb --to html --no-input\n')

File c:\Users\Degg\miniconda3\envs\earth-analytics-python\lib\site-packages\IPython\core\interactiveshell.py:2475, in InteractiveShell.run_cell_magic(self, magic_name, line, cell)
   2473 with self.builtin_trap:
   2474     args = (magic_arg_s, cell)
-> 2475     result = fn(*args, **kwargs)
   2477 # The code below prevents the output from being displayed
   2478 # when using magics with decodator @output_can_be_silenced
   2479 # when the last Python token in the expression is a ';'.
   2480 if getattr(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, False):

File c:\Users\Degg\miniconda3\envs\earth-analytics-python\lib\site-packages\IPython\core\magics\script.py:153, in ScriptMagics._make_script_magic.<locals>.named_script_magic(line, cell)
    151 else:
    152     line = script
--> 153 return self.shebang(line, cell)

File c:\Users\Degg\miniconda3\envs\earth-analytics-python\lib\site-packages\IPython\core\magics\script.py:305, in ScriptMagics.shebang(self, line, cell)
    300 if args.raise_error and p.returncode != 0:
    301     # If we get here and p.returncode is still None, we must have
    302     # killed it but not yet seen its return code. We don't wait for it,
    303     # in case it's stuck in uninterruptible sleep. -9 = SIGKILL
    304     rc = p.returncode or -9
--> 305     raise CalledProcessError(rc, cell)

CalledProcessError: Command 'b'jupyter nbconvert abq-ndvi.ipynb --to html --no-input\n'' returned non-zero exit status 1.
In [ ]: