LSTM-forecaster / utils /fetch_comstock_data.py
nkapila6's picture
Upload 441 files
83c588b verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 13 14:04:47 2024
@author: Nikhil Kapila
"""
import pandas as pd
import os
import requests
from requests.exceptions import RequestException
import time
size_bins = ['1001_5000', '5001_10000', '10001_25000', '25001_50000',
'50001_100000', '100001_200000', '200001_500000']
def fetch_data()->pd.DataFrame:
# buildings = pd.read_csv('datasets/comstock/GA_filtered_building_list.csv')
# buildings = pd.read_csv('datasets/comstock/GA_baseline_basic_metadata_and_annual_results.csv')
buildings = pd.read_csv('datasets/comstock/GA_metadata.csv')
return buildings
def parse_floor_area_category(floor_area_category):
try:
if "_" in floor_area_category:
parts = floor_area_category.split("_")
return int(parts[0]), int(parts[1])
return int(floor_area_category), int(floor_area_category)
except ValueError:
return None, None
def calculate_midpoint(floor_area_category):
lower, upper = parse_floor_area_category(floor_area_category)
if lower is not None and upper is not None:
return (lower + upper) / 2
return None
def find_closest_floor_area_category(input_floor_area, predefined_bins):
input_midpoint = calculate_midpoint(input_floor_area)
closest_bins = sorted(predefined_bins, key=lambda bin: abs(calculate_midpoint(bin) - input_midpoint))
return closest_bins
def find_id(floor_area: str = '5001_10000', in_vintage: str = '1980 to 1989', building_type: str = 'Office') -> int:
if building_type == 'Lodging / Residential' or building_type == 'Lodging':
building_type = 'Lodging'
print(building_type)
filter_on = fetch_data()
closest_bins = find_closest_floor_area_category(floor_area, size_bins)
print(f"Input floor area: {floor_area}, Closest bins: {closest_bins}")
for closest_category in closest_bins:
conditions = {
'in.floor_area_category': [closest_category],
'in.comstock_building_type_group': [building_type],
# 'in.vintage': [in_vintage],
# 'in.cluster_name': [cluster_name]
}
query_string = ' & '.join([f"`{col}` in {values}" for col, values in conditions.items()])
filtered_bldgs = filter_on.query(query_string)
if not filtered_bldgs.empty:
filtered_bldgs1 = filtered_bldgs.sort_values(
by='calc.weighted.site_energy.total.energy_consumption..tbtu', ascending=True)
return filtered_bldgs1.iloc[0]['bldg_id']
# Fallback: If no match is found, sort all buildings by size and find the closest one
filter_on['midpoint'] = filter_on['in.floor_area_category'].apply(calculate_midpoint)
input_midpoint = calculate_midpoint(floor_area)
if input_midpoint is not None:
if input_midpoint <= filter_on['midpoint'].min():
closest_bldg = filter_on.sort_values(by='midpoint', ascending=True).iloc[0]
else:
closest_bldg = filter_on.sort_values(by='midpoint', ascending=False).iloc[0]
return closest_bldg['bldg_id']
raise ValueError("No buildings match the given criteria.")
# =============================================================================
# # Most affordable upgrades that can be suggested:
# 0 is Baseline
# "28": "Package 1, Wall & Roof Insulation + New Windows"
# "29": "Package 2, LED Lighting + Variable Speed HP RTU or HP Boilers"
# "31": "Package 4, Package 1 + Package 2"
# https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/timeseries_individual_buildings/by_state/upgrade=1/state=GA/100004-1.parquet
# =============================================================================
# fetch the building url from NREL
def fetch_building_urls(b_id:str, state:str='GA')->dict:
# upgrade_list = [21, 11, 26, 10, 19]
upgrade_list = [0, 28, 29, 31]
url_dict = {}
for upgrade_id in upgrade_list:
dataset_url_link = f"https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/timeseries_individual_buildings/by_state/upgrade={upgrade_id}/state={state}/{b_id}-{upgrade_id}.parquet"
url_dict[upgrade_id] = dataset_url_link
return url_dict
def get_datasets_from_comstock(b_id:str, url_dict:dict, max_attempts:int=3, output_col:str='out.site_energy.total.energy_consumption')->dict:
pd_dict = {}
folder_path = f'datasets/comstock/downloaded/{b_id}/'
if not os.path.exists(folder_path): os.makedirs(folder_path)
for k,v in url_dict.items():
# file_path = folder_path + f"buildingid_{b_id}_upgrade_{k}_data.parquet"
file_path = folder_path + f"{b_id}_{k}.parquet"
# if file already exists, skip download
if not os.path.exists(file_path):
print(f'Downloading {v}\n')
if file_downloader(v, k, file_path):
pd_dict[k] = pd.read_parquet(file_path)
pd_dict[k].set_index('timestamp', inplace=True)
pd_dict[k] = pd_dict[k].resample('h').sum()
print(f'File loaded into dict.')
else:
print(f'Failed to download after multiple attempts')
else:
print(f'File already exists at {file_path}. Skipping download.\n')
pd_dict[k] = pd.read_parquet(file_path)
pd_dict[k].set_index('timestamp', inplace=True)
pd_dict[k] = pd_dict[k].resample('h').sum()
pd_dict[k] = pd_dict[k]['out.site_energy.total.energy_consumption']
pd_dict[k] = pd.DataFrame(pd_dict[k])
return pd_dict
def file_downloader(url:str, upgrade_id:str, file_path:str, max_attempts:int=3)->bool:
for attempt in range(max_attempts):
try:
response = requests.get(url)
response.raise_for_status()
with open(file_path, 'wb') as file:
file.write(response.content)
print(f'File saved at {file_path}.')
return True
except RequestException as e:
print(f'Attempt {attempt+1} to download upgrade-ID {upgrade_id} failed.\n')
time.sleep(2)
print(f'Failed to download after multiple attempts')
return False
# data = fetch_data()
# b_id = find_id()
# dict = fetch_building_urls(b_id)
# print(dict)
# pd_dict = get_datasets_from_comstock(b_id, dict)
# lighting_upgrade = pd_dict[21]
# print(lighting_upgrade.columns)
# print(lighting_upgrade["out.site_energy.total.energy_consumption"].sum())
# out.site_energy.total.energy_consumption in kWh
# out.site_energy.total.energy_consumption_intensity in kwh_per_ft^2 (kwh/square foot)