Spaces:

GaTech-NREL
/

LSTM-forecaster

Sleeping

App Files Files Community

LSTM-forecaster / utils /fetch_comstock_data.py

nkapila6

Upload 441 files

83c588b verified over 1 year ago

raw

history blame contribute delete

6.79 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Tue Jun 13 14:04:47 2024

	@author: Nikhil Kapila
	"""

	import pandas as pd
	import os
	import requests
	from requests.exceptions import RequestException
	import time

	size_bins = ['1001_5000', '5001_10000', '10001_25000', '25001_50000',
	'50001_100000', '100001_200000', '200001_500000']

	def fetch_data()->pd.DataFrame:
	# buildings = pd.read_csv('datasets/comstock/GA_filtered_building_list.csv')
	# buildings = pd.read_csv('datasets/comstock/GA_baseline_basic_metadata_and_annual_results.csv')
	buildings = pd.read_csv('datasets/comstock/GA_metadata.csv')
	return buildings

	def parse_floor_area_category(floor_area_category):
	try:
	if "_" in floor_area_category:
	parts = floor_area_category.split("_")
	return int(parts[0]), int(parts[1])
	return int(floor_area_category), int(floor_area_category)
	except ValueError:
	return None, None

	def calculate_midpoint(floor_area_category):
	lower, upper = parse_floor_area_category(floor_area_category)
	if lower is not None and upper is not None:
	return (lower + upper) / 2
	return None

	def find_closest_floor_area_category(input_floor_area, predefined_bins):
	input_midpoint = calculate_midpoint(input_floor_area)
	closest_bins = sorted(predefined_bins, key=lambda bin: abs(calculate_midpoint(bin) - input_midpoint))
	return closest_bins

	def find_id(floor_area: str = '5001_10000', in_vintage: str = '1980 to 1989', building_type: str = 'Office') -> int:
	if building_type == 'Lodging / Residential' or building_type == 'Lodging':
	building_type = 'Lodging'
	print(building_type)

	filter_on = fetch_data()
	closest_bins = find_closest_floor_area_category(floor_area, size_bins)
	print(f"Input floor area: {floor_area}, Closest bins: {closest_bins}")

	for closest_category in closest_bins:
	conditions = {
	'in.floor_area_category': [closest_category],
	'in.comstock_building_type_group': [building_type],
	# 'in.vintage': [in_vintage],
	# 'in.cluster_name': [cluster_name]
	}

	query_string = ' & '.join([f"`{col}` in {values}" for col, values in conditions.items()])
	filtered_bldgs = filter_on.query(query_string)

	if not filtered_bldgs.empty:
	filtered_bldgs1 = filtered_bldgs.sort_values(
	by='calc.weighted.site_energy.total.energy_consumption..tbtu', ascending=True)
	return filtered_bldgs1.iloc[0]['bldg_id']

	# Fallback: If no match is found, sort all buildings by size and find the closest one
	filter_on['midpoint'] = filter_on['in.floor_area_category'].apply(calculate_midpoint)
	input_midpoint = calculate_midpoint(floor_area)

	if input_midpoint is not None:
	if input_midpoint <= filter_on['midpoint'].min():
	closest_bldg = filter_on.sort_values(by='midpoint', ascending=True).iloc[0]
	else:
	closest_bldg = filter_on.sort_values(by='midpoint', ascending=False).iloc[0]

	return closest_bldg['bldg_id']

	raise ValueError("No buildings match the given criteria.")

	# =============================================================================
	# # Most affordable upgrades that can be suggested:
	# 0 is Baseline
	# "28": "Package 1, Wall & Roof Insulation + New Windows"
	# "29": "Package 2, LED Lighting + Variable Speed HP RTU or HP Boilers"
	# "31": "Package 4, Package 1 + Package 2"
	# https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/timeseries_individual_buildings/by_state/upgrade=1/state=GA/100004-1.parquet
	# =============================================================================

	# fetch the building url from NREL
	def fetch_building_urls(b_id:str, state:str='GA')->dict:
	# upgrade_list = [21, 11, 26, 10, 19]
	upgrade_list = [0, 28, 29, 31]
	url_dict = {}
	for upgrade_id in upgrade_list:
	dataset_url_link = f"https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/timeseries_individual_buildings/by_state/upgrade={upgrade_id}/state={state}/{b_id}-{upgrade_id}.parquet"
	url_dict[upgrade_id] = dataset_url_link
	return url_dict

	def get_datasets_from_comstock(b_id:str, url_dict:dict, max_attempts:int=3, output_col:str='out.site_energy.total.energy_consumption')->dict:
	pd_dict = {}
	folder_path = f'datasets/comstock/downloaded/{b_id}/'
	if not os.path.exists(folder_path): os.makedirs(folder_path)

	for k,v in url_dict.items():
	# file_path = folder_path + f"buildingid_{b_id}_upgrade_{k}_data.parquet"
	file_path = folder_path + f"{b_id}_{k}.parquet"

	# if file already exists, skip download
	if not os.path.exists(file_path):
	print(f'Downloading {v}\n')
	if file_downloader(v, k, file_path):
	pd_dict[k] = pd.read_parquet(file_path)
	pd_dict[k].set_index('timestamp', inplace=True)
	pd_dict[k] = pd_dict[k].resample('h').sum()
	print(f'File loaded into dict.')
	else:
	print(f'Failed to download after multiple attempts')
	else:
	print(f'File already exists at {file_path}. Skipping download.\n')
	pd_dict[k] = pd.read_parquet(file_path)
	pd_dict[k].set_index('timestamp', inplace=True)
	pd_dict[k] = pd_dict[k].resample('h').sum()
	pd_dict[k] = pd_dict[k]['out.site_energy.total.energy_consumption']
	pd_dict[k] = pd.DataFrame(pd_dict[k])
	return pd_dict

	def file_downloader(url:str, upgrade_id:str, file_path:str, max_attempts:int=3)->bool:
	for attempt in range(max_attempts):
	try:
	response = requests.get(url)
	response.raise_for_status()
	with open(file_path, 'wb') as file:
	file.write(response.content)
	print(f'File saved at {file_path}.')
	return True
	except RequestException as e:
	print(f'Attempt {attempt+1} to download upgrade-ID {upgrade_id} failed.\n')
	time.sleep(2)
	print(f'Failed to download after multiple attempts')
	return False

	# data = fetch_data()
	# b_id = find_id()
	# dict = fetch_building_urls(b_id)
	# print(dict)
	# pd_dict = get_datasets_from_comstock(b_id, dict)
	# lighting_upgrade = pd_dict[21]
	# print(lighting_upgrade.columns)
	# print(lighting_upgrade["out.site_energy.total.energy_consumption"].sum())

	# out.site_energy.total.energy_consumption in kWh
	# out.site_energy.total.energy_consumption_intensity in kwh_per_ft^2 (kwh/square foot)