File size: 5,387 Bytes
5c22923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests
import pandas as pd
from datetime import datetime, date, timedelta
import pytz

# --- Configuration ---
LATITUDE = 24.86
LONGITUDE = 67.01
HISTORICAL_CSV = "data/last_7_days_hourly_data.csv"
TIMEZONE = 'Asia/Karachi' # Use a constant for the timezone

def get_complete_past_week_hourly_data(latitude, longitude, filename):
    """
    Fetches a complete, seamless 7-day history of hourly data by combining
    the historical archive with the most recent real-time measurements.
    """
    print("--- Starting full historical data assembly ---")
    
    # === FIX: Get the current date *in the target timezone* ===
    # This ensures the script works correctly on any server (like UTC-based GitHub Actions).
    karachi_now = datetime.now(pytz.timezone(TIMEZONE))
    today_in_karachi = karachi_now.date()

    # --- Step 1: Fetch HISTORICAL data (Archive API) ---
    hist_end_date = today_in_karachi - timedelta(days=2)
    hist_start_date = today_in_karachi - timedelta(days=8)
    
    print(f"Fetching historical archive from {hist_start_date} to {hist_end_date}...")
    try:
        weather_url = "https://archive-api.open-meteo.com/v1/archive"
        weather_params = {"latitude": latitude, "longitude": longitude, "start_date": hist_start_date.strftime("%Y-%m-%d"), "end_date": hist_end_date.strftime("%Y-%m-%d"), "hourly": "temperature_2m,relative_humidity_2m,wind_speed_10m", "timezone": TIMEZONE}
        df_weather_hist = pd.DataFrame(requests.get(weather_url, params=weather_params).json()['hourly'])

        aq_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
        aq_params = {"latitude": latitude, "longitude": longitude, "start_date": hist_start_date.strftime("%Y-%m-%d"), "end_date": hist_end_date.strftime("%Y-%m-%d"), "hourly": "pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,us_aqi", "timezone": TIMEZONE}
        df_aq_hist = pd.DataFrame(requests.get(aq_url, params=aq_params).json()['hourly'])
        
        df_historical = pd.merge(df_weather_hist, df_aq_hist, on='time')
        df_historical['time'] = pd.to_datetime(df_historical['time'])
        print(f"-> OK: Fetched {len(df_historical)} records from archive.")
    except KeyError:
        print("!!! WARNING: Historical data not available in the requested range (this is normal). Proceeding with recent data.")
        df_historical = pd.DataFrame()
    except Exception as e:
        print(f"!!! WARNING: Could not fetch historical data. Reason: {e}")
        df_historical = pd.DataFrame()

    # --- Step 2: Fetch RECENT data (Forecast API) ---
    recent_start_date = today_in_karachi - timedelta(days=2)
    recent_end_date = today_in_karachi
    
    print(f"Fetching recent measured data from {recent_start_date} to {recent_end_date}...")
    try:
        weather_url = "https://api.open-meteo.com/v1/forecast"
        weather_params = {"latitude": latitude, "longitude": longitude, "start_date": recent_start_date.strftime("%Y-%m-%d"), "end_date": recent_end_date.strftime("%Y-%m-%d"), "hourly": "temperature_2m,relative_humidity_2m,wind_speed_10m", "timezone": TIMEZONE}
        df_weather_recent = pd.DataFrame(requests.get(weather_url, params=weather_params).json()['hourly'])

        aq_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
        aq_params = {"latitude": latitude, "longitude": longitude, "start_date": recent_start_date.strftime("%Y-%m-%d"), "end_date": recent_end_date.strftime("%Y-%m-%d"), "hourly": "pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,us_aqi", "timezone": TIMEZONE}
        df_aq_recent = pd.DataFrame(requests.get(aq_url, params=aq_params).json()['hourly'])

        df_recent = pd.merge(df_weather_recent, df_aq_recent, on='time')
        df_recent['time'] = pd.to_datetime(df_recent['time'])
        print(f"-> OK: Fetched {len(df_recent)} recent records.")
    except Exception as e:
        print(f"!!! WARNING: Could not fetch recent data. Reason: {e}")
        df_recent = pd.DataFrame()

    # --- Step 3: Combine, De-duplicate, and Filter ---
    print("Combining and cleaning final dataset...")
    if df_historical.empty and df_recent.empty:
        print("!!! FATAL: Both historical and recent data fetches failed. Cannot proceed.")
        return

    df_combined = pd.concat([df_historical, df_recent])
    df_combined = df_combined.drop_duplicates(subset='time', keep='last').sort_values(by='time')

    # The rest of the script is already correct because it uses a timezone-aware 'now'
    df_combined['time'] = df_combined['time'].dt.tz_localize(TIMEZONE, ambiguous='infer')
    df_measured = df_combined[df_combined['time'] <= karachi_now].copy()

    seven_days_ago = karachi_now - timedelta(days=7)
    df_final_week = df_measured[df_measured['time'] >= seven_days_ago]
    
    # --- Step 4: Final Rename and Save ---
    df_final = df_final_week.rename({
        'time': 'timestamp',
        'temperature_2m': 'temperature',
        'relative_humidity_2m': 'humidity',
        'wind_speed_10m': 'wind_speed',
        'pm2_5': 'pm25',
        'us_aqi': 'aqi'
    }, axis='columns').dropna()

    df_final.to_csv(filename, index=False)
    
    print(f"\n DONE ")
    print(f"Saved {len(df_final)} hourly records to '{filename}', covering a complete and up-to-date 7-day period.")

#main func
if __name__ == "__main__":
    get_complete_past_week_hourly_data(LATITUDE, LONGITUDE, HISTORICAL_CSV)