Dev Goyal commited on
Commit Β·
25d293a
1
Parent(s): 012bcc4
refactor: improve 8-K filing retrieval logic with Item 2.02 validation and update ingestion script error handling
Browse files- Dockerfile +1 -1
- core/earnings_tools.py +31 -18
- scripts/ingest_earnings_calls.py +12 -5
Dockerfile
CHANGED
|
@@ -28,7 +28,7 @@ ENV PYTHONPATH=/app
|
|
| 28 |
RUN python scripts/ingest.py --tickers AAPL MSFT TSLA GOOGL NVDA
|
| 29 |
|
| 30 |
# Ingest SEC 8-K / earnings call data for demo tickers
|
| 31 |
-
RUN python scripts/ingest_earnings_calls.py --tickers AAPL MSFT GOOGL NVDA TSLA --quarters
|
| 32 |
|
| 33 |
# ββ Supervisord config (runs both services) βββββββββββββββββββββββββββββββββ
|
| 34 |
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
|
|
|
| 28 |
RUN python scripts/ingest.py --tickers AAPL MSFT TSLA GOOGL NVDA
|
| 29 |
|
| 30 |
# Ingest SEC 8-K / earnings call data for demo tickers
|
| 31 |
+
RUN python scripts/ingest_earnings_calls.py --tickers AAPL MSFT GOOGL NVDA TSLA --quarters Q3-2025 Q4-2025 Q1-2026
|
| 32 |
|
| 33 |
# ββ Supervisord config (runs both services) βββββββββββββββββββββββββββββββββ
|
| 34 |
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
core/earnings_tools.py
CHANGED
|
@@ -73,9 +73,21 @@ def parse_quarter(quarter_str: str) -> tuple[int, int]:
|
|
| 73 |
return q, y
|
| 74 |
|
| 75 |
|
| 76 |
-
def
|
| 77 |
-
"""
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
# ---------------------------------------------------------------------------
|
|
@@ -151,28 +163,29 @@ def fetch_transcript_sec_8k(ticker: str, quarter: int, year: int) -> Optional[st
|
|
| 151 |
resp.raise_for_status()
|
| 152 |
filings = resp.json()["filings"]["recent"]
|
| 153 |
|
| 154 |
-
|
| 155 |
best_doc_url = None
|
| 156 |
|
| 157 |
for i, form in enumerate(filings["form"]):
|
| 158 |
if form != "8-K":
|
| 159 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
filed = filings["filingDate"][i] # "2025-01-30"
|
| 161 |
filed_year, filed_month = int(filed[:4]), int(filed[5:7])
|
| 162 |
|
| 163 |
-
#
|
| 164 |
-
#
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
m = target_month + offset
|
| 169 |
-
y = year
|
| 170 |
-
if m > 12:
|
| 171 |
-
m -= 12
|
| 172 |
-
y += 1
|
| 173 |
-
acceptable.add((y, m))
|
| 174 |
-
|
| 175 |
-
if (filed_year, filed_month) in acceptable:
|
| 176 |
accession = filings["accessionNumber"][i]
|
| 177 |
acc_clean = accession.replace("-", "")
|
| 178 |
primary_doc = filings["primaryDocument"][i]
|
|
@@ -181,7 +194,7 @@ def fetch_transcript_sec_8k(ticker: str, quarter: int, year: int) -> Optional[st
|
|
| 181 |
f"{cik.lstrip('0')}/{acc_clean}/{primary_doc}"
|
| 182 |
)
|
| 183 |
best_doc_url = doc_url
|
| 184 |
-
break # Take the first matching 8-K
|
| 185 |
|
| 186 |
if not best_doc_url:
|
| 187 |
print(f"[Earnings Ingest] No matching SEC 8-K found for {ticker} Q{quarter}-{year}.")
|
|
|
|
| 73 |
return q, y
|
| 74 |
|
| 75 |
|
| 76 |
+
def _get_quarter_month_range(q: int) -> list[int]:
|
| 77 |
+
"""
|
| 78 |
+
Get the month range (quarter end month + 3 months after) for a given quarter.
|
| 79 |
+
This is used as a heuristic to find the relevant 8-K filing.
|
| 80 |
+
"""
|
| 81 |
+
start_month = {1: 3, 2: 6, 3: 9, 4: 12}[q]
|
| 82 |
+
# We allow a very wide range: 2 months before the standard month to 4 months after.
|
| 83 |
+
# This covers most fiscal year offsets (e.g. AAPL Q1 ends in Dec, reported in Jan/Feb).
|
| 84 |
+
months = []
|
| 85 |
+
for i in range(-2, 5):
|
| 86 |
+
m = start_month + i
|
| 87 |
+
if m < 1: m += 12
|
| 88 |
+
if m > 12: m -= 12
|
| 89 |
+
months.append(m)
|
| 90 |
+
return months
|
| 91 |
|
| 92 |
|
| 93 |
# ---------------------------------------------------------------------------
|
|
|
|
| 163 |
resp.raise_for_status()
|
| 164 |
filings = resp.json()["filings"]["recent"]
|
| 165 |
|
| 166 |
+
acceptable_months = _get_quarter_month_range(quarter)
|
| 167 |
best_doc_url = None
|
| 168 |
|
| 169 |
for i, form in enumerate(filings["form"]):
|
| 170 |
if form != "8-K":
|
| 171 |
continue
|
| 172 |
+
|
| 173 |
+
# Check for Item 2.02 (Results of Operations and Financial Condition)
|
| 174 |
+
# Some filings have items like '1.01,2.02,9.01', some just '2.02'
|
| 175 |
+
items = str(filings.get("items", [""])[i])
|
| 176 |
+
if "2.02" not in items:
|
| 177 |
+
# If we can't find Item 2.02, we fallback to checking if 'earnings' is in the title (if available)
|
| 178 |
+
# or just continuing to search for other 8-Ks.
|
| 179 |
+
continue
|
| 180 |
+
|
| 181 |
filed = filings["filingDate"][i] # "2025-01-30"
|
| 182 |
filed_year, filed_month = int(filed[:4]), int(filed[5:7])
|
| 183 |
|
| 184 |
+
# Logic: If the filing is within the target year (or next year if Q4)
|
| 185 |
+
# and the month is in our heuristic range.
|
| 186 |
+
is_valid_year = (filed_year == year) or (quarter == 4 and filed_year == year + 1)
|
| 187 |
+
|
| 188 |
+
if is_valid_year and filed_month in acceptable_months:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
accession = filings["accessionNumber"][i]
|
| 190 |
acc_clean = accession.replace("-", "")
|
| 191 |
primary_doc = filings["primaryDocument"][i]
|
|
|
|
| 194 |
f"{cik.lstrip('0')}/{acc_clean}/{primary_doc}"
|
| 195 |
)
|
| 196 |
best_doc_url = doc_url
|
| 197 |
+
break # Take the first matching 8-K (most recent)
|
| 198 |
|
| 199 |
if not best_doc_url:
|
| 200 |
print(f"[Earnings Ingest] No matching SEC 8-K found for {ticker} Q{quarter}-{year}.")
|
scripts/ingest_earnings_calls.py
CHANGED
|
@@ -98,12 +98,19 @@ def main():
|
|
| 98 |
}.get(r["status"], "β")
|
| 99 |
print(f" {icon} {r['ticker']} {r['quarter']}: {r['status']}")
|
| 100 |
|
| 101 |
-
|
| 102 |
-
if failed
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
sys.exit(1)
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
if __name__ == "__main__":
|
|
|
|
| 98 |
}.get(r["status"], "β")
|
| 99 |
print(f" {icon} {r['ticker']} {r['quarter']}: {r['status']}")
|
| 100 |
|
| 101 |
+
errors = [r for r in results if r["status"] == "error"]
|
| 102 |
+
failed = [r for r in results if r["status"] == "failed"]
|
| 103 |
+
|
| 104 |
+
if errors:
|
| 105 |
+
print(f"\n[CRITICAL] {len(errors)} ingest(s) hit technical errors. Check logs.")
|
| 106 |
sys.exit(1)
|
| 107 |
+
|
| 108 |
+
if failed:
|
| 109 |
+
print(f"\n[INFO] {len(failed)} transcript(s) could not be found (likely not yet reported).")
|
| 110 |
+
print("This is not treated as a build failure.")
|
| 111 |
+
|
| 112 |
+
print("\nIngestion process completed successfully.")
|
| 113 |
+
sys.exit(0)
|
| 114 |
|
| 115 |
|
| 116 |
if __name__ == "__main__":
|