| | from __future__ import annotations |
| |
|
| | import argparse |
| | import json |
| | import logging |
| | from pathlib import Path |
| | from typing import Any, Dict, Iterable, Mapping |
| |
|
| | import pandas as pd |
| |
|
| | from src import data_prep |
| |
|
| | LOGGER = logging.getLogger(__name__) |
| |
|
| |
|
| | DEFAULT_META_CONFIG: Dict[str, Dict[str, Any]] = { |
| | "14_EU.csv": { |
| | "type_scrutin": "europeennes", |
| | "date_scrutin": "2014-05-25", |
| | "tour_column": "N° tour", |
| | "code_bv_cols": ["Code de la commune", "N° de bureau de vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Votants": "votants", |
| | "Exprimés": "exprimes", |
| | "Exprimés": "exprimes", |
| | "Nombre de voix du candidat": "voix", |
| | "Voix": "voix", |
| | "Nom du candidat": "nom_candidature", |
| | "Prénom du candidat": "nom_candidature", |
| | "Code nuance du candidat": "code_candidature", |
| | }, |
| | }, |
| | "14_MN14_T1T2.csv": { |
| | "type_scrutin": "municipales", |
| | "date_scrutin": "2014-03-23", |
| | "tour_column": "N° tour", |
| | "code_bv_cols": ["Code commune", "N° de bureau de vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Votants": "votants", |
| | "Exprimés": "exprimes", |
| | "Nombre de voix": "voix", |
| | "Nom du candidat tête de liste": "nom_candidature", |
| | "Prénom du candidat tête de liste": "nom_candidature", |
| | "Code nuance de la liste": "code_candidature", |
| | }, |
| | }, |
| | "17_L_T1.csv": { |
| | "type_scrutin": "legislatives", |
| | "date_scrutin": "2017-06-11", |
| | "tour": 1, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance": "code_candidature", |
| | "Nom": "nom_candidature", |
| | }, |
| | }, |
| | "17_L_T2.csv": { |
| | "type_scrutin": "legislatives", |
| | "date_scrutin": "2017-06-18", |
| | "tour": 2, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance": "code_candidature", |
| | "Nom": "nom_candidature", |
| | }, |
| | }, |
| | "17_PR_T1.csv": { |
| | "type_scrutin": "presidentielles", |
| | "date_scrutin": "2017-04-23", |
| | "tour": 1, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nom": "nom_candidature", |
| | "Code nuance du candidat": "code_candidature", |
| | }, |
| | }, |
| | "17_PR_T2.csv": { |
| | "type_scrutin": "presidentielles", |
| | "date_scrutin": "2017-05-07", |
| | "tour": 2, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nom": "nom_candidature", |
| | "Code nuance du candidat": "code_candidature", |
| | }, |
| | }, |
| | "19_EU.csv": { |
| | "type_scrutin": "europeennes", |
| | "date_scrutin": "2019-05-26", |
| | "tour": 1, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nom Tête de Liste": "nom_candidature", |
| | "Nuance Liste": "code_candidature", |
| | }, |
| | }, |
| | "20_MN_T1.csv": { |
| | "type_scrutin": "municipales", |
| | "date_scrutin": "2020-03-15", |
| | "tour": 1, |
| | "sep": ";", |
| | "code_bv_cols": ["Code de la commune", "Code B.Vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nom": "nom_candidature", |
| | "Liste": "nom_candidature", |
| | "Code Nuance": "code_candidature", |
| | }, |
| | }, |
| | "20_MN_T2.csv": { |
| | "type_scrutin": "municipales", |
| | "date_scrutin": "2020-06-28", |
| | "tour": 2, |
| | "code_bv_cols": ["Code de la commune", "Code B.Vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nom": "nom_candidature", |
| | "Liste": "nom_candidature", |
| | "Code Nuance": "code_candidature", |
| | }, |
| | }, |
| | "21_DEP_T1.csv": { |
| | "type_scrutin": "departementales", |
| | "date_scrutin": "2021-06-20", |
| | "tour": 1, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance": "code_candidature", |
| | "Binôme": "nom_candidature", |
| | }, |
| | }, |
| | "21_DEP_T2.csv": { |
| | "type_scrutin": "departementales", |
| | "date_scrutin": "2021-06-27", |
| | "tour": 2, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance": "code_candidature", |
| | "Binôme": "nom_candidature", |
| | }, |
| | }, |
| | "21_REG_T1.csv": { |
| | "type_scrutin": "regionales", |
| | "date_scrutin": "2021-06-20", |
| | "tour": 1, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance Liste": "code_candidature", |
| | "Libellé Abrégé Liste": "nom_candidature", |
| | }, |
| | }, |
| | "21_REG_T2.csv": { |
| | "type_scrutin": "regionales", |
| | "date_scrutin": "2021-06-27", |
| | "tour": 2, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance Liste": "code_candidature", |
| | "Libellé Abrégé Liste": "nom_candidature", |
| | }, |
| | }, |
| | "22_L_T1.csv": { |
| | "type_scrutin": "legislatives", |
| | "date_scrutin": "2022-06-12", |
| | "tour": 1, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance": "code_candidature", |
| | "Nom": "nom_candidature", |
| | }, |
| | }, |
| | "22_L_T2.csv": { |
| | "type_scrutin": "legislatives", |
| | "date_scrutin": "2022-06-19", |
| | "tour": 2, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance": "code_candidature", |
| | "Nom": "nom_candidature", |
| | }, |
| | }, |
| | "22_PR_T1.csv": { |
| | "type_scrutin": "presidentielles", |
| | "date_scrutin": "2022-04-10", |
| | "tour": 1, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nom": "nom_candidature", |
| | "Code nuance du candidat": "code_candidature", |
| | }, |
| | }, |
| | "22_PR_T2.csv": { |
| | "type_scrutin": "presidentielles", |
| | "date_scrutin": "2022-04-24", |
| | "tour": 2, |
| | "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nom": "nom_candidature", |
| | "Code nuance du candidat": "code_candidature", |
| | }, |
| | }, |
| | "24_EU.csv": { |
| | "type_scrutin": "europeennes", |
| | "date_scrutin": "2024-06-09", |
| | "tour": 1, |
| | "code_bv_cols": ["Code commune", "Code BV"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix 1": "voix", |
| | "Voix": "voix", |
| | "Nuance liste 1": "code_candidature", |
| | "Libellé abrégé de liste 1": "nom_candidature", |
| | }, |
| | }, |
| | "24_L_T1.csv": { |
| | "type_scrutin": "legislatives", |
| | "date_scrutin": "2024-06-30", |
| | "tour": 1, |
| | "code_bv_cols": ["Code commune", "Code BV"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance Liste": "code_candidature", |
| | "Libellé Abrégé Liste": "nom_candidature", |
| | "Binôme": "nom_candidature", |
| | }, |
| | }, |
| | "24_L_T2.csv": { |
| | "type_scrutin": "legislatives", |
| | "date_scrutin": "2024-07-07", |
| | "tour": 2, |
| | "code_bv_cols": ["Code commune", "Code BV"], |
| | "rename_map": { |
| | "Inscrits": "inscrits", |
| | "Abstentions": "abstentions", |
| | "Votants": "votants", |
| | "Blancs": "blancs", |
| | "Nuls": "nuls", |
| | "Exprimés": "exprimes", |
| | "Voix": "voix", |
| | "Nuance Liste": "code_candidature", |
| | "Libellé Abrégé Liste": "nom_candidature", |
| | "Binôme": "nom_candidature", |
| | }, |
| | }, |
| | } |
| |
|
| | DEFAULT_META_CONFIG_PATH = Path(__file__).resolve().parents[2] / "config" / "raw_sources.yaml" |
| |
|
| |
|
| | def _resolve_meta_config(raw: Mapping[str, Mapping[str, Any]]) -> Dict[str, Dict[str, Any]]: |
| | resolved: Dict[str, Dict[str, Any]] = {} |
| |
|
| | def resolve_one(key: str, stack: list[str]) -> Dict[str, Any]: |
| | if key in resolved: |
| | return resolved[key] |
| | if key in stack: |
| | raise ValueError(f"Cycle detecte dans meta-config: {' -> '.join(stack + [key])}") |
| | meta = dict(raw[key]) |
| | base_key = meta.pop("copy_from", None) |
| | if base_key: |
| | if base_key not in raw: |
| | raise KeyError(f"copy_from cible introuvable: {base_key}") |
| | base = resolve_one(base_key, stack + [key]) |
| | merged = dict(base) |
| | rename_base = dict(base.get("rename_map", {})) |
| | rename_override = dict(meta.get("rename_map", {})) |
| | merged.update(meta) |
| | if rename_base or rename_override: |
| | merged["rename_map"] = {**rename_base, **rename_override} |
| | resolved[key] = merged |
| | else: |
| | resolved[key] = meta |
| | return resolved[key] |
| |
|
| | for name in raw: |
| | resolve_one(name, []) |
| | return resolved |
| |
|
| |
|
| | def load_meta_config(meta_path: Path | None) -> Dict[str, Dict[str, Any]]: |
| | if meta_path is None: |
| | if DEFAULT_META_CONFIG_PATH.exists(): |
| | meta_path = DEFAULT_META_CONFIG_PATH |
| | else: |
| | return DEFAULT_META_CONFIG |
| | if not meta_path.exists(): |
| | raise FileNotFoundError(f"Meta-config file not found: {meta_path}") |
| | if meta_path.suffix in {".yml", ".yaml"}: |
| | try: |
| | import yaml |
| | except Exception as exc: |
| | raise RuntimeError("PyYAML is required to read YAML meta-config files.") from exc |
| | raw = yaml.safe_load(meta_path.read_text()) or {} |
| | else: |
| | raw = json.loads(meta_path.read_text()) |
| | if not isinstance(raw, dict): |
| | raise ValueError("Meta-config invalide: attendu un mapping de fichiers vers meta-donnees.") |
| | return _resolve_meta_config(raw) |
| |
|
| |
|
| | def preprocess_all(raw_dir: Path, output_dir: Path, meta_config: Mapping[str, Mapping[str, Any]]) -> pd.DataFrame: |
| | frames = [] |
| | missing: list[str] = [] |
| | for file_name, meta in meta_config.items(): |
| | path = raw_dir / file_name |
| | if not path.exists(): |
| | missing.append(file_name) |
| | continue |
| | LOGGER.info("Standardisation de %s", file_name) |
| | df_std = data_prep.standardize_election( |
| | path, |
| | meta, |
| | rename_map=meta.get("rename_map", {}), |
| | sep=meta.get("sep", ";"), |
| | encoding=meta.get("encoding", ("cp1252", "utf-8-sig", "latin-1")), |
| | decimal=meta.get("decimal", ","), |
| | ) |
| | frames.append(df_std) |
| | if missing: |
| | LOGGER.warning("Fichiers manquants ignorés: %s", ", ".join(sorted(missing))) |
| | if not frames: |
| | raise RuntimeError("Aucune donnée chargée : vérifier le dossier raw et la configuration meta.") |
| |
|
| | elections_long = pd.concat(frames, ignore_index=True) |
| | elections_long["date_scrutin"] = pd.to_datetime(elections_long["date_scrutin"]) |
| | elections_long["annee"] = elections_long["date_scrutin"].dt.year |
| | elections_long["type_scrutin"] = elections_long["type_scrutin"].str.lower() |
| | elections_long["code_commune"] = elections_long["code_bv"].astype(str).str.split("-").str[0] |
| |
|
| | issues = data_prep.validate_consistency(elections_long) |
| | for name, df_issue in issues.items(): |
| | if len(df_issue) > 0: |
| | LOGGER.warning("%s : %s lignes a inspecter", name, len(df_issue)) |
| |
|
| | output_dir.mkdir(parents=True, exist_ok=True) |
| | parquet_path = output_dir / "elections_long.parquet" |
| | csv_path = output_dir / "elections_long.csv" |
| | elections_long.to_parquet(parquet_path, index=False) |
| | elections_long.to_csv(csv_path, sep=";", index=False) |
| | LOGGER.info("Long format sauvegarde (%s lignes) -> %s / %s", len(elections_long), parquet_path, csv_path) |
| | return elections_long |
| |
|
| |
|
| | def parse_args() -> argparse.Namespace: |
| | parser = argparse.ArgumentParser(description="Prétraitement des fichiers bruts en format long standardisé.") |
| | parser.add_argument("--raw-dir", type=Path, default=Path("data/raw"), help="Répertoire des fichiers bruts CSV.") |
| | parser.add_argument("--output-dir", type=Path, default=Path("data/interim"), help="Destination du format long harmonisé.") |
| | parser.add_argument( |
| | "--meta-config", |
| | type=Path, |
| | default=None, |
| | help="Chemin vers un fichier JSON/YAML décrivant les meta-données des scrutins. Par défaut, utilise la configuration embarquée.", |
| | ) |
| | return parser.parse_args() |
| |
|
| |
|
| | def main() -> None: |
| | logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") |
| | args = parse_args() |
| | meta_config = load_meta_config(args.meta_config) |
| | preprocess_all(args.raw_dir, args.output_dir, meta_config) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|