""" GitHub repository connector. Clones a public GitHub repo to a temporary local directory and returns the path for downstream parsing. """ from __future__ import annotations import logging import os import re import shutil import tempfile from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) # Regex for validating GitHub URLs GITHUB_URL_RE = re.compile( r"^https?://github\.com/(?P[A-Za-z0-9_.\-]+)/(?P[A-Za-z0-9_.\-]+?)(?:\.git)?(?:/.*)?$" ) def _validate_github_url(url: str) -> re.Match: """Raise ValueError if the URL is not a valid public GitHub repo URL.""" match = GITHUB_URL_RE.match(url.strip()) if not match: raise ValueError( f"Invalid GitHub URL: {url!r}. " "Expected format: https://github.com//" ) return match def clone_repo(url: str, target_dir: Optional[str] = None) -> str: """ Clone a GitHub repository into *target_dir* (or a temp dir). Returns the path to the cloned repository root. Raises: ValueError: If the URL is invalid. RuntimeError: If git clone fails. """ match = _validate_github_url(url) owner = match.group("owner") repo = match.group("repo") # Build a clean clone URL (strip any path suffix after repo name) clone_url = f"https://github.com/{owner}/{repo}.git" if target_dir is None: target_dir = tempfile.mkdtemp(prefix="codesentry_") dest = os.path.join(target_dir, repo) logger.info("Cloning %s → %s", clone_url, dest) # Use gitpython if available, fall back to subprocess try: import git # type: ignore git.Repo.clone_from( clone_url, dest, depth=1, # shallow clone — we only need the code, not history no_single_branch=True, ) except ImportError: import subprocess # noqa: S404 result = subprocess.run( # noqa: S603 S607 ["git", "clone", "--depth", "1", clone_url, dest], capture_output=True, text=True, timeout=120, ) if result.returncode != 0: raise RuntimeError( f"git clone failed (exit {result.returncode}): {result.stderr.strip()}" ) return dest def cleanup_repo(path: str) -> None: """Remove a cloned repository directory from disk.""" try: shutil.rmtree(path, ignore_errors=True) logger.debug("Cleaned up repo dir: %s", path) except Exception as exc: logger.warning("Failed to clean up %s: %s", path, exc) def get_repo_info(url: str) -> dict: """Extract owner and repo name from a GitHub URL without cloning.""" match = _validate_github_url(url) return { "owner": match.group("owner"), "repo": match.group("repo"), "clone_url": f"https://github.com/{match.group('owner')}/{match.group('repo')}.git", } class GitHubConnector: """ Context-manager wrapper around clone/cleanup. Usage:: async with GitHubConnector("https://github.com/foo/bar") as repo_dir: files = parse_directory(repo_dir) """ def __init__(self, url: str) -> None: self.url = url self._repo_dir: Optional[str] = None self._tmp_dir: Optional[str] = None def __enter__(self) -> str: self._tmp_dir = tempfile.mkdtemp(prefix="codesentry_") self._repo_dir = clone_repo(self.url, target_dir=self._tmp_dir) return self._repo_dir def __exit__(self, *_: object) -> None: if self._tmp_dir: cleanup_repo(self._tmp_dir) # Async support async def __aenter__(self) -> str: return self.__enter__() async def __aexit__(self, *args: object) -> None: self.__exit__(*args)