Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| from collections import defaultdict | |
| from collections.abc import Mapping, Sequence | |
| from dataclasses import asdict | |
| from datetime import UTC, datetime | |
| from pathlib import Path | |
| from typing import Any | |
| from slop_farmer.config import PrSearchRefreshOptions | |
| from slop_farmer.data.parquet_io import read_json, read_parquet_rows | |
| from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir | |
| from slop_farmer.reports.pr_heuristics import ( | |
| compile_cluster_suppression_rules, | |
| suppressed_pull_request_reasons, | |
| ) | |
| from slop_farmer.reports.pr_scope import ( | |
| PrScopeCluster, | |
| PrScopeClusterOptions, | |
| _build_scope_profile, | |
| _feature_idf, | |
| _include_pull_request, | |
| _normalize_vector, | |
| _pairwise_comparisons, | |
| build_pr_scope_clusters, | |
| ) | |
| FEATURE_VERSION = "pr_scope_v1" | |
| CANDIDATE_FORMULA_VERSION = "scope_cluster_candidate_v1" | |
| DEFAULT_CANDIDATE_LIMIT = 5 | |
| def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path: | |
| return resolve_snapshot_source_dir( | |
| snapshot_dir=options.snapshot_dir, | |
| local_snapshots_root=options.output_dir.resolve() / "snapshots", | |
| hf_repo_id=options.hf_repo_id, | |
| hf_revision=options.hf_revision, | |
| hf_materialize_dir=options.hf_materialize_dir, | |
| hf_output_dir=options.output_dir, | |
| ) | |
| def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]: | |
| manifest_path = snapshot_dir / "manifest.json" | |
| manifest = read_json(manifest_path) if manifest_path.exists() else {} | |
| pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet") | |
| pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet") | |
| contributors = read_parquet_rows(snapshot_dir / "new_contributors.parquet") | |
| repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or "" | |
| snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name | |
| return { | |
| "repo": repo, | |
| "snapshot_id": snapshot_id, | |
| "manifest": manifest, | |
| "pull_requests": pull_requests, | |
| "pr_files": pr_files, | |
| "contributors": contributors, | |
| } | |
| def build_pr_scope_search_artifacts( | |
| pull_requests: Sequence[Mapping[str, Any]], | |
| pr_files: Sequence[Mapping[str, Any]], | |
| *, | |
| options: PrScopeClusterOptions | None = None, | |
| suppression_rules: Sequence[Mapping[str, Any]] = (), | |
| limit_prs: int | None = None, | |
| ) -> dict[str, Any]: | |
| settings = options or PrScopeClusterOptions() | |
| suppressed_prs = suppressed_pull_request_reasons( | |
| pull_requests, | |
| pr_files, | |
| compile_cluster_suppression_rules(suppression_rules), | |
| ) | |
| active_prs = [ | |
| row | |
| for row in pull_requests | |
| if _include_pull_request(row, settings) and int(row["number"]) not in suppressed_prs | |
| ] | |
| active_prs.sort(key=lambda row: int(row["number"])) | |
| if limit_prs is not None: | |
| if limit_prs < 1: | |
| raise ValueError("--limit-prs must be at least 1") | |
| active_prs = active_prs[:limit_prs] | |
| active_numbers = {int(row["number"]) for row in active_prs if row.get("number") is not None} | |
| filtered_pr_files = [ | |
| row | |
| for row in pr_files | |
| if row.get("pull_request_number") is not None | |
| and int(row["pull_request_number"]) in active_numbers | |
| ] | |
| files_by_pr: defaultdict[int, list[Mapping[str, Any]]] = defaultdict(list) | |
| for row in filtered_pr_files: | |
| files_by_pr[int(row["pull_request_number"])].append(row) | |
| profiles = [ | |
| _build_scope_profile(row, files_by_pr.get(int(row["number"]), []), settings) | |
| for row in active_prs | |
| ] | |
| feature_idf = _feature_idf(profiles, settings) if profiles else {} | |
| for profile in profiles: | |
| profile.vector = _normalize_vector( | |
| { | |
| feature: weight * feature_idf[feature] | |
| for feature, weight in profile.raw_vector.items() | |
| if feature in feature_idf | |
| } | |
| ) | |
| comparisons = _pairwise_comparisons(profiles, settings) if len(profiles) > 1 else [] | |
| comparison_rows = {_pair_key(entry.left, entry.right): entry for entry in comparisons} | |
| neighbor_rankings = _neighbor_rankings(comparisons, settings) | |
| clusters = build_pr_scope_clusters( | |
| active_prs, | |
| filtered_pr_files, | |
| options=settings, | |
| suppression_rules=suppression_rules, | |
| ) | |
| documents = [_document_row(row) for row in active_prs] | |
| features = [_feature_row(profile) for profile in profiles] | |
| neighbors = _neighbor_rows(neighbor_rankings) | |
| cluster_rows = [_cluster_row(cluster) for cluster in clusters] | |
| cluster_members = _cluster_member_rows(clusters) | |
| cluster_candidates = _cluster_candidate_rows( | |
| profiles=profiles, | |
| comparison_rows=comparison_rows, | |
| clusters=clusters, | |
| ) | |
| settings_json = { | |
| **asdict(settings), | |
| "feature_version": FEATURE_VERSION, | |
| "candidate_formula_version": CANDIDATE_FORMULA_VERSION, | |
| } | |
| return { | |
| "documents": documents, | |
| "features": features, | |
| "run_artifact": { | |
| "feature_version": FEATURE_VERSION, | |
| "idf_json": feature_idf, | |
| }, | |
| "neighbors": neighbors, | |
| "clusters": cluster_rows, | |
| "cluster_members": cluster_members, | |
| "cluster_candidates": cluster_candidates, | |
| "settings_json": settings_json, | |
| } | |
| def build_scope_feature_idf_for_indexed_documents( | |
| indexed_documents: Sequence[Mapping[str, Any]], | |
| pr_files: Sequence[Mapping[str, Any]], | |
| *, | |
| options: PrScopeClusterOptions | None = None, | |
| ) -> dict[str, float]: | |
| settings = options or PrScopeClusterOptions() | |
| indexed_numbers = { | |
| int(row["pr_number"]) for row in indexed_documents if row.get("pr_number") is not None | |
| } | |
| files_by_pr: defaultdict[int, list[Mapping[str, Any]]] = defaultdict(list) | |
| for row in pr_files: | |
| pr_number = row.get("pull_request_number") | |
| if pr_number is None: | |
| continue | |
| number = int(pr_number) | |
| if number in indexed_numbers: | |
| files_by_pr[number].append(row) | |
| profiles = [ | |
| _build_scope_profile( | |
| _document_to_profile_row(row), | |
| files_by_pr.get(int(row["pr_number"]), []), | |
| settings, | |
| ) | |
| for row in indexed_documents | |
| if row.get("pr_number") is not None | |
| ] | |
| return _feature_idf(profiles, settings) | |
| def build_scope_feature_for_pull_request( | |
| pr_row: Mapping[str, Any], | |
| pr_files: Sequence[Mapping[str, Any]], | |
| *, | |
| feature_idf: Mapping[str, float], | |
| options: PrScopeClusterOptions | None = None, | |
| ) -> dict[str, Any]: | |
| settings = options or PrScopeClusterOptions() | |
| profile = _build_scope_profile(pr_row, pr_files, settings) | |
| profile.vector = _normalize_vector( | |
| { | |
| feature: weight * feature_idf[feature] | |
| for feature, weight in profile.raw_vector.items() | |
| if feature in feature_idf | |
| } | |
| ) | |
| return _feature_row(profile) | |
| def rank_scope_feature_matches( | |
| query_feature: Mapping[str, Any], | |
| indexed_features: Sequence[Mapping[str, Any]], | |
| *, | |
| options: PrScopeClusterOptions | None = None, | |
| limit: int = 10, | |
| ) -> list[dict[str, Any]]: | |
| settings = options or PrScopeClusterOptions() | |
| rows: list[dict[str, Any]] = [] | |
| query_pr_number = int(query_feature["pr_number"]) | |
| for feature in indexed_features: | |
| if int(feature["pr_number"]) == query_pr_number: | |
| continue | |
| pair = scope_feature_pair_explanation(query_feature, feature, options=settings) | |
| if pair["similarity"] < settings.min_similarity: | |
| continue | |
| rows.append(pair) | |
| rows.sort( | |
| key=lambda row: ( | |
| -float(row["similarity"]), | |
| -float(row["content_similarity"]), | |
| int(row["right_pr_number"]), | |
| ) | |
| ) | |
| return rows[:limit] | |
| def rank_scope_cluster_candidates( | |
| *, | |
| similarity_rows: Sequence[Mapping[str, Any]], | |
| clusters: Sequence[Mapping[str, Any]], | |
| cluster_members: Mapping[str, Sequence[int]], | |
| assigned_cluster_ids: set[str] | None = None, | |
| limit: int = DEFAULT_CANDIDATE_LIMIT, | |
| ) -> list[dict[str, Any]]: | |
| similarities_by_pr = { | |
| int(row["right_pr_number"]): row | |
| for row in similarity_rows | |
| if row.get("right_pr_number") is not None | |
| } | |
| candidate_rows: list[dict[str, Any]] = [] | |
| assigned = assigned_cluster_ids or set() | |
| for cluster in clusters: | |
| cluster_id = str(cluster["cluster_id"]) | |
| member_rows = [ | |
| (member_pr_number, similarities_by_pr.get(member_pr_number)) | |
| for member_pr_number in cluster_members.get(cluster_id, ()) | |
| ] | |
| member_similarities = [ | |
| (member_pr_number, similarity_row) | |
| for member_pr_number, similarity_row in member_rows | |
| if similarity_row is not None and float(similarity_row["similarity"]) > 0.0 | |
| ] | |
| if not member_similarities and cluster_id not in assigned: | |
| continue | |
| member_similarities.sort(key=lambda item: (-float(item[1]["similarity"]), item[0])) | |
| top_similarities = [float(entry["similarity"]) for _, entry in member_similarities[:3]] | |
| max_member_similarity = top_similarities[0] if top_similarities else 0.0 | |
| avg_top_member_similarity = ( | |
| sum(top_similarities) / len(top_similarities) if top_similarities else 0.0 | |
| ) | |
| matched_member_count = len(member_similarities) | |
| best_member_pr_number = member_similarities[0][0] if member_similarities else None | |
| best_match = member_similarities[0][1] if member_similarities else None | |
| candidate_score = ( | |
| max_member_similarity * 0.60 | |
| + avg_top_member_similarity * 0.30 | |
| + min(matched_member_count, 3) / 3.0 * 0.10 | |
| ) | |
| evidence = { | |
| "matched_member_pr_numbers": [member for member, _ in member_similarities[:5]], | |
| "best_member_pr_number": best_member_pr_number, | |
| "best_shared_filenames": ( | |
| list(best_match["shared_filenames"][:5]) if best_match is not None else [] | |
| ), | |
| "best_shared_directories": ( | |
| list(best_match["shared_directories"][:5]) if best_match is not None else [] | |
| ), | |
| "reason": _candidate_reason( | |
| matched_member_count=matched_member_count, | |
| best_comparison=best_match, | |
| ), | |
| } | |
| candidate_rows.append( | |
| { | |
| "cluster_id": cluster_id, | |
| "candidate_score": candidate_score, | |
| "matched_member_count": matched_member_count, | |
| "best_member_pr_number": best_member_pr_number, | |
| "max_member_similarity": max_member_similarity, | |
| "avg_top_member_similarity": avg_top_member_similarity, | |
| "evidence": evidence, | |
| "assigned": cluster_id in assigned, | |
| } | |
| ) | |
| candidate_rows.sort( | |
| key=lambda row: ( | |
| -float(row["candidate_score"]), | |
| -int(row["matched_member_count"]), | |
| str(row["cluster_id"]), | |
| ) | |
| ) | |
| for rank, row in enumerate(candidate_rows[:limit], start=1): | |
| row["candidate_rank"] = rank | |
| return candidate_rows[:limit] | |
| def scope_feature_pair_explanation( | |
| left_feature: Mapping[str, Any], | |
| right_feature: Mapping[str, Any], | |
| *, | |
| options: PrScopeClusterOptions | None = None, | |
| ) -> dict[str, Any]: | |
| settings = options or PrScopeClusterOptions() | |
| weight_total = ( | |
| settings.content_weight | |
| + settings.size_weight | |
| + settings.breadth_weight | |
| + settings.concentration_weight | |
| ) | |
| if weight_total <= 0.0: | |
| raise ValueError("PR scope similarity weights must sum to a positive value.") | |
| left_vector = _json_dict(left_feature.get("vector_json")) | |
| right_vector = _json_dict(right_feature.get("vector_json")) | |
| left_filenames = set(_json_list(left_feature.get("filenames_json"))) | |
| right_filenames = set(_json_list(right_feature.get("filenames_json"))) | |
| left_directories = set(_json_list(left_feature.get("directories_json"))) | |
| right_directories = set(_json_list(right_feature.get("directories_json"))) | |
| content_similarity = _cosine_similarity(left_vector, right_vector) | |
| if ( | |
| content_similarity <= 0.0 | |
| and not left_filenames.intersection(right_filenames) | |
| and not left_directories.intersection(right_directories) | |
| ): | |
| similarity = 0.0 | |
| else: | |
| size_similarity = _ratio_similarity( | |
| int(left_feature.get("total_changed_lines") or 0), | |
| int(right_feature.get("total_changed_lines") or 0), | |
| ) | |
| breadth_similarity = ( | |
| _ratio_similarity( | |
| int(left_feature.get("file_count") or 0), | |
| int(right_feature.get("file_count") or 0), | |
| ) | |
| + _ratio_similarity( | |
| int(left_feature.get("directory_count") or 0), | |
| int(right_feature.get("directory_count") or 0), | |
| ) | |
| ) / 2.0 | |
| concentration_similarity = max( | |
| 0.0, | |
| 1.0 | |
| - abs( | |
| float(left_feature.get("dominant_dir_share") or 0.0) | |
| - float(right_feature.get("dominant_dir_share") or 0.0) | |
| ), | |
| ) | |
| similarity = ( | |
| content_similarity * settings.content_weight | |
| + size_similarity * settings.size_weight | |
| + breadth_similarity * settings.breadth_weight | |
| + concentration_similarity * settings.concentration_weight | |
| ) / weight_total | |
| return { | |
| "left_pr_number": int(left_feature["pr_number"]), | |
| "right_pr_number": int(right_feature["pr_number"]), | |
| "similarity": similarity, | |
| "content_similarity": content_similarity, | |
| "size_similarity": size_similarity, | |
| "breadth_similarity": breadth_similarity, | |
| "concentration_similarity": concentration_similarity, | |
| "shared_filenames": sorted(left_filenames & right_filenames)[:10], | |
| "shared_directories": sorted( | |
| left_directories & right_directories, | |
| key=lambda value: (-value.count("/"), value), | |
| )[:10], | |
| } | |
| return { | |
| "left_pr_number": int(left_feature["pr_number"]), | |
| "right_pr_number": int(right_feature["pr_number"]), | |
| "similarity": similarity, | |
| "content_similarity": content_similarity, | |
| "size_similarity": 0.0, | |
| "breadth_similarity": 0.0, | |
| "concentration_similarity": 0.0, | |
| "shared_filenames": [], | |
| "shared_directories": [], | |
| } | |
| def scope_options_from_settings(settings_json: Mapping[str, Any] | None) -> PrScopeClusterOptions: | |
| if not settings_json: | |
| return PrScopeClusterOptions() | |
| defaults = asdict(PrScopeClusterOptions()) | |
| values = {key: settings_json[key] for key in defaults if key in settings_json} | |
| return PrScopeClusterOptions(**values) | |
| def iso_timestamp() -> str: | |
| return datetime.now(tz=UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") | |
| def _document_row(row: Mapping[str, Any]) -> dict[str, Any]: | |
| return { | |
| "pr_number": int(row["number"]), | |
| "github_id": row.get("github_id"), | |
| "author_login": row.get("author_login"), | |
| "state": row.get("state"), | |
| "draft": bool(row.get("draft")), | |
| "merged": bool(row.get("merged")), | |
| "title": row.get("title") or "", | |
| "base_ref": row.get("base_ref"), | |
| "created_at": row.get("created_at"), | |
| "updated_at": row.get("updated_at"), | |
| "merged_at": row.get("merged_at"), | |
| "additions": int(row.get("additions") or 0), | |
| "deletions": int(row.get("deletions") or 0), | |
| "changed_files": int(row.get("changed_files") or 0), | |
| "comments_count": int(row.get("comments_count") or 0), | |
| "review_comments_count": int(row.get("review_comments_count") or 0), | |
| "html_url": row.get("html_url"), | |
| } | |
| def _document_to_profile_row(row: Mapping[str, Any]) -> dict[str, Any]: | |
| return { | |
| "number": int(row["pr_number"]), | |
| "additions": int(row.get("additions") or 0), | |
| "deletions": int(row.get("deletions") or 0), | |
| "changed_files": int(row.get("changed_files") or 0), | |
| } | |
| def _feature_row(profile: Any) -> dict[str, Any]: | |
| return { | |
| "pr_number": profile.number, | |
| "feature_version": FEATURE_VERSION, | |
| "total_changed_lines": profile.total_changed_lines, | |
| "file_count": profile.file_count, | |
| "directory_count": profile.directory_count, | |
| "dominant_dir_share": profile.dominant_dir_share, | |
| "filenames_json": sorted(profile.filenames), | |
| "directories_json": sorted(profile.directories), | |
| "vector_json": profile.vector, | |
| } | |
| def _neighbor_rankings( | |
| comparisons: Sequence[Any], options: PrScopeClusterOptions | |
| ) -> dict[int, list[dict[str, Any]]]: | |
| ranked: defaultdict[int, list[tuple[float, int, Any]]] = defaultdict(list) | |
| for entry in comparisons: | |
| if entry.similarity < options.min_similarity: | |
| continue | |
| ranked[entry.left].append((entry.similarity, entry.right, entry)) | |
| ranked[entry.right].append((entry.similarity, entry.left, entry)) | |
| results: dict[int, list[dict[str, Any]]] = {} | |
| for pr_number, items in ranked.items(): | |
| ordered = sorted(items, key=lambda item: (-item[0], item[1]))[: options.max_neighbors] | |
| results[pr_number] = [ | |
| { | |
| "other_pr_number": other_pr_number, | |
| "rank": rank, | |
| "comparison": comparison, | |
| } | |
| for rank, (_, other_pr_number, comparison) in enumerate(ordered, start=1) | |
| ] | |
| return results | |
| def _neighbor_rows( | |
| neighbor_rankings: Mapping[int, Sequence[Mapping[str, Any]]], | |
| ) -> list[dict[str, Any]]: | |
| rows: dict[tuple[int, int], dict[str, Any]] = {} | |
| for pr_number, ranked_neighbors in neighbor_rankings.items(): | |
| for ranked_neighbor in ranked_neighbors: | |
| comparison = ranked_neighbor["comparison"] | |
| left_pr = min(pr_number, int(ranked_neighbor["other_pr_number"])) | |
| right_pr = max(pr_number, int(ranked_neighbor["other_pr_number"])) | |
| pair_key = (left_pr, right_pr) | |
| row = rows.get(pair_key) | |
| if row is None: | |
| row = { | |
| "left_pr_number": left_pr, | |
| "right_pr_number": right_pr, | |
| "rank_from_left": None, | |
| "rank_from_right": None, | |
| "similarity": comparison.similarity, | |
| "content_similarity": comparison.content_similarity, | |
| "size_similarity": comparison.size_similarity, | |
| "breadth_similarity": comparison.breadth_similarity, | |
| "concentration_similarity": comparison.concentration_similarity, | |
| "shared_filenames_json": comparison.shared_filenames, | |
| "shared_directories_json": comparison.shared_directories, | |
| } | |
| rows[pair_key] = row | |
| if pr_number == left_pr: | |
| row["rank_from_left"] = int(ranked_neighbor["rank"]) | |
| else: | |
| row["rank_from_right"] = int(ranked_neighbor["rank"]) | |
| return [rows[key] for key in sorted(rows)] | |
| def _cluster_row(cluster: PrScopeCluster) -> dict[str, Any]: | |
| return { | |
| "cluster_id": cluster.cluster_id, | |
| "representative_pr_number": cluster.representative_pr_number, | |
| "cluster_size": len(cluster.pr_numbers), | |
| "average_similarity": cluster.average_similarity, | |
| "summary": cluster.summary, | |
| "shared_filenames_json": cluster.shared_filenames, | |
| "shared_directories_json": cluster.shared_directories, | |
| } | |
| def _cluster_member_rows(clusters: Sequence[PrScopeCluster]) -> list[dict[str, Any]]: | |
| rows: list[dict[str, Any]] = [] | |
| for cluster in clusters: | |
| for pr_number in cluster.pr_numbers: | |
| rows.append( | |
| { | |
| "cluster_id": cluster.cluster_id, | |
| "pr_number": pr_number, | |
| "member_role": ( | |
| "representative" | |
| if pr_number == cluster.representative_pr_number | |
| else "member" | |
| ), | |
| } | |
| ) | |
| rows.sort( | |
| key=lambda row: ( | |
| row["cluster_id"], | |
| row["member_role"] != "representative", | |
| row["pr_number"], | |
| ) | |
| ) | |
| return rows | |
| def _cluster_candidate_rows( | |
| *, | |
| profiles: Sequence[Any], | |
| comparison_rows: Mapping[tuple[int, int], Any], | |
| clusters: Sequence[PrScopeCluster], | |
| ) -> list[dict[str, Any]]: | |
| cluster_ids_by_pr: defaultdict[int, set[str]] = defaultdict(set) | |
| cluster_members: dict[str, list[int]] = {} | |
| for cluster in clusters: | |
| cluster_members[cluster.cluster_id] = list(cluster.pr_numbers) | |
| for pr_number in cluster.pr_numbers: | |
| cluster_ids_by_pr[pr_number].add(cluster.cluster_id) | |
| rows: list[dict[str, Any]] = [] | |
| for profile in sorted(profiles, key=lambda item: item.number): | |
| candidates = _cluster_candidates_for_pr( | |
| pr_number=profile.number, | |
| comparison_rows=comparison_rows, | |
| clusters=clusters, | |
| assigned_cluster_ids=cluster_ids_by_pr.get(profile.number, set()), | |
| cluster_members=cluster_members, | |
| ) | |
| rows.extend(candidates) | |
| return rows | |
| def _cluster_candidates_for_pr( | |
| *, | |
| pr_number: int, | |
| comparison_rows: Mapping[tuple[int, int], Any], | |
| clusters: Sequence[PrScopeCluster], | |
| assigned_cluster_ids: set[str], | |
| cluster_members: Mapping[str, Sequence[int]], | |
| ) -> list[dict[str, Any]]: | |
| candidate_rows: list[dict[str, Any]] = [] | |
| for cluster in clusters: | |
| member_similarities: list[tuple[int, Any]] = [] | |
| for member_pr_number in cluster_members[cluster.cluster_id]: | |
| if member_pr_number == pr_number: | |
| continue | |
| comparison = comparison_rows.get(_pair_key(pr_number, member_pr_number)) | |
| if comparison is None or comparison.similarity <= 0.0: | |
| continue | |
| member_similarities.append((member_pr_number, comparison)) | |
| if not member_similarities and cluster.cluster_id not in assigned_cluster_ids: | |
| continue | |
| member_similarities.sort(key=lambda item: (-item[1].similarity, item[0])) | |
| top_similarities = [entry.similarity for _, entry in member_similarities[:3]] | |
| max_member_similarity = top_similarities[0] if top_similarities else 0.0 | |
| avg_top_member_similarity = ( | |
| sum(top_similarities) / len(top_similarities) if top_similarities else 0.0 | |
| ) | |
| matched_member_count = len(member_similarities) | |
| candidate_score = ( | |
| max_member_similarity * 0.60 | |
| + avg_top_member_similarity * 0.30 | |
| + min(matched_member_count, 3) / 3.0 * 0.10 | |
| ) | |
| best_member_pr_number = member_similarities[0][0] if member_similarities else None | |
| best_comparison = member_similarities[0][1] if member_similarities else None | |
| evidence = { | |
| "matched_member_pr_numbers": [member for member, _ in member_similarities[:5]], | |
| "best_member_pr_number": best_member_pr_number, | |
| "best_shared_filenames": ( | |
| list(best_comparison.shared_filenames[:5]) if best_comparison is not None else [] | |
| ), | |
| "best_shared_directories": ( | |
| list(best_comparison.shared_directories[:5]) if best_comparison is not None else [] | |
| ), | |
| "reason": _candidate_reason( | |
| matched_member_count=matched_member_count, | |
| best_comparison=best_comparison, | |
| ), | |
| } | |
| candidate_rows.append( | |
| { | |
| "pr_number": pr_number, | |
| "cluster_id": cluster.cluster_id, | |
| "candidate_score": candidate_score, | |
| "matched_member_count": matched_member_count, | |
| "best_member_pr_number": best_member_pr_number, | |
| "max_member_similarity": max_member_similarity, | |
| "avg_top_member_similarity": avg_top_member_similarity, | |
| "evidence_json": evidence, | |
| "assigned": cluster.cluster_id in assigned_cluster_ids, | |
| } | |
| ) | |
| candidate_rows.sort( | |
| key=lambda row: ( | |
| -row["candidate_score"], | |
| -row["matched_member_count"], | |
| row["cluster_id"], | |
| ) | |
| ) | |
| for rank, row in enumerate(candidate_rows[:DEFAULT_CANDIDATE_LIMIT], start=1): | |
| row["candidate_rank"] = rank | |
| return candidate_rows[:DEFAULT_CANDIDATE_LIMIT] | |
| def _candidate_reason(*, matched_member_count: int, best_comparison: Any | None) -> str: | |
| if best_comparison is None: | |
| return "cluster membership matches existing scope assignment" | |
| shared_filenames = ( | |
| list(best_comparison.shared_filenames) | |
| if hasattr(best_comparison, "shared_filenames") | |
| else list(best_comparison.get("shared_filenames") or []) | |
| ) | |
| shared_directories = ( | |
| list(best_comparison.shared_directories) | |
| if hasattr(best_comparison, "shared_directories") | |
| else list(best_comparison.get("shared_directories") or []) | |
| ) | |
| if matched_member_count >= 2: | |
| return "overlapping files and directories with multiple cluster members" | |
| if shared_filenames: | |
| return "overlapping changed files with a cluster member" | |
| if shared_directories: | |
| return "overlapping directories with a cluster member" | |
| return "similar change shape to a cluster member" | |
| def _pair_key(left: int, right: int) -> tuple[int, int]: | |
| return (left, right) if left <= right else (right, left) | |
| def _json_dict(raw: Any) -> dict[str, float]: | |
| if isinstance(raw, dict): | |
| return {str(key): float(value) for key, value in raw.items()} | |
| if isinstance(raw, str) and raw: | |
| payload = json.loads(raw) | |
| if isinstance(payload, dict): | |
| return {str(key): float(value) for key, value in payload.items()} | |
| return {} | |
| def _json_list(raw: Any) -> list[str]: | |
| if isinstance(raw, list): | |
| return [str(item) for item in raw] | |
| if isinstance(raw, str) and raw: | |
| payload = json.loads(raw) | |
| if isinstance(payload, list): | |
| return [str(item) for item in payload] | |
| return [] | |
| def _cosine_similarity(left: Mapping[str, float], right: Mapping[str, float]) -> float: | |
| if not left or not right: | |
| return 0.0 | |
| if len(left) > len(right): | |
| left, right = right, left | |
| return sum(weight * right.get(feature, 0.0) for feature, weight in left.items()) | |
| def _ratio_similarity(left: int, right: int) -> float: | |
| largest = max(left, right) | |
| if largest <= 0: | |
| return 1.0 | |
| return min(left, right) / largest | |