Spaces:
Sleeping
Sleeping
| """Holistic pull-request scope clustering. | |
| This module answers a different question than duplicate-PR detection: | |
| "Which open PRs look similar in code scope and touched areas?" | |
| The algorithm intentionally uses the whole open-PR queue at once instead of | |
| only raw pairwise overlap: | |
| 1. Build a sparse "scope vector" for each open PR from: | |
| - exact filenames | |
| - directory prefixes | |
| - coarse file chunks derived from unified-diff hunk ranges | |
| 2. Weight every feature by inverse PR frequency so very common files and | |
| directories contribute less than rare, discriminative ones. | |
| 3. Blend content similarity with simple "shape" features: | |
| - total changed-line size | |
| - changed-file count | |
| - touched-directory count | |
| - concentration in the dominant leaf directory | |
| 4. Build a mutual k-nearest-neighbor graph from those scores. | |
| 5. Keep only strong or locally-supported edges so broad PRs do not bridge | |
| unrelated groups. | |
| 6. Split graph components greedily around local medoids to avoid weak chains. | |
| The result is a conservative "similar scope" clustering intended for queue | |
| analysis and review routing, not a strict duplicate classifier. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from collections import Counter, defaultdict | |
| from collections.abc import Mapping, Sequence | |
| from dataclasses import dataclass | |
| from datetime import UTC, datetime | |
| from math import log, sqrt | |
| from pathlib import Path, PurePosixPath | |
| from typing import Any | |
| from pydantic import BaseModel, Field | |
| from slop_farmer.data.parquet_io import read_json, read_parquet_rows | |
| from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir | |
| from slop_farmer.reports.pr_heuristics import ( | |
| compile_cluster_suppression_rules, | |
| suppressed_pull_request_reasons, | |
| ) | |
| HUNK_HEADER_PATTERN = re.compile(r"^@@ -\d+(?:,\d+)? \+(?P<start>\d+)(?:,(?P<count>\d+))? @@") | |
| __all__ = [ | |
| "PrScopeCluster", | |
| "PrScopeClusterOptions", | |
| "PrScopePair", | |
| "build_pr_scope_clusters", | |
| "run_pr_scope_report", | |
| ] | |
| class PrScopeClusterOptions: | |
| """Tuning knobs for holistic PR scope clustering. | |
| The defaults aim to be conservative: | |
| - only open, non-draft PRs are clustered | |
| - exact files matter most | |
| - directory and chunk features let nearby-but-not-identical changes group | |
| - cluster edges need either high similarity or mutual-neighbor support | |
| """ | |
| include_closed: bool = False | |
| include_drafts: bool = False | |
| chunk_size: int = 80 | |
| max_neighbors: int = 5 | |
| min_similarity: float = 0.30 | |
| pair_similarity: float = 0.40 | |
| strong_similarity: float = 0.55 | |
| min_shared_neighbors: int = 1 | |
| expansion_similarity: float = 0.34 | |
| min_cluster_average_similarity: float = 0.32 | |
| min_cluster_size: int = 2 | |
| max_feature_df_ratio: float = 0.90 | |
| file_weight: float = 1.0 | |
| directory_weight: float = 0.60 | |
| chunk_weight: float = 0.75 | |
| content_weight: float = 0.70 | |
| size_weight: float = 0.15 | |
| breadth_weight: float = 0.10 | |
| concentration_weight: float = 0.05 | |
| max_shared_features: int = 8 | |
| class PrScopePair(BaseModel): | |
| left_pr_number: int | |
| right_pr_number: int | |
| similarity: float | |
| content_similarity: float | |
| size_similarity: float | |
| breadth_similarity: float | |
| concentration_similarity: float | |
| shared_filenames: list[str] = Field(default_factory=list) | |
| shared_directories: list[str] = Field(default_factory=list) | |
| class PrScopeCluster(BaseModel): | |
| cluster_id: str | |
| pr_numbers: list[int] | |
| representative_pr_number: int | |
| average_similarity: float | |
| summary: str | |
| shared_filenames: list[str] = Field(default_factory=list) | |
| shared_directories: list[str] = Field(default_factory=list) | |
| pairwise: list[PrScopePair] = Field(default_factory=list) | |
| class _ScopeProfile: | |
| number: int | |
| total_changed_lines: int | |
| file_count: int | |
| directory_count: int | |
| dominant_dir_share: float | |
| filenames: set[str] | |
| directories: set[str] | |
| raw_vector: dict[str, float] | |
| vector: dict[str, float] | |
| class _ScopeComparison: | |
| left: int | |
| right: int | |
| similarity: float | |
| content_similarity: float | |
| size_similarity: float | |
| breadth_similarity: float | |
| concentration_similarity: float | |
| shared_filenames: list[str] | |
| shared_directories: list[str] | |
| def build_pr_scope_clusters( | |
| pull_requests: Sequence[Mapping[str, Any]], | |
| pr_files: Sequence[Mapping[str, Any]], | |
| *, | |
| options: PrScopeClusterOptions | None = None, | |
| suppression_rules: Sequence[Mapping[str, Any]] = (), | |
| ) -> list[PrScopeCluster]: | |
| """Cluster open PRs by weighted file-scope similarity. | |
| This is intentionally holistic: | |
| - feature weights depend on the full open-PR set | |
| - similarity blends exact-file, directory, and chunk overlap | |
| - graph edges depend on each PR's neighborhood, not only raw pair scores | |
| """ | |
| settings = options or PrScopeClusterOptions() | |
| suppressed_prs = suppressed_pull_request_reasons( | |
| pull_requests, | |
| pr_files, | |
| compile_cluster_suppression_rules(suppression_rules), | |
| ) | |
| active_prs = [ | |
| row | |
| for row in pull_requests | |
| if _include_pull_request(row, settings) and int(row["number"]) not in suppressed_prs | |
| ] | |
| if len(active_prs) < 2: | |
| return [] | |
| files_by_pr: defaultdict[int, list[Mapping[str, Any]]] = defaultdict(list) | |
| active_numbers = {int(row["number"]) for row in active_prs if row.get("number") is not None} | |
| for row in pr_files: | |
| pr_number = row.get("pull_request_number") | |
| if pr_number is None: | |
| continue | |
| number = int(pr_number) | |
| if number in active_numbers: | |
| files_by_pr[number].append(row) | |
| profiles = [ | |
| _build_scope_profile(row, files_by_pr.get(int(row["number"]), []), settings) | |
| for row in active_prs | |
| ] | |
| profiles_by_number = {profile.number: profile for profile in profiles} | |
| feature_idf = _feature_idf(profiles, settings) | |
| for profile in profiles: | |
| profile.vector = _normalize_vector( | |
| { | |
| feature: weight * feature_idf[feature] | |
| for feature, weight in profile.raw_vector.items() | |
| if feature in feature_idf | |
| } | |
| ) | |
| comparisons = _pairwise_comparisons(profiles, settings) | |
| if not comparisons: | |
| return [] | |
| comparison_map = {(entry.left, entry.right): entry for entry in comparisons} | |
| top_neighbors = _top_neighbors(comparisons, settings) | |
| edges = _cluster_edges(comparisons, top_neighbors, settings) | |
| if not edges: | |
| return [] | |
| clusters: list[PrScopeCluster] = [] | |
| for component in _connected_components(sorted(active_numbers), edges): | |
| if len(component) < settings.min_cluster_size: | |
| continue | |
| member_sets = _refine_component(component, comparison_map, settings) | |
| for members in member_sets: | |
| clusters.append( | |
| _cluster_entry( | |
| members=members, | |
| profiles_by_number=profiles_by_number, | |
| comparison_map=comparison_map, | |
| feature_idf=feature_idf, | |
| settings=settings, | |
| ) | |
| ) | |
| return sorted( | |
| clusters, | |
| key=lambda cluster: ( | |
| -len(cluster.pr_numbers), | |
| -cluster.average_similarity, | |
| cluster.cluster_id, | |
| ), | |
| ) | |
| def run_pr_scope_report(options: Any) -> Path: | |
| """Resolve a snapshot, cluster open PR scopes, and write a JSON report.""" | |
| snapshot_dir = _resolve_snapshot_dir(options) | |
| snapshot = _load_snapshot_context(snapshot_dir) | |
| clusters = build_pr_scope_clusters( | |
| snapshot["pull_requests"], | |
| snapshot["pr_files"], | |
| suppression_rules=options.cluster_suppression_rules, | |
| ) | |
| output_path = (options.output or (snapshot_dir / "pr-scope-clusters.json")).resolve() | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| payload = { | |
| "repo": snapshot["repo"], | |
| "snapshot_id": snapshot["snapshot_id"], | |
| "generated_at": datetime.now(tz=UTC) | |
| .replace(microsecond=0) | |
| .isoformat() | |
| .replace("+00:00", "Z"), | |
| "cluster_count": len(clusters), | |
| "pr_scope_clusters": [cluster.model_dump(mode="json") for cluster in clusters], | |
| } | |
| output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") | |
| return output_path | |
| def _resolve_snapshot_dir(options: Any) -> Path: | |
| return resolve_snapshot_source_dir( | |
| snapshot_dir=options.snapshot_dir, | |
| local_snapshots_root=options.output_dir.resolve() / "snapshots", | |
| hf_repo_id=options.hf_repo_id, | |
| hf_revision=options.hf_revision, | |
| hf_materialize_dir=options.hf_materialize_dir, | |
| hf_output_dir=options.output_dir, | |
| ) | |
| def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]: | |
| manifest_path = snapshot_dir / "manifest.json" | |
| manifest = read_json(manifest_path) if manifest_path.exists() else {} | |
| pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet") | |
| pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet") | |
| repo = manifest.get("repo") or (pull_requests[0]["repo"] if pull_requests else None) or "" | |
| snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name | |
| return { | |
| "repo": repo, | |
| "snapshot_id": snapshot_id, | |
| "pull_requests": pull_requests, | |
| "pr_files": pr_files, | |
| } | |
| def _include_pull_request(row: Mapping[str, Any], options: PrScopeClusterOptions) -> bool: | |
| if row.get("number") is None: | |
| return False | |
| if not options.include_closed and str(row.get("state") or "").lower() != "open": | |
| return False | |
| return options.include_drafts or not bool(row.get("draft")) | |
| def _build_scope_profile( | |
| pr_row: Mapping[str, Any], | |
| file_rows: Sequence[Mapping[str, Any]], | |
| options: PrScopeClusterOptions, | |
| ) -> _ScopeProfile: | |
| number = int(pr_row["number"]) | |
| file_lines: defaultdict[str, int] = defaultdict(int) | |
| prefix_directory_lines: defaultdict[str, int] = defaultdict(int) | |
| leaf_directory_lines: defaultdict[str, int] = defaultdict(int) | |
| chunk_lines: defaultdict[str, int] = defaultdict(int) | |
| for row in file_rows: | |
| filename = str(row.get("filename") or "").strip() | |
| if not filename: | |
| continue | |
| changed_lines = _file_changed_lines(row) | |
| file_lines[filename] += changed_lines | |
| leaf_directory = _leaf_directory(filename) | |
| leaf_directory_lines[leaf_directory] += changed_lines | |
| for directory in _directory_prefixes(filename): | |
| prefix_directory_lines[directory] += changed_lines | |
| patch = row.get("patch") | |
| if patch: | |
| for chunk_key, chunk_size in _chunk_line_weights( | |
| filename, str(patch), options.chunk_size | |
| ).items(): | |
| chunk_lines[chunk_key] += chunk_size | |
| if not file_lines: | |
| fallback_total = max( | |
| 1, int(pr_row.get("additions") or 0) + int(pr_row.get("deletions") or 0) | |
| ) | |
| file_count = max(1, int(pr_row.get("changed_files") or 0)) | |
| return _ScopeProfile( | |
| number=number, | |
| total_changed_lines=fallback_total, | |
| file_count=file_count, | |
| directory_count=0, | |
| dominant_dir_share=0.0, | |
| filenames=set(), | |
| directories=set(), | |
| raw_vector={}, | |
| vector={}, | |
| ) | |
| raw_vector: dict[str, float] = {} | |
| for filename, changed_lines in file_lines.items(): | |
| raw_vector[f"file:{filename}"] = sqrt(changed_lines) * options.file_weight | |
| for directory, changed_lines in prefix_directory_lines.items(): | |
| raw_vector[f"dir:{directory}"] = ( | |
| raw_vector.get(f"dir:{directory}", 0.0) + sqrt(changed_lines) * options.directory_weight | |
| ) | |
| for chunk_key, changed_lines in chunk_lines.items(): | |
| raw_vector[chunk_key] = ( | |
| raw_vector.get(chunk_key, 0.0) + sqrt(changed_lines) * options.chunk_weight | |
| ) | |
| total_changed_lines = sum(file_lines.values()) | |
| dominant_dir_share = 0.0 | |
| if leaf_directory_lines and total_changed_lines > 0: | |
| dominant_dir_share = max(leaf_directory_lines.values()) / total_changed_lines | |
| return _ScopeProfile( | |
| number=number, | |
| total_changed_lines=total_changed_lines, | |
| file_count=len(file_lines), | |
| directory_count=len(leaf_directory_lines), | |
| dominant_dir_share=dominant_dir_share, | |
| filenames=set(file_lines), | |
| directories=set(prefix_directory_lines), | |
| raw_vector=raw_vector, | |
| vector={}, | |
| ) | |
| def _file_changed_lines(row: Mapping[str, Any]) -> int: | |
| additions = int(row.get("additions") or 0) | |
| deletions = int(row.get("deletions") or 0) | |
| changes = int(row.get("changes") or 0) | |
| total = additions + deletions | |
| if total > 0: | |
| return total | |
| if changes > 0: | |
| return changes | |
| return 1 | |
| def _directory_prefixes(filename: str) -> list[str]: | |
| parts = PurePosixPath(filename).parts[:-1] | |
| prefixes: list[str] = [] | |
| current: list[str] = [] | |
| for part in parts: | |
| current.append(part) | |
| prefixes.append("/".join(current)) | |
| return prefixes | |
| def _leaf_directory(filename: str) -> str: | |
| parts = PurePosixPath(filename).parts[:-1] | |
| return "/".join(parts) if parts else "." | |
| def _chunk_line_weights(filename: str, patch: str, chunk_size: int) -> dict[str, int]: | |
| weights: defaultdict[str, int] = defaultdict(int) | |
| for start, end in _patch_ranges(patch): | |
| current = start | |
| while current <= end: | |
| chunk_index = (current - 1) // chunk_size | |
| chunk_start = chunk_index * chunk_size + 1 | |
| chunk_end = chunk_start + chunk_size - 1 | |
| overlap_end = min(end, chunk_end) | |
| weights[f"chunk:{filename}:{chunk_start}-{chunk_end}"] += overlap_end - current + 1 | |
| current = overlap_end + 1 | |
| return weights | |
| def _patch_ranges(patch: str) -> list[tuple[int, int]]: | |
| ranges: list[tuple[int, int]] = [] | |
| for line in patch.splitlines(): | |
| match = HUNK_HEADER_PATTERN.match(line) | |
| if match is None: | |
| continue | |
| start = int(match.group("start")) | |
| count = int(match.group("count") or "1") | |
| if count <= 0: | |
| continue | |
| ranges.append((start, start + count - 1)) | |
| return ranges | |
| def _feature_idf( | |
| profiles: Sequence[_ScopeProfile], options: PrScopeClusterOptions | |
| ) -> dict[str, float]: | |
| document_frequency: Counter[str] = Counter() | |
| profile_count = len(profiles) | |
| for profile in profiles: | |
| document_frequency.update(profile.raw_vector.keys()) | |
| idf: dict[str, float] = {} | |
| for feature, frequency in document_frequency.items(): | |
| if frequency / profile_count > options.max_feature_df_ratio: | |
| continue | |
| idf[feature] = log((profile_count + 1) / (frequency + 1)) + 1.0 | |
| return idf | |
| def _normalize_vector(vector: Mapping[str, float]) -> dict[str, float]: | |
| norm = sqrt(sum(weight * weight for weight in vector.values())) | |
| if norm <= 0.0: | |
| return {} | |
| return {feature: weight / norm for feature, weight in vector.items() if weight > 0.0} | |
| def _pairwise_comparisons( | |
| profiles: Sequence[_ScopeProfile], | |
| options: PrScopeClusterOptions, | |
| ) -> list[_ScopeComparison]: | |
| comparisons: list[_ScopeComparison] = [] | |
| weight_total = ( | |
| options.content_weight | |
| + options.size_weight | |
| + options.breadth_weight | |
| + options.concentration_weight | |
| ) | |
| if weight_total <= 0: | |
| raise ValueError("PR scope similarity weights must sum to a positive value.") | |
| ordered = sorted(profiles, key=lambda profile: profile.number) | |
| for index, left in enumerate(ordered): | |
| for right in ordered[index + 1 :]: | |
| content_similarity = _cosine_similarity(left.vector, right.vector) | |
| if ( | |
| content_similarity <= 0.0 | |
| and not left.filenames.intersection(right.filenames) | |
| and not left.directories.intersection(right.directories) | |
| ): | |
| continue | |
| size_similarity = _ratio_similarity(left.total_changed_lines, right.total_changed_lines) | |
| breadth_similarity = ( | |
| _ratio_similarity(left.file_count, right.file_count) | |
| + _ratio_similarity(left.directory_count, right.directory_count) | |
| ) / 2.0 | |
| concentration_similarity = max( | |
| 0.0, 1.0 - abs(left.dominant_dir_share - right.dominant_dir_share) | |
| ) | |
| similarity = ( | |
| content_similarity * options.content_weight | |
| + size_similarity * options.size_weight | |
| + breadth_similarity * options.breadth_weight | |
| + concentration_similarity * options.concentration_weight | |
| ) / weight_total | |
| comparisons.append( | |
| _ScopeComparison( | |
| left=left.number, | |
| right=right.number, | |
| similarity=similarity, | |
| content_similarity=content_similarity, | |
| size_similarity=size_similarity, | |
| breadth_similarity=breadth_similarity, | |
| concentration_similarity=concentration_similarity, | |
| shared_filenames=sorted(left.filenames & right.filenames)[:10], | |
| shared_directories=sorted( | |
| left.directories & right.directories, | |
| key=lambda value: (-value.count("/"), value), | |
| )[:10], | |
| ) | |
| ) | |
| return comparisons | |
| def _cosine_similarity(left: Mapping[str, float], right: Mapping[str, float]) -> float: | |
| if not left or not right: | |
| return 0.0 | |
| if len(left) > len(right): | |
| left, right = right, left | |
| return sum(weight * right.get(feature, 0.0) for feature, weight in left.items()) | |
| def _ratio_similarity(left: int, right: int) -> float: | |
| largest = max(left, right) | |
| if largest <= 0: | |
| return 1.0 | |
| return min(left, right) / largest | |
| def _top_neighbors( | |
| comparisons: Sequence[_ScopeComparison], | |
| options: PrScopeClusterOptions, | |
| ) -> dict[int, set[int]]: | |
| ranked: defaultdict[int, list[tuple[float, int]]] = defaultdict(list) | |
| for entry in comparisons: | |
| if entry.similarity < options.min_similarity: | |
| continue | |
| ranked[entry.left].append((entry.similarity, entry.right)) | |
| ranked[entry.right].append((entry.similarity, entry.left)) | |
| neighbors: dict[int, set[int]] = {} | |
| for number, items in ranked.items(): | |
| sorted_items = sorted(items, key=lambda item: (-item[0], item[1])) | |
| neighbors[number] = {other for _, other in sorted_items[: options.max_neighbors]} | |
| return neighbors | |
| def _cluster_edges( | |
| comparisons: Sequence[_ScopeComparison], | |
| top_neighbors: Mapping[int, set[int]], | |
| options: PrScopeClusterOptions, | |
| ) -> set[tuple[int, int]]: | |
| edges: set[tuple[int, int]] = set() | |
| for entry in comparisons: | |
| if entry.similarity < options.min_similarity: | |
| continue | |
| left_neighbors = top_neighbors.get(entry.left, set()) | |
| right_neighbors = top_neighbors.get(entry.right, set()) | |
| if entry.similarity >= options.strong_similarity: | |
| edges.add((entry.left, entry.right)) | |
| continue | |
| if entry.right not in left_neighbors or entry.left not in right_neighbors: | |
| continue | |
| shared_neighbor_count = len(left_neighbors & right_neighbors) | |
| if ( | |
| shared_neighbor_count >= options.min_shared_neighbors | |
| or entry.similarity >= options.pair_similarity | |
| ): | |
| edges.add((entry.left, entry.right)) | |
| return edges | |
| def _connected_components(numbers: Sequence[int], edges: set[tuple[int, int]]) -> list[list[int]]: | |
| adjacency: defaultdict[int, set[int]] = defaultdict(set) | |
| for left, right in edges: | |
| adjacency[left].add(right) | |
| adjacency[right].add(left) | |
| components: list[list[int]] = [] | |
| seen: set[int] = set() | |
| for number in sorted(numbers): | |
| if number in seen or number not in adjacency: | |
| continue | |
| stack = [number] | |
| component: list[int] = [] | |
| while stack: | |
| current = stack.pop() | |
| if current in seen: | |
| continue | |
| seen.add(current) | |
| component.append(current) | |
| stack.extend(sorted(adjacency[current] - seen, reverse=True)) | |
| if component: | |
| components.append(sorted(component)) | |
| return components | |
| def _refine_component( | |
| members: Sequence[int], | |
| comparison_map: Mapping[tuple[int, int], _ScopeComparison], | |
| options: PrScopeClusterOptions, | |
| ) -> list[list[int]]: | |
| if len(members) <= options.min_cluster_size: | |
| average_similarity = _cluster_average_similarity(members, comparison_map) | |
| if average_similarity >= options.min_cluster_average_similarity: | |
| return [sorted(members)] | |
| return [] | |
| remaining = set(members) | |
| refined: list[list[int]] = [] | |
| while len(remaining) >= options.min_cluster_size: | |
| seed = max( | |
| sorted(remaining), | |
| key=lambda number: ( | |
| _mean_similarity(number, remaining - {number}, comparison_map), | |
| -number, | |
| ), | |
| ) | |
| cluster = [seed] | |
| candidates = sorted( | |
| remaining - {seed}, | |
| key=lambda number: (_similarity(seed, number, comparison_map), -number), | |
| reverse=True, | |
| ) | |
| for candidate in candidates: | |
| mean_to_cluster = _mean_similarity(candidate, set(cluster), comparison_map) | |
| if mean_to_cluster >= options.expansion_similarity: | |
| cluster.append(candidate) | |
| cluster = sorted(cluster) | |
| average_similarity = _cluster_average_similarity(cluster, comparison_map) | |
| if ( | |
| len(cluster) >= options.min_cluster_size | |
| and average_similarity >= options.min_cluster_average_similarity | |
| ): | |
| refined.append(cluster) | |
| remaining.difference_update(cluster) | |
| else: | |
| remaining.remove(seed) | |
| return refined | |
| def _mean_similarity( | |
| number: int, | |
| others: set[int], | |
| comparison_map: Mapping[tuple[int, int], _ScopeComparison], | |
| ) -> float: | |
| if not others: | |
| return 0.0 | |
| return sum(_similarity(number, other, comparison_map) for other in others) / len(others) | |
| def _cluster_average_similarity( | |
| members: Sequence[int], | |
| comparison_map: Mapping[tuple[int, int], _ScopeComparison], | |
| ) -> float: | |
| if len(members) < 2: | |
| return 0.0 | |
| total = 0.0 | |
| comparisons = 0 | |
| ordered = sorted(members) | |
| for index, left in enumerate(ordered): | |
| for right in ordered[index + 1 :]: | |
| total += _similarity(left, right, comparison_map) | |
| comparisons += 1 | |
| if comparisons == 0: | |
| return 0.0 | |
| return total / comparisons | |
| def _similarity( | |
| left: int, | |
| right: int, | |
| comparison_map: Mapping[tuple[int, int], _ScopeComparison], | |
| ) -> float: | |
| entry = comparison_map.get((left, right)) or comparison_map.get((right, left)) | |
| return entry.similarity if entry is not None else 0.0 | |
| def _cluster_entry( | |
| *, | |
| members: Sequence[int], | |
| profiles_by_number: Mapping[int, _ScopeProfile], | |
| comparison_map: Mapping[tuple[int, int], _ScopeComparison], | |
| feature_idf: Mapping[str, float], | |
| settings: PrScopeClusterOptions, | |
| ) -> PrScopeCluster: | |
| ordered = sorted(members) | |
| pairwise: list[PrScopePair] = [] | |
| for index, left in enumerate(ordered): | |
| for right in ordered[index + 1 :]: | |
| entry = comparison_map.get((left, right)) or comparison_map.get((right, left)) | |
| if entry is None: | |
| continue | |
| pairwise.append( | |
| PrScopePair( | |
| left_pr_number=entry.left, | |
| right_pr_number=entry.right, | |
| similarity=round(entry.similarity, 3), | |
| content_similarity=round(entry.content_similarity, 3), | |
| size_similarity=round(entry.size_similarity, 3), | |
| breadth_similarity=round(entry.breadth_similarity, 3), | |
| concentration_similarity=round(entry.concentration_similarity, 3), | |
| shared_filenames=entry.shared_filenames, | |
| shared_directories=entry.shared_directories, | |
| ) | |
| ) | |
| pairwise.sort( | |
| key=lambda entry: (-entry.similarity, entry.left_pr_number, entry.right_pr_number) | |
| ) | |
| average_similarity = _cluster_average_similarity(ordered, comparison_map) | |
| representative_pr_number = _representative_pr_number(ordered, comparison_map) | |
| shared_filenames = _shared_exact_features( | |
| ordered, | |
| profiles_by_number, | |
| feature_idf, | |
| prefix="file:", | |
| limit=settings.max_shared_features, | |
| ) | |
| shared_directories = _shared_exact_features( | |
| ordered, | |
| profiles_by_number, | |
| feature_idf, | |
| prefix="dir:", | |
| limit=settings.max_shared_features, | |
| ) | |
| return PrScopeCluster( | |
| cluster_id=f"pr-scope-{ordered[0]}-{len(ordered)}", | |
| pr_numbers=ordered, | |
| representative_pr_number=representative_pr_number, | |
| average_similarity=round(average_similarity, 3), | |
| summary=_cluster_summary( | |
| ordered, representative_pr_number, shared_filenames, shared_directories | |
| ), | |
| shared_filenames=shared_filenames, | |
| shared_directories=shared_directories, | |
| pairwise=pairwise, | |
| ) | |
| def _representative_pr_number( | |
| members: Sequence[int], | |
| comparison_map: Mapping[tuple[int, int], _ScopeComparison], | |
| ) -> int: | |
| return max( | |
| sorted(members), | |
| key=lambda number: ( | |
| _mean_similarity(number, set(members) - {number}, comparison_map), | |
| -number, | |
| ), | |
| ) | |
| def _shared_exact_features( | |
| members: Sequence[int], | |
| profiles_by_number: Mapping[int, _ScopeProfile], | |
| feature_idf: Mapping[str, float], | |
| *, | |
| prefix: str, | |
| limit: int, | |
| ) -> list[str]: | |
| counts: Counter[str] = Counter() | |
| for number in members: | |
| profile = profiles_by_number[number] | |
| values = profile.filenames if prefix == "file:" else profile.directories | |
| counts.update(values) | |
| minimum_count = 2 if len(members) > 1 else 1 | |
| shared = [ | |
| value | |
| for value, count in counts.items() | |
| if count >= minimum_count and f"{prefix}{value}" in feature_idf | |
| ] | |
| if prefix == "dir:": | |
| shared.sort( | |
| key=lambda value: ( | |
| -counts[value], | |
| -value.count("/"), | |
| -feature_idf.get(f"{prefix}{value}", 0.0), | |
| value, | |
| ) | |
| ) | |
| else: | |
| shared.sort( | |
| key=lambda value: (-counts[value], -feature_idf.get(f"{prefix}{value}", 0.0), value) | |
| ) | |
| return shared[:limit] | |
| def _cluster_summary( | |
| members: Sequence[int], | |
| representative_pr_number: int, | |
| shared_filenames: Sequence[str], | |
| shared_directories: Sequence[str], | |
| ) -> str: | |
| count = len(members) | |
| if shared_filenames: | |
| preview = ", ".join(f"`{value}`" for value in shared_filenames[:3]) | |
| return ( | |
| f"{count} open PRs share weighted file overlap around {preview}; " | |
| f"representative PR #{representative_pr_number}." | |
| ) | |
| if shared_directories: | |
| preview = ", ".join(f"`{value}`" for value in shared_directories[:3]) | |
| return ( | |
| f"{count} open PRs cluster in {preview} with similar change breadth; " | |
| f"representative PR #{representative_pr_number}." | |
| ) | |
| return f"{count} open PRs have similar weighted scope; representative PR #{representative_pr_number}." | |