| | """ |
| | 关联分析:将repo-level指标与repos_searched元信息join |
| | 生成关联分析图和分组对比图 |
| | """ |
| | import pandas as pd |
| | import numpy as np |
| | from pathlib import Path |
| | import json |
| | import matplotlib |
| | matplotlib.use('Agg') |
| | import matplotlib.pyplot as plt |
| | import matplotlib.font_manager as fm |
| | import seaborn as sns |
| | import time |
| |
|
| | |
| | font_families_to_try = ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif'] |
| | available_fonts = [f.name for f in fm.fontManager.ttflist] |
| | font_found = None |
| |
|
| | for font_family in font_families_to_try: |
| | font_lower = font_family.lower() |
| | if any(f.lower() == font_lower for f in available_fonts): |
| | font_found = font_family |
| | break |
| |
|
| | if font_found is None: |
| | font_found = 'sans-serif' |
| |
|
| | plt.rcParams['font.family'] = font_found |
| | plt.rcParams['font.size'] = 20 |
| | plt.rcParams['axes.labelsize'] = 28 |
| | plt.rcParams['axes.titlesize'] = 28 |
| | plt.rcParams['xtick.labelsize'] = 24 |
| | plt.rcParams['ytick.labelsize'] = 24 |
| | plt.rcParams['legend.fontsize'] = 20 |
| | plt.rcParams['figure.titlesize'] = 32 |
| | plt.rcParams['axes.linewidth'] = 1.5 |
| | plt.rcParams['axes.spines.top'] = False |
| | plt.rcParams['axes.spines.right'] = False |
| | plt.rcParams['axes.grid'] = True |
| | plt.rcParams['grid.alpha'] = 0.3 |
| | plt.rcParams['grid.linewidth'] = 0.5 |
| |
|
| | |
| | NATURE_COLORS = { |
| | 'primary': '#2E5090', |
| | 'secondary': '#1A5490', |
| | 'accent': '#4A90E2', |
| | 'success': '#2E7D32', |
| | 'warning': '#F57C00', |
| | 'error': '#C62828', |
| | } |
| |
|
| | def apply_nature_style(ax): |
| | """应用Nature风格""" |
| | ax.spines['top'].set_visible(False) |
| | ax.spines['right'].set_visible(False) |
| | ax.spines['left'].set_linewidth(1.5) |
| | ax.spines['bottom'].set_linewidth(1.5) |
| | ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.5) |
| | ax.tick_params(width=1.5, length=5) |
| |
|
| |
|
| | class JoinInsights: |
| | def __init__(self, repos_searched_csv, repo_level_csv, check_history_csv, output_dir): |
| | self.repos_searched_csv = repos_searched_csv |
| | self.repo_level_csv = repo_level_csv |
| | self.check_history_csv = check_history_csv |
| | self.output_dir = Path(output_dir) |
| | self.output_dir.mkdir(parents=True, exist_ok=True) |
| | |
| | self.df_joined = None |
| | |
| | def load_and_join(self): |
| | """加载数据并join""" |
| | print("Loading data...") |
| | |
| | |
| | df_repo = pd.read_csv(self.repo_level_csv) |
| | df_repo['full_name'] = df_repo['full_name'].fillna( |
| | df_repo['repo_name'].str.replace('___', '/') |
| | ) |
| | |
| | |
| | print("Loading repos_searched.csv...") |
| | df_searched = pd.read_csv( |
| | self.repos_searched_csv, |
| | usecols=['full_name', 'keyword', 'stars', 'forks', 'open_issues', |
| | 'created_at', 'pushed_at', 'language', 'license', 'archived'], |
| | dtype={'stars': 'float64', 'forks': 'float64', 'open_issues': 'float64'} |
| | ) |
| | |
| | |
| | print("Loading repos_check_history.csv...") |
| | df_history = pd.read_csv( |
| | self.check_history_csv, |
| | usecols=['full_name', 'keyword', 'is_relevant'] |
| | ) |
| | |
| | |
| | print("Joining data...") |
| | df_joined = df_repo.merge(df_history, on='full_name', how='left') |
| | df_joined = df_joined.merge(df_searched, on='full_name', how='left', suffixes=('', '_searched')) |
| | |
| | |
| | if 'keyword_searched' in df_joined.columns: |
| | df_joined['keyword'] = df_joined['keyword'].fillna(df_joined['keyword_searched']) |
| | if 'language_searched' in df_joined.columns: |
| | df_joined['language_searched'] = df_joined['language_searched'].fillna(df_joined.get('primary_language', '')) |
| | |
| | |
| | df_joined = df_joined.dropna(subset=['full_name']) |
| | |
| | self.df_joined = df_joined |
| | print(f"Joined data: {len(df_joined)} rows") |
| | |
| | |
| | df_joined.to_csv(self.output_dir / 'joined_data.csv', index=False) |
| | print(f"Saved joined data to {self.output_dir / 'joined_data.csv'}") |
| | |
| | def analyze_correlations(self): |
| | """分析关联性""" |
| | if self.df_joined is None: |
| | self.load_and_join() |
| | |
| | df = self.df_joined.copy() |
| | |
| | |
| | numeric_cols = ['stars', 'forks', 'open_issues', 'total_code_lines', |
| | 'total_tokens', 'total_functions', 'total_files', |
| | 'comment_ratio', 'language_entropy'] |
| | numeric_cols = [c for c in numeric_cols if c in df.columns] |
| | |
| | df_numeric = df[numeric_cols].dropna() |
| | |
| | if len(df_numeric) > 0: |
| | corr_matrix = df_numeric.corr() |
| | |
| | |
| | corr_matrix.to_csv(self.output_dir / 'correlation_matrix.csv') |
| | |
| | |
| | insights = {} |
| | |
| | if 'stars' in df_numeric.columns and 'total_code_lines' in df_numeric.columns: |
| | corr = df_numeric['stars'].corr(df_numeric['total_code_lines']) |
| | insights['stars_vs_loc'] = float(corr) |
| | |
| | if 'stars' in df_numeric.columns and 'total_functions' in df_numeric.columns: |
| | corr = df_numeric['stars'].corr(df_numeric['total_functions']) |
| | insights['stars_vs_functions'] = float(corr) |
| | |
| | if 'stars' in df_numeric.columns and 'comment_ratio' in df_numeric.columns: |
| | corr = df_numeric['stars'].corr(df_numeric['comment_ratio']) |
| | insights['stars_vs_comment_ratio'] = float(corr) |
| | |
| | with open(self.output_dir / 'correlation_insights.json', 'w', encoding='utf-8') as f: |
| | json.dump(insights, f, indent=2) |
| | |
| | print(f"Correlation insights saved") |
| | |
| | def plot_stars_vs_metrics(self): |
| | """绘制stars与多个指标的关系""" |
| | if self.df_joined is None: |
| | self.load_and_join() |
| | |
| | df = self.df_joined.copy() |
| | df = df[df['stars'].notna() & (df['stars'] > 0)] |
| | |
| | if len(df) == 0: |
| | print("No data for stars vs metrics plot") |
| | return |
| | |
| | fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8)) |
| | |
| | colors_list = [NATURE_COLORS['primary'], NATURE_COLORS['accent'], |
| | NATURE_COLORS['success'], NATURE_COLORS['secondary']] |
| | |
| | |
| | ax = axes[0, 0] |
| | apply_nature_style(ax) |
| | df_plot = df[df['total_code_lines'] > 0] |
| | if len(df_plot) > 0: |
| | ax.scatter(df_plot['total_code_lines'], df_plot['stars'], |
| | alpha=0.4, s=30, color=colors_list[0], edgecolors='white', linewidth=0.5) |
| | ax.set_xscale('log') |
| | ax.set_yscale('log') |
| | ax.set_xlabel('Lines of Code (LOC, log scale)', fontsize=28, fontweight='bold') |
| | ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold') |
| | ax.set_title('Stars vs Lines of Code', fontsize=28, fontweight='bold') |
| | |
| | corr = np.corrcoef(np.log10(df_plot['total_code_lines']), |
| | np.log10(df_plot['stars']))[0, 1] |
| | ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes, |
| | fontsize=24, fontweight='bold', verticalalignment='top', |
| | bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, |
| | edgecolor=NATURE_COLORS['primary'], linewidth=2)) |
| | |
| | |
| | ax = axes[0, 1] |
| | apply_nature_style(ax) |
| | df_plot = df[df['total_functions'] > 0] |
| | if len(df_plot) > 0: |
| | ax.scatter(df_plot['total_functions'], df_plot['stars'], |
| | alpha=0.4, s=30, color=colors_list[1], edgecolors='white', linewidth=0.5) |
| | ax.set_xscale('log') |
| | ax.set_yscale('log') |
| | ax.set_xlabel('Number of Functions (log scale)', fontsize=28, fontweight='bold') |
| | ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold') |
| | ax.set_title('Stars vs Number of Functions', fontsize=28, fontweight='bold') |
| | |
| | corr = np.corrcoef(np.log10(df_plot['total_functions']), |
| | np.log10(df_plot['stars']))[0, 1] |
| | ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes, |
| | fontsize=18, fontweight='bold', verticalalignment='top', |
| | bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, |
| | edgecolor=NATURE_COLORS['accent'], linewidth=2)) |
| | |
| | |
| | ax = axes[1, 0] |
| | apply_nature_style(ax) |
| | df_plot = df[df['comment_ratio'].notna() & (df['comment_ratio'] >= 0)] |
| | if len(df_plot) > 0: |
| | ax.scatter(df_plot['comment_ratio'], df_plot['stars'], |
| | alpha=0.4, s=30, color=colors_list[2], edgecolors='white', linewidth=0.5) |
| | ax.set_yscale('log') |
| | ax.set_xlabel('Comment Ratio', fontsize=28, fontweight='bold') |
| | ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold') |
| | ax.set_title('Stars vs Comment Ratio', fontsize=28, fontweight='bold') |
| | |
| | corr = df_plot['comment_ratio'].corr(np.log10(df_plot['stars'])) |
| | ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes, |
| | fontsize=18, fontweight='bold', verticalalignment='top', |
| | bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, |
| | edgecolor=NATURE_COLORS['success'], linewidth=2)) |
| | |
| | |
| | ax = axes[1, 1] |
| | apply_nature_style(ax) |
| | df_plot = df[df['language_entropy'].notna() & (df['language_entropy'] >= 0)] |
| | if len(df_plot) > 0: |
| | ax.scatter(df_plot['language_entropy'], df_plot['stars'], |
| | alpha=0.4, s=30, color=colors_list[3], edgecolors='white', linewidth=0.5) |
| | ax.set_yscale('log') |
| | ax.set_xlabel('Language Diversity (Entropy)', fontsize=28, fontweight='bold') |
| | ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold') |
| | ax.set_title('Stars vs Language Diversity', fontsize=28, fontweight='bold') |
| | |
| | corr = df_plot['language_entropy'].corr(np.log10(df_plot['stars'])) |
| | ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes, |
| | fontsize=18, fontweight='bold', verticalalignment='top', |
| | bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, |
| | edgecolor=NATURE_COLORS['secondary'], linewidth=2)) |
| | |
| | plt.suptitle('Correlation Analysis: Stars vs Code Metrics (Top 15K Repositories)', |
| | fontsize=32, fontweight='bold', y=0.995) |
| | plt.tight_layout(rect=[0, 0, 1, 0.96]) |
| | |
| | fig_path = self.output_dir / 'fig_insights_stars_vs_metrics.png' |
| | plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white') |
| | plt.close() |
| | print(f"Saved: {fig_path}") |
| | |
| | def plot_by_keyword_comparison(self): |
| | """按keyword分组对比代码指标""" |
| | if self.df_joined is None: |
| | self.load_and_join() |
| | |
| | df = self.df_joined.copy() |
| | df = df[df['keyword'].notna()] |
| | |
| | |
| | top_keywords = df['keyword'].value_counts().head(15).index |
| | df = df[df['keyword'].isin(top_keywords)] |
| | |
| | if len(df) == 0: |
| | print("No data for keyword comparison") |
| | return |
| | |
| | fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8)) |
| | |
| | colors_list = [NATURE_COLORS['primary'], NATURE_COLORS['success'], |
| | NATURE_COLORS['warning'], NATURE_COLORS['secondary']] |
| | |
| | |
| | ax = axes[0, 0] |
| | apply_nature_style(ax) |
| | stats = df.groupby('keyword')['total_code_lines'].mean().sort_values(ascending=False) |
| | stats.plot(kind='bar', ax=ax, color=colors_list[0], alpha=0.85, edgecolor='white', linewidth=1.5) |
| | ax.set_title('Average Lines of Code', fontsize=28, fontweight='bold') |
| | ax.set_xlabel('') |
| | ax.set_ylabel('Average LOC', fontsize=28) |
| | ax.tick_params(axis='x', rotation=45, labelsize=24) |
| | ax.tick_params(axis='y', labelsize=24) |
| | |
| | |
| | ax = axes[0, 1] |
| | apply_nature_style(ax) |
| | stats = df.groupby('keyword')['comment_ratio'].mean().sort_values(ascending=False) |
| | stats.plot(kind='bar', ax=ax, color=colors_list[1], alpha=0.85, edgecolor='white', linewidth=1.5) |
| | ax.set_title('Average Comment Ratio', fontsize=28, fontweight='bold') |
| | ax.set_xlabel('') |
| | ax.set_ylabel('Comment Ratio', fontsize=28) |
| | ax.tick_params(axis='x', rotation=45, labelsize=24) |
| | ax.tick_params(axis='y', labelsize=24) |
| | |
| | |
| | ax = axes[1, 0] |
| | apply_nature_style(ax) |
| | if 'stars' in df.columns: |
| | stats = df.groupby('keyword')['stars'].mean().sort_values(ascending=False) |
| | stats.plot(kind='bar', ax=ax, color=colors_list[2], alpha=0.85, edgecolor='white', linewidth=1.5) |
| | ax.set_title('Average Stars', fontsize=28, fontweight='bold') |
| | ax.set_xlabel('') |
| | ax.set_ylabel('Average Stars', fontsize=28) |
| | ax.tick_params(axis='x', rotation=45, labelsize=24) |
| | ax.tick_params(axis='y', labelsize=24) |
| | |
| | |
| | ax = axes[1, 1] |
| | apply_nature_style(ax) |
| | stats = df.groupby('keyword')['language_entropy'].mean().sort_values(ascending=False) |
| | stats.plot(kind='bar', ax=ax, color=colors_list[3], alpha=0.85, edgecolor='white', linewidth=1.5) |
| | ax.set_title('Average Language Diversity', fontsize=28, fontweight='bold') |
| | ax.set_xlabel('') |
| | ax.set_ylabel('Language Entropy', fontsize=28) |
| | ax.tick_params(axis='x', rotation=45, labelsize=24) |
| | ax.tick_params(axis='y', labelsize=24) |
| | |
| | plt.suptitle('Code Metrics Comparison by Keyword (Top 15K Repositories)', |
| | fontsize=32, fontweight='bold', y=0.995) |
| | plt.tight_layout(rect=[0, 0, 1, 0.96]) |
| | |
| | fig_path = self.output_dir / 'fig_insights_by_keyword.png' |
| | plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white') |
| | plt.close() |
| | print(f"Saved: {fig_path}") |
| | |
| | def plot_archived_vs_active(self): |
| | """对比archived与active仓库的代码特征""" |
| | if self.df_joined is None: |
| | self.load_and_join() |
| | |
| | df = self.df_joined.copy() |
| | |
| | if 'archived' not in df.columns: |
| | print("No archived column in data") |
| | return |
| | |
| | df['is_archived'] = df['archived'].fillna(False) |
| | |
| | fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8)) |
| | |
| | |
| | ax = axes[0, 0] |
| | apply_nature_style(ax) |
| | df_plot = df[df['total_code_lines'] > 0] |
| | if len(df_plot) > 0: |
| | bp = df_plot.boxplot(column='total_code_lines', by='is_archived', ax=ax, |
| | widths=0.6, patch_artist=True, |
| | boxprops=dict(facecolor=NATURE_COLORS['primary'], alpha=0.7, linewidth=2), |
| | medianprops=dict(color='white', linewidth=3), |
| | whiskerprops=dict(linewidth=2), |
| | capprops=dict(linewidth=2)) |
| | ax.set_title('Lines of Code: Archived vs Active', fontsize=28, fontweight='bold') |
| | ax.set_xlabel('') |
| | ax.set_ylabel('Lines of Code', fontsize=28) |
| | ax.set_yscale('log') |
| | ax.set_xticklabels(['Active', 'Archived'], fontsize=24) |
| | plt.setp(ax.xaxis.get_majorticklabels(), rotation=0) |
| | |
| | |
| | ax = axes[0, 1] |
| | apply_nature_style(ax) |
| | df_plot = df[df['comment_ratio'].notna()] |
| | if len(df_plot) > 0: |
| | bp = df_plot.boxplot(column='comment_ratio', by='is_archived', ax=ax, |
| | widths=0.6, patch_artist=True, |
| | boxprops=dict(facecolor=NATURE_COLORS['success'], alpha=0.7, linewidth=2), |
| | medianprops=dict(color='white', linewidth=3), |
| | whiskerprops=dict(linewidth=2), |
| | capprops=dict(linewidth=2)) |
| | ax.set_title('Comment Ratio: Archived vs Active', fontsize=28, fontweight='bold') |
| | ax.set_xlabel('') |
| | ax.set_ylabel('Comment Ratio', fontsize=28) |
| | ax.set_xticklabels(['Active', 'Archived'], fontsize=24) |
| | plt.setp(ax.xaxis.get_majorticklabels(), rotation=0) |
| | |
| | |
| | ax = axes[1, 0] |
| | apply_nature_style(ax) |
| | df_plot = df[df['total_functions'] > 0] |
| | if len(df_plot) > 0: |
| | bp = df_plot.boxplot(column='total_functions', by='is_archived', ax=ax, |
| | widths=0.6, patch_artist=True, |
| | boxprops=dict(facecolor=NATURE_COLORS['accent'], alpha=0.7, linewidth=2), |
| | medianprops=dict(color='white', linewidth=3), |
| | whiskerprops=dict(linewidth=2), |
| | capprops=dict(linewidth=2)) |
| | ax.set_title('Number of Functions: Archived vs Active', fontsize=28, fontweight='bold') |
| | ax.set_xlabel('') |
| | ax.set_ylabel('Number of Functions', fontsize=28) |
| | ax.set_yscale('log') |
| | ax.set_xticklabels(['Active', 'Archived'], fontsize=24) |
| | plt.setp(ax.xaxis.get_majorticklabels(), rotation=0) |
| | |
| | |
| | ax = axes[1, 1] |
| | apply_nature_style(ax) |
| | df_plot = df[df['total_files'] > 0] |
| | if len(df_plot) > 0: |
| | bp = df_plot.boxplot(column='total_files', by='is_archived', ax=ax, |
| | widths=0.6, patch_artist=True, |
| | boxprops=dict(facecolor=NATURE_COLORS['secondary'], alpha=0.7, linewidth=2), |
| | medianprops=dict(color='white', linewidth=3), |
| | whiskerprops=dict(linewidth=2), |
| | capprops=dict(linewidth=2)) |
| | ax.set_title('Number of Files: Archived vs Active', fontsize=28, fontweight='bold') |
| | ax.set_xlabel('') |
| | ax.set_ylabel('Number of Files', fontsize=28) |
| | ax.set_yscale('log') |
| | ax.set_xticklabels(['Active', 'Archived'], fontsize=24) |
| | plt.setp(ax.xaxis.get_majorticklabels(), rotation=0) |
| | |
| | plt.suptitle('Code Characteristics Comparison: Archived vs Active (Top 15K Repositories)', |
| | fontsize=32, fontweight='bold', y=0.995) |
| | plt.tight_layout(rect=[0, 0, 1, 0.96]) |
| | |
| | fig_path = self.output_dir / 'fig_insights_archived_vs_active.png' |
| | plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white') |
| | plt.close() |
| | print(f"Saved: {fig_path}") |
| | |
| | def run(self): |
| | """执行完整分析""" |
| | print("=" * 80) |
| | print("关联分析与洞察") |
| | print("=" * 80) |
| | |
| | self.load_and_join() |
| | self.analyze_correlations() |
| | self.plot_stars_vs_metrics() |
| | self.plot_by_keyword_comparison() |
| | self.plot_archived_vs_active() |
| | |
| | print(f"\n关联分析完成!结果保存在: {self.output_dir}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | repos_searched_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv" |
| | repo_level_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/code_stats/repo_level_metrics_top15000.csv" |
| | check_history_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv" |
| | output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/insights" |
| | |
| | insights = JoinInsights(repos_searched_csv, repo_level_csv, check_history_csv, output_dir) |
| | insights.run() |
| |
|
| |
|