DouDou commited on
Commit
1f571a3
·
verified ·
1 Parent(s): 6ae52c9

Upload data2/step22/ppt.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data2/step22/ppt.py +103 -0
data2/step22/ppt.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import jsonlines
2
+ import matplotlib.pyplot as plt
3
+ from collections import Counter
4
+ import os
5
+
6
+ # 统计文件名后缀分布
7
+ suffix_counter = Counter()
8
+
9
+ file_path = "./output/alignment.jsonl"
10
+ if not os.path.exists(file_path):
11
+ print(f"Error: {file_path} not found.")
12
+ else:
13
+ with jsonlines.open(file_path) as reader:
14
+ for obj in reader:
15
+ # Use simple split as in original code, but safer
16
+ parts = obj['file'].split(".")
17
+ suffix = parts[-1] if len(parts) > 1 else "no_suffix"
18
+ suffix_counter[suffix] += 1
19
+
20
+ # Sort data for better presentation
21
+ sorted_suffixes = suffix_counter.most_common()
22
+
23
+ # Total count
24
+ total_files = sum(suffix_counter.values())
25
+
26
+ # Try to use rich for a beautiful table
27
+ try:
28
+ from rich.console import Console
29
+ from rich.table import Table
30
+ from rich import box
31
+
32
+ console = Console()
33
+ table = Table(title="Language Distribution", box=box.ROUNDED)
34
+
35
+ table.add_column("Language (Suffix)", style="cyan", justify="left")
36
+ table.add_column("Count", style="magenta", justify="right")
37
+ table.add_column("Percentage", style="green", justify="right")
38
+
39
+ for suffix, count in sorted_suffixes:
40
+ percentage = (count / total_files) * 100
41
+ table.add_row(suffix, str(count), f"{percentage:.1f}%")
42
+
43
+ console.print(table)
44
+ except ImportError:
45
+ # Fallback to standard print
46
+ print(f"\n{'Language':<15} | {'Count':<10} | {'Percentage':<10}")
47
+ print("-" * 45)
48
+ for suffix, count in sorted_suffixes:
49
+ percentage = (count / total_files) * 100
50
+ print(f"{suffix:<15} | {count:<10} | {percentage:.1f}%")
51
+
52
+ # Plotting Bar Chart
53
+ if sorted_suffixes:
54
+ labels, values = zip(*sorted_suffixes)
55
+
56
+ # 1. Bar Chart
57
+ plt.figure(figsize=(12, 6))
58
+ bars = plt.bar(labels, values, color='skyblue')
59
+ plt.title('Language Distribution', fontsize=16)
60
+ plt.xlabel('Language', fontsize=12)
61
+ plt.ylabel('Count', fontsize=12)
62
+ plt.xticks(rotation=45)
63
+
64
+ # Add value labels on top of bars
65
+ for bar in bars:
66
+ height = bar.get_height()
67
+ plt.text(bar.get_x() + bar.get_width()/2., height,
68
+ f'{int(height)}',
69
+ ha='center', va='bottom')
70
+
71
+ plt.tight_layout()
72
+ plt.savefig("suffix.png")
73
+ print("\nBar chart saved to suffix.png")
74
+
75
+ # 2. Table Image
76
+ plt.figure(figsize=(10, len(sorted_suffixes) * 0.5 + 2)) # Adjust height based on number of rows
77
+ plt.axis('off')
78
+
79
+ cell_text = []
80
+ for suffix, count in sorted_suffixes:
81
+ percentage = (count / total_files) * 100
82
+ cell_text.append([suffix, str(count), f"{percentage:.1f}%"])
83
+
84
+ col_labels = ["Language", "Count", "Percentage"]
85
+ col_colors = ["#CCCCFF", "#CCCCFF", "#CCCCFF"]
86
+
87
+ table_plot = plt.table(cellText=cell_text,
88
+ colLabels=col_labels,
89
+ colColours=col_colors,
90
+ loc='center',
91
+ cellLoc='center')
92
+
93
+ table_plot.auto_set_font_size(False)
94
+ table_plot.set_fontsize(12)
95
+ table_plot.scale(1.2, 1.5)
96
+
97
+ plt.title('Language Distribution Table', fontsize=16, y=1.0) # Adjust title position if needed
98
+ # We don't use tight_layout here as it might cut off parts of the table sometimes,
99
+ # but for simple tables bbox_inches='tight' in savefig usually works best.
100
+
101
+ plt.savefig("suffix_table.png", bbox_inches='tight', dpi=300)
102
+ print("Table saved to suffix_table.png")
103
+ # plt.show() # Commented out to avoid blocking if headless