Bachstelze commited on
Commit
b2a4d6f
·
1 Parent(s): 8ed5965

add dedublication script

Browse files
Datasets_all/AimoScore_dedublication.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import pandas as pd
3
+ from openpyxl.utils import get_column_letter
4
+ from openpyxl import load_workbook
5
+ from openpyxl.workbook import Workbook
6
+
7
+ INPUT_FILE = "AimoScore_WeakLink_big_scores.xls" # or .xls if using xlrd engine
8
+ SCORE_PICKLE = "AimoScore_WeakLink_big_scores.pkl"
9
+ OUTPUT_FILE = "AimoScore_deduped.xlsx"
10
+ REMOVED_REPORT = "AimoScore_removed_report.csv"
11
+
12
+ # old implemenation to load from original excel
13
+ #sheets = pd.read_excel(INPUT_FILE, sheet_name=None)
14
+ #df = next(iter(sheets.values()))
15
+ with open(SCORE_PICKLE, "rb") as f:
16
+ sheets = pickle.load(f)
17
+ df = sheets["Sheet1"]
18
+ #print(df)
19
+
20
+ required_cols = {"AimoScore", "EstimatedScore"}
21
+ if not required_cols.issubset(df.columns):
22
+ raise ValueError(f"Input file must contain columns: {required_cols}. Found: {df.columns.tolist()}")
23
+
24
+ counts = df["AimoScore"].value_counts()
25
+ duplicated_values = set(counts[counts > 1].index)
26
+ mask_removed = df["AimoScore"].isin(duplicated_values)
27
+ removed_rows = df[mask_removed].copy().reset_index(drop=True)
28
+ kept_rows = df[~mask_removed].copy()
29
+
30
+ # Add comparison columns for removed rows
31
+ removed_rows["Diff"] = removed_rows["EstimatedScore"] - removed_rows["AimoScore"]
32
+
33
+
34
+ # Convert RelDiff into boolean Quality estimator: True if absolute rel diff > 0.1
35
+ removed_rows["Threshold"] = removed_rows["Diff"].apply(
36
+ lambda x: bool(abs(x) > 0.1) if pd.notna(x) else False
37
+ )
38
+
39
+ # Only keep the requested columns for the removed report
40
+ removed_report_df = removed_rows[["EstimatedScore", "Diff", "Threshold"]]
41
+
42
+ # Save removed report (overwrite previous save)
43
+ removed_report_df.to_csv(REMOVED_REPORT, index=False)
44
+
45
+
46
+ agg = removed_rows.groupby("AimoScore").agg(
47
+ count_removed=("AimoScore", "size"),
48
+ mean_estimated=("EstimatedScore", "mean"),
49
+ median_estimated=("EstimatedScore", "median"),
50
+ mean_diff=("Diff", "mean")
51
+ ).reset_index()
52
+
53
+ # Save kept rows to Excel via pandas (openpyxl engine creates file)
54
+ kept_rows.to_excel(OUTPUT_FILE, index=False, engine="openpyxl")
55
+
56
+ # Adjust column widths using openpyxl
57
+ wb = load_workbook(OUTPUT_FILE)
58
+ ws = wb.active
59
+
60
+ # Strategy: set each column width to max(length of header, max cell string length) capped to a sensible range
61
+ min_width = 15
62
+ max_width = 60
63
+
64
+ for i, col in enumerate(kept_rows.columns, start=1):
65
+ col_letter = get_column_letter(i)
66
+ # header length
67
+ max_len = len(str(col))
68
+ # check cell values in column
69
+ for cell in ws[col_letter]:
70
+ if cell.value is not None:
71
+ cell_len = len(str(cell.value))
72
+ if cell_len > max_len:
73
+ max_len = cell_len
74
+ # set width with caps and small padding
75
+ adjusted_width = min(max(max_len + 2, min_width), max_width)
76
+ ws.column_dimensions[col_letter].width = adjusted_width
77
+
78
+ wb.save(OUTPUT_FILE)
79
+
80
+ # Save reports
81
+ removed_rows.to_csv(REMOVED_REPORT, index=False)
82
+ agg.to_csv("AimoScore_removed_agg.csv", index=False)
83
+
84
+ print(f"Original rows: {len(df)}")
85
+ print(f"Removed rows: {len(removed_rows)} (duplicated AimoScore values: {len(duplicated_values)})")
86
+ print(f"Kept rows: {len(kept_rows)}")