File size: 12,454 Bytes
9da03b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | """
Not neat, but this is what I did to make exclusive splits. saving here for now.
"""
## Full pipeline
import pandas as pd
protein_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv", sep="\t", header=None)
protein_clusters.columns=["tr_cluster_rep","tr_cluster_member"]
protein_clusters.head()
dna_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv", sep="\t", header=None)
dna_clusters.columns=["dna_cluster_rep","dna_cluster_member"]
dna_clusters.head()
all_data = pd.read_parquet("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet")
all_data
protein_cluster_map = dict(zip(protein_clusters["tr_cluster_member"],protein_clusters["tr_cluster_rep"]))
dna_cluster_map = dict(zip(dna_clusters["dna_cluster_member"],dna_clusters["dna_cluster_rep"]))
print(len(protein_cluster_map))
print(len(dna_cluster_map))
all_data["tr_cluster_rep"] = all_data["tr_seqid"].map(protein_cluster_map)
all_data["dna_cluster_rep"] = all_data["dna_seqid"].map(dna_cluster_map)
print(len(all_data[all_data["tr_cluster_rep"].isna()]))
print(len(all_data[all_data["dna_cluster_rep"].isna()]))
all_data.head()
### handpick test
handpicked_test_trs = ["trseq23","trseq26","trseq17"]
handpicked_test = all_data.loc[
all_data["tr_cluster_rep"].isin(handpicked_test_trs)
].reset_index(drop=True)
off_limits_dna_clusters = handpicked_test["dna_cluster_rep"].unique().tolist()
remaining = all_data.loc[
(~all_data["tr_cluster_rep"].isin(handpicked_test_trs)) &
(~all_data["dna_cluster_rep"].isin(off_limits_dna_clusters))
].reset_index(drop=True)
test_ids = handpicked_test["ID"].unique().tolist()
remaining_ids = remaining["ID"].unique().tolist()
lost_rows = all_data.loc[
(~all_data["ID"].isin(test_ids)) &
(~all_data["ID"].isin(remaining_ids))
]
print(f"Rows in test: {len(handpicked_test)}")
print(f"Rows to be split between train and val: {len(remaining)}")
total_rows = len(handpicked_test) + len(remaining)
print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%")
print(f"Lost rows: {len(lost_rows)}")
### handpick val
handpicked_val_trs = ["trseq9", "trseq5", "trseq28"]
handpicked_val = remaining.loc[
remaining["tr_cluster_rep"].isin(handpicked_val_trs)
].reset_index(drop=True)
off_limits_dna_clusters = handpicked_val["dna_cluster_rep"].unique().tolist()
train_remain = remaining.loc[
(~remaining["tr_cluster_rep"].isin(handpicked_val_trs)) &
(~remaining["dna_cluster_rep"].isin(off_limits_dna_clusters))
].reset_index(drop=True)
val_ids = handpicked_val["ID"].unique().tolist()
train_remain_ids = train_remain["ID"].unique().tolist()
lost_rows = all_data.loc[
(~all_data["ID"].isin(test_ids)) &
(~all_data["ID"].isin(val_ids)) &
(~all_data["ID"].isin(train_remain_ids))
]
print(f"Rows in val: {len(handpicked_val)}")
print(f"Rows left for train: {len(train_remain)}")
total_rows = len(handpicked_val) + len(train_remain)
print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%")
print(f"Lost rows: {len(lost_rows)}")
train_exclusive = all_data.loc[
all_data["ID"].isin(train_remain_ids)
].reset_index(drop=True)
val_exclusive = all_data.loc[
all_data["ID"].isin(val_ids)
].reset_index(drop=True)
test_exclusive = all_data.loc[
all_data["ID"].isin(test_ids)
].reset_index(drop=True)
leaky_test = all_data.loc[
~(all_data["ID"].isin(train_exclusive["ID"].tolist())) &
~(all_data["ID"].isin(val_exclusive["ID"].tolist())) &
~(all_data["ID"].isin(test_exclusive["ID"].tolist()))
].reset_index(drop=True)
print(f"Original total: {len(all_data)}")
retained_total = len(train_exclusive)+len(val_exclusive)+len(test_exclusive)
print(f"New, exclusive total: {retained_total}")
print(f"Lost rows: {len(all_data)-retained_total}")
print(f"Length train: {len(train_exclusive)}/{retained_total} ({100*len(train_exclusive)/retained_total:.2f}%)")
print(f"Length val: {len(val_exclusive)}/{retained_total} ({100*len(val_exclusive)/retained_total:.2f}%)")
print(f"Length test: {len(test_exclusive)}/{retained_total} ({100*len(test_exclusive)/retained_total:.2f}%)")
def check_validity(train_exclusive, val_exclusive, test_exclusive):
train_exclusive_ids = set(train_exclusive["ID"].unique().tolist())
val_exclusive_ids = set(val_exclusive["ID"].unique().tolist())
test_exclusive_ids = set(test_exclusive["ID"].unique().tolist())
assert len(train_exclusive_ids.intersection(val_exclusive_ids)) == 0
assert len(train_exclusive_ids.intersection(test_exclusive_ids)) == 0
assert len(val_exclusive_ids.intersection(test_exclusive_ids)) == 0
print(f"Pass! No overlap in IDs")
# Investigate TR intersection. No assertions unless we are explicitly splitting on this.
train_exclusive_tr_seqs = set(train_exclusive["tr_sequence"].unique().tolist())
val_exclusive_tr_seqs = set(val_exclusive["tr_sequence"].unique().tolist())
test_exclusive_tr_seqs = set(test_exclusive["tr_sequence"].unique().tolist())
train_exclusive_tr_reps = set(train_exclusive["tr_cluster_rep"].unique().tolist())
val_exclusive_tr_reps = set(val_exclusive["tr_cluster_rep"].unique().tolist())
test_exclusive_tr_reps = set(test_exclusive["tr_cluster_rep"].unique().tolist())
print(f"Train-Val TR intersection: {len(train_exclusive_tr_seqs.intersection(val_exclusive_tr_seqs))}")
print(f"Train-Test TR intersection: {len(train_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}")
print(f"Val-Test TR intersection: {len(val_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}")
print(f"Train-Val TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(val_exclusive_tr_reps))}")
print(f"Train-Test TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}")
print(f"Val-Test TR Cluster Rep intersection: {len(val_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}")
# Investigate DNA intersection. No assertions unless we are explicitly splitting on this.
train_exclusive_dna_seqs = set(train_exclusive["dna_sequence"].unique().tolist())
val_exclusive_dna_seqs = set(val_exclusive["dna_sequence"].unique().tolist())
test_exclusive_dna_seqs = set(test_exclusive["dna_sequence"].unique().tolist())
train_exclusive_dna_reps = set(train_exclusive["dna_cluster_rep"].unique().tolist())
val_exclusive_dna_reps = set(val_exclusive["dna_cluster_rep"].unique().tolist())
test_exclusive_dna_reps = set(test_exclusive["dna_cluster_rep"].unique().tolist())
print(f"Train-Val DNA intersection: {len(train_exclusive_dna_seqs.intersection(val_exclusive_dna_seqs))}")
print(f"Train-Test DNA intersection: {len(train_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}")
print(f"Val-Test DNA intersection: {len(val_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}")
print(f"Train-Val DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(val_exclusive_dna_reps))}")
print(f"Train-Test DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}")
print(f"Val-Test DNA Cluster Rep intersection: {len(val_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}")
def get_reverse_complement(s):
"""
Returns 5' to 3' sequence of the reverse complement
"""
chars = list(s)
recon = []
rev_map = {
"a": "t",
"c": "g",
"t": "a",
"g": "c",
"A": "T",
"C": "G",
"T": "A",
"G": "C",
"n": "n",
"N": "N",
}
for c in chars:
recon += [rev_map[c]]
recon = "".join(recon)
return recon[::-1]
# now make reverse complements
def augment_rc(df):
"""
Get the reverse complement and add it as a datapoint, effectively doubling the dataset.
Also flip the orientation of the scores
columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
"""
df_rc = df.copy(deep=True)
df_rc["dna_sequence"] = df_rc["dna_sequence"].apply(
lambda x: get_reverse_complement(x)
)
df_rc["ID"] = df_rc["ID"] + "_rc"
df_rc["scores"] = df_rc["scores"].apply(lambda s: ",".join(s.split(",")[::-1]))
final_df = pd.concat([df, df_rc]).reset_index(drop=True)
return final_df
def convert_scores(scores, mode=1):
"""
Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score
"""
svec = [int(x) for x in scores.split(",")]
max_score = max(svec)
if mode ==1:
binary_svec = [0 if x<max_score else 1 for x in svec]
assert(svec.count(max_score)==binary_svec.count(1))
else:
binary_svec = [0 if x<max_score else max_score for x in svec]
assert(svec.count(max_score)==binary_svec.count(max_score))
binary_svec = ",".join([str(x) for x in binary_svec])
return binary_svec
check_validity(train_exclusive, val_exclusive, test_exclusive)
train_exclusive = augment_rc(train_exclusive)
val_exclusive = augment_rc(val_exclusive)
test_exclusive = augment_rc(test_exclusive)
leaky_test = augment_rc(leaky_test)
print(f"Added reverse complement sequences to train_exclusive, val_exclusive, and test_exclusive (and leaky test_exclusive)")
check_validity(train_exclusive, val_exclusive, test_exclusive)
total = sum([len(train_exclusive), len(val_exclusive), len(test_exclusive), len(leaky_test)])
print(
f"Length of train_exclusive dataset: {len(train_exclusive)} ({100*len(train_exclusive)/total:.2f}%)"
)
print(f"Length of val_exclusive dataset: {len(val_exclusive)} ({100*len(val_exclusive)/total:.2f}%)")
print(f"Length of test_exclusive dataset: {len(test_exclusive)} ({100*len(test_exclusive)/total:.2f}%)")
print(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)")
print(
f"Total sequences = {total}. Same as edges size*2? {total==len(all_data)*2}"
)
# since we've added all these new DNA sequences, we do need a new mapping of seq id to dna sequence
all_data = pd.concat([train_exclusive, val_exclusive, test_exclusive, leaky_test])
all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1]
dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"]))
assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"]))
# create the output dir
import os
from pathlib import Path
split_out_dir = Path("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test")
os.makedirs(split_out_dir, exist_ok=True)
# add binary_scores to allow other training modes
train_exclusive["binary_sores"] = train_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
val_exclusive["binary_sores"] = val_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
test_exclusive["binary_sores"] = test_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
leaky_test["binary_sores"] = leaky_test["scores"].apply(lambda x: convert_scores(x, mode=1))
train_exclusive["split"] = ["train"]*len(train_exclusive)
val_exclusive["split"] = ["val"]*len(val_exclusive)
test_exclusive["split"] = ["test"]*len(test_exclusive)
leaky_test["split"] = ["leakytest"]*len(leaky_test)
# slect final cols and save
split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "binary_sores", "split"]
train_exclusive[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
val_exclusive[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
test_exclusive[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
leaky_test[split_final_cols].to_csv(split_out_dir / "leakytest.csv", index=False)
print(f"Saved all splits to {split_out_dir}")
# make baby versions too
train_exclusive[split_final_cols].sample(400, random_state=42).to_csv(split_out_dir / "babytrain.csv", index=False)
val_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyval.csv", index=False)
test_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babytest.csv", index=False)
leaky_test[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyleakytest.csv", index=False)
|