DPACMAN / dpacman /data_tasks /split /remap_handpick.py
svincoff's picture
added dropout and overfit prevention
9da03b7
"""
Not neat, but this is what I did to make exclusive splits. saving here for now.
"""
## Full pipeline
import pandas as pd
protein_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv", sep="\t", header=None)
protein_clusters.columns=["tr_cluster_rep","tr_cluster_member"]
protein_clusters.head()
dna_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv", sep="\t", header=None)
dna_clusters.columns=["dna_cluster_rep","dna_cluster_member"]
dna_clusters.head()
all_data = pd.read_parquet("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet")
all_data
protein_cluster_map = dict(zip(protein_clusters["tr_cluster_member"],protein_clusters["tr_cluster_rep"]))
dna_cluster_map = dict(zip(dna_clusters["dna_cluster_member"],dna_clusters["dna_cluster_rep"]))
print(len(protein_cluster_map))
print(len(dna_cluster_map))
all_data["tr_cluster_rep"] = all_data["tr_seqid"].map(protein_cluster_map)
all_data["dna_cluster_rep"] = all_data["dna_seqid"].map(dna_cluster_map)
print(len(all_data[all_data["tr_cluster_rep"].isna()]))
print(len(all_data[all_data["dna_cluster_rep"].isna()]))
all_data.head()
### handpick test
handpicked_test_trs = ["trseq23","trseq26","trseq17"]
handpicked_test = all_data.loc[
all_data["tr_cluster_rep"].isin(handpicked_test_trs)
].reset_index(drop=True)
off_limits_dna_clusters = handpicked_test["dna_cluster_rep"].unique().tolist()
remaining = all_data.loc[
(~all_data["tr_cluster_rep"].isin(handpicked_test_trs)) &
(~all_data["dna_cluster_rep"].isin(off_limits_dna_clusters))
].reset_index(drop=True)
test_ids = handpicked_test["ID"].unique().tolist()
remaining_ids = remaining["ID"].unique().tolist()
lost_rows = all_data.loc[
(~all_data["ID"].isin(test_ids)) &
(~all_data["ID"].isin(remaining_ids))
]
print(f"Rows in test: {len(handpicked_test)}")
print(f"Rows to be split between train and val: {len(remaining)}")
total_rows = len(handpicked_test) + len(remaining)
print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%")
print(f"Lost rows: {len(lost_rows)}")
### handpick val
handpicked_val_trs = ["trseq9", "trseq5", "trseq28"]
handpicked_val = remaining.loc[
remaining["tr_cluster_rep"].isin(handpicked_val_trs)
].reset_index(drop=True)
off_limits_dna_clusters = handpicked_val["dna_cluster_rep"].unique().tolist()
train_remain = remaining.loc[
(~remaining["tr_cluster_rep"].isin(handpicked_val_trs)) &
(~remaining["dna_cluster_rep"].isin(off_limits_dna_clusters))
].reset_index(drop=True)
val_ids = handpicked_val["ID"].unique().tolist()
train_remain_ids = train_remain["ID"].unique().tolist()
lost_rows = all_data.loc[
(~all_data["ID"].isin(test_ids)) &
(~all_data["ID"].isin(val_ids)) &
(~all_data["ID"].isin(train_remain_ids))
]
print(f"Rows in val: {len(handpicked_val)}")
print(f"Rows left for train: {len(train_remain)}")
total_rows = len(handpicked_val) + len(train_remain)
print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%")
print(f"Lost rows: {len(lost_rows)}")
train_exclusive = all_data.loc[
all_data["ID"].isin(train_remain_ids)
].reset_index(drop=True)
val_exclusive = all_data.loc[
all_data["ID"].isin(val_ids)
].reset_index(drop=True)
test_exclusive = all_data.loc[
all_data["ID"].isin(test_ids)
].reset_index(drop=True)
leaky_test = all_data.loc[
~(all_data["ID"].isin(train_exclusive["ID"].tolist())) &
~(all_data["ID"].isin(val_exclusive["ID"].tolist())) &
~(all_data["ID"].isin(test_exclusive["ID"].tolist()))
].reset_index(drop=True)
print(f"Original total: {len(all_data)}")
retained_total = len(train_exclusive)+len(val_exclusive)+len(test_exclusive)
print(f"New, exclusive total: {retained_total}")
print(f"Lost rows: {len(all_data)-retained_total}")
print(f"Length train: {len(train_exclusive)}/{retained_total} ({100*len(train_exclusive)/retained_total:.2f}%)")
print(f"Length val: {len(val_exclusive)}/{retained_total} ({100*len(val_exclusive)/retained_total:.2f}%)")
print(f"Length test: {len(test_exclusive)}/{retained_total} ({100*len(test_exclusive)/retained_total:.2f}%)")
def check_validity(train_exclusive, val_exclusive, test_exclusive):
train_exclusive_ids = set(train_exclusive["ID"].unique().tolist())
val_exclusive_ids = set(val_exclusive["ID"].unique().tolist())
test_exclusive_ids = set(test_exclusive["ID"].unique().tolist())
assert len(train_exclusive_ids.intersection(val_exclusive_ids)) == 0
assert len(train_exclusive_ids.intersection(test_exclusive_ids)) == 0
assert len(val_exclusive_ids.intersection(test_exclusive_ids)) == 0
print(f"Pass! No overlap in IDs")
# Investigate TR intersection. No assertions unless we are explicitly splitting on this.
train_exclusive_tr_seqs = set(train_exclusive["tr_sequence"].unique().tolist())
val_exclusive_tr_seqs = set(val_exclusive["tr_sequence"].unique().tolist())
test_exclusive_tr_seqs = set(test_exclusive["tr_sequence"].unique().tolist())
train_exclusive_tr_reps = set(train_exclusive["tr_cluster_rep"].unique().tolist())
val_exclusive_tr_reps = set(val_exclusive["tr_cluster_rep"].unique().tolist())
test_exclusive_tr_reps = set(test_exclusive["tr_cluster_rep"].unique().tolist())
print(f"Train-Val TR intersection: {len(train_exclusive_tr_seqs.intersection(val_exclusive_tr_seqs))}")
print(f"Train-Test TR intersection: {len(train_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}")
print(f"Val-Test TR intersection: {len(val_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}")
print(f"Train-Val TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(val_exclusive_tr_reps))}")
print(f"Train-Test TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}")
print(f"Val-Test TR Cluster Rep intersection: {len(val_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}")
# Investigate DNA intersection. No assertions unless we are explicitly splitting on this.
train_exclusive_dna_seqs = set(train_exclusive["dna_sequence"].unique().tolist())
val_exclusive_dna_seqs = set(val_exclusive["dna_sequence"].unique().tolist())
test_exclusive_dna_seqs = set(test_exclusive["dna_sequence"].unique().tolist())
train_exclusive_dna_reps = set(train_exclusive["dna_cluster_rep"].unique().tolist())
val_exclusive_dna_reps = set(val_exclusive["dna_cluster_rep"].unique().tolist())
test_exclusive_dna_reps = set(test_exclusive["dna_cluster_rep"].unique().tolist())
print(f"Train-Val DNA intersection: {len(train_exclusive_dna_seqs.intersection(val_exclusive_dna_seqs))}")
print(f"Train-Test DNA intersection: {len(train_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}")
print(f"Val-Test DNA intersection: {len(val_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}")
print(f"Train-Val DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(val_exclusive_dna_reps))}")
print(f"Train-Test DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}")
print(f"Val-Test DNA Cluster Rep intersection: {len(val_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}")
def get_reverse_complement(s):
"""
Returns 5' to 3' sequence of the reverse complement
"""
chars = list(s)
recon = []
rev_map = {
"a": "t",
"c": "g",
"t": "a",
"g": "c",
"A": "T",
"C": "G",
"T": "A",
"G": "C",
"n": "n",
"N": "N",
}
for c in chars:
recon += [rev_map[c]]
recon = "".join(recon)
return recon[::-1]
# now make reverse complements
def augment_rc(df):
"""
Get the reverse complement and add it as a datapoint, effectively doubling the dataset.
Also flip the orientation of the scores
columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
"""
df_rc = df.copy(deep=True)
df_rc["dna_sequence"] = df_rc["dna_sequence"].apply(
lambda x: get_reverse_complement(x)
)
df_rc["ID"] = df_rc["ID"] + "_rc"
df_rc["scores"] = df_rc["scores"].apply(lambda s: ",".join(s.split(",")[::-1]))
final_df = pd.concat([df, df_rc]).reset_index(drop=True)
return final_df
def convert_scores(scores, mode=1):
"""
Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score
"""
svec = [int(x) for x in scores.split(",")]
max_score = max(svec)
if mode ==1:
binary_svec = [0 if x<max_score else 1 for x in svec]
assert(svec.count(max_score)==binary_svec.count(1))
else:
binary_svec = [0 if x<max_score else max_score for x in svec]
assert(svec.count(max_score)==binary_svec.count(max_score))
binary_svec = ",".join([str(x) for x in binary_svec])
return binary_svec
check_validity(train_exclusive, val_exclusive, test_exclusive)
train_exclusive = augment_rc(train_exclusive)
val_exclusive = augment_rc(val_exclusive)
test_exclusive = augment_rc(test_exclusive)
leaky_test = augment_rc(leaky_test)
print(f"Added reverse complement sequences to train_exclusive, val_exclusive, and test_exclusive (and leaky test_exclusive)")
check_validity(train_exclusive, val_exclusive, test_exclusive)
total = sum([len(train_exclusive), len(val_exclusive), len(test_exclusive), len(leaky_test)])
print(
f"Length of train_exclusive dataset: {len(train_exclusive)} ({100*len(train_exclusive)/total:.2f}%)"
)
print(f"Length of val_exclusive dataset: {len(val_exclusive)} ({100*len(val_exclusive)/total:.2f}%)")
print(f"Length of test_exclusive dataset: {len(test_exclusive)} ({100*len(test_exclusive)/total:.2f}%)")
print(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)")
print(
f"Total sequences = {total}. Same as edges size*2? {total==len(all_data)*2}"
)
# since we've added all these new DNA sequences, we do need a new mapping of seq id to dna sequence
all_data = pd.concat([train_exclusive, val_exclusive, test_exclusive, leaky_test])
all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1]
dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"]))
assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"]))
# create the output dir
import os
from pathlib import Path
split_out_dir = Path("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test")
os.makedirs(split_out_dir, exist_ok=True)
# add binary_scores to allow other training modes
train_exclusive["binary_sores"] = train_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
val_exclusive["binary_sores"] = val_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
test_exclusive["binary_sores"] = test_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
leaky_test["binary_sores"] = leaky_test["scores"].apply(lambda x: convert_scores(x, mode=1))
train_exclusive["split"] = ["train"]*len(train_exclusive)
val_exclusive["split"] = ["val"]*len(val_exclusive)
test_exclusive["split"] = ["test"]*len(test_exclusive)
leaky_test["split"] = ["leakytest"]*len(leaky_test)
# slect final cols and save
split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "binary_sores", "split"]
train_exclusive[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
val_exclusive[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
test_exclusive[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
leaky_test[split_final_cols].to_csv(split_out_dir / "leakytest.csv", index=False)
print(f"Saved all splits to {split_out_dir}")
# make baby versions too
train_exclusive[split_final_cols].sample(400, random_state=42).to_csv(split_out_dir / "babytrain.csv", index=False)
val_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyval.csv", index=False)
test_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babytest.csv", index=False)
leaky_test[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyleakytest.csv", index=False)