| """ |
| Not neat, but this is what I did to make exclusive splits. saving here for now. |
| """ |
|
|
| |
| import pandas as pd |
| protein_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv", sep="\t", header=None) |
| protein_clusters.columns=["tr_cluster_rep","tr_cluster_member"] |
| protein_clusters.head() |
|
|
| dna_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv", sep="\t", header=None) |
| dna_clusters.columns=["dna_cluster_rep","dna_cluster_member"] |
| dna_clusters.head() |
|
|
| all_data = pd.read_parquet("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet") |
| all_data |
|
|
| protein_cluster_map = dict(zip(protein_clusters["tr_cluster_member"],protein_clusters["tr_cluster_rep"])) |
| dna_cluster_map = dict(zip(dna_clusters["dna_cluster_member"],dna_clusters["dna_cluster_rep"])) |
| print(len(protein_cluster_map)) |
| print(len(dna_cluster_map)) |
| all_data["tr_cluster_rep"] = all_data["tr_seqid"].map(protein_cluster_map) |
| all_data["dna_cluster_rep"] = all_data["dna_seqid"].map(dna_cluster_map) |
| print(len(all_data[all_data["tr_cluster_rep"].isna()])) |
| print(len(all_data[all_data["dna_cluster_rep"].isna()])) |
| all_data.head() |
|
|
|
|
| |
| handpicked_test_trs = ["trseq23","trseq26","trseq17"] |
| handpicked_test = all_data.loc[ |
| all_data["tr_cluster_rep"].isin(handpicked_test_trs) |
| ].reset_index(drop=True) |
|
|
| off_limits_dna_clusters = handpicked_test["dna_cluster_rep"].unique().tolist() |
| remaining = all_data.loc[ |
| (~all_data["tr_cluster_rep"].isin(handpicked_test_trs)) & |
| (~all_data["dna_cluster_rep"].isin(off_limits_dna_clusters)) |
| ].reset_index(drop=True) |
|
|
| test_ids = handpicked_test["ID"].unique().tolist() |
| remaining_ids = remaining["ID"].unique().tolist() |
| lost_rows = all_data.loc[ |
| (~all_data["ID"].isin(test_ids)) & |
| (~all_data["ID"].isin(remaining_ids)) |
| ] |
| print(f"Rows in test: {len(handpicked_test)}") |
| print(f"Rows to be split between train and val: {len(remaining)}") |
| total_rows = len(handpicked_test) + len(remaining) |
| print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%") |
| print(f"Lost rows: {len(lost_rows)}") |
|
|
| |
| handpicked_val_trs = ["trseq9", "trseq5", "trseq28"] |
|
|
| handpicked_val = remaining.loc[ |
| remaining["tr_cluster_rep"].isin(handpicked_val_trs) |
| ].reset_index(drop=True) |
|
|
| off_limits_dna_clusters = handpicked_val["dna_cluster_rep"].unique().tolist() |
| train_remain = remaining.loc[ |
| (~remaining["tr_cluster_rep"].isin(handpicked_val_trs)) & |
| (~remaining["dna_cluster_rep"].isin(off_limits_dna_clusters)) |
| ].reset_index(drop=True) |
|
|
| val_ids = handpicked_val["ID"].unique().tolist() |
| train_remain_ids = train_remain["ID"].unique().tolist() |
| lost_rows = all_data.loc[ |
| (~all_data["ID"].isin(test_ids)) & |
| (~all_data["ID"].isin(val_ids)) & |
| (~all_data["ID"].isin(train_remain_ids)) |
| ] |
| print(f"Rows in val: {len(handpicked_val)}") |
| print(f"Rows left for train: {len(train_remain)}") |
| total_rows = len(handpicked_val) + len(train_remain) |
| print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%") |
| print(f"Lost rows: {len(lost_rows)}") |
|
|
| train_exclusive = all_data.loc[ |
| all_data["ID"].isin(train_remain_ids) |
| ].reset_index(drop=True) |
|
|
| val_exclusive = all_data.loc[ |
| all_data["ID"].isin(val_ids) |
| ].reset_index(drop=True) |
|
|
| test_exclusive = all_data.loc[ |
| all_data["ID"].isin(test_ids) |
| ].reset_index(drop=True) |
|
|
| leaky_test = all_data.loc[ |
| ~(all_data["ID"].isin(train_exclusive["ID"].tolist())) & |
| ~(all_data["ID"].isin(val_exclusive["ID"].tolist())) & |
| ~(all_data["ID"].isin(test_exclusive["ID"].tolist())) |
| ].reset_index(drop=True) |
|
|
| print(f"Original total: {len(all_data)}") |
| retained_total = len(train_exclusive)+len(val_exclusive)+len(test_exclusive) |
| print(f"New, exclusive total: {retained_total}") |
| print(f"Lost rows: {len(all_data)-retained_total}") |
| print(f"Length train: {len(train_exclusive)}/{retained_total} ({100*len(train_exclusive)/retained_total:.2f}%)") |
| print(f"Length val: {len(val_exclusive)}/{retained_total} ({100*len(val_exclusive)/retained_total:.2f}%)") |
| print(f"Length test: {len(test_exclusive)}/{retained_total} ({100*len(test_exclusive)/retained_total:.2f}%)") |
|
|
| def check_validity(train_exclusive, val_exclusive, test_exclusive): |
| train_exclusive_ids = set(train_exclusive["ID"].unique().tolist()) |
| val_exclusive_ids = set(val_exclusive["ID"].unique().tolist()) |
| test_exclusive_ids = set(test_exclusive["ID"].unique().tolist()) |
|
|
| assert len(train_exclusive_ids.intersection(val_exclusive_ids)) == 0 |
| assert len(train_exclusive_ids.intersection(test_exclusive_ids)) == 0 |
| assert len(val_exclusive_ids.intersection(test_exclusive_ids)) == 0 |
| print(f"Pass! No overlap in IDs") |
|
|
| |
| train_exclusive_tr_seqs = set(train_exclusive["tr_sequence"].unique().tolist()) |
| val_exclusive_tr_seqs = set(val_exclusive["tr_sequence"].unique().tolist()) |
| test_exclusive_tr_seqs = set(test_exclusive["tr_sequence"].unique().tolist()) |
|
|
| train_exclusive_tr_reps = set(train_exclusive["tr_cluster_rep"].unique().tolist()) |
| val_exclusive_tr_reps = set(val_exclusive["tr_cluster_rep"].unique().tolist()) |
| test_exclusive_tr_reps = set(test_exclusive["tr_cluster_rep"].unique().tolist()) |
|
|
| print(f"Train-Val TR intersection: {len(train_exclusive_tr_seqs.intersection(val_exclusive_tr_seqs))}") |
| print(f"Train-Test TR intersection: {len(train_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}") |
| print(f"Val-Test TR intersection: {len(val_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}") |
|
|
| print(f"Train-Val TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(val_exclusive_tr_reps))}") |
| print(f"Train-Test TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}") |
| print(f"Val-Test TR Cluster Rep intersection: {len(val_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}") |
|
|
| |
| train_exclusive_dna_seqs = set(train_exclusive["dna_sequence"].unique().tolist()) |
| val_exclusive_dna_seqs = set(val_exclusive["dna_sequence"].unique().tolist()) |
| test_exclusive_dna_seqs = set(test_exclusive["dna_sequence"].unique().tolist()) |
|
|
| train_exclusive_dna_reps = set(train_exclusive["dna_cluster_rep"].unique().tolist()) |
| val_exclusive_dna_reps = set(val_exclusive["dna_cluster_rep"].unique().tolist()) |
| test_exclusive_dna_reps = set(test_exclusive["dna_cluster_rep"].unique().tolist()) |
|
|
| print(f"Train-Val DNA intersection: {len(train_exclusive_dna_seqs.intersection(val_exclusive_dna_seqs))}") |
| print(f"Train-Test DNA intersection: {len(train_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}") |
| print(f"Val-Test DNA intersection: {len(val_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}") |
|
|
| print(f"Train-Val DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(val_exclusive_dna_reps))}") |
| print(f"Train-Test DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}") |
| print(f"Val-Test DNA Cluster Rep intersection: {len(val_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}") |
|
|
| def get_reverse_complement(s): |
| """ |
| Returns 5' to 3' sequence of the reverse complement |
| """ |
| chars = list(s) |
| recon = [] |
| rev_map = { |
| "a": "t", |
| "c": "g", |
| "t": "a", |
| "g": "c", |
| "A": "T", |
| "C": "G", |
| "T": "A", |
| "G": "C", |
| "n": "n", |
| "N": "N", |
| } |
| for c in chars: |
| recon += [rev_map[c]] |
|
|
| recon = "".join(recon) |
| return recon[::-1] |
|
|
| |
| def augment_rc(df): |
| """ |
| Get the reverse complement and add it as a datapoint, effectively doubling the dataset. |
| Also flip the orientation of the scores |
| |
| columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"] |
| """ |
| df_rc = df.copy(deep=True) |
|
|
| df_rc["dna_sequence"] = df_rc["dna_sequence"].apply( |
| lambda x: get_reverse_complement(x) |
| ) |
| df_rc["ID"] = df_rc["ID"] + "_rc" |
| df_rc["scores"] = df_rc["scores"].apply(lambda s: ",".join(s.split(",")[::-1])) |
|
|
| final_df = pd.concat([df, df_rc]).reset_index(drop=True) |
|
|
| return final_df |
|
|
| def convert_scores(scores, mode=1): |
| """ |
| Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score |
| """ |
| svec = [int(x) for x in scores.split(",")] |
| max_score = max(svec) |
| if mode ==1: |
| binary_svec = [0 if x<max_score else 1 for x in svec] |
| assert(svec.count(max_score)==binary_svec.count(1)) |
| else: |
| binary_svec = [0 if x<max_score else max_score for x in svec] |
| assert(svec.count(max_score)==binary_svec.count(max_score)) |
| binary_svec = ",".join([str(x) for x in binary_svec]) |
| return binary_svec |
|
|
| check_validity(train_exclusive, val_exclusive, test_exclusive) |
|
|
| train_exclusive = augment_rc(train_exclusive) |
| val_exclusive = augment_rc(val_exclusive) |
| test_exclusive = augment_rc(test_exclusive) |
| leaky_test = augment_rc(leaky_test) |
|
|
| print(f"Added reverse complement sequences to train_exclusive, val_exclusive, and test_exclusive (and leaky test_exclusive)") |
|
|
| check_validity(train_exclusive, val_exclusive, test_exclusive) |
|
|
| total = sum([len(train_exclusive), len(val_exclusive), len(test_exclusive), len(leaky_test)]) |
| print( |
| f"Length of train_exclusive dataset: {len(train_exclusive)} ({100*len(train_exclusive)/total:.2f}%)" |
| ) |
| print(f"Length of val_exclusive dataset: {len(val_exclusive)} ({100*len(val_exclusive)/total:.2f}%)") |
| print(f"Length of test_exclusive dataset: {len(test_exclusive)} ({100*len(test_exclusive)/total:.2f}%)") |
| print(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)") |
| print( |
| f"Total sequences = {total}. Same as edges size*2? {total==len(all_data)*2}" |
| ) |
|
|
| |
| all_data = pd.concat([train_exclusive, val_exclusive, test_exclusive, leaky_test]) |
| all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1] |
| dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"])) |
| assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"])) |
|
|
| |
| import os |
| from pathlib import Path |
| split_out_dir = Path("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test") |
| os.makedirs(split_out_dir, exist_ok=True) |
| |
| |
| train_exclusive["binary_sores"] = train_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1)) |
| val_exclusive["binary_sores"] = val_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1)) |
| test_exclusive["binary_sores"] = test_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1)) |
| leaky_test["binary_sores"] = leaky_test["scores"].apply(lambda x: convert_scores(x, mode=1)) |
|
|
| train_exclusive["split"] = ["train"]*len(train_exclusive) |
| val_exclusive["split"] = ["val"]*len(val_exclusive) |
| test_exclusive["split"] = ["test"]*len(test_exclusive) |
| leaky_test["split"] = ["leakytest"]*len(leaky_test) |
|
|
| |
| split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "binary_sores", "split"] |
| train_exclusive[split_final_cols].to_csv(split_out_dir / "train.csv", index=False) |
| val_exclusive[split_final_cols].to_csv(split_out_dir / "val.csv", index=False) |
| test_exclusive[split_final_cols].to_csv(split_out_dir / "test.csv", index=False) |
| leaky_test[split_final_cols].to_csv(split_out_dir / "leakytest.csv", index=False) |
| print(f"Saved all splits to {split_out_dir}") |
|
|
| |
| train_exclusive[split_final_cols].sample(400, random_state=42).to_csv(split_out_dir / "babytrain.csv", index=False) |
| val_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyval.csv", index=False) |
| test_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babytest.csv", index=False) |
| leaky_test[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyleakytest.csv", index=False) |
|
|
|
|