""" Not neat, but this is what I did to make exclusive splits. saving here for now. """ ## Full pipeline import pandas as pd protein_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv", sep="\t", header=None) protein_clusters.columns=["tr_cluster_rep","tr_cluster_member"] protein_clusters.head() dna_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv", sep="\t", header=None) dna_clusters.columns=["dna_cluster_rep","dna_cluster_member"] dna_clusters.head() all_data = pd.read_parquet("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet") all_data protein_cluster_map = dict(zip(protein_clusters["tr_cluster_member"],protein_clusters["tr_cluster_rep"])) dna_cluster_map = dict(zip(dna_clusters["dna_cluster_member"],dna_clusters["dna_cluster_rep"])) print(len(protein_cluster_map)) print(len(dna_cluster_map)) all_data["tr_cluster_rep"] = all_data["tr_seqid"].map(protein_cluster_map) all_data["dna_cluster_rep"] = all_data["dna_seqid"].map(dna_cluster_map) print(len(all_data[all_data["tr_cluster_rep"].isna()])) print(len(all_data[all_data["dna_cluster_rep"].isna()])) all_data.head() ### handpick test handpicked_test_trs = ["trseq23","trseq26","trseq17"] handpicked_test = all_data.loc[ all_data["tr_cluster_rep"].isin(handpicked_test_trs) ].reset_index(drop=True) off_limits_dna_clusters = handpicked_test["dna_cluster_rep"].unique().tolist() remaining = all_data.loc[ (~all_data["tr_cluster_rep"].isin(handpicked_test_trs)) & (~all_data["dna_cluster_rep"].isin(off_limits_dna_clusters)) ].reset_index(drop=True) test_ids = handpicked_test["ID"].unique().tolist() remaining_ids = remaining["ID"].unique().tolist() lost_rows = all_data.loc[ (~all_data["ID"].isin(test_ids)) & (~all_data["ID"].isin(remaining_ids)) ] print(f"Rows in test: {len(handpicked_test)}") print(f"Rows to be split between train and val: {len(remaining)}") total_rows = len(handpicked_test) + len(remaining) print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%") print(f"Lost rows: {len(lost_rows)}") ### handpick val handpicked_val_trs = ["trseq9", "trseq5", "trseq28"] handpicked_val = remaining.loc[ remaining["tr_cluster_rep"].isin(handpicked_val_trs) ].reset_index(drop=True) off_limits_dna_clusters = handpicked_val["dna_cluster_rep"].unique().tolist() train_remain = remaining.loc[ (~remaining["tr_cluster_rep"].isin(handpicked_val_trs)) & (~remaining["dna_cluster_rep"].isin(off_limits_dna_clusters)) ].reset_index(drop=True) val_ids = handpicked_val["ID"].unique().tolist() train_remain_ids = train_remain["ID"].unique().tolist() lost_rows = all_data.loc[ (~all_data["ID"].isin(test_ids)) & (~all_data["ID"].isin(val_ids)) & (~all_data["ID"].isin(train_remain_ids)) ] print(f"Rows in val: {len(handpicked_val)}") print(f"Rows left for train: {len(train_remain)}") total_rows = len(handpicked_val) + len(train_remain) print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%") print(f"Lost rows: {len(lost_rows)}") train_exclusive = all_data.loc[ all_data["ID"].isin(train_remain_ids) ].reset_index(drop=True) val_exclusive = all_data.loc[ all_data["ID"].isin(val_ids) ].reset_index(drop=True) test_exclusive = all_data.loc[ all_data["ID"].isin(test_ids) ].reset_index(drop=True) leaky_test = all_data.loc[ ~(all_data["ID"].isin(train_exclusive["ID"].tolist())) & ~(all_data["ID"].isin(val_exclusive["ID"].tolist())) & ~(all_data["ID"].isin(test_exclusive["ID"].tolist())) ].reset_index(drop=True) print(f"Original total: {len(all_data)}") retained_total = len(train_exclusive)+len(val_exclusive)+len(test_exclusive) print(f"New, exclusive total: {retained_total}") print(f"Lost rows: {len(all_data)-retained_total}") print(f"Length train: {len(train_exclusive)}/{retained_total} ({100*len(train_exclusive)/retained_total:.2f}%)") print(f"Length val: {len(val_exclusive)}/{retained_total} ({100*len(val_exclusive)/retained_total:.2f}%)") print(f"Length test: {len(test_exclusive)}/{retained_total} ({100*len(test_exclusive)/retained_total:.2f}%)") def check_validity(train_exclusive, val_exclusive, test_exclusive): train_exclusive_ids = set(train_exclusive["ID"].unique().tolist()) val_exclusive_ids = set(val_exclusive["ID"].unique().tolist()) test_exclusive_ids = set(test_exclusive["ID"].unique().tolist()) assert len(train_exclusive_ids.intersection(val_exclusive_ids)) == 0 assert len(train_exclusive_ids.intersection(test_exclusive_ids)) == 0 assert len(val_exclusive_ids.intersection(test_exclusive_ids)) == 0 print(f"Pass! No overlap in IDs") # Investigate TR intersection. No assertions unless we are explicitly splitting on this. train_exclusive_tr_seqs = set(train_exclusive["tr_sequence"].unique().tolist()) val_exclusive_tr_seqs = set(val_exclusive["tr_sequence"].unique().tolist()) test_exclusive_tr_seqs = set(test_exclusive["tr_sequence"].unique().tolist()) train_exclusive_tr_reps = set(train_exclusive["tr_cluster_rep"].unique().tolist()) val_exclusive_tr_reps = set(val_exclusive["tr_cluster_rep"].unique().tolist()) test_exclusive_tr_reps = set(test_exclusive["tr_cluster_rep"].unique().tolist()) print(f"Train-Val TR intersection: {len(train_exclusive_tr_seqs.intersection(val_exclusive_tr_seqs))}") print(f"Train-Test TR intersection: {len(train_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}") print(f"Val-Test TR intersection: {len(val_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}") print(f"Train-Val TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(val_exclusive_tr_reps))}") print(f"Train-Test TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}") print(f"Val-Test TR Cluster Rep intersection: {len(val_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}") # Investigate DNA intersection. No assertions unless we are explicitly splitting on this. train_exclusive_dna_seqs = set(train_exclusive["dna_sequence"].unique().tolist()) val_exclusive_dna_seqs = set(val_exclusive["dna_sequence"].unique().tolist()) test_exclusive_dna_seqs = set(test_exclusive["dna_sequence"].unique().tolist()) train_exclusive_dna_reps = set(train_exclusive["dna_cluster_rep"].unique().tolist()) val_exclusive_dna_reps = set(val_exclusive["dna_cluster_rep"].unique().tolist()) test_exclusive_dna_reps = set(test_exclusive["dna_cluster_rep"].unique().tolist()) print(f"Train-Val DNA intersection: {len(train_exclusive_dna_seqs.intersection(val_exclusive_dna_seqs))}") print(f"Train-Test DNA intersection: {len(train_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}") print(f"Val-Test DNA intersection: {len(val_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}") print(f"Train-Val DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(val_exclusive_dna_reps))}") print(f"Train-Test DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}") print(f"Val-Test DNA Cluster Rep intersection: {len(val_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}") def get_reverse_complement(s): """ Returns 5' to 3' sequence of the reverse complement """ chars = list(s) recon = [] rev_map = { "a": "t", "c": "g", "t": "a", "g": "c", "A": "T", "C": "G", "T": "A", "G": "C", "n": "n", "N": "N", } for c in chars: recon += [rev_map[c]] recon = "".join(recon) return recon[::-1] # now make reverse complements def augment_rc(df): """ Get the reverse complement and add it as a datapoint, effectively doubling the dataset. Also flip the orientation of the scores columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"] """ df_rc = df.copy(deep=True) df_rc["dna_sequence"] = df_rc["dna_sequence"].apply( lambda x: get_reverse_complement(x) ) df_rc["ID"] = df_rc["ID"] + "_rc" df_rc["scores"] = df_rc["scores"].apply(lambda s: ",".join(s.split(",")[::-1])) final_df = pd.concat([df, df_rc]).reset_index(drop=True) return final_df def convert_scores(scores, mode=1): """ Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score """ svec = [int(x) for x in scores.split(",")] max_score = max(svec) if mode ==1: binary_svec = [0 if x