File size: 12,454 Bytes
9da03b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
"""
Not neat, but this is what I did to make exclusive splits. saving here for now. 
"""

## Full pipeline
import pandas as pd
protein_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv", sep="\t", header=None)
protein_clusters.columns=["tr_cluster_rep","tr_cluster_member"]
protein_clusters.head()

dna_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv", sep="\t", header=None)
dna_clusters.columns=["dna_cluster_rep","dna_cluster_member"]
dna_clusters.head()

all_data = pd.read_parquet("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet")
all_data

protein_cluster_map = dict(zip(protein_clusters["tr_cluster_member"],protein_clusters["tr_cluster_rep"]))
dna_cluster_map = dict(zip(dna_clusters["dna_cluster_member"],dna_clusters["dna_cluster_rep"]))
print(len(protein_cluster_map))
print(len(dna_cluster_map))
all_data["tr_cluster_rep"] = all_data["tr_seqid"].map(protein_cluster_map)
all_data["dna_cluster_rep"] = all_data["dna_seqid"].map(dna_cluster_map)
print(len(all_data[all_data["tr_cluster_rep"].isna()]))
print(len(all_data[all_data["dna_cluster_rep"].isna()]))
all_data.head()


### handpick test
handpicked_test_trs = ["trseq23","trseq26","trseq17"]
handpicked_test = all_data.loc[
    all_data["tr_cluster_rep"].isin(handpicked_test_trs)
].reset_index(drop=True)

off_limits_dna_clusters = handpicked_test["dna_cluster_rep"].unique().tolist()
remaining = all_data.loc[
    (~all_data["tr_cluster_rep"].isin(handpicked_test_trs)) & 
    (~all_data["dna_cluster_rep"].isin(off_limits_dna_clusters))  
].reset_index(drop=True)

test_ids = handpicked_test["ID"].unique().tolist()
remaining_ids = remaining["ID"].unique().tolist()
lost_rows = all_data.loc[
    (~all_data["ID"].isin(test_ids)) & 
    (~all_data["ID"].isin(remaining_ids))
]
print(f"Rows in test: {len(handpicked_test)}")
print(f"Rows to be split between train and val: {len(remaining)}")
total_rows = len(handpicked_test) + len(remaining)
print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%")
print(f"Lost rows: {len(lost_rows)}")

### handpick val
handpicked_val_trs = ["trseq9", "trseq5", "trseq28"]

handpicked_val = remaining.loc[
    remaining["tr_cluster_rep"].isin(handpicked_val_trs)
].reset_index(drop=True)

off_limits_dna_clusters = handpicked_val["dna_cluster_rep"].unique().tolist()
train_remain = remaining.loc[
    (~remaining["tr_cluster_rep"].isin(handpicked_val_trs)) & 
    (~remaining["dna_cluster_rep"].isin(off_limits_dna_clusters))  
].reset_index(drop=True)

val_ids = handpicked_val["ID"].unique().tolist()
train_remain_ids = train_remain["ID"].unique().tolist()
lost_rows = all_data.loc[
    (~all_data["ID"].isin(test_ids)) & 
    (~all_data["ID"].isin(val_ids)) & 
    (~all_data["ID"].isin(train_remain_ids))
]
print(f"Rows in val: {len(handpicked_val)}")
print(f"Rows left for train: {len(train_remain)}")
total_rows = len(handpicked_val) + len(train_remain)
print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%")
print(f"Lost rows: {len(lost_rows)}")

train_exclusive = all_data.loc[
    all_data["ID"].isin(train_remain_ids)
    ].reset_index(drop=True)

val_exclusive = all_data.loc[
    all_data["ID"].isin(val_ids)
    ].reset_index(drop=True)

test_exclusive = all_data.loc[
    all_data["ID"].isin(test_ids)
    ].reset_index(drop=True)

leaky_test = all_data.loc[
    ~(all_data["ID"].isin(train_exclusive["ID"].tolist())) & 
    ~(all_data["ID"].isin(val_exclusive["ID"].tolist())) & 
    ~(all_data["ID"].isin(test_exclusive["ID"].tolist()))
].reset_index(drop=True)

print(f"Original total: {len(all_data)}")
retained_total = len(train_exclusive)+len(val_exclusive)+len(test_exclusive)
print(f"New, exclusive total: {retained_total}")
print(f"Lost rows: {len(all_data)-retained_total}")
print(f"Length train: {len(train_exclusive)}/{retained_total} ({100*len(train_exclusive)/retained_total:.2f}%)")
print(f"Length val: {len(val_exclusive)}/{retained_total} ({100*len(val_exclusive)/retained_total:.2f}%)")
print(f"Length test: {len(test_exclusive)}/{retained_total} ({100*len(test_exclusive)/retained_total:.2f}%)")

def check_validity(train_exclusive, val_exclusive, test_exclusive):
    train_exclusive_ids = set(train_exclusive["ID"].unique().tolist())
    val_exclusive_ids = set(val_exclusive["ID"].unique().tolist())
    test_exclusive_ids = set(test_exclusive["ID"].unique().tolist())

    assert len(train_exclusive_ids.intersection(val_exclusive_ids)) == 0
    assert len(train_exclusive_ids.intersection(test_exclusive_ids)) == 0
    assert len(val_exclusive_ids.intersection(test_exclusive_ids)) == 0
    print(f"Pass! No overlap in IDs")

    # Investigate TR intersection. No assertions unless we are explicitly splitting on this. 
    train_exclusive_tr_seqs = set(train_exclusive["tr_sequence"].unique().tolist())
    val_exclusive_tr_seqs = set(val_exclusive["tr_sequence"].unique().tolist())
    test_exclusive_tr_seqs = set(test_exclusive["tr_sequence"].unique().tolist())

    train_exclusive_tr_reps = set(train_exclusive["tr_cluster_rep"].unique().tolist())
    val_exclusive_tr_reps = set(val_exclusive["tr_cluster_rep"].unique().tolist())
    test_exclusive_tr_reps = set(test_exclusive["tr_cluster_rep"].unique().tolist())

    print(f"Train-Val TR intersection: {len(train_exclusive_tr_seqs.intersection(val_exclusive_tr_seqs))}")
    print(f"Train-Test TR intersection: {len(train_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}")
    print(f"Val-Test TR intersection: {len(val_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}")

    print(f"Train-Val TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(val_exclusive_tr_reps))}")
    print(f"Train-Test TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}")
    print(f"Val-Test TR Cluster Rep intersection: {len(val_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}")

    # Investigate DNA intersection. No assertions unless we are explicitly splitting on this. 
    train_exclusive_dna_seqs = set(train_exclusive["dna_sequence"].unique().tolist())
    val_exclusive_dna_seqs = set(val_exclusive["dna_sequence"].unique().tolist())
    test_exclusive_dna_seqs = set(test_exclusive["dna_sequence"].unique().tolist())

    train_exclusive_dna_reps = set(train_exclusive["dna_cluster_rep"].unique().tolist())
    val_exclusive_dna_reps = set(val_exclusive["dna_cluster_rep"].unique().tolist())
    test_exclusive_dna_reps = set(test_exclusive["dna_cluster_rep"].unique().tolist())

    print(f"Train-Val DNA intersection: {len(train_exclusive_dna_seqs.intersection(val_exclusive_dna_seqs))}")
    print(f"Train-Test DNA intersection: {len(train_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}")
    print(f"Val-Test DNA intersection: {len(val_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}")

    print(f"Train-Val DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(val_exclusive_dna_reps))}")
    print(f"Train-Test DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}")
    print(f"Val-Test DNA Cluster Rep intersection: {len(val_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}")

def get_reverse_complement(s):
    """
    Returns 5' to 3' sequence of the reverse complement
    """
    chars = list(s)
    recon = []
    rev_map = {
        "a": "t",
        "c": "g",
        "t": "a",
        "g": "c",
        "A": "T",
        "C": "G",
        "T": "A",
        "G": "C",
        "n": "n",
        "N": "N",
    }
    for c in chars:
        recon += [rev_map[c]]

    recon = "".join(recon)
    return recon[::-1]

# now make reverse complements
def augment_rc(df):
    """
    Get the reverse complement and add it as a datapoint, effectively doubling the dataset.
    Also flip the orientation of the scores

    columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
    """
    df_rc = df.copy(deep=True)

    df_rc["dna_sequence"] = df_rc["dna_sequence"].apply(
        lambda x: get_reverse_complement(x)
    )
    df_rc["ID"] = df_rc["ID"] + "_rc"
    df_rc["scores"] = df_rc["scores"].apply(lambda s: ",".join(s.split(",")[::-1]))

    final_df = pd.concat([df, df_rc]).reset_index(drop=True)

    return final_df

def convert_scores(scores, mode=1):
    """
    Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score
    """
    svec = [int(x) for x in scores.split(",")]
    max_score = max(svec)
    if mode ==1:
        binary_svec = [0 if x<max_score else 1 for x in svec]
        assert(svec.count(max_score)==binary_svec.count(1))
    else:
        binary_svec = [0 if x<max_score else max_score for x in svec]
        assert(svec.count(max_score)==binary_svec.count(max_score))
    binary_svec = ",".join([str(x) for x in binary_svec])
    return binary_svec

check_validity(train_exclusive, val_exclusive, test_exclusive)

train_exclusive = augment_rc(train_exclusive)
val_exclusive = augment_rc(val_exclusive)
test_exclusive = augment_rc(test_exclusive)
leaky_test = augment_rc(leaky_test)

print(f"Added reverse complement sequences to train_exclusive, val_exclusive, and test_exclusive (and leaky test_exclusive)")

check_validity(train_exclusive, val_exclusive, test_exclusive)

total = sum([len(train_exclusive), len(val_exclusive), len(test_exclusive), len(leaky_test)])
print(
    f"Length of train_exclusive dataset: {len(train_exclusive)} ({100*len(train_exclusive)/total:.2f}%)"
)
print(f"Length of val_exclusive dataset: {len(val_exclusive)} ({100*len(val_exclusive)/total:.2f}%)")
print(f"Length of test_exclusive dataset: {len(test_exclusive)} ({100*len(test_exclusive)/total:.2f}%)")
print(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)")
print(
    f"Total sequences = {total}. Same as edges size*2? {total==len(all_data)*2}"
)

# since we've added all these new DNA sequences, we do need a new mapping of seq id to dna sequence
all_data = pd.concat([train_exclusive, val_exclusive, test_exclusive, leaky_test])
all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1]
dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"]))
assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"]))

# create the output dir
import os
from pathlib import Path
split_out_dir = Path("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test")
os.makedirs(split_out_dir, exist_ok=True)
    
# add binary_scores to allow other training modes
train_exclusive["binary_sores"] = train_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
val_exclusive["binary_sores"] = val_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
test_exclusive["binary_sores"] = test_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
leaky_test["binary_sores"] = leaky_test["scores"].apply(lambda x: convert_scores(x, mode=1))

train_exclusive["split"] = ["train"]*len(train_exclusive)
val_exclusive["split"] = ["val"]*len(val_exclusive)
test_exclusive["split"] = ["test"]*len(test_exclusive)
leaky_test["split"] = ["leakytest"]*len(leaky_test)

# slect final cols and save
split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "binary_sores", "split"]
train_exclusive[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
val_exclusive[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
test_exclusive[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
leaky_test[split_final_cols].to_csv(split_out_dir / "leakytest.csv", index=False)
print(f"Saved all splits to {split_out_dir}")

# make baby versions too
train_exclusive[split_final_cols].sample(400, random_state=42).to_csv(split_out_dir / "babytrain.csv", index=False)
val_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyval.csv", index=False)
test_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babytest.csv", index=False)
leaky_test[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyleakytest.csv", index=False)