added dropout and overfit prevention

9da03b7 9 months ago

17.9 kB

	from collections import Counter, defaultdict
	from ortools.linear_solver import pywraplp
	import random
	from omegaconf import DictConfig
	import pandas as pd
	from pathlib import Path
	import os
	import numpy as np
	from sklearn.model_selection import train_test_split
	from dpacman.data_tasks.fimo.post_fimo import get_reverse_complement
	import json
	import rootutils
	from dpacman.utils import pylogger

	root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
	logger = pylogger.RankedLogger(__name__, rank_zero_only=True)

	def split_with_predefined_test(
	full_df = pd.DataFrame(),
	split_names=("train", "val", "test"),
	test_trs=None,
	test_dnas=None,
	ratios=(0.8, 0.1, 0.1),
	):
	"""
	Method for splitting into train and val with a predefined test set.
	The proteins in the test set, and the DNA clusters of the DNAs they're associated with, must be excluded from train and val.
	The remaining rows for train and val are split to preserve 80/10/10 as best as possible.
	"""
	test = full_df.copy(deep=True)
	if test_trs is not None:
	test = test.loc[test["tr_seqid"].isin(test_trs)].reset_index(drop=True)
	if test_dnas is not None:
	test = test.loc[test["dna_seqid"].isin(test_dnas)].reset_index(drop=True)

	tr_clusters_to_exclude = test["tr_cluster_rep"].unique().tolist()
	dna_clusters_to_exclude = test["dna_cluster_rep"].unique().tolist()

	remaining = full_df.loc[
	(~full_df["tr_cluster_rep"].isin(tr_clusters_to_exclude)) &
	(~full_df["dna_cluster_rep"].isin(dna_clusters_to_exclude))
	].reset_index(drop=True)

	test_ids = test["ID"].unique().tolist()
	remaining_ids = remaining["ID"].unique().tolist()
	remaining_clusters = remaining["dna_cluster_rep"].unique().tolist()
	lost_rows = full_df.loc[
	(~full_df["ID"].isin(test_ids)) &
	(~full_df["ID"].isin(remaining_ids))
	]

	logger.info(f"Rows in test: {len(test)}")
	logger.info(f"Rows to be split between train and val: {len(remaining)}")
	total_rows = len(test) + len(remaining)
	logger.info(f"Total rows: {total_rows}. Test percentage: {100*len(test)/total_rows:.2f}%")
	logger.info(f"Lost rows: {len(lost_rows)}")

	train_ratio_from_remaining = round((0.8*total_rows)/len(remaining), 2)
	# use sklearn
	test_size_1 = 1 - train_ratio_from_remaining
	logger.info(
	f"\tPerforming first split: non-test clusters -> train clusters ({round(1-test_size_1,3)}) and val ({test_size_1})"
	)
	X = remaining_clusters
	y = [0] * len(remaining_clusters)
	X_train, X_val, y_train, y_val = train_test_split(
	X, y, test_size=test_size_1, random_state=0
	)

	train = remaining.loc[remaining["dna_cluster_rep"].isin(X_train)]
	val = remaining.loc[remaining["dna_cluster_rep"].isin(X_val)]
	leaky_test = lost_rows

	kept_by_split = {
	"train": len(X_train),
	"val": len(X_val),
	"test": len(test["dna_cluster_rep"].unique())
	}
	splits = {
	"train": train,
	"val": val,
	"test": test,
	"leaky_test": leaky_test
	}
	return splits, kept_by_split

	def split_bipartite_fast(
	dna_clusters,
	split_names=("train", "val", "test"),
	ratios=(0.8, 0.1, 0.1),
	):
	# use sklearn
	test_size_1 = 0.2
	test_size_2 = 0.5
	logger.info(
	f"\tPerforming first split: all clusters -> train clusters ({round(1-test_size_1,3)}) and other ({test_size_1})"
	)
	X = dna_clusters
	y = [0] * len(dna_clusters)
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=test_size_1, random_state=0
	)
	logger.info(
	f"\tPerforming second split: other -> val clusters ({round(1-test_size_2,3)}) and test clusters ({test_size_2})"
	)
	X_val, X_test, y_val, y_test = train_test_split(
	X_test, y_test, test_size=test_size_2, random_state=0
	)

	dna_assign = {}
	for x in X_train:
	dna_assign[x] = "train"
	for x in X_val:
	dna_assign[x] = "val"
	for x in X_test:
	dna_assign[x] = "test"

	kept_by_split = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
	return dna_assign, kept_by_split

	# construct new labels
	def convert_scores(scores, mode=1):
	"""
	Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score
	"""
	svec = [int(x) for x in scores.split(",")]
	max_score = max(svec)
	if mode ==1:
	binary_svec = [0 if x<max_score else 1 for x in svec]
	assert(svec.count(max_score)==binary_svec.count(1))
	else:
	binary_svec = [0 if x<max_score else max_score for x in svec]
	assert(svec.count(max_score)==binary_svec.count(max_score))
	binary_svec = ",".join([str(x) for x in binary_svec])
	return binary_svec

	def print_split_ratios(kept_by_split):
	total = sum(kept_by_split.values())
	train_pcnt = 100 * kept_by_split["train"] / total
	val_pcnt = 100 * kept_by_split["val"] / total
	test_pcnt = 100 * kept_by_split["test"] / total
	logger.info(
	f"Cluster distribution - Train: {train_pcnt:.2f}%, Val: {val_pcnt:.2f}%, Test: {test_pcnt:.2f}%"
	)


	def make_edges(
	processed_fimo_path: str, protein_cluster_path: str, dna_cluster_path: str
	):
	"""
	Make edges for input to the splitting algorithm. Edges consist of: (tr_cluster_rep)_(dna_cluster_rep) where the cluster rep is the sequence ID
	"""
	# Read cluser data
	protein_clusters = pd.read_csv(protein_cluster_path, header=None, sep="\t")
	protein_clusters.columns = ["tr_cluster_rep", "tr_seqid"]

	dna_clusters = pd.read_csv(dna_cluster_path, header=None, sep="\t")
	dna_clusters.columns = ["dna_cluster_rep", "dna_seqid"]

	# Read datapoints
	edges = pd.read_parquet(processed_fimo_path)
	edges = pd.merge(edges, dna_clusters, on="dna_seqid", how="left")
	edges = pd.merge(edges, protein_clusters, on="tr_seqid", how="left")
	edges["edge"] = edges.apply(
	lambda row: (row["tr_cluster_rep"], row["dna_cluster_rep"]), axis=1
	)

	logger.info(f"Total unique edges: {len(edges['edge'].unique().tolist())}")
	dup_edges = edges.loc[edges.duplicated("edge")]["edge"].unique().tolist()
	logger.info(f"Total edges with >1 datapoint: {len(dup_edges)}")
	logger.info(
	f"Total datapoints belonging to a duplicate edge: {len(edges.loc[edges['edge'].isin(dup_edges)])}"
	)
	return edges


	def check_validity(train, val, test, split_by="both"):
	"""
	Rigorous check for no overlap
	Columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
	"""
	train_ids = set(train["ID"].unique().tolist())
	val_ids = set(val["ID"].unique().tolist())
	test_ids = set(test["ID"].unique().tolist())

	assert len(train_ids.intersection(val_ids)) == 0
	assert len(train_ids.intersection(test_ids)) == 0
	assert len(val_ids.intersection(test_ids)) == 0
	logger.info(f"Pass! No overlap in IDs")

	# Investigate TR intersection. No assertions unless we are explicitly splitting on this.
	train_tr_seqs = set(train["tr_sequence"].unique().tolist())
	val_tr_seqs = set(val["tr_sequence"].unique().tolist())
	test_tr_seqs = set(test["tr_sequence"].unique().tolist())

	train_tr_reps = set(train["tr_cluster_rep"].unique().tolist())
	val_tr_reps = set(val["tr_cluster_rep"].unique().tolist())
	test_tr_reps = set(test["tr_cluster_rep"].unique().tolist())

	logger.info(f"Train-Val TR intersection: {len(train_tr_seqs.intersection(val_tr_seqs))}")
	logger.info(f"Train-Test TR intersection: {len(train_tr_seqs.intersection(test_tr_seqs))}")
	logger.info(f"Val-Test TR intersection: {len(val_tr_seqs.intersection(test_tr_seqs))}")

	logger.info(f"Train-Val TR Cluster Rep intersection: {len(train_tr_reps.intersection(val_tr_reps))}")
	logger.info(f"Train-Test TR Cluster Rep intersection: {len(train_tr_reps.intersection(test_tr_reps))}")
	logger.info(f"Val-Test TR Cluster Rep intersection: {len(val_tr_reps.intersection(test_tr_reps))}")

	# Investigate DNA intersection. No assertions unless we are explicitly splitting on this.
	train_dna_seqs = set(train["dna_sequence"].unique().tolist())
	val_dna_seqs = set(val["dna_sequence"].unique().tolist())
	test_dna_seqs = set(test["dna_sequence"].unique().tolist())

	train_dna_reps = set(train["dna_cluster_rep"].unique().tolist())
	val_dna_reps = set(val["dna_cluster_rep"].unique().tolist())
	test_dna_reps = set(test["dna_cluster_rep"].unique().tolist())

	logger.info(f"Train-Val DNA intersection: {len(train_dna_seqs.intersection(val_dna_seqs))}")
	logger.info(f"Train-Test DNA intersection: {len(train_dna_seqs.intersection(test_dna_seqs))}")
	logger.info(f"Val-Test DNA intersection: {len(val_dna_seqs.intersection(test_dna_seqs))}")

	logger.info(f"Train-Val DNA Cluster Rep intersection: {len(train_dna_reps.intersection(val_dna_reps))}")
	logger.info(f"Train-Test DNA Cluster Rep intersection: {len(train_dna_reps.intersection(test_dna_reps))}")
	logger.info(f"Val-Test DNA Cluster Rep intersection: {len(val_dna_reps.intersection(test_dna_reps))}")

	if split_by != "dna":
	assert len(train_tr_seqs.intersection(val_tr_seqs)) == 0
	assert len(train_tr_seqs.intersection(test_tr_seqs)) == 0
	assert len(val_tr_seqs.intersection(test_tr_seqs)) == 0
	logger.info(f"Pass! No overlap in TR sequences")

	assert len(train_tr_reps.intersection(val_tr_reps)) == 0
	assert len(train_tr_reps.intersection(test_tr_reps)) == 0
	assert len(val_tr_reps.intersection(test_tr_reps)) == 0
	logger.info(f"Pass! No overlap in TR cluster reps")

	if split_by != "protein":
	assert len(train_dna_seqs.intersection(val_dna_seqs)) == 0
	assert len(train_dna_seqs.intersection(test_dna_seqs)) == 0
	assert len(val_dna_seqs.intersection(test_dna_seqs)) == 0
	logger.info(f"Pass! No overlap in DNA sequences")

	assert len(train_dna_reps.intersection(val_dna_reps)) == 0
	assert len(train_dna_reps.intersection(test_dna_reps)) == 0
	assert len(val_dna_reps.intersection(test_dna_reps)) == 0
	logger.info(f"Pass! No overlap in DNA cluster reps")


	def augment_rc(df):
	"""
	Get the reverse complement and add it as a datapoint, effectively doubling the dataset.
	Also flip the orientation of the scores

	columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
	"""
	df_rc = df.copy(deep=True)

	df_rc["dna_sequence"] = df_rc["dna_sequence"].apply(
	lambda x: get_reverse_complement(x)
	)
	df_rc["ID"] = df_rc["ID"] + "_rc"
	df_rc["scores"] = df_rc["scores"].apply(lambda s: ",".join(s.split(",")[::-1]))

	final_df = pd.concat([df, df_rc]).reset_index(drop=True)

	return final_df


	def main(cfg: DictConfig):
	"""
	Take a set of DNA clusters + protein clusters, and create the best possible splits into train/val/test.
	"""
	# construct edges from training data
	edge_df = make_edges(
	processed_fimo_path=Path(root) / cfg.data_task.input_data_path,
	protein_cluster_path=Path(root) / cfg.data_task.cluster_output_paths.protein,
	dna_cluster_path=Path(root) / cfg.data_task.cluster_output_paths.dna,
	)
	edges = edge_df["edge"].unique().tolist()

	# figure out if we actually even have a conflict
	total_proteins = len(edge_df["tr_seqid"].unique().tolist())
	total_protein_clusters = len(edge_df["tr_cluster_rep"].unique().tolist())

	no_protein_overlap = (total_proteins) == (total_protein_clusters)
	logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")

	if cfg.data_task.split_by == "dna":
	if cfg.data_task.test_trs or cfg.data_task.test_dnas:
	logger.info(f"Splitting with predefined trs/dnas reserved for test set")
	splits, kept_by_split = split_with_predefined_test(
	full_df=edge_df,
	split_names=("train", "val", "test"),
	test_trs=cfg.data_task.test_trs if cfg.data_task.test_trs else None,
	test_dnas=cfg.data_task.test_dnas if cfg.data_task.test_dnas else None,
	ratios=(0.8, 0.1, 0.1),
	)
	train = splits["train"]
	train["split"]=["train"]*len(train)
	val = splits["val"]
	val["split"]=["val"]*len(val)
	test = splits["test"]
	test["split"]=["test"]*len(test)
	leaky_test = splits["leaky_test"]
	leaky_test["split"]=["leaky_test"]*len(leaky_test)
	else:
	logger.info(f"Easy split: all proteins are in their own clusters.")
	dna_clusters = edge_df["dna_cluster_rep"].unique().tolist()
	results = split_bipartite_fast(
	dna_clusters,
	split_names=("train", "val", "test"),
	ratios=(
	cfg.data_task.train_ratio,
	cfg.data_task.val_ratio,
	cfg.data_task.test_ratio,
	),
	)
	dna_assign, kept_by_split = results

	# assign datapoints to cluster by their DNA cluster rep
	edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
	train = edge_df.loc[edge_df["split"] == "train"].reset_index(drop=True)
	val = edge_df.loc[edge_df["split"] == "val"].reset_index(drop=True)
	test = edge_df.loc[edge_df["split"] == "test"].reset_index(drop=True)
	leaky_test = pd.DataFrame(columns=edge_df.columns)

	# Print ratios: hopefully close to desired (e.g. 80/10/10)
	print_split_ratios(kept_by_split)

	# Make train, val, test sets
	# make sure no ID is duplicate
	assert len(edge_df["ID"].unique()) == len(edge_df)
	split_cols = [
	"ID",
	"dna_sequence",
	"tr_sequence",
	"tr_cluster_rep",
	"dna_cluster_rep",
	"scores",
	"split",
	]
	train = train[split_cols]
	val = val[split_cols]
	test = test[split_cols]
	leaky_test = leaky_test[split_cols]

	# ensure there is no overlap
	check_validity(train, val, test, split_by=cfg.data_task.split_by)

	total = sum([len(train), len(val), len(test), len(leaky_test)])
	logger.info(f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)")
	logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
	logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
	logger.info(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)")
	logger.info(f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}")

	og_unique_dna = pd.concat([train, val, test, leaky_test])
	og_unique_dna = len(og_unique_dna["dna_sequence"].unique())

	## Now do RC data augmentation if asked
	if cfg.data_task.augment_rc:
	train = augment_rc(train)
	val = augment_rc(val)
	test = augment_rc(test)
	leaky_test = augment_rc(leaky_test)

	logger.info(f"Added reverse complement sequences to train, val, and test (and leaky test)")

	check_validity(train, val, test, split_by=cfg.data_task.split_by)

	total = sum([len(train), len(val), len(test), len(leaky_test)])
	logger.info(
	f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)"
	)
	logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
	logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
	logger.info(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)")
	logger.info(
	f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}"
	)

	# since we've added all these new DNA sequences, we do need a new mapping of seq id to dna sequence
	all_data = pd.concat([train, val, test, leaky_test])
	all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1]
	dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"]))
	assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"]))
	new_map_path = str(Path(root) / cfg.data_task.dna_map_path).replace(
	".json", "_with_rc.json"
	)

	with open(new_map_path, "w") as f:
	json.dump(dna_dict, f, indent=2)
	logger.info(
	f"Saved DNA maps with reverse complements (len {len(dna_dict)}=2original map of len {og_unique_dna}=={len(dna_dict)==2og_unique_dna}) to {new_map_path}"
	)

	# create the output dir
	split_out_dir = Path(root) / cfg.data_task.split_out_dir
	os.makedirs(split_out_dir, exist_ok=True)

	# add binary_scores to allow other training modes
	train["fimo_binary_sores"] = train["scores"].apply(lambda x: convert_scores(x, mode=1))
	val["fimo_binary_sores"] = val["scores"].apply(lambda x: convert_scores(x, mode=1))
	test["fimo_binary_sores"] = test["scores"].apply(lambda x: convert_scores(x, mode=1))
	leaky_test["fimo_binary_sores"] = leaky_test["scores"].apply(lambda x: convert_scores(x, mode=1))

	# slect final cols and save
	split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "fimo_binary_sores", "split"]
	train[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
	val[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
	test[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
	leaky_test[split_final_cols].to_csv(split_out_dir / "leaky_test.csv", index=False)
	logger.info(f"Saved all splits to {split_out_dir}")