| name: remap | |
| type: split | |
| max_protein_length: 1998 | |
| cluster_output_paths: | |
| dna: dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv | |
| protein: dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv | |
| input_data_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet | |
| split_out_dir: dpacman/data_files/processed/splits | |
| dna_map_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence.json | |
| split_by: dna # protein, dna, or both | |
| test_trs: ["trseq23","trseq26","trseq17"] | |
| test_dnas: null | |
| augment_rc: true | |
| test_ratio: 0.10 | |
| val_ratio: 0.10 | |
| train_ratio: 0.80 | |
| require_nonempty: true | |
| ratio_tolerance: null | |
| bigM: null | |
| seed: 0 |