{ "cells": [ { "cell_type": "markdown", "id": "82a7f2d0", "metadata": {}, "source": [ "# Scrap notebook for figuring out how to make better splits" ] }, { "cell_type": "code", "execution_count": 38, "id": "9004776a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDtr_seqiddna_seqidpeak_seqidchrpeak_idtr_namechipscoretotal_jaspar_hitsdna_sequencetr_sequencescores
0trseq26_dnaseq49877trseq26dnaseq49877peakseq24686chr12_peak1150NFYB6.01GCTCTTAAAGATGGTGTGTCCAGAGTTTGTTCCTTCAGATGTTCAG...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
1trseq26_dnaseq75052trseq26dnaseq75052peakseq12129chr9_peak512NFYB5.01TGTTGGTCTCGCTGACCTCAAGAACGGAGCCGTGGACCCTCGCGGT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
2trseq26_dnaseq14843trseq26dnaseq14843peakseq12863chr1_peak1335NFYB6.03AGTTTGGGTACTCAAATATGGTACCAGCAACCAGATGGTGAGTTGC...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
3trseq26_dnaseq39522trseq26dnaseq39522peakseq5250chr5_peak280NFYB4.01CTTGGAGAACCTTTATGTCTAGCTAAGGGATTGTAAATACACCAAT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
4trseq26_dnaseq49215trseq26dnaseq49215peakseq4451chr4_peak201NFYB138.02GCGGTGACTGTTACAGTTCTTAAAGGCGGCGTGTCTGGAGTTTGTT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
....................................
71152trseq9_dnaseq4066trseq9dnaseq4066peakseq12188chr18_peak213CTCF1000.01AATAATATATCTATTTCTTTATCTTTGTCTTCCCTACTGGACTAGC...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
71153trseq9_dnaseq62118trseq9dnaseq62118peakseq46829chr18_peak812CTCF1000.01TAAATATGTATTTTAGTAAAGTGTTATGATACACTGTGATGGGGGT...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
71154trseq9_dnaseq41538trseq9dnaseq41538peakseq3957chrY_peak4CTCF267.01GACAGGAGTTGTGTACGAATGTGTGTGAATGTGGGAGCCTAACTAG...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
71155trseq9_dnaseq38134trseq9dnaseq38134peakseq40502chr5_peak1955CTCF1000.02CTGGGCGGGTAGGTGAGAGGACAGGAGGGCGAAGTGGAGAGGAGGG...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
71156trseq9_dnaseq5106trseq9dnaseq5106peakseq37888chr1_peak4019CTCF14.02ACAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
\n", "

71157 rows × 11 columns

\n", "
" ], "text/plain": [ " ID tr_seqid dna_seqid peak_seqid \\\n", "0 trseq26_dnaseq49877 trseq26 dnaseq49877 peakseq24686 \n", "1 trseq26_dnaseq75052 trseq26 dnaseq75052 peakseq12129 \n", "2 trseq26_dnaseq14843 trseq26 dnaseq14843 peakseq12863 \n", "3 trseq26_dnaseq39522 trseq26 dnaseq39522 peakseq5250 \n", "4 trseq26_dnaseq49215 trseq26 dnaseq49215 peakseq4451 \n", "... ... ... ... ... \n", "71152 trseq9_dnaseq4066 trseq9 dnaseq4066 peakseq12188 \n", "71153 trseq9_dnaseq62118 trseq9 dnaseq62118 peakseq46829 \n", "71154 trseq9_dnaseq41538 trseq9 dnaseq41538 peakseq3957 \n", "71155 trseq9_dnaseq38134 trseq9 dnaseq38134 peakseq40502 \n", "71156 trseq9_dnaseq5106 trseq9 dnaseq5106 peakseq37888 \n", "\n", " chrpeak_id tr_name chipscore total_jaspar_hits \\\n", "0 chr12_peak1150 NFYB 6.0 1 \n", "1 chr9_peak512 NFYB 5.0 1 \n", "2 chr1_peak1335 NFYB 6.0 3 \n", "3 chr5_peak280 NFYB 4.0 1 \n", "4 chr4_peak201 NFYB 138.0 2 \n", "... ... ... ... ... \n", "71152 chr18_peak213 CTCF 1000.0 1 \n", "71153 chr18_peak812 CTCF 1000.0 1 \n", "71154 chrY_peak4 CTCF 267.0 1 \n", "71155 chr5_peak1955 CTCF 1000.0 2 \n", "71156 chr1_peak4019 CTCF 14.0 2 \n", "\n", " dna_sequence \\\n", "0 GCTCTTAAAGATGGTGTGTCCAGAGTTTGTTCCTTCAGATGTTCAG... \n", "1 TGTTGGTCTCGCTGACCTCAAGAACGGAGCCGTGGACCCTCGCGGT... \n", "2 AGTTTGGGTACTCAAATATGGTACCAGCAACCAGATGGTGAGTTGC... \n", "3 CTTGGAGAACCTTTATGTCTAGCTAAGGGATTGTAAATACACCAAT... \n", "4 GCGGTGACTGTTACAGTTCTTAAAGGCGGCGTGTCTGGAGTTTGTT... \n", "... ... \n", "71152 AATAATATATCTATTTCTTTATCTTTGTCTTCCCTACTGGACTAGC... \n", "71153 TAAATATGTATTTTAGTAAAGTGTTATGATACACTGTGATGGGGGT... \n", "71154 GACAGGAGTTGTGTACGAATGTGTGTGAATGTGGGAGCCTAACTAG... \n", "71155 CTGGGCGGGTAGGTGAGAGGACAGGAGGGCGAAGTGGAGAGGAGGG... \n", "71156 ACAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA... \n", "\n", " tr_sequence \\\n", "0 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "1 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "2 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "3 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "4 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "... ... \n", "71152 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "71153 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "71154 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "71155 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "71156 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "\n", " scores \n", "0 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "1 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "2 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "3 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "4 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "... ... \n", "71152 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "71153 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "71154 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "71155 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "71156 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", "\n", "[71157 rows x 11 columns]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "protein_clusters = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv\", sep=\"\\t\", header=None)\n", "protein_clusters.columns=[\"tr_cluster_rep\",\"tr_cluster_member\"]\n", "protein_clusters.head()\n", "\n", "dna_clusters = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv\", sep=\"\\t\", header=None)\n", "dna_clusters.columns=[\"dna_cluster_rep\",\"dna_cluster_member\"]\n", "dna_clusters.head()\n", "\n", "all_data = pd.read_parquet(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet\")\n", "all_data" ] }, { "cell_type": "code", "execution_count": null, "id": "ce5cf600", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29\n", "71134\n", "0\n", "0\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDtr_seqiddna_seqidpeak_seqidchrpeak_idtr_namechipscoretotal_jaspar_hitsdna_sequencetr_sequencescorestr_cluster_repdna_cluster_rep
0trseq26_dnaseq49877trseq26dnaseq49877peakseq24686chr12_peak1150NFYB6.01GCTCTTAAAGATGGTGTGTCCAGAGTTTGTTCCTTCAGATGTTCAG...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq58018
1trseq26_dnaseq75052trseq26dnaseq75052peakseq12129chr9_peak512NFYB5.01TGTTGGTCTCGCTGACCTCAAGAACGGAGCCGTGGACCCTCGCGGT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq75052
2trseq26_dnaseq14843trseq26dnaseq14843peakseq12863chr1_peak1335NFYB6.03AGTTTGGGTACTCAAATATGGTACCAGCAACCAGATGGTGAGTTGC...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq14843
3trseq26_dnaseq39522trseq26dnaseq39522peakseq5250chr5_peak280NFYB4.01CTTGGAGAACCTTTATGTCTAGCTAAGGGATTGTAAATACACCAAT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq3280
4trseq26_dnaseq49215trseq26dnaseq49215peakseq4451chr4_peak201NFYB138.02GCGGTGACTGTTACAGTTCTTAAAGGCGGCGTGTCTGGAGTTTGTT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq57257
\n", "
" ], "text/plain": [ " ID tr_seqid dna_seqid peak_seqid chrpeak_id \\\n", "0 trseq26_dnaseq49877 trseq26 dnaseq49877 peakseq24686 chr12_peak1150 \n", "1 trseq26_dnaseq75052 trseq26 dnaseq75052 peakseq12129 chr9_peak512 \n", "2 trseq26_dnaseq14843 trseq26 dnaseq14843 peakseq12863 chr1_peak1335 \n", "3 trseq26_dnaseq39522 trseq26 dnaseq39522 peakseq5250 chr5_peak280 \n", "4 trseq26_dnaseq49215 trseq26 dnaseq49215 peakseq4451 chr4_peak201 \n", "\n", " tr_name chipscore total_jaspar_hits \\\n", "0 NFYB 6.0 1 \n", "1 NFYB 5.0 1 \n", "2 NFYB 6.0 3 \n", "3 NFYB 4.0 1 \n", "4 NFYB 138.0 2 \n", "\n", " dna_sequence \\\n", "0 GCTCTTAAAGATGGTGTGTCCAGAGTTTGTTCCTTCAGATGTTCAG... \n", "1 TGTTGGTCTCGCTGACCTCAAGAACGGAGCCGTGGACCCTCGCGGT... \n", "2 AGTTTGGGTACTCAAATATGGTACCAGCAACCAGATGGTGAGTTGC... \n", "3 CTTGGAGAACCTTTATGTCTAGCTAAGGGATTGTAAATACACCAAT... \n", "4 GCGGTGACTGTTACAGTTCTTAAAGGCGGCGTGTCTGGAGTTTGTT... \n", "\n", " tr_sequence \\\n", "0 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "1 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "2 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "3 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "4 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "\n", " scores tr_cluster_rep \\\n", "0 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "1 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "2 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "3 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "4 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "\n", " dna_cluster_rep \n", "0 dnaseq58018 \n", "1 dnaseq75052 \n", "2 dnaseq14843 \n", "3 dnaseq3280 \n", "4 dnaseq57257 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "protein_cluster_map = dict(zip(protein_clusters[\"tr_cluster_member\"],protein_clusters[\"tr_cluster_rep\"]))\n", "dna_cluster_map = dict(zip(dna_clusters[\"dna_cluster_member\"],dna_clusters[\"dna_cluster_rep\"]))\n", "print(len(protein_cluster_map))\n", "print(len(dna_cluster_map))\n", "all_data[\"tr_cluster_rep\"] = all_data[\"tr_seqid\"].map(protein_cluster_map)\n", "all_data[\"dna_cluster_rep\"] = all_data[\"dna_seqid\"].map(dna_cluster_map)\n", "print(len(all_data[all_data[\"tr_cluster_rep\"].isna()]))\n", "print(len(all_data[all_data[\"dna_cluster_rep\"].isna()]))\n", "all_data.head()" ] }, { "cell_type": "code", "execution_count": 8, "id": "2f09e399", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0FromEntryReviewedEntry NameProtein namesGene NamesOrganismLengthSequenceInterProPfamMotifZinc fingerProtein familiesBinding siteSite
00E2F8A0AVK6reviewedE2F8_HUMANTranscription factor E2F8 (E2F-8)E2F8Homo sapiens (Human)867MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...IPR015633;IPR003316;IPR036388;IPR036390;PF02319;NaNNaNE2F/DP familyNaNNaN
11FEZF1A0PJY2reviewedFEZF1_HUMANFez family zinc finger protein 1 (Zinc finger ...FEZF1 FEZ ZNF312BHomo sapiens (Human)475MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...IPR036236;IPR013087;PF00096;PF13912;MOTIF 28..43; /note=\"Engrailed homology 1 repr...ZN_FING 260..282; /note=\"C2H2-type 1\"; /eviden...Krueppel C2H2-type zinc-finger protein familyNaNNaN
22ZNF320A2RRD8reviewedZN320_HUMANZinc finger protein 320ZNF320Homo sapiens (Human)509MALSQGLLTFRDVAIEFSQEEWKCLDPAQRTLYRDVMLENYRNLVS...IPR050589;IPR001909;IPR036051;IPR036236;IPR013...PF01352;PF00096;NaNZN_FING 161..183; /note=\"C2H2-type 1\"; /eviden...Krueppel C2H2-type zinc-finger protein familyNaNNaN
33BDP1A6H8Y1reviewedBDP1_HUMANTranscription factor TFIIIB component B'' homo...BDP1 KIAA1241 KIAA1689 TFNRHomo sapiens (Human)2624MFRRARLSVKPNVRPGVGARGSTASNPQRGRESPRPPDPATDSASK...IPR009057;IPR001005;IPR039467;PF15963;NaNNaNNaNNaNNaN
44ZNF316A6NFI3reviewedZN316_HUMANZinc finger protein 316ZNF316Homo sapiens (Human)1004MAALHTTPDSPAAQLERAEDGSECDPDQEEEEEEEEKGEEVQEVEE...IPR001909;IPR036051;IPR036236;IPR013087;PF01352;PF00096;NaNZN_FING 345..367; /note=\"C2H2-type 1\"; /eviden...Krueppel C2H2-type zinc-finger protein familyNaNNaN
......................................................
11771177GMEB1Q9Y692reviewedGMEB1_HUMANGlucocorticoid modulatory element-binding prot...GMEB1Homo sapiens (Human)573MANAEVSVPVGDVVVVPTEGNEGENPEDTKTQVILQLQPVQQGLFI...IPR010919;IPR000770;PF01342;NaNNaNNaNBINDING 113; /ligand=\"Zn(2+)\"; /ligand_id=\"ChE...NaN
11781178ZFP37Q9Y6Q3reviewedZFP37_HUMANZinc finger protein 37 homolog (Zfp-37)ZFP37Homo sapiens (Human)630MSVSSGVQILTKPETVDRRRSAETTKEAGRPLEMAVSEPEASAAEW...IPR001909;IPR036051;IPR050826;IPR056436;IPR036...PF01352;PF00096;PF23561;NaNZN_FING 293..315; /note=\"C2H2-type 1\"; /eviden...Krueppel C2H2-type zinc-finger protein familyNaNNaN
11791179NCOA3Q9Y6Q9reviewedNCOA3_HUMANNuclear receptor coactivator 3 (NCoA-3) (EC 2....NCOA3 AIB1 BHLHE42 RAC3 TRAM1Homo sapiens (Human)1424MSGLGENLDPLASDSRKRKLPCDTPGQGLTCSGEKRRREQESKYIE...IPR011598;IPR056193;IPR036638;IPR010011;IPR032...PF23172;PF07469;PF16279;PF16665;PF08815;PF0098...MOTIF 685..689; /note=\"LXXLL motif 1\"; MOTIF 7...NaNSRC/p160 nuclear receptor coactivator familyNaNNaN
11801180ZHX2Q9Y6X8reviewedZHX2_HUMANZinc fingers and homeoboxes protein 2 (Alpha-f...ZHX2 AFR1 KIAA0854 RAFHomo sapiens (Human)837MASKRKSTTPCMVRTSQVVEQDVPEEVDRAKEKGIGTPQPDVAKDS...IPR001356;IPR009057;IPR041057;IPR036236;IPR013...PF00046;PF18387;NaNZN_FING 78..101; /note=\"C2H2-type 1\"; /evidenc...ZHX familyNaNNaN
11811181MORC2Q9Y6X9reviewedMORC2_HUMANATPase MORC2 (EC 3.6.1.-) (MORC family CW-type...MORC2 KIAA0852 ZCWCC1Homo sapiens (Human)1032MAFTNYSSLNRAQLTFEYLHTNSTTHEFLFGALAELVDNARDADAT...IPR056360;IPR036890;IPR041006;IPR011124;PF23327;PF13589;PF17942;PF07496;NaNZN_FING 490..544; /note=\"CW-type\"; /evidence=\"...NaNBINDING 39; /ligand=\"ATP\"; /ligand_id=\"ChEBI:C...NaN
\n", "

1182 rows × 17 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 From Entry Reviewed Entry Name \\\n", "0 0 E2F8 A0AVK6 reviewed E2F8_HUMAN \n", "1 1 FEZF1 A0PJY2 reviewed FEZF1_HUMAN \n", "2 2 ZNF320 A2RRD8 reviewed ZN320_HUMAN \n", "3 3 BDP1 A6H8Y1 reviewed BDP1_HUMAN \n", "4 4 ZNF316 A6NFI3 reviewed ZN316_HUMAN \n", "... ... ... ... ... ... \n", "1177 1177 GMEB1 Q9Y692 reviewed GMEB1_HUMAN \n", "1178 1178 ZFP37 Q9Y6Q3 reviewed ZFP37_HUMAN \n", "1179 1179 NCOA3 Q9Y6Q9 reviewed NCOA3_HUMAN \n", "1180 1180 ZHX2 Q9Y6X8 reviewed ZHX2_HUMAN \n", "1181 1181 MORC2 Q9Y6X9 reviewed MORC2_HUMAN \n", "\n", " Protein names \\\n", "0 Transcription factor E2F8 (E2F-8) \n", "1 Fez family zinc finger protein 1 (Zinc finger ... \n", "2 Zinc finger protein 320 \n", "3 Transcription factor TFIIIB component B'' homo... \n", "4 Zinc finger protein 316 \n", "... ... \n", "1177 Glucocorticoid modulatory element-binding prot... \n", "1178 Zinc finger protein 37 homolog (Zfp-37) \n", "1179 Nuclear receptor coactivator 3 (NCoA-3) (EC 2.... \n", "1180 Zinc fingers and homeoboxes protein 2 (Alpha-f... \n", "1181 ATPase MORC2 (EC 3.6.1.-) (MORC family CW-type... \n", "\n", " Gene Names Organism Length \\\n", "0 E2F8 Homo sapiens (Human) 867 \n", "1 FEZF1 FEZ ZNF312B Homo sapiens (Human) 475 \n", "2 ZNF320 Homo sapiens (Human) 509 \n", "3 BDP1 KIAA1241 KIAA1689 TFNR Homo sapiens (Human) 2624 \n", "4 ZNF316 Homo sapiens (Human) 1004 \n", "... ... ... ... \n", "1177 GMEB1 Homo sapiens (Human) 573 \n", "1178 ZFP37 Homo sapiens (Human) 630 \n", "1179 NCOA3 AIB1 BHLHE42 RAC3 TRAM1 Homo sapiens (Human) 1424 \n", "1180 ZHX2 AFR1 KIAA0854 RAF Homo sapiens (Human) 837 \n", "1181 MORC2 KIAA0852 ZCWCC1 Homo sapiens (Human) 1032 \n", "\n", " Sequence \\\n", "0 MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT... \n", "1 MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL... \n", "2 MALSQGLLTFRDVAIEFSQEEWKCLDPAQRTLYRDVMLENYRNLVS... \n", "3 MFRRARLSVKPNVRPGVGARGSTASNPQRGRESPRPPDPATDSASK... \n", "4 MAALHTTPDSPAAQLERAEDGSECDPDQEEEEEEEEKGEEVQEVEE... \n", "... ... \n", "1177 MANAEVSVPVGDVVVVPTEGNEGENPEDTKTQVILQLQPVQQGLFI... \n", "1178 MSVSSGVQILTKPETVDRRRSAETTKEAGRPLEMAVSEPEASAAEW... \n", "1179 MSGLGENLDPLASDSRKRKLPCDTPGQGLTCSGEKRRREQESKYIE... \n", "1180 MASKRKSTTPCMVRTSQVVEQDVPEEVDRAKEKGIGTPQPDVAKDS... \n", "1181 MAFTNYSSLNRAQLTFEYLHTNSTTHEFLFGALAELVDNARDADAT... \n", "\n", " InterPro \\\n", "0 IPR015633;IPR003316;IPR036388;IPR036390; \n", "1 IPR036236;IPR013087; \n", "2 IPR050589;IPR001909;IPR036051;IPR036236;IPR013... \n", "3 IPR009057;IPR001005;IPR039467; \n", "4 IPR001909;IPR036051;IPR036236;IPR013087; \n", "... ... \n", "1177 IPR010919;IPR000770; \n", "1178 IPR001909;IPR036051;IPR050826;IPR056436;IPR036... \n", "1179 IPR011598;IPR056193;IPR036638;IPR010011;IPR032... \n", "1180 IPR001356;IPR009057;IPR041057;IPR036236;IPR013... \n", "1181 IPR056360;IPR036890;IPR041006;IPR011124; \n", "\n", " Pfam \\\n", "0 PF02319; \n", "1 PF00096;PF13912; \n", "2 PF01352;PF00096; \n", "3 PF15963; \n", "4 PF01352;PF00096; \n", "... ... \n", "1177 PF01342; \n", "1178 PF01352;PF00096;PF23561; \n", "1179 PF23172;PF07469;PF16279;PF16665;PF08815;PF0098... \n", "1180 PF00046;PF18387; \n", "1181 PF23327;PF13589;PF17942;PF07496; \n", "\n", " Motif \\\n", "0 NaN \n", "1 MOTIF 28..43; /note=\"Engrailed homology 1 repr... \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "... ... \n", "1177 NaN \n", "1178 NaN \n", "1179 MOTIF 685..689; /note=\"LXXLL motif 1\"; MOTIF 7... \n", "1180 NaN \n", "1181 NaN \n", "\n", " Zinc finger \\\n", "0 NaN \n", "1 ZN_FING 260..282; /note=\"C2H2-type 1\"; /eviden... \n", "2 ZN_FING 161..183; /note=\"C2H2-type 1\"; /eviden... \n", "3 NaN \n", "4 ZN_FING 345..367; /note=\"C2H2-type 1\"; /eviden... \n", "... ... \n", "1177 NaN \n", "1178 ZN_FING 293..315; /note=\"C2H2-type 1\"; /eviden... \n", "1179 NaN \n", "1180 ZN_FING 78..101; /note=\"C2H2-type 1\"; /evidenc... \n", "1181 ZN_FING 490..544; /note=\"CW-type\"; /evidence=\"... \n", "\n", " Protein families \\\n", "0 E2F/DP family \n", "1 Krueppel C2H2-type zinc-finger protein family \n", "2 Krueppel C2H2-type zinc-finger protein family \n", "3 NaN \n", "4 Krueppel C2H2-type zinc-finger protein family \n", "... ... \n", "1177 NaN \n", "1178 Krueppel C2H2-type zinc-finger protein family \n", "1179 SRC/p160 nuclear receptor coactivator family \n", "1180 ZHX family \n", "1181 NaN \n", "\n", " Binding site Site \n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "... ... ... \n", "1177 BINDING 113; /ligand=\"Zn(2+)\"; /ligand_id=\"ChE... NaN \n", "1178 NaN NaN \n", "1179 NaN NaN \n", "1180 NaN NaN \n", "1181 BINDING 39; /ligand=\"ATP\"; /ligand_id=\"ChEBI:C... NaN \n", "\n", "[1182 rows x 17 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "idmap = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/remap/idmapping_reviewed_true_processed_2025_08_11.tsv\", sep=\"\\t\")\n", "idmap" ] }, { "cell_type": "code", "execution_count": 9, "id": "94f26da4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'trseq13': 'MGRKKIQITRIMDERNRQVTFTKRKFGLMKKAYELSVLCDCEIALIIFNSSNKLFQYASTDMDKVLLKYTEYNEPHESRTNSDIVEALNKKEHRGCDSPDPDTSYVLTPHTEEKYKKINEEFDNMMRNHKIAPGLPPQNFSMSVTVPVTSPNALSYTNPGSSLVSPSLAASSTLTDSSMLSPPQTTLHRNVSPGAPQRPPSTGNAGGMLSTTDLTVPNGAGSSPVGNGFVNSRASPNLIGATGANSLGKVMPTKSPPPPGGGNLGMNSRKPDLRVVIPPSSKGMMPPLSEEEELELNTQRISSSQATQPLATPVVSVTTPSLPPQGLVYSAMPTAYNTDYSLTSADLSALQGFNSPGMLSLGQVSAWQQHHLGQAALSSLVAGGQLSQGSNLSINTNQNISIKSEPISPPRDRMTPSGFQQQQQQQQQQQPPPPPQPQPQPPQPQPRQEMGRSPVDSLSSSSSSYDGSDREDPRGDFHSPIVLGRPPNTEDRESPSVKRMRMDAWVT',\n", " 'trseq8': 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD',\n", " 'trseq29': 'MTSSSPAGLEGSDLSSINTMMSAVMSVGKVTENGGSPQGIKSPSKPPGPNRIGRRNQETKEEKSSYNCPLCEKICTTQHQLTMHIRQHNTDTGGADHSCSICGKSLSSASSLDRHMLVHSGERPYKCTVCGQSFTTNGNMHRHMKIHEKDPNSATATAPPSPLKRRRLSSKRKLSHDAESEREDPAPAKKMVEDGQSGDLEKKADEVFHCPVCFKEFVCKYGLETHMETHSDNPLRCDICCVTFRTHRGLLRHNALVHKQLPRDAMGRPFIQNNPSIPAGFHDLGFTDFSCRKFPRISQAWCETNLRRCISEQHRFVCDTCDKAFPMLCSLALHKQTHVAADQGQEKPQATPLPGDALDQKGFLALLGLQHTKDVRPAPAEEPLPDDNQAIQLQTLKCQLPQDPGCTNLLSLSPFEAASLGGSLTVLPATKDSIKHLSLQPFQKGFIIQPDSSIVVKPISGESAIELADIQQILKMAASAPPQISLPPFSKAPAAPLQAIFKHMPPLKPKPLVTPRTVVATSTPPPLINAQQASPGCISPSLPPPPLKLLKGSVEAASNAHLLQSKSGTQPHAATRLSLQQPRAELPGQPEMKTQLEQDSIIEALLPLSMEAKIKQEITEGELKAFMTAPGGKKTPAMRKVLYPCRFCNQVFAFSGVLRAHVRSHLGISPYQCNICDYIAADKAALIRHLRTHSGERPYICKICHYPFTVKANCERHLRKKHLKATRKDIEKNIEYVSSSAAELVDAFCAPDTVCRLCGEDLKHYRALRIHMRTHCGRGLGGGHKGRKPFECKECSAAFAAKRNCIHHILKQHLHVPEQDIESYVLAADGLGPAEAPAAEASGRGEDSGCAALGDCKPLTAFLEPQNGFLHRGPTQPPPPHVSIKLEPASSFAVDFNEPLDFSQKGLALVQVKQENISFLSPSSLVPYDCSMEPIDLSIPKNFRKGDKDLATPSEAKKPEEEAGSSEQPSPCPAPGPSLPVTLGPSGILESPMAPAPAATPEPPAQPLQGPVQLAVPIYSSALVSSPPLVGSSALLSGTALLRPLRPKPPLLLPKPPVTEELPPLASIAQIISSVSSAPTLLKTKVADPGPASTGSNTTASDSLGGSVPKAATTATPAATTSPKESSEPPAPASSPEAASPTEQGPAGTSKKRGRKRGMRSRPRANSGGVDLDSSGEFASIEKMLATTDTNKFSPFLQTAEDNTQDEVAGAPADHHGPSDEEQGSPPEDKLLRAKRNSYTNCLQKITCPHCPRVFPWASSLQRHMLTHTDSQSDAETAAAAGEVLDLTSRDREQPSEGATELRQVAGDAPVEQATAETASPVHREEHGRGESHEPEEEHGTEESTGDADGAEEDASSNQSLDLDFATKLMDFKLAEGDGEAGAGGAASQEQKLACDTCGKSFKFLGTLSRHRKAHGRQEPKDEKGDGASTAEEGPQPAPEQEEKPPETPAEVVESAPGAGEAPAEKLAEETEGPSDGESAAEKRSSEKSDDDKKPKTDSPKSVASKADKRKKVCSVCNKRFWSLQDLTRHMRSHTGERPYKCQTCERTFTLKHSLVRHQRIHQKARHAKHHGKDSDKEERGEEDSENESTHSGNNAVSENEAELAPNASNHMAVTRSRKEGLASATKDCSHREEKVTAGWPSEPGQGDLNPESPAALGQDLLEPRSKRPAHPILATADGASQLVGME',\n", " 'trseq20': 'MPVERMRMRPWLEEQINSNTIPGLKWLNKEKKIFQIPWMHAARHGWDVEKDAPLFRNWAIHTGKHQPGVDKPDPKTWKANFRCAMNSLPDIEEVKDKSIKKGNNAFRVYRMLPLSERPSKKGKKPKTEKEDKVKHIKQEPVESSLGLSNGVSDLSPEYAVLTSTIKNEVDSTVNIIVVGQSHLDSNIENQEIVTNPPDICQVVEVTTESDEQPVSMSELYPLQISPVSSYAESETTDSVPSDEESAEGRPHWRKRNIEGKQYLSNMGTRGSYLLPGMASFVTSNKPDLQVTIKEESNPVPYNSSWPPFQDLPLSSSMTPASSSSRPDRETRASVIKKTSDITQARVKSC',\n", " 'trseq17': 'MNFETSRCATLQYCPDPYIQRFVETPAHFSWKESYYRSTMSQSTQTNEFLSPEVFQHIWDFLEQPICSVQPIDLNFVDEPSEDGATNKIEISMDCIRMQDSDLSDPMWPQYTNLGLLNSMDQQIQNGSSSTSPYNTDHAQNSVTAPSPYAQPSSTFDALSPSPAIPSNTDYPGPHSFDVSFQQSSTAKSATWTYSTELKKLYCQIAKTCPIQIKVMTPPPQGAVIRAMPVYKKAEHVTEVVKRCPNHELSREFNEGQIAPPSHLIRVEGNSHAQYVEDPITGRQSVLVPYEPPQVGTEFTTVLYNFMCNSSCVGGMNRRPILIIVTLETRDGQVLGRRCFEARICACPGRDRKADEDSIRKQQVSDSTKNGDGTKRPFRQNTHGIQMTSIKKRRSPDDELLYLPVRGRETYEMLLKIKESLELMQYLPQHTIETYRQQQQQQHQHLLQKQTSIQSPSSYGNSSPPLNKMNSMNKLPSVSQLINPQQRNALTPTTIPDGMGANIPMMGTHMPMAGDMNGLSPTQALPPPLSMPSTSHCTPPPPYPTDCSIVSFLARLGCSSCLDYFTTQGLTTIYQIEHYSMDDLASLKIPEQFRHAIWKGILDHRQLHEFSSPSHLLRTPSSASTVSVGSSETRGERVIDAVRFTLRQTISFPPRDEWNDFNFDMDARRNKQQRIKEEGE',\n", " 'trseq28': 'MTSPSPRIQIISTDSAVASPQRIQIVTDQQTGQKIQIVTAVDASGSPKQQFILTSPDGAGTGKVILASPETSSAKQLIFTTSDNLVPGRIQIVTDSASVERLLGKTDVQRPQVVEYCVVCGDKASGRHYGAVSCEGCKGFFKRSVRKNLTYSCRSNQDCIINKHHRNRCQFCRLKKCLEMGMKMESVQSERKPFDVQREKPSNCAASTEKIYIRKDLRSPLIATPTFVADKDGARQTGLLDPGMLVNIQQPLIREDGTVLLATDSKAETSQGALGTLANVVTSLANLSESLNNGDTSEIQPEDQSASEITRAFDTLAKALNTTDSSSSPSLADGIDTSGGGSIHVISRDQSTPIIEVEGPLLSDTHVTFKLTMPSPMPEYLNVHYICESASRLLFLSMHWARSIPAFQALGQDCNTSLVRACWNELFTLGLAQCAQVMSLSTILAAIVNHLQNSIQEDKLSGDRIKQVMEHIWKLQEFCNSMAKLDIDGYEYAYLKAIVLFSPDHPGLTSTSQIEKFQEKAQMELQDYVQKTYSEDTYRLARILVRLPALRLMSSNITEELFFTGLIGNVSIDSIIPYILKMETAEYNGQITGASL',\n", " 'trseq27': 'MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNYPEGAAYEFNAAAAANAQVYGQTGLPYGPGSEAAAFGSNGLGGFPPLNSVSPSPLMLLHPPPQLSPFLQPHGQQVPYYLENEPSGYTVREAGPPAFYRPNSDNRRQGGRERLASTNDKGSMAMESAKETRYCAVCNDYASGYHYGVWSCEGCKAFFKRSIQGHNDYMCPATNQCTIDKNRRKSCQACRLRKCYEVGMMKGGIRKDRRGGRMLKHKRQRDDGEGRGEVGSAGDMRAANLWPSPLMIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTSRGGASVEETDQSHLATAGSTSSHSLQKYYITGEAEGFPATV',\n", " 'trseq15': 'MLWKITDNVKYEEDCEDRHDGSSNGNPRVPHLSSAGQHLYSPAPPLSHTGVAEYQPPPYFPPPYQQLAYSQSADPYSHLGEAYAAAINPLHQPAPTGSQQQAWPGRQSQEGAGLPSHHGRPAGLLPHLSGLEAGAVSARRDAYRRSDLLLPHAHALDAAGLAENLGLHDMPHQMDEVQNVDDQHLLLHDQTVIRKGPISMTKNPLNLPCQKELVGAVMNPTEVFCSVPGRLSLLSSTSKYKVTVAEVQRRLSPPECLNASLLGGVLRRAKSKNGGRSLREKLDKIGLNLPAGRRKAAHVTLLTSLVEGEAVHLARDFAYVCEAEFPSKPVAEYLTRPHLGGRNEMAARKNMLLAAQQLCKEFTELLSQDRTPHGTSRLAPVLETNIQNCLSHFSLITHGFGSQAICAAVSALQNYIKEALIVIDKSYMNPGDQSPADSNKTLEKMEKHRK',\n", " 'trseq4': 'MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRRFRFQEAAGPREALSRLQELCHGWLRPEMRTKEQILELLVLEQFLTILPQEIQSRVQELHPESGEEAVTLVEDMQRELGRLRQQVTNHGRGTEVLLEEPLPLETARESPSFKLEPMETERSPGPRLQELLGPSPQRDPQAVKERALSAPWLSLFPPEGNMEDKEMTGPQLPESLEDVAMYISQEEWGHQDPSKRALSRDTVQESYENVDSLESHIPSQEVPGTQVGQGGKLWDPSVQSCKEGLSPRGPAPGEEKFENLEGVPSVCSENIHPQVLLPDQARGEVPWSPELGRPHDRSQGDWAPPPEGGMEQALAGASSGRELGRPKELQPKKLHLCPLCGKNFSNNSNLIRHQRIHAAERLCMGVDCTEIFGGNPRFLSLHRAHLGEEAHKCLECGKCFSQNTHLTRHQRTHTGEKPYQCNICGKCFSCNSNLHRHQRTHTGEKPYKCPECGEIFAHSSNLLRHQRIHTGERPYKCPECGKSFSRSSHLVIHERTHERERLYPFSECGEAVSDSTPFLTNHGAHKAEKKLFECLTCGKSFRQGMHLTRHQRTHTGEKPYKCTLCGENFSHRSNLIRHQRIHTGEKPYTCHECGDSFSHSSNRIRHLRTHTGERPYKCSECGESFSRSSRLMSHQRTHTG',\n", " 'trseq18': 'MNTTDNGVNCLCAICGDRATGKHYGASSCDGCKGFFRRSIRKSHVYSCRFSRQCVVDKDKRNQCRYCRLRKCFRAGMKKEAVQNERDRISTRRSTFDGSNIPSINTLAQAEVRSRQISVSSPGSSTDINVKKIASIGDVCESMKQQLLVLVEWAKYIPAFCELPLDDQVALLRAHAGEHLLLGATKRSMMYKDILLLGNNYVIHRNSCEVEISRVANRVLDELVRPFQEIQIDDNEYACLKAIVFFDPDAKGLSDPVKIKNMRFQVQIGLEDYINDRQYDSRGRFGELLLLLPTLQSITWQMIEQIQFVKLFGMVKIDNLLQEMLLGGASNDGSHLHHPMHPHLSQDPLTGQTILLGPMSTLVHADQISTPETPLPSPPQGSGQEQYKIAANQASVISHQHLSKQKQL',\n", " 'trseq26': 'MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTNGSKESFREQDIYLPIANVARIMKNAIPQTGKIAKDAKECVQECVSEFISFITSEASERCHQEKRKTINGEDILFAMSTLGFDSYVEPLKLYLQKFREAMKGEKGIGGAVTATDGLSEELTEEAFTNQLPAGLITTDGQQQNVMVYTTSYQQISGVQQIQFS',\n", " 'trseq25': 'MSVDPLSSKALKIKRELSENTPHLSDEALMGLSVRELNRHLRGLSAEEVTRLKQRRRTLKNRGYAASCRVKRVCQKEELQKQKSELEREVDKLARENAAMRLELDALRGKCEALQGFARSVAAARGPATLVAPASVITIVKSTPGSGSGPAHGPDPAHGPASCS',\n", " 'trseq16': 'MMQESGTETKSNGSAIQNGSGGSNHLLECGGLREGRSNGETPAVDIGAADLAHAQQQQQQALQVARQLLLQQQQQQQVSGLKSPKRNDKQPALQVPVSVAMMTPQVITPQQMQQILQQQVLSPQQLQVLLQQQQALMLQQQQLQEFYKKQQEQLQLQLLQQQHAGKQPKEQQQVATQQLAFQQQLLQMQQLQQQHLLSLQRQGLLTIQPGQPALPLQPLAQGMIPTELQQLWKEVTSAHTAEETTGNNHSSLDLTTTCVSSSAPSKTSLIMNPHASTNGQLSVHTPKRESLSHEEHPHSHPLYGHGVCKWPGCEAVCEDFQSFLKHLNSEHALDDRSTAQCRVQMQVVQQLELQLAKDKERLQAMMTHLHVKSTEPKAAPQPLNLVSSVTLSKSASEASPQSLPHTPTTPTAPLTPVTQGPSVITTTSMHTVGPIRRRYSDKYNVPISSADIAQNQEFYKNAEVRPPFTYASLIRQAILESPEKQLTLNEIYNWFTRMFAYFRRNAATWKNAVRHNLSLHKCFVRVENVKGAVWTVDEVEFQKRRPQKISGNPSLIKNMQSSHAYCTPLNAALQASMAENSIPLYTTASMGNPTLGNLASAIREELNGAMEHTNSNESDSSPGRSPMQAVHPVHVKEEPLDPEEAEGPLSLVTTANHSPDFDHDRDYEDEPVNEDME',\n", " 'trseq10': 'MEQYTANSNSSTEQIVVQAGQIQQQQQGGVTAVQLQTEAQVASASGQQVQTLQVVQGQPLMVQVSGGQLITSTGQPIMVQAVPGGQGQTIMQVPVSGTQGLQQIQLVPPGQIQIQGGQAVQVQGQQGQTQQIIIQQPQTAVTAGQTQTQQQIAVQGQQVAQTAEGQTIVYQPVNADGTILQQVTVPVSGMITIPAASLAGAQIVQTGANTNTTSSGQGTVTVTLPVAGNVVNSGGMVMMVPGAGSVPAIQRIPLPGAEMLEEEPLYVNAKQYHRILKRRQARAKLEAEGKIPKERRKYLHESRHRHAMARKRGEGGRFFSPKEKDSPHMQDPNQADEEAMTQIIRVS',\n", " 'trseq23': 'MSDPQTSMAATAAVSPSDYLQPAASTTQDSQPSPLALLAATCSKIGPPAVEAAVTPPAPPQPTPRKLVPIKPAPLPLSPGKNSFGILSSKGNILQIQGSQLSASYPGGQLVFAIQNPTMINKGTRSNANIQYQAVPQIQASNSQTIQVQPNLTNQIQIIPGTNQAIITPSPSSHKPVPIKPAPIQKSSTTTTPVQSGANVVKLTGGGGNVTLTLPVNNLVNASDTGAPTQLLTESPPTPLSKTNKKARKKSLPASQPPVAVAEQVETVLIETTADNIIQAGNNLLIVQSPGGGQPAVVQQVQVVPPKAEQQQVVQIPQQALRVVQAASATLPTVPQKPSQNFQIQAAEPTPTQVYIRTPSGEVQTVLVQDSPPATAAATSNTTCSSPASRAPHLSGTSKKHSAAILRKERPLPKIAPAGSIISLNAAQLAAAAQAMQTININGVQVQGVPVTITNTGGQQQLTVQNVSGNNLTISGLSPTQIQLQMEQALAGETQPGEKRRRMACTCPNCKDGEKRSGEQGKKKHVCHIPDCGKTFRKTSLLRAHVRLHTGERPFVCNWFFCGKRFTRSDELQRHARTHTGDKRFECAQCQKRFMRSDHLTKHYKTHLVTKNL',\n", " 'trseq11': 'MESRKLISATDIQYSGSLLNSLNEQRGHGLFCDVTVIVEDRKFRAHKNILSASSTYFHQLFSVAGQVVELSFIRAEIFAEILNYIYSSKIVRVRSDLLDELIKSGQLLGVKFIAELGVPLSQVKSISGTAQDGNTEPLPPDSGDKNLVIQKSKDEAQDNGATIMPIITESFSLSAEDYEMKKIIVTDSDDDDDDVIFCSEILPTKETLPSNNTVAQVQSNPGPVAISDVAPSASNNSPPLTNITPTQKLPTPVNQATLSQTQGSEKLLVSSAPTHLTPNIILLNQTPLSTPPNVSSSLPNHMPSSINLLVQNQQTPNSAILTGNKANEEEEEEIIDDDDDTISSSPDSAVSNTSLVPQADTSQNTSFDGSLIQKMQIPTLLQEPLSNSLKISDIITRNTNDPGVGSKHLMEGQKIITLDTATEIEGLSTGCKVYANIGEDTYDIVIPVKDDPDEGEARLENEIPKTSGSEMANKRMKVKHDDHYELIVDGRVYYICIVCKRSYVCLTSLRRHFNIHSWEKKYPCRYCEKVFPLAEYRTKHEIHHTGERRYQCLACGKSFINYQFMSSHIKSVHSQDPSGDSKLYRLHPCRSLQIRQYAYLSDRSSTIPAMKDDGIGYKVDTGKEPPVGTTTSTQNKPMTWEDIFIQQENDSIFKQNVTDGSTEFEFIIPESY',\n", " 'trseq3': 'MASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEEEDDDDEDGGGGDHGGGGGHGHAGHHHHHHHHHHHPPMIALQPLVTDDPTQVHHHQEVILVQTREEVVGGDDSDGLRAEDGFEDQILIPVPAPAGGDDDYIEQTLVTVAAAGKSGGGGSSSSGGGRVKKGGGKKSGKKSYLSGGAGAAGGGGADPGNKKWEQKQVQIKTLEGEFSVTMWSSDEKKDIDHETVVEEQIIGENSPPDYSEYMTGKKLPPGGIPGIDLSDPKQLAEFARMKPRKIKEDDAPRTIACPHKGCTKMFRDNSAMRKHLHTHGPRVHVCAECGKAFVESSKLKRHQLVHTGEKPFQCTFEGCGKRFSLDFNLRTHVRIHTGDRPYVCPFDGCNKKFAQSTNLKSHILTHAKAKNNQ',\n", " 'trseq22': 'MRKGIQPALEQYLVTAGGGEGAAVVAAAAAASMDKRALLASPGFAAAAAAAAAPGAYIQILTTNTSTTSCSSSLQSGAVAAGPLLPSAPGAEQTAGSLLYTTPHGPSSRAGLLQQPPALGRGGSGGGGGPPAKRRLELGESGHQYLSDGLKTPKGKGRAALRSPDSPKTPKSPSEKTRYDTSLGLLTKKFIQLLSQSPDGVLDLNKAAEVLKVQKRRIYDITNVLEGIHLIKKKSKNNVQWMGCSLSEDGGMLAQCQGLSKEVTELSQEEKKLDELIQSCTLDLKLLTEDSENQRLAYVTYQDIRKISGLKDQTVIVVKAPPETRLEVPDSIESLQIHLASTQGPIEVYLCPEETETHSPMKTNNQDHNGNIPKPASKDLASTNSGHSDCSVSMGNLSPLASPANLLQQTEDQIPSNLEGPFVNLLPPLLQEDYLLSLGEEEGISDLFDAYDLEKLPLVEDFMCS',\n", " 'trseq7': 'MDSKESLTPGREENPSSVLAQERGDVMDFYKTLRGGATVKVSASSPSLAVASQSDSKQRRLLVDFPKGSVSNAQQPDLSKAVSLSMGLYMGETETKVMGNDLGFPQQGQISLSSGETDLKLLEESIANLNRSTSVPENPKSSASTAVSAAPTEKEFPKTHSDVSSEQQHLKGQTGTNGGNVKLYTTDQSTFDILQDLEFSSGSPGKETNESPWRSDLLIDENCLLSPLAGEDDSFLLEGNSNEDCKPLILPDTKPKIKDNGDLVLSSPSNVTLPQVKTEKEDFIELCTPGVIKQEKLGTVYCQASFPGANIIGNKMSAISVHGVSTSGGQMYHYDMNTASLSQQQDQKPIFNVIPPIPVGSENWNRCQGSGDDNLTSLGTLNFPGRTVFSNGYSSPSMRPDVSSPPSSSSTATTGPPPKLCLVCSDEASGCHYGVLTCGSCKVFFKRAVEGQHNYLCAGRNDCIIDKIRRKNCPACRYRKCLQAGMNLEARKTKKKIKGIQQATTGVSQETSENPGNKTIVPATLPQLTPTLVSLLEVIEPEVLYAGYDSSVPDSTWRIMTTLNMLGGRQVIAAVKWAKAIPGFRNLHLDDQMTLLQYSWMFLMAFALGWRSYRQSSANLLCFAPDLIINEQRMTLPCMYDQCKHMLYVSSELHRLQVSYEEYLCMKTLLLLSSVPKDGLKSQELFDEIRMTYIKELGKAIVKREGNSSQNWQRFYQLTKLLDSMHEVVENLLNYCFQTFLDKTMSIEFPEMLAEIITNQIPKYSNGNIKKLLFHQK',\n", " 'trseq19': 'MPITRMRMRPWLEMQINSNQIPGLIWINKEEMIFQIPWKHAAKHGWDINKDACLFRSWAIHTGRYKAGEKEPDPKTWKANFRCAMNSLPDIEEVKDQSRNKGSSAVRVYRMLPPLTKNQRKERKSKSSRDAKSKAKRKSCGDSSPDTFSDGLSSSTLPDDHSSYTVPGYMQDLEVEQALTPALSPCAVSSTLPDWHIPVEVVPDSTSDLYNFQVSPMPSTSEATTDEDEEGKLPEDIMKLLEQSEWQPTNVDGKGYLLNEPGVQPTSVYGDFSCKEEPEIDSPGGDIGLSLQRVFTDLKNMDATWLDSLLTPVRLPSIQAIPCAP',\n", " 'trseq5': 'MATQVMGQSSGGGGLFTSSGNIGMALPNDMYDLHDLSKAELAAPQLIMLANVALTGEVNGSCCDYLVGEERQMAELMPVGDNNFSDSEEGEGLEESADIKGEPHGLENMELRSLELSVVEPQPVFEASGAPDIYSSNKDLPPETPGAEDKGKSSKTKPFRCKPCQYEAESEEQFVHHIRVHSAKKFFVEESAEKQAKARESGSSTAEEGDFSKGPIRCDRCGYNTNRYDHYTAHLKHHTRAGDNERVYKCIICTYTTVSEYHWRKHLRNHFPRKVYTCGKCNYFSDRKNNYVQHVRTHTGERPYKCELCPYSSSQKTHLTRHMRTHSGEKPFKCDQCSYVASNQHEVTRHARQVHNGPKPLNCPHCDYKTADRSNFKKHVELHVNPRQFNCPVCDYAASKKCNLQYHFKSKHPTCPNKTMDVSKVKLKKTKKREADLPDNITNEKTEIEQTKIKGDVAGKKNEKSVKAEKRDVSKEKKPSNNVSVIQVTTRTRKSVTEVKEMDVHTGSNSEKFSKTKKSKRKLEVDSHSLHGPVNDEESSTKKKKKVESKSKNNSQEVPKGDSKVEENKKQNTCMKKSTKKKTLKNKSSKKSSKPPQKEPVEKGSAQMDPPQMGPAPTEAVQKGPVQVEPPPPMEHAQMEGAQIRPAPDEPVQMEVVQEGPAQKELLPPVEPAQMVGAQIVLAHMELPPPMETAQTEVAQMGPAPMEPAQMEVAQVESAPMQVVQKEPVQMELSPPMEVVQKEPVQIELSPPMEVVQKEPVKIELSPPIEVVQKEPVQMELSPPMGVVQKEPAQREPPPPREPPLHMEPISKKPPLRKDKKEKSNMQSERARKEQVLIEVGLVPVKDSWLLKESVSTEDLSPPSPPLPKENLREEASGDQKLLNTGEGNKEAPLQKVGAEEADESLPGLAANINESTHISSSGQNLNTPEGETLNGKHQTDSIVCEMKMDTDQNTRENLTGINSTVEEPVSPMLPPSAVEEREAVSKTALASPPATMAANESQEIDEDEGIHSHEGSDLSDNMSEGSDDSGLHGARPVPQESSRKNAKEALAVKAAKGDFVCIFCDRSFRKGKDYSKHLNRHLVNVYYLEEAAQGQE',\n", " 'trseq2': 'MAAVVQQNDLVFEFASNVMEDERQLGDPAIFPAVIVEHVPGADILNSYAGLACVEEPNDMITESSLDVAEEEIIDDDDDDITLTVEASCHDGDETIETIEAAEALLNMDSPGPMLDEKRINNNIFSSPEDDMVVAPVTHVSVTLDGIPEVMETQQVQEKYADSPGASSPEQPKRKKGRKTKPPRPDSPATTPNISVKKKNKDGKGNTIYLWEFLLALLQDKATCPKYIKWTQREKGIFKLVDSKAVSRLWGKHKNKPDMNYETMGRALRYYYQRGILAKVEGQRLVYQFKEMPKDLIYINDEDPSSSIESSDPSLSSSATSNRNQTSRSRVSSSPGVKGGATTVLKPGNSKAAKPKDPVEVAQPSEVLRTVQPTQSPYPTQLFRTVHVVQPVQAVPEGEAARTSTMQDETLNSSVQSIRTIQAPTQVPVVVSPRNQQLHTVTLQTVPLTTVIASTDPSAGTGSQKFILQAIPSSQPMTVLKENVMLQSQKAGSPPSIVLGPAQVQQVLTSNVQTICNGTVSVASSPSFSATAPVVTFSPRSSQLVAHPPGTVITSVIKTQETKTLTQEVEKKESEDHLKENTEKTEQQPQPYVMVVSSSNGFTSQVAMKQNELLEPNSF',\n", " 'trseq21': 'MQNSEGGADSPASVALRPSAAAPPVPASPQRVLVQAASSNPKGAQMQPISLPRVQQVPQQVQPVQHVYPAQVQYVEGGDAVYTNGAIRTAYTYNPEPQMYAPSSTASYFEAPGGAQVTVAASSPPAVPSHSMVGITMDVGGSPIVSSAGAYLIHGGMDSTRHSLAHTSRSSPATLEMAIENLQKSEGITSHKSGLLNSHLQWLLDNYETAEGVSLPRSSLYNHYLRHCQEHKLDPVNAASFGKLIRSVFMGLRTRRLGTRGNSKYHYYGIRLKPDSPLNRLQEDTQYMAMRQQPMHQKPRYRPAQKTDSLGDSGSHSGLHSTPEQTMAVQSQHHQQYIDVSHVFPEFPAPDLGSFLLQDGVTLHDVKALQLVYRRHCEATVDVVMNLQFHYIEKLWLSFWNSKASSSDGPTSLPASDEDPEGAVLPKDKLISLCQCDPILRWMRSCDHILYQALVEILIPDVLRPVPSTLTQAIRNFAKSLEGWLTNAMSDFPQQVIQTKVGVVSAFAQTLRRYTSLNHLAQAARAVLQNTSQINQMLSDLNRVDFANVQEQASWVCQCEESVVQRLEQDFKLTLQQQSSLDQWASWLDSVVTQVLKQHAGSPSFPKAARQFLLKWSFYSSMVIRDLTLRSAASFGSFHLIRLLYDEYMFYLVEHRVAEATGETPIAVMGEFNDLASLSLTLLDKDDMGDEQRGSEAGPDARSLGEPLVKRERSDPNHSLQGI',\n", " 'trseq12': 'MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAASAAPPGASLLLLQQQQQQQQQQQQQQQQQQQQQQQETSPRQQQQQQGEDGSPQAHRRGPTGYLVLDEEQQPSQPQSALECHPERGCVPEPGAAVAASKGLPQQLPAPPDEDDSAAPSTLSLLGPTFPGLSSCSADLKDILSEASTMQLLQQQQQEAVSEGSSSGRAREASGAPTSSKDNYLGGTSTISDNAKELCKAVSVSMGLGVEALEHLSPGEQLRGDCMYAPLLGVPPAVRPTPCAPLAECKGSLLDDSAGKSTEDTAEYSPFKGGYTKGLEGESLGCSGSAAAGSSGTLELPSTLSLYKSGALDEAAAYQSRDYYNFPLALAGPPPPPPPPHPHARIKLENPLDYGSAWAAAAAQCRYGDLASLHGAGAAGPGSGSPSAAASSSWHTLFTAEEGQLYGPCGGGGGGGGGGGGGGGGGGGGGGGEAGAVAPYGYTRPPQGLAGQESDFTAPDVWYPGGMVSRVPYPSPTCVKSEMGPWMDSYSGPYGDMRLETARDHVLPIDYYFPPQKTCLICGDEASGCHYGALTCGSCKVFFKRAAEGKQKYLCASRNDCTIDKFRRKNCPSCRLRKCYEAGMTLGARKLKKLGNLKLQEEGEASSTTSPTEETTQKLTVSHIEGYECQPIFLNVLEAIEPGVVCAGHDNNQPDSFAALLSSLNELGERQLVHVVKWAKALPGFRNLHVDDQMAVIQYSWMGLMVFAMGWRSFTNVNSRMLYFAPDLVFNEYRMHKSRMYSQCVRMRHLSQEFGWLQITPQEFLCMKALLLFSIIPVDGLKNQKFFDELRMNYIKELDRIIACKRKNPTSCSRRFYQLTKLLDSVQPIARELHQFTFDLLIKSHMVSVDFPEMMAEIISVQVPKILSGKVKPIYFHTQ',\n", " 'trseq1': 'MAAAKAEMQLMSPLQISDPFGSFPHSPTMDNYPKLEEMMLLSNGAPQFLGAAGAPEGSGSNSSSSSSGGGGGGGGGSNSSSSSSTFNPQADTGEQPYEHLTAESFPDISLNNEKVLVETSYPSQTTRLPPITYTGRFSLEPAPNSGNTLWPEPLFSLVSGLVSMTNPPASSSSAPSPAASSASASQSPPLSCAVPSNDSSPIYSAAPTFPTPNTDIFPEPQSQAFPGSAGTALQYPPPAYPAAKGGFQVPMIPDYLFPQQQGDLGLGTPDQKPFQGLESRTQQPSLTPLSTIKAFATQSGSQDLKALNTSYQSQLIKPSRMRKYPNRPSKTPPHERPYACPVESCDRRFSRSDELTRHIRIHTGQKPFQCRICMRNFSRSDHLTTHIRTHTGEKPFACDICGRKFARSDERKRHTKIHLRQKDKKADKSVVASSATSSLSSYPSPVATSYPSPVTTSYPSPATTSYPSPVPTSFSSPGSSTYPSPVHSGFPSPSVATTYSSVPPAFPAQVSSFPSSAVTNSFSASTGLSDMTATFSPRTIEIC',\n", " 'trseq6': 'MDLPVGPGAAGPSNVPAFLTKLWTLVSDPDTDALICWSPSGNSFHVFDQGQFAKEVLPKYFKHNNMASFVRQLNMYGFRKVVHIEQGGLVKPERDDTEFQHPCFLRGQEQLLENIKRKVTSVSTLKSEDIKIRQDSVTKLLTDVQLMKGKQECMDSKLLAMKHENEALWREVASLRQKHAQQQKVVNKLIQFLISLVQSNRILGVKRKIPLMLNDSGSAHSMPKYSRQFSLEHVHGSGPYSAPSPAYSSSSLYAPDAVASSGPIISDITELAPASPMASPGGSIDERPLSSSPLVRVKEEPPSPPQSPRVEEASPGRPSSVDTLLSPTALIDSILRESEPAPASVTALTDARGHTDTEGRPPSPPPTSTPEKCLSVACLDKNELSDHLDAMDSNLDNLQTMLSSHGFSVDTSALLDLFSPSVTVPDMSLPDLDSSLASIQELLSPQEPPRPPEAENSSPDSGKQLVHYTAQPLFLLDPGSVDTGSNDLPVLFELGEGSYFSEGDGFAEDPTISLLTGSEPPKAKDPTVS',\n", " 'trseq24': 'MSDQDHSMDEMTAVVKIEKGVGGNNGGNGNGGGAFSQARSSSTGSSSSTGGGGQESQPSPLALLAATCSRIESPNENSNNSQGPSQSGGTGELDLTATQLSQGANGWQIISSSSGATPTSKEQSGSSTNGSNGSESSKNRTVSGGQYVVAAAPNLQNQQVLTGLPGVMPNIQYQVIPQFQTVDGQQLQFAATGAQVQQDGSGQIQIIPGANQQIITNRGSGGNIIAAMPNLLQQAVPLQGLANNVLSGQTQYVTNVPVALNGNITLLPVNSVSAATLTPSSQAVTISSSGSQESGSQPVTSGTTISSASLVSSQASSSSFFTNANSYSTTTTTSNMGIMNFTTSGSSGTNSQGQTPQRVSGLQGSDALNIQQNQTSGGSLQAGQQKEGEQNQQTQQQQILIQPQLVQGGQALQALQAAPLSGQTFTTQAISQETLQNLQLQAVPNSGPIIIRTPTVGPNGQVSWQTLQLQNLQVQNPQAQTITLAPMQGVSLGQTSSSNTTLTPIASAASIPAGTVTVNAAQLSSMPGLQTINLSALGTSGIQVHPIQGLPLAIANAPGDHGAQLGLHGAGGDGIHDDTAGGEEGENSPDAQPQAGRRTRREACTCPYCKDSEGRGSGDPGKKKQHICHIQGCGKVYGKTSHLRAHLRWHTGERPFMCTWSYCGKRFTRSDELQRHKRTHTGEKKFACPECPKRFMRSDHLSKHIKTHQNKKGGPGVALSVGTLPLDSGAGSEGSGTATPSALITTNMVAMEAICPEGIARLANSGINVMQVADLQSINISGNGF',\n", " 'trseq9': 'MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTDGGEVVQDVNSSVQMVMMEQLDPTLLQMKTEVMEGTVAPEAEAAVDDTQIITLQVVNMEEQPINIGELQLVQVPVPVTVPVATTSVEELQGAYENEVSKEGLAESEPMICHTLPLPEGFQVVKVGANGEVETLEQGELPPQEDPSWQKDPDYQPPAKKTKKTKKSKLRYTEEGKDVDVSVYDFEEEQQEGLLSEVNAEKVVGNMKPPKPTKIKKKGVKKTFQCELCSYTCPRRSNLDRHMKSHTDERPHKCHLCGRAFRTVTLLRNHLNTHTGTRPHKCPDCDMAFVTSGELVRHRRYKHTHEKPFKCSMCDYASVEVSKLKRHIRSHTGERPFQCSLCSYASRDTYKLKRHMRTHSGEKPYECYICHARFTQSGTMKMHILQKHTENVAKFHCPHCDTVIARKSDLGVHLRKQHSYIEQGKKCRYCDAVFHERYALIQHQKSHKNEKRFKCDQCDYACRQERHMIMHKRTHTGEKPYACSHCDKTFRQKQLLDMHFKRYHDPNFVPAAFVCSKCGKTFTRRNTMARHADNCAGPDGVEGENGGETKKSKRGRKRKMRSKKEDSSDSENAEPDLDDNEDEEEPAVEIEPEPEPQPVTPAPPPAKKRRGRPPGRTNQPKQNQPTAIIQVEDQNTGAIENIIVEVKKEPDAEPAEGEEEEAQPAATDAPNGDLTPEMILSMMDR',\n", " 'trseq14': 'MLPTQAGAAAALGRGSALGGSLNRTPTGRPGGGGGTRGANGGRVPGNGAGLGPGRLEREAAAAAATTPAPTAGALYSGSEGDSESGEEEELGAERRGLKRSLSEMEIGMVVGGPEASAAATGGYGPVSGAVSGAKPGKKTRGRVKIKMEFIDNKLRRYTTFSKRKTGIMKKAYELSTLTGTQVLLLVASETGHVYTFATRKLQPMITSETGKALIQTCLNSPDSPPRSDPTTDQRMSATGFEETDLTYQVSESDSSGETKDTLKPAFTVTNLPGTTSTIQTAPSTSTTMQVSSGPSFPITNYLAPVSASVSPSAVSSANGTVLKSTGSGPVSSGGLMQLPTSFTLMPGGAVAQQVPVQAIQVHQAPQQASPSRDSSTDLTQTSSSGTVTLPATIMTSSVPTTVGGHMMYPSPHAVMYAPTSGLGDGSLTVLNAFSQAPSTMQVSHSQVQEPGGVPQVFLTASSGTVQIPVSAVQLHQMAVIGQQAGSSSNLTELQVVNLDTAHSTKSE'}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/tr_seqid_to_tr_sequence.json\", \"r\") as f:\n", " import json\n", " tr_map = json.load(f)\n", "\n", "tr_map" ] }, { "cell_type": "code", "execution_count": 10, "id": "9ad16929", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDtr_seqiddna_seqidpeak_seqidchrpeak_idtr_namechipscoretotal_jaspar_hitsdna_sequencetr_sequencescorestr_cluster_repdna_cluster_rep
0trseq26_dnaseq49877trseq26dnaseq49877peakseq24686chr12_peak1150NFYB6.01GCTCTTAAAGATGGTGTGTCCAGAGTTTGTTCCTTCAGATGTTCAG...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq58018
1trseq26_dnaseq75052trseq26dnaseq75052peakseq12129chr9_peak512NFYB5.01TGTTGGTCTCGCTGACCTCAAGAACGGAGCCGTGGACCCTCGCGGT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq75052
2trseq26_dnaseq14843trseq26dnaseq14843peakseq12863chr1_peak1335NFYB6.03AGTTTGGGTACTCAAATATGGTACCAGCAACCAGATGGTGAGTTGC...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq14843
3trseq26_dnaseq39522trseq26dnaseq39522peakseq5250chr5_peak280NFYB4.01CTTGGAGAACCTTTATGTCTAGCTAAGGGATTGTAAATACACCAAT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq3280
4trseq26_dnaseq49215trseq26dnaseq49215peakseq4451chr4_peak201NFYB138.02GCGGTGACTGTTACAGTTCTTAAAGGCGGCGTGTCTGGAGTTTGTT...MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq26dnaseq57257
..........................................
71152trseq9_dnaseq4066trseq9dnaseq4066peakseq12188chr18_peak213CTCF1000.01AATAATATATCTATTTCTTTATCTTTGTCTTCCCTACTGGACTAGC...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq9dnaseq48336
71153trseq9_dnaseq62118trseq9dnaseq62118peakseq46829chr18_peak812CTCF1000.01TAAATATGTATTTTAGTAAAGTGTTATGATACACTGTGATGGGGGT...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq9dnaseq62118
71154trseq9_dnaseq41538trseq9dnaseq41538peakseq3957chrY_peak4CTCF267.01GACAGGAGTTGTGTACGAATGTGTGTGAATGTGGGAGCCTAACTAG...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq9dnaseq41538
71155trseq9_dnaseq38134trseq9dnaseq38134peakseq40502chr5_peak1955CTCF1000.02CTGGGCGGGTAGGTGAGAGGACAGGAGGGCGAAGTGGAGAGGAGGG...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq9dnaseq73435
71156trseq9_dnaseq5106trseq9dnaseq5106peakseq37888chr1_peak4019CTCF14.02ACAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA...MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq9dnaseq5106
\n", "

71157 rows × 13 columns

\n", "
" ], "text/plain": [ " ID tr_seqid dna_seqid peak_seqid \\\n", "0 trseq26_dnaseq49877 trseq26 dnaseq49877 peakseq24686 \n", "1 trseq26_dnaseq75052 trseq26 dnaseq75052 peakseq12129 \n", "2 trseq26_dnaseq14843 trseq26 dnaseq14843 peakseq12863 \n", "3 trseq26_dnaseq39522 trseq26 dnaseq39522 peakseq5250 \n", "4 trseq26_dnaseq49215 trseq26 dnaseq49215 peakseq4451 \n", "... ... ... ... ... \n", "71152 trseq9_dnaseq4066 trseq9 dnaseq4066 peakseq12188 \n", "71153 trseq9_dnaseq62118 trseq9 dnaseq62118 peakseq46829 \n", "71154 trseq9_dnaseq41538 trseq9 dnaseq41538 peakseq3957 \n", "71155 trseq9_dnaseq38134 trseq9 dnaseq38134 peakseq40502 \n", "71156 trseq9_dnaseq5106 trseq9 dnaseq5106 peakseq37888 \n", "\n", " chrpeak_id tr_name chipscore total_jaspar_hits \\\n", "0 chr12_peak1150 NFYB 6.0 1 \n", "1 chr9_peak512 NFYB 5.0 1 \n", "2 chr1_peak1335 NFYB 6.0 3 \n", "3 chr5_peak280 NFYB 4.0 1 \n", "4 chr4_peak201 NFYB 138.0 2 \n", "... ... ... ... ... \n", "71152 chr18_peak213 CTCF 1000.0 1 \n", "71153 chr18_peak812 CTCF 1000.0 1 \n", "71154 chrY_peak4 CTCF 267.0 1 \n", "71155 chr5_peak1955 CTCF 1000.0 2 \n", "71156 chr1_peak4019 CTCF 14.0 2 \n", "\n", " dna_sequence \\\n", "0 GCTCTTAAAGATGGTGTGTCCAGAGTTTGTTCCTTCAGATGTTCAG... \n", "1 TGTTGGTCTCGCTGACCTCAAGAACGGAGCCGTGGACCCTCGCGGT... \n", "2 AGTTTGGGTACTCAAATATGGTACCAGCAACCAGATGGTGAGTTGC... \n", "3 CTTGGAGAACCTTTATGTCTAGCTAAGGGATTGTAAATACACCAAT... \n", "4 GCGGTGACTGTTACAGTTCTTAAAGGCGGCGTGTCTGGAGTTTGTT... \n", "... ... \n", "71152 AATAATATATCTATTTCTTTATCTTTGTCTTCCCTACTGGACTAGC... \n", "71153 TAAATATGTATTTTAGTAAAGTGTTATGATACACTGTGATGGGGGT... \n", "71154 GACAGGAGTTGTGTACGAATGTGTGTGAATGTGGGAGCCTAACTAG... \n", "71155 CTGGGCGGGTAGGTGAGAGGACAGGAGGGCGAAGTGGAGAGGAGGG... \n", "71156 ACAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA... \n", "\n", " tr_sequence \\\n", "0 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "1 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "2 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "3 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "4 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "... ... \n", "71152 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "71153 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "71154 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "71155 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "71156 MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTD... \n", "\n", " scores tr_cluster_rep \\\n", "0 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "1 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "2 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "3 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "4 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq26 \n", "... ... ... \n", "71152 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq9 \n", "71153 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq9 \n", "71154 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq9 \n", "71155 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq9 \n", "71156 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq9 \n", "\n", " dna_cluster_rep \n", "0 dnaseq58018 \n", "1 dnaseq75052 \n", "2 dnaseq14843 \n", "3 dnaseq3280 \n", "4 dnaseq57257 \n", "... ... \n", "71152 dnaseq48336 \n", "71153 dnaseq62118 \n", "71154 dnaseq41538 \n", "71155 dnaseq73435 \n", "71156 dnaseq5106 \n", "\n", "[71157 rows x 13 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_data" ] }, { "cell_type": "code", "execution_count": null, "id": "aa60c388", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tr_cluster_repcountProtein namesProtein families
0trseq446445Zinc finger protein 263 (Zinc finger protein F...Krueppel C2H2-type zinc-finger protein family
1trseq18962Early growth response protein 1 (EGR-1) (AT225...EGR C2H2-type zinc-finger protein family
2trseq236407Transcription factor Sp2Sp1 C2H2-type zinc-finger protein family
3trseq92641Transcriptional repressor CTCF (11-zinc finger...CTCF zinc-finger protein family
4trseq51839RE1-silencing transcription factor (Neural-res...NaN
5trseq191820Interferon regulatory factor 1 (IRF-1)IRF family
6trseq26873Nuclear transcription factor Y subunit beta (C...NFYB/HAP3 subunit family
7trseq24873Transcription factor Sp1Sp1 C2H2-type zinc-finger protein family
8trseq10528Nuclear transcription factor Y subunit alpha (...NFYA/HAP2 subunit family
9trseq16487Forkhead box protein P1 (Mac-1-regulated forkh...NaN
10trseq2971Ras-responsive element-binding protein 1 (RREB...Krueppel C2H2-type zinc-finger protein family
11trseq1462Serum response factor (SRF)NaN
12trseq2134DNA-binding protein RFX2 (Regulatory factor X 2)RFX family
13trseq1730Tumor protein 63 (p63) (Chronic ulcerative sto...P53 family
14trseq2829Nuclear receptor subfamily 2 group C member 2 ...Nuclear hormone receptor family, NR2 subfamily
15trseq2216Transcription factor E2F3 (E2F-3)E2F/DP family
16trseq159Transcription factor AP-2 gamma (AP2-gamma) (A...AP-2 family
17trseq29ETS-related transcription factor Elf-1 (E74-li...ETS family
18trseq203Interferon regulatory factor 2 (IRF-2)IRF family
19trseq83Cellular tumor antigen p53 (Antigen NY-CO-13) ...P53 family
20trseq273Estrogen receptor (ER) (ER-alpha) (Estradiol r...Nuclear hormone receptor family, NR3 subfamily
21trseq112Transcriptional regulator Kaiso (Zinc finger a...NaN
22trseq252Transcription factor MafF (U-Maf) (V-maf muscu...BZIP family, Maf subfamily
23trseq62Heat shock factor protein 1 (HSF 1) (Heat shoc...HSF family
24trseq32Transcriptional repressor protein YY1 (Delta t...YY transcription factor family
25trseq182Hepatocyte nuclear factor 4-gamma (HNF-4-gamma...Nuclear hormone receptor family, NR2 subfamily
26trseq131Myocyte-specific enhancer factor 2A (Serum res...MEF2 family
27trseq71Glucocorticoid receptor (GR) (Nuclear receptor...Nuclear hormone receptor family, NR3 subfamily
28trseq121Androgen receptor (Dihydrotestosterone recepto...Nuclear hormone receptor family, NR3 subfamily
\n", "
" ], "text/plain": [ " tr_cluster_rep count Protein names \\\n", "0 trseq4 46445 Zinc finger protein 263 (Zinc finger protein F... \n", "1 trseq1 8962 Early growth response protein 1 (EGR-1) (AT225... \n", "2 trseq23 6407 Transcription factor Sp2 \n", "3 trseq9 2641 Transcriptional repressor CTCF (11-zinc finger... \n", "4 trseq5 1839 RE1-silencing transcription factor (Neural-res... \n", "5 trseq19 1820 Interferon regulatory factor 1 (IRF-1) \n", "6 trseq26 873 Nuclear transcription factor Y subunit beta (C... \n", "7 trseq24 873 Transcription factor Sp1 \n", "8 trseq10 528 Nuclear transcription factor Y subunit alpha (... \n", "9 trseq16 487 Forkhead box protein P1 (Mac-1-regulated forkh... \n", "10 trseq29 71 Ras-responsive element-binding protein 1 (RREB... \n", "11 trseq14 62 Serum response factor (SRF) \n", "12 trseq21 34 DNA-binding protein RFX2 (Regulatory factor X 2) \n", "13 trseq17 30 Tumor protein 63 (p63) (Chronic ulcerative sto... \n", "14 trseq28 29 Nuclear receptor subfamily 2 group C member 2 ... \n", "15 trseq22 16 Transcription factor E2F3 (E2F-3) \n", "16 trseq15 9 Transcription factor AP-2 gamma (AP2-gamma) (A... \n", "17 trseq2 9 ETS-related transcription factor Elf-1 (E74-li... \n", "18 trseq20 3 Interferon regulatory factor 2 (IRF-2) \n", "19 trseq8 3 Cellular tumor antigen p53 (Antigen NY-CO-13) ... \n", "20 trseq27 3 Estrogen receptor (ER) (ER-alpha) (Estradiol r... \n", "21 trseq11 2 Transcriptional regulator Kaiso (Zinc finger a... \n", "22 trseq25 2 Transcription factor MafF (U-Maf) (V-maf muscu... \n", "23 trseq6 2 Heat shock factor protein 1 (HSF 1) (Heat shoc... \n", "24 trseq3 2 Transcriptional repressor protein YY1 (Delta t... \n", "25 trseq18 2 Hepatocyte nuclear factor 4-gamma (HNF-4-gamma... \n", "26 trseq13 1 Myocyte-specific enhancer factor 2A (Serum res... \n", "27 trseq7 1 Glucocorticoid receptor (GR) (Nuclear receptor... \n", "28 trseq12 1 Androgen receptor (Dihydrotestosterone recepto... \n", "\n", " Protein families \n", "0 Krueppel C2H2-type zinc-finger protein family \n", "1 EGR C2H2-type zinc-finger protein family \n", "2 Sp1 C2H2-type zinc-finger protein family \n", "3 CTCF zinc-finger protein family \n", "4 NaN \n", "5 IRF family \n", "6 NFYB/HAP3 subunit family \n", "7 Sp1 C2H2-type zinc-finger protein family \n", "8 NFYA/HAP2 subunit family \n", "9 NaN \n", "10 Krueppel C2H2-type zinc-finger protein family \n", "11 NaN \n", "12 RFX family \n", "13 P53 family \n", "14 Nuclear hormone receptor family, NR2 subfamily \n", "15 E2F/DP family \n", "16 AP-2 family \n", "17 ETS family \n", "18 IRF family \n", "19 P53 family \n", "20 Nuclear hormone receptor family, NR3 subfamily \n", "21 NaN \n", "22 BZIP family, Maf subfamily \n", "23 HSF family \n", "24 YY transcription factor family \n", "25 Nuclear hormone receptor family, NR2 subfamily \n", "26 MEF2 family \n", "27 Nuclear hormone receptor family, NR3 subfamily \n", "28 Nuclear hormone receptor family, NR3 subfamily " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vcs = all_data[\"tr_cluster_rep\"].value_counts().reset_index()\n", "vcs[\"sequence\"] = vcs[\"tr_cluster_rep\"].map(tr_map)\n", "vcs = pd.merge(vcs, idmap.rename(columns={\"Sequence\":\"sequence\"}),on=\"sequence\")\n", "vcs[[\"tr_cluster_rep\",\"count\",\"Protein names\",\"Protein families\"]]" ] }, { "cell_type": "code", "execution_count": 39, "id": "3628496c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29\n", "71134\n", "0\n", "0\n", "Rows in test: 7310\n", "Rows to be split between train and val: 59068\n", "Total rows: 66378. Test percentage: 11.01%\n", "Lost rows: 4779\n", "Rows in val: 4462\n", "Rows left for train: 54002\n", "Total rows: 58464. Test percentage: 7.63%\n", "Lost rows: 5383\n" ] } ], "source": [ "## Full pipeline\n", "import pandas as pd\n", "protein_clusters = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv\", sep=\"\\t\", header=None)\n", "protein_clusters.columns=[\"tr_cluster_rep\",\"tr_cluster_member\"]\n", "protein_clusters.head()\n", "\n", "dna_clusters = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv\", sep=\"\\t\", header=None)\n", "dna_clusters.columns=[\"dna_cluster_rep\",\"dna_cluster_member\"]\n", "dna_clusters.head()\n", "\n", "all_data = pd.read_parquet(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet\")\n", "all_data\n", "\n", "protein_cluster_map = dict(zip(protein_clusters[\"tr_cluster_member\"],protein_clusters[\"tr_cluster_rep\"]))\n", "dna_cluster_map = dict(zip(dna_clusters[\"dna_cluster_member\"],dna_clusters[\"dna_cluster_rep\"]))\n", "print(len(protein_cluster_map))\n", "print(len(dna_cluster_map))\n", "all_data[\"tr_cluster_rep\"] = all_data[\"tr_seqid\"].map(protein_cluster_map)\n", "all_data[\"dna_cluster_rep\"] = all_data[\"dna_seqid\"].map(dna_cluster_map)\n", "print(len(all_data[all_data[\"tr_cluster_rep\"].isna()]))\n", "print(len(all_data[all_data[\"dna_cluster_rep\"].isna()]))\n", "all_data.head()\n", "\n", "\n", "### handpick test\n", "handpicked_test_trs = [\"trseq23\",\"trseq26\",\"trseq17\"]\n", "handpicked_test = all_data.loc[\n", " all_data[\"tr_cluster_rep\"].isin(handpicked_test_trs)\n", "].reset_index(drop=True)\n", "\n", "off_limits_dna_clusters = handpicked_test[\"dna_cluster_rep\"].unique().tolist()\n", "remaining = all_data.loc[\n", " (~all_data[\"tr_cluster_rep\"].isin(handpicked_test_trs)) & \n", " (~all_data[\"dna_cluster_rep\"].isin(off_limits_dna_clusters)) \n", "].reset_index(drop=True)\n", "\n", "test_ids = handpicked_test[\"ID\"].unique().tolist()\n", "remaining_ids = remaining[\"ID\"].unique().tolist()\n", "lost_rows = all_data.loc[\n", " (~all_data[\"ID\"].isin(test_ids)) & \n", " (~all_data[\"ID\"].isin(remaining_ids))\n", "]\n", "print(f\"Rows in test: {len(handpicked_test)}\")\n", "print(f\"Rows to be split between train and val: {len(remaining)}\")\n", "total_rows = len(handpicked_test) + len(remaining)\n", "print(f\"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%\")\n", "print(f\"Lost rows: {len(lost_rows)}\")\n", "\n", "### handpick val\n", "handpicked_val_trs = [\"trseq9\", \"trseq5\", \"trseq28\"]\n", "\n", "handpicked_val = remaining.loc[\n", " remaining[\"tr_cluster_rep\"].isin(handpicked_val_trs)\n", "].reset_index(drop=True)\n", "\n", "off_limits_dna_clusters = handpicked_val[\"dna_cluster_rep\"].unique().tolist()\n", "train_remain = remaining.loc[\n", " (~remaining[\"tr_cluster_rep\"].isin(handpicked_val_trs)) & \n", " (~remaining[\"dna_cluster_rep\"].isin(off_limits_dna_clusters)) \n", "].reset_index(drop=True)\n", "\n", "val_ids = handpicked_val[\"ID\"].unique().tolist()\n", "train_remain_ids = train_remain[\"ID\"].unique().tolist()\n", "lost_rows = all_data.loc[\n", " (~all_data[\"ID\"].isin(test_ids)) & \n", " (~all_data[\"ID\"].isin(val_ids)) & \n", " (~all_data[\"ID\"].isin(train_remain_ids))\n", "]\n", "print(f\"Rows in val: {len(handpicked_val)}\")\n", "print(f\"Rows left for train: {len(train_remain)}\")\n", "total_rows = len(handpicked_val) + len(train_remain)\n", "print(f\"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%\")\n", "print(f\"Lost rows: {len(lost_rows)}\")" ] }, { "cell_type": "code", "execution_count": 1, "id": "d92891a1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29\n", "71134\n", "0\n", "0\n", "Rows in test: 7310\n", "Rows to be split between train and val: 59068\n", "Total rows: 66378. Test percentage: 11.01%\n", "Lost rows: 4779\n", "Rows in val: 4462\n", "Rows left for train: 54002\n", "Total rows: 58464. Test percentage: 7.63%\n", "Lost rows: 5383\n", "Original total: 71157\n", "New, exclusive total: 65774\n", "Lost rows: 5383\n", "Length train: 54002/65774 (82.10%)\n", "Length val: 4462/65774 (6.78%)\n", "Length test: 7310/65774 (11.11%)\n", "Pass! No overlap in IDs\n", "Train-Val TR intersection: 0\n", "Train-Test TR intersection: 0\n", "Val-Test TR intersection: 0\n", "Train-Val TR Cluster Rep intersection: 0\n", "Train-Test TR Cluster Rep intersection: 0\n", "Val-Test TR Cluster Rep intersection: 0\n", "Train-Val DNA intersection: 0\n", "Train-Test DNA intersection: 0\n", "Val-Test DNA intersection: 0\n", "Train-Val DNA Cluster Rep intersection: 0\n", "Train-Test DNA Cluster Rep intersection: 0\n", "Val-Test DNA Cluster Rep intersection: 0\n", "Added reverse complement sequences to train_exclusive, val_exclusive, and test_exclusive (and leaky test_exclusive)\n", "Pass! No overlap in IDs\n", "Train-Val TR intersection: 0\n", "Train-Test TR intersection: 0\n", "Val-Test TR intersection: 0\n", "Train-Val TR Cluster Rep intersection: 0\n", "Train-Test TR Cluster Rep intersection: 0\n", "Val-Test TR Cluster Rep intersection: 0\n", "Train-Val DNA intersection: 0\n", "Train-Test DNA intersection: 0\n", "Val-Test DNA intersection: 0\n", "Train-Val DNA Cluster Rep intersection: 0\n", "Train-Test DNA Cluster Rep intersection: 0\n", "Val-Test DNA Cluster Rep intersection: 0\n", "Length of train_exclusive dataset: 108004 (75.89%)\n", "Length of val_exclusive dataset: 8924 (6.27%)\n", "Length of test_exclusive dataset: 14620 (10.27%)\n", "Length of leaky_test dataset: 10766 (7.56%)\n", "Total sequences = 142314. Same as edges size*2? True\n", "Saved all splits to /home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test\n" ] } ], "source": [ "## Full pipeline\n", "import pandas as pd\n", "protein_clusters = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv\", sep=\"\\t\", header=None)\n", "protein_clusters.columns=[\"tr_cluster_rep\",\"tr_cluster_member\"]\n", "protein_clusters.head()\n", "\n", "dna_clusters = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv\", sep=\"\\t\", header=None)\n", "dna_clusters.columns=[\"dna_cluster_rep\",\"dna_cluster_member\"]\n", "dna_clusters.head()\n", "\n", "all_data = pd.read_parquet(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet\")\n", "all_data\n", "\n", "protein_cluster_map = dict(zip(protein_clusters[\"tr_cluster_member\"],protein_clusters[\"tr_cluster_rep\"]))\n", "dna_cluster_map = dict(zip(dna_clusters[\"dna_cluster_member\"],dna_clusters[\"dna_cluster_rep\"]))\n", "print(len(protein_cluster_map))\n", "print(len(dna_cluster_map))\n", "all_data[\"tr_cluster_rep\"] = all_data[\"tr_seqid\"].map(protein_cluster_map)\n", "all_data[\"dna_cluster_rep\"] = all_data[\"dna_seqid\"].map(dna_cluster_map)\n", "print(len(all_data[all_data[\"tr_cluster_rep\"].isna()]))\n", "print(len(all_data[all_data[\"dna_cluster_rep\"].isna()]))\n", "all_data.head()\n", "\n", "\n", "### handpick test\n", "handpicked_test_trs = [\"trseq23\",\"trseq26\",\"trseq17\"]\n", "handpicked_test = all_data.loc[\n", " all_data[\"tr_cluster_rep\"].isin(handpicked_test_trs)\n", "].reset_index(drop=True)\n", "\n", "off_limits_dna_clusters = handpicked_test[\"dna_cluster_rep\"].unique().tolist()\n", "remaining = all_data.loc[\n", " (~all_data[\"tr_cluster_rep\"].isin(handpicked_test_trs)) & \n", " (~all_data[\"dna_cluster_rep\"].isin(off_limits_dna_clusters)) \n", "].reset_index(drop=True)\n", "\n", "test_ids = handpicked_test[\"ID\"].unique().tolist()\n", "remaining_ids = remaining[\"ID\"].unique().tolist()\n", "lost_rows = all_data.loc[\n", " (~all_data[\"ID\"].isin(test_ids)) & \n", " (~all_data[\"ID\"].isin(remaining_ids))\n", "]\n", "print(f\"Rows in test: {len(handpicked_test)}\")\n", "print(f\"Rows to be split between train and val: {len(remaining)}\")\n", "total_rows = len(handpicked_test) + len(remaining)\n", "print(f\"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%\")\n", "print(f\"Lost rows: {len(lost_rows)}\")\n", "\n", "### handpick val\n", "handpicked_val_trs = [\"trseq9\", \"trseq5\", \"trseq28\"]\n", "\n", "handpicked_val = remaining.loc[\n", " remaining[\"tr_cluster_rep\"].isin(handpicked_val_trs)\n", "].reset_index(drop=True)\n", "\n", "off_limits_dna_clusters = handpicked_val[\"dna_cluster_rep\"].unique().tolist()\n", "train_remain = remaining.loc[\n", " (~remaining[\"tr_cluster_rep\"].isin(handpicked_val_trs)) & \n", " (~remaining[\"dna_cluster_rep\"].isin(off_limits_dna_clusters)) \n", "].reset_index(drop=True)\n", "\n", "val_ids = handpicked_val[\"ID\"].unique().tolist()\n", "train_remain_ids = train_remain[\"ID\"].unique().tolist()\n", "lost_rows = all_data.loc[\n", " (~all_data[\"ID\"].isin(test_ids)) & \n", " (~all_data[\"ID\"].isin(val_ids)) & \n", " (~all_data[\"ID\"].isin(train_remain_ids))\n", "]\n", "print(f\"Rows in val: {len(handpicked_val)}\")\n", "print(f\"Rows left for train: {len(train_remain)}\")\n", "total_rows = len(handpicked_val) + len(train_remain)\n", "print(f\"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%\")\n", "print(f\"Lost rows: {len(lost_rows)}\")\n", "\n", "train_exclusive = all_data.loc[\n", " all_data[\"ID\"].isin(train_remain_ids)\n", " ].reset_index(drop=True)\n", "\n", "val_exclusive = all_data.loc[\n", " all_data[\"ID\"].isin(val_ids)\n", " ].reset_index(drop=True)\n", "\n", "test_exclusive = all_data.loc[\n", " all_data[\"ID\"].isin(test_ids)\n", " ].reset_index(drop=True)\n", "\n", "leaky_test = all_data.loc[\n", " ~(all_data[\"ID\"].isin(train_exclusive[\"ID\"].tolist())) & \n", " ~(all_data[\"ID\"].isin(val_exclusive[\"ID\"].tolist())) & \n", " ~(all_data[\"ID\"].isin(test_exclusive[\"ID\"].tolist()))\n", "].reset_index(drop=True)\n", "\n", "print(f\"Original total: {len(all_data)}\")\n", "retained_total = len(train_exclusive)+len(val_exclusive)+len(test_exclusive)\n", "print(f\"New, exclusive total: {retained_total}\")\n", "print(f\"Lost rows: {len(all_data)-retained_total}\")\n", "print(f\"Length train: {len(train_exclusive)}/{retained_total} ({100*len(train_exclusive)/retained_total:.2f}%)\")\n", "print(f\"Length val: {len(val_exclusive)}/{retained_total} ({100*len(val_exclusive)/retained_total:.2f}%)\")\n", "print(f\"Length test: {len(test_exclusive)}/{retained_total} ({100*len(test_exclusive)/retained_total:.2f}%)\")\n", "\n", "def check_validity(train_exclusive, val_exclusive, test_exclusive):\n", " train_exclusive_ids = set(train_exclusive[\"ID\"].unique().tolist())\n", " val_exclusive_ids = set(val_exclusive[\"ID\"].unique().tolist())\n", " test_exclusive_ids = set(test_exclusive[\"ID\"].unique().tolist())\n", "\n", " assert len(train_exclusive_ids.intersection(val_exclusive_ids)) == 0\n", " assert len(train_exclusive_ids.intersection(test_exclusive_ids)) == 0\n", " assert len(val_exclusive_ids.intersection(test_exclusive_ids)) == 0\n", " print(f\"Pass! No overlap in IDs\")\n", "\n", " # Investigate TR intersection. No assertions unless we are explicitly splitting on this. \n", " train_exclusive_tr_seqs = set(train_exclusive[\"tr_sequence\"].unique().tolist())\n", " val_exclusive_tr_seqs = set(val_exclusive[\"tr_sequence\"].unique().tolist())\n", " test_exclusive_tr_seqs = set(test_exclusive[\"tr_sequence\"].unique().tolist())\n", "\n", " train_exclusive_tr_reps = set(train_exclusive[\"tr_cluster_rep\"].unique().tolist())\n", " val_exclusive_tr_reps = set(val_exclusive[\"tr_cluster_rep\"].unique().tolist())\n", " test_exclusive_tr_reps = set(test_exclusive[\"tr_cluster_rep\"].unique().tolist())\n", "\n", " print(f\"Train-Val TR intersection: {len(train_exclusive_tr_seqs.intersection(val_exclusive_tr_seqs))}\")\n", " print(f\"Train-Test TR intersection: {len(train_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}\")\n", " print(f\"Val-Test TR intersection: {len(val_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}\")\n", "\n", " print(f\"Train-Val TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(val_exclusive_tr_reps))}\")\n", " print(f\"Train-Test TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}\")\n", " print(f\"Val-Test TR Cluster Rep intersection: {len(val_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}\")\n", "\n", " # Investigate DNA intersection. No assertions unless we are explicitly splitting on this. \n", " train_exclusive_dna_seqs = set(train_exclusive[\"dna_sequence\"].unique().tolist())\n", " val_exclusive_dna_seqs = set(val_exclusive[\"dna_sequence\"].unique().tolist())\n", " test_exclusive_dna_seqs = set(test_exclusive[\"dna_sequence\"].unique().tolist())\n", "\n", " train_exclusive_dna_reps = set(train_exclusive[\"dna_cluster_rep\"].unique().tolist())\n", " val_exclusive_dna_reps = set(val_exclusive[\"dna_cluster_rep\"].unique().tolist())\n", " test_exclusive_dna_reps = set(test_exclusive[\"dna_cluster_rep\"].unique().tolist())\n", "\n", " print(f\"Train-Val DNA intersection: {len(train_exclusive_dna_seqs.intersection(val_exclusive_dna_seqs))}\")\n", " print(f\"Train-Test DNA intersection: {len(train_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}\")\n", " print(f\"Val-Test DNA intersection: {len(val_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}\")\n", "\n", " print(f\"Train-Val DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(val_exclusive_dna_reps))}\")\n", " print(f\"Train-Test DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}\")\n", " print(f\"Val-Test DNA Cluster Rep intersection: {len(val_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}\")\n", "\n", "def get_reverse_complement(s):\n", " \"\"\"\n", " Returns 5' to 3' sequence of the reverse complement\n", " \"\"\"\n", " chars = list(s)\n", " recon = []\n", " rev_map = {\n", " \"a\": \"t\",\n", " \"c\": \"g\",\n", " \"t\": \"a\",\n", " \"g\": \"c\",\n", " \"A\": \"T\",\n", " \"C\": \"G\",\n", " \"T\": \"A\",\n", " \"G\": \"C\",\n", " \"n\": \"n\",\n", " \"N\": \"N\",\n", " }\n", " for c in chars:\n", " recon += [rev_map[c]]\n", "\n", " recon = \"\".join(recon)\n", " return recon[::-1]\n", "\n", "# now make reverse complements\n", "def augment_rc(df):\n", " \"\"\"\n", " Get the reverse complement and add it as a datapoint, effectively doubling the dataset.\n", " Also flip the orientation of the scores\n", "\n", " columns = [\"ID\",\"dna_sequence\",\"tr_sequence\",\"tr_cluster_rep\",\"dna_cluster_rep\", \"scores\",\"split\"]\n", " \"\"\"\n", " df_rc = df.copy(deep=True)\n", "\n", " df_rc[\"dna_sequence\"] = df_rc[\"dna_sequence\"].apply(\n", " lambda x: get_reverse_complement(x)\n", " )\n", " df_rc[\"ID\"] = df_rc[\"ID\"] + \"_rc\"\n", " df_rc[\"scores\"] = df_rc[\"scores\"].apply(lambda s: \",\".join(s.split(\",\")[::-1]))\n", "\n", " final_df = pd.concat([df, df_rc]).reset_index(drop=True)\n", "\n", " return final_df\n", "\n", "def convert_scores(scores, mode=1):\n", " \"\"\"\n", " Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score\n", " \"\"\"\n", " svec = [int(x) for x in scores.split(\",\")]\n", " max_score = max(svec)\n", " if mode ==1:\n", " binary_svec = [0 if x\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDtr_seqiddna_seqidpeak_seqidchrpeak_idtr_namechipscoretotal_jaspar_hitsdna_sequencetr_sequencescorestr_cluster_repdna_cluster_rep
28719trseq4_dnaseq38058trseq4dnaseq38058peakseq59667chr8_peak2619ZNF2635.02CTGGGAGTCTCCCAGTGAATCCTCTCCCTCCAGGAAGCATTCAGGG...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq4dnaseq38058
30710trseq4_dnaseq58107trseq4dnaseq58107peakseq38026chr22_peak949ZNF263909.01GTAACGATGCCTTCCTAGGCACTGGCGTTACCGCCTGACCAAGGAG...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq4dnaseq58107
8736trseq4_dnaseq18629trseq4dnaseq18629peakseq39291chr6_peak2197ZNF26361.01CAAAGAAAGAAAATCTACTTTATTACAAGGAACAAAAACATAATAG...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq4dnaseq18629
5747trseq4_dnaseq4928trseq4dnaseq4928peakseq39282chr1_peak4157ZNF263394.01AATTTCTATCAACTGAGGCAAAAGTCTTAAGTTCCCCCAAACCAAT...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq4dnaseq4928
28098trseq4_dnaseq17297trseq4dnaseq17297peakseq46179chr1_peak4932ZNF26310.01ATGTGGGAGTAGAGATAAAGAAATCAGTGCAGTTAAGGAGGGTAGA...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq4dnaseq17297
6990trseq4_dnaseq45969trseq4dnaseq45969peakseq3403chr12_peak164ZNF2631000.01GCAGGAGAGGTCACAGACCCACAGAATCGTCCAATCCCTGCCCCAG...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq4dnaseq45969
29914trseq4_dnaseq70215trseq4dnaseq70215peakseq29873chr10_peak1270ZNF26399.03TGAGAAAAATAAACCCTGGGATATACAAAGGGACATCTGTCCACAG...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,99,99,99,99,99,9...trseq4dnaseq70215
52579trseq4_dnaseq3768trseq4dnaseq3768peakseq43489chr22_peak1089ZNF26365.09AAGTAGCTGGGATTACAGACGTACACCACCACGCCTGGCTAATTTT...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq4dnaseq3768
6423trseq4_dnaseq19395trseq4dnaseq19395peakseq60837chr20_peak1976ZNF26376.02CAATACTCAATTTCCCCCTATTTATTTCTAGTGCAGGTTTCACAGC...MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq4dnaseq19395
49313trseq1_dnaseq5894trseq1dnaseq5894peakseq60338chr17_peak3283EGR11000.01ACAGGGCAACAGCGGGATTGAGAGATGGAGGGATCCCCGCATCTGA...MAAAKAEMQLMSPLQISDPFGSFPHSPTMDNYPKLEEMMLLSNGAP...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...trseq1dnaseq5894
\n", "" ], "text/plain": [ " ID tr_seqid dna_seqid peak_seqid chrpeak_id \\\n", "28719 trseq4_dnaseq38058 trseq4 dnaseq38058 peakseq59667 chr8_peak2619 \n", "30710 trseq4_dnaseq58107 trseq4 dnaseq58107 peakseq38026 chr22_peak949 \n", "8736 trseq4_dnaseq18629 trseq4 dnaseq18629 peakseq39291 chr6_peak2197 \n", "5747 trseq4_dnaseq4928 trseq4 dnaseq4928 peakseq39282 chr1_peak4157 \n", "28098 trseq4_dnaseq17297 trseq4 dnaseq17297 peakseq46179 chr1_peak4932 \n", "6990 trseq4_dnaseq45969 trseq4 dnaseq45969 peakseq3403 chr12_peak164 \n", "29914 trseq4_dnaseq70215 trseq4 dnaseq70215 peakseq29873 chr10_peak1270 \n", "52579 trseq4_dnaseq3768 trseq4 dnaseq3768 peakseq43489 chr22_peak1089 \n", "6423 trseq4_dnaseq19395 trseq4 dnaseq19395 peakseq60837 chr20_peak1976 \n", "49313 trseq1_dnaseq5894 trseq1 dnaseq5894 peakseq60338 chr17_peak3283 \n", "\n", " tr_name chipscore total_jaspar_hits \\\n", "28719 ZNF263 5.0 2 \n", "30710 ZNF263 909.0 1 \n", "8736 ZNF263 61.0 1 \n", "5747 ZNF263 394.0 1 \n", "28098 ZNF263 10.0 1 \n", "6990 ZNF263 1000.0 1 \n", "29914 ZNF263 99.0 3 \n", "52579 ZNF263 65.0 9 \n", "6423 ZNF263 76.0 2 \n", "49313 EGR1 1000.0 1 \n", "\n", " dna_sequence \\\n", "28719 CTGGGAGTCTCCCAGTGAATCCTCTCCCTCCAGGAAGCATTCAGGG... \n", "30710 GTAACGATGCCTTCCTAGGCACTGGCGTTACCGCCTGACCAAGGAG... \n", "8736 CAAAGAAAGAAAATCTACTTTATTACAAGGAACAAAAACATAATAG... \n", "5747 AATTTCTATCAACTGAGGCAAAAGTCTTAAGTTCCCCCAAACCAAT... \n", "28098 ATGTGGGAGTAGAGATAAAGAAATCAGTGCAGTTAAGGAGGGTAGA... \n", "6990 GCAGGAGAGGTCACAGACCCACAGAATCGTCCAATCCCTGCCCCAG... \n", "29914 TGAGAAAAATAAACCCTGGGATATACAAAGGGACATCTGTCCACAG... \n", "52579 AAGTAGCTGGGATTACAGACGTACACCACCACGCCTGGCTAATTTT... \n", "6423 CAATACTCAATTTCCCCCTATTTATTTCTAGTGCAGGTTTCACAGC... \n", "49313 ACAGGGCAACAGCGGGATTGAGAGATGGAGGGATCCCCGCATCTGA... \n", "\n", " tr_sequence \\\n", "28719 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "30710 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "8736 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "5747 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "28098 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "6990 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "29914 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "52579 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "6423 MASGPGSQEREGLLIVKLEEDCAWSQELPPPDPGPSPEASHLRFRR... \n", "49313 MAAAKAEMQLMSPLQISDPFGSFPHSPTMDNYPKLEEMMLLSNGAP... \n", "\n", " scores tr_cluster_rep \\\n", "28719 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq4 \n", "30710 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq4 \n", "8736 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq4 \n", "5747 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq4 \n", "28098 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq4 \n", "6990 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq4 \n", "29914 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,99,99,99,99,99,9... trseq4 \n", "52579 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq4 \n", "6423 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq4 \n", "49313 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... trseq1 \n", "\n", " dna_cluster_rep \n", "28719 dnaseq38058 \n", "30710 dnaseq58107 \n", "8736 dnaseq18629 \n", "5747 dnaseq4928 \n", "28098 dnaseq17297 \n", "6990 dnaseq45969 \n", "29914 dnaseq70215 \n", "52579 dnaseq3768 \n", "6423 dnaseq19395 \n", "49313 dnaseq5894 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_exclusive.sample(10, random_state=42)" ] }, { "cell_type": "code", "execution_count": 67, "id": "9e422e5b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "round((0.8*total_rows)/len(remaining), 2)" ] }, { "cell_type": "code", "execution_count": 63, "id": "e7163e49", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tr_cluster_repcountsequenceUnnamed: 0FromEntryReviewedEntry NameProtein namesGene NamesOrganismLengthInterProPfamMotifZinc fingerProtein familiesBinding siteSite
2trseq236407MSDPQTSMAATAAVSPSDYLQPAASTTQDSQPSPLALLAATCSKIG...443SP2Q02086reviewedSP2_HUMANTranscription factor Sp2SP2 KIAA0048Homo sapiens (Human)613IPR036236;IPR013087;PF00096;MOTIF 361..369; /note=\"9aaTAD; inactive\"; /evi...ZN_FING 525..549; /note=\"C2H2-type 1\"; /eviden...Sp1 C2H2-type zinc-finger protein familyNaNNaN
6trseq26873MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN...273NFYBP25208reviewedNFYB_HUMANNuclear transcription factor Y subunit beta (C...NFYB HAP3Homo sapiens (Human)207IPR003958;IPR009072;IPR027113;IPR003956;PF00808;NaNNaNNFYB/HAP3 subunit familyNaNNaN
13trseq1730MNFETSRCATLQYCPDPYIQRFVETPAHFSWKESYYRSTMSQSTQT...1004TP63Q9H3D4reviewedP63_HUMANTumor protein 63 (p63) (Chronic ulcerative sto...TP63 KET P63 P73H P73L TP73LHomo sapiens (Human)680IPR008967;IPR012346;IPR011615;IPR036674;IPR010...PF00870;PF07710;PF07647;NaNNaNP53 familyBINDING 244; /ligand=\"Zn(2+)\"; /ligand_id=\"ChE...NaN
\n", "
" ], "text/plain": [ " tr_cluster_rep count sequence \\\n", "2 trseq23 6407 MSDPQTSMAATAAVSPSDYLQPAASTTQDSQPSPLALLAATCSKIG... \n", "6 trseq26 873 MTMDGDSSTTDASQLGISADYIGGSHYVIQPHDDTEDSMNDHEDTN... \n", "13 trseq17 30 MNFETSRCATLQYCPDPYIQRFVETPAHFSWKESYYRSTMSQSTQT... \n", "\n", " Unnamed: 0 From Entry Reviewed Entry Name \\\n", "2 443 SP2 Q02086 reviewed SP2_HUMAN \n", "6 273 NFYB P25208 reviewed NFYB_HUMAN \n", "13 1004 TP63 Q9H3D4 reviewed P63_HUMAN \n", "\n", " Protein names \\\n", "2 Transcription factor Sp2 \n", "6 Nuclear transcription factor Y subunit beta (C... \n", "13 Tumor protein 63 (p63) (Chronic ulcerative sto... \n", "\n", " Gene Names Organism Length \\\n", "2 SP2 KIAA0048 Homo sapiens (Human) 613 \n", "6 NFYB HAP3 Homo sapiens (Human) 207 \n", "13 TP63 KET P63 P73H P73L TP73L Homo sapiens (Human) 680 \n", "\n", " InterPro \\\n", "2 IPR036236;IPR013087; \n", "6 IPR003958;IPR009072;IPR027113;IPR003956; \n", "13 IPR008967;IPR012346;IPR011615;IPR036674;IPR010... \n", "\n", " Pfam \\\n", "2 PF00096; \n", "6 PF00808; \n", "13 PF00870;PF07710;PF07647; \n", "\n", " Motif \\\n", "2 MOTIF 361..369; /note=\"9aaTAD; inactive\"; /evi... \n", "6 NaN \n", "13 NaN \n", "\n", " Zinc finger \\\n", "2 ZN_FING 525..549; /note=\"C2H2-type 1\"; /eviden... \n", "6 NaN \n", "13 NaN \n", "\n", " Protein families \\\n", "2 Sp1 C2H2-type zinc-finger protein family \n", "6 NFYB/HAP3 subunit family \n", "13 P53 family \n", "\n", " Binding site Site \n", "2 NaN NaN \n", "6 NaN NaN \n", "13 BINDING 244; /ligand=\"Zn(2+)\"; /ligand_id=\"ChE... NaN " ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vcs.loc[vcs[\"tr_cluster_rep\"].isin(handpicked_test_trs)]" ] }, { "cell_type": "markdown", "id": "cf6b3dbc", "metadata": {}, "source": [ "# Another strategy: same as above but randomly drop most of trseq4 datapoints before we start" ] }, { "cell_type": "code", "execution_count": null, "id": "f963c8d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29\n", "71134\n", "0\n", "0\n", "Dropped most TR4 datapoints to avoid overfitting. New starting size: 33674\n", "Rows in test: 7310\n", "Rows to be split between train and val: 23263\n", "Total rows: 30573. Test percentage: 23.91%\n", "Lost rows: 3101\n", "Rows in val: 4434\n", "Rows left for train: 18640\n", "Total rows: 23074. Test percentage: 19.22%\n", "Lost rows: 3290\n", "Original total: 33674\n", "New, exclusive total: 30384\n", "Lost rows: 3290\n", "Length train: 18640/30384 (61.35%)\n", "Length val: 4434/30384 (14.59%)\n", "Length test: 7310/30384 (24.06%)\n", "Pass! No overlap in IDs\n", "Train-Val TR intersection: 0\n", "Train-Test TR intersection: 0\n", "Val-Test TR intersection: 0\n", "Train-Val TR Cluster Rep intersection: 0\n", "Train-Test TR Cluster Rep intersection: 0\n", "Val-Test TR Cluster Rep intersection: 0\n", "Train-Val DNA intersection: 0\n", "Train-Test DNA intersection: 0\n", "Val-Test DNA intersection: 0\n", "Train-Val DNA Cluster Rep intersection: 0\n", "Train-Test DNA Cluster Rep intersection: 0\n", "Val-Test DNA Cluster Rep intersection: 0\n", "Added reverse complement sequences to train_exclusive, val_exclusive, and test_exclusive (and leaky test_exclusive)\n", "Pass! No overlap in IDs\n", "Train-Val TR intersection: 0\n", "Train-Test TR intersection: 0\n", "Val-Test TR intersection: 0\n", "Train-Val TR Cluster Rep intersection: 0\n", "Train-Test TR Cluster Rep intersection: 0\n", "Val-Test TR Cluster Rep intersection: 0\n", "Train-Val DNA intersection: 0\n", "Train-Test DNA intersection: 0\n", "Val-Test DNA intersection: 0\n", "Train-Val DNA Cluster Rep intersection: 0\n", "Train-Test DNA Cluster Rep intersection: 0\n", "Val-Test DNA Cluster Rep intersection: 0\n", "Length of train_exclusive dataset: 37280. (% of train+val+test=61.35%) (% of train_val+test+leaky_test=55.35%)\n", "Length of val_exclusive dataset: 8868.(% of train+val+test = 14.59%) (% of train_val+test+leaky_test=13.17%)\n", "Length of test_exclusive dataset: 14620. (% of train+val+test= 24.06%) (% of train_val+test+leaky_test=21.71%)\n", "Length of leaky_test dataset: 6580. (% of train_val+test+leaky_test=9.77%)\n", "Total sequences = 67348. Same as edges size*2? True\n", "Saved all splits to /home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test_cropTR4\n" ] } ], "source": [ "## Full pipeline\n", "import pandas as pd\n", "protein_clusters = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv\", sep=\"\\t\", header=None)\n", "protein_clusters.columns=[\"tr_cluster_rep\",\"tr_cluster_member\"]\n", "protein_clusters.head()\n", "\n", "dna_clusters = pd.read_csv(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv\", sep=\"\\t\", header=None)\n", "dna_clusters.columns=[\"dna_cluster_rep\",\"dna_cluster_member\"]\n", "dna_clusters.head()\n", "\n", "all_data = pd.read_parquet(\"/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet\")\n", "all_data\n", "\n", "protein_cluster_map = dict(zip(protein_clusters[\"tr_cluster_member\"],protein_clusters[\"tr_cluster_rep\"]))\n", "dna_cluster_map = dict(zip(dna_clusters[\"dna_cluster_member\"],dna_clusters[\"dna_cluster_rep\"]))\n", "print(len(protein_cluster_map))\n", "print(len(dna_cluster_map))\n", "all_data[\"tr_cluster_rep\"] = all_data[\"tr_seqid\"].map(protein_cluster_map)\n", "all_data[\"dna_cluster_rep\"] = all_data[\"dna_seqid\"].map(dna_cluster_map)\n", "print(len(all_data[all_data[\"tr_cluster_rep\"].isna()]))\n", "print(len(all_data[all_data[\"dna_cluster_rep\"].isna()]))\n", "all_data.head()\n", "\n", "### DROP MOST TRSEQ4 DATAPOINTS\n", "all_data_notr4 = all_data.loc[all_data[\"tr_seqid\"]!=\"trseq4\"].reset_index(drop=True)\n", "all_data_tr4 = all_data.loc[all_data[\"tr_seqid\"]==\"trseq4\"].reset_index(drop=True)\n", "all_data_tr4 = all_data_tr4.sample(8962,random_state=42).reset_index(drop=True)\n", "all_data = pd.concat([all_data_notr4,all_data_tr4]).reset_index(drop=True)\n", "print(f\"Dropped most TR4 datapoints to avoid overfitting. New starting size: {len(all_data)}\")\n", "\n", "### handpick test\n", "# trseq29\n", "handpicked_test_trs = [\"trseq23\",\"trseq26\",\"trseq17\"]\n", "handpicked_test = all_data.loc[\n", " all_data[\"tr_cluster_rep\"].isin(handpicked_test_trs)\n", "].reset_index(drop=True)\n", "\n", "off_limits_dna_clusters = handpicked_test[\"dna_cluster_rep\"].unique().tolist()\n", "remaining = all_data.loc[\n", " (~all_data[\"tr_cluster_rep\"].isin(handpicked_test_trs)) & \n", " (~all_data[\"dna_cluster_rep\"].isin(off_limits_dna_clusters)) \n", "].reset_index(drop=True)\n", "\n", "test_ids = handpicked_test[\"ID\"].unique().tolist()\n", "remaining_ids = remaining[\"ID\"].unique().tolist()\n", "lost_rows = all_data.loc[\n", " (~all_data[\"ID\"].isin(test_ids)) & \n", " (~all_data[\"ID\"].isin(remaining_ids))\n", "]\n", "print(f\"Rows in test: {len(handpicked_test)}\")\n", "print(f\"Rows to be split between train and val: {len(remaining)}\")\n", "total_rows = len(handpicked_test) + len(remaining)\n", "print(f\"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%\")\n", "print(f\"Lost rows: {len(lost_rows)}\")\n", "\n", "### handpick val\n", "handpicked_val_trs = [\"trseq9\", \"trseq5\", \"trseq28\"]\n", "\n", "handpicked_val = remaining.loc[\n", " remaining[\"tr_cluster_rep\"].isin(handpicked_val_trs)\n", "].reset_index(drop=True)\n", "\n", "off_limits_dna_clusters = handpicked_val[\"dna_cluster_rep\"].unique().tolist()\n", "train_remain = remaining.loc[\n", " (~remaining[\"tr_cluster_rep\"].isin(handpicked_val_trs)) & \n", " (~remaining[\"dna_cluster_rep\"].isin(off_limits_dna_clusters)) \n", "].reset_index(drop=True)\n", "\n", "val_ids = handpicked_val[\"ID\"].unique().tolist()\n", "train_remain_ids = train_remain[\"ID\"].unique().tolist()\n", "lost_rows = all_data.loc[\n", " (~all_data[\"ID\"].isin(test_ids)) & \n", " (~all_data[\"ID\"].isin(val_ids)) & \n", " (~all_data[\"ID\"].isin(train_remain_ids))\n", "]\n", "print(f\"Rows in val: {len(handpicked_val)}\")\n", "print(f\"Rows left for train: {len(train_remain)}\")\n", "total_rows = len(handpicked_val) + len(train_remain)\n", "print(f\"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%\")\n", "print(f\"Lost rows: {len(lost_rows)}\")\n", "\n", "train_exclusive = all_data.loc[\n", " all_data[\"ID\"].isin(train_remain_ids)\n", " ].reset_index(drop=True)\n", "\n", "val_exclusive = all_data.loc[\n", " all_data[\"ID\"].isin(val_ids)\n", " ].reset_index(drop=True)\n", "\n", "test_exclusive = all_data.loc[\n", " all_data[\"ID\"].isin(test_ids)\n", " ].reset_index(drop=True)\n", "\n", "leaky_test = all_data.loc[\n", " ~(all_data[\"ID\"].isin(train_exclusive[\"ID\"].tolist())) & \n", " ~(all_data[\"ID\"].isin(val_exclusive[\"ID\"].tolist())) & \n", " ~(all_data[\"ID\"].isin(test_exclusive[\"ID\"].tolist()))\n", "].reset_index(drop=True)\n", "\n", "print(f\"Original total: {len(all_data)}\")\n", "retained_total = len(train_exclusive)+len(val_exclusive)+len(test_exclusive)\n", "print(f\"New, exclusive total: {retained_total}\")\n", "print(f\"Lost rows: {len(all_data)-retained_total}\")\n", "print(f\"Length train: {len(train_exclusive)}/{retained_total} ({100*len(train_exclusive)/retained_total:.2f}%)\")\n", "print(f\"Length val: {len(val_exclusive)}/{retained_total} ({100*len(val_exclusive)/retained_total:.2f}%)\")\n", "print(f\"Length test: {len(test_exclusive)}/{retained_total} ({100*len(test_exclusive)/retained_total:.2f}%)\")\n", "\n", "def check_validity(train_exclusive, val_exclusive, test_exclusive):\n", " train_exclusive_ids = set(train_exclusive[\"ID\"].unique().tolist())\n", " val_exclusive_ids = set(val_exclusive[\"ID\"].unique().tolist())\n", " test_exclusive_ids = set(test_exclusive[\"ID\"].unique().tolist())\n", "\n", " assert len(train_exclusive_ids.intersection(val_exclusive_ids)) == 0\n", " assert len(train_exclusive_ids.intersection(test_exclusive_ids)) == 0\n", " assert len(val_exclusive_ids.intersection(test_exclusive_ids)) == 0\n", " print(f\"Pass! No overlap in IDs\")\n", "\n", " # Investigate TR intersection. No assertions unless we are explicitly splitting on this. \n", " train_exclusive_tr_seqs = set(train_exclusive[\"tr_sequence\"].unique().tolist())\n", " val_exclusive_tr_seqs = set(val_exclusive[\"tr_sequence\"].unique().tolist())\n", " test_exclusive_tr_seqs = set(test_exclusive[\"tr_sequence\"].unique().tolist())\n", "\n", " train_exclusive_tr_reps = set(train_exclusive[\"tr_cluster_rep\"].unique().tolist())\n", " val_exclusive_tr_reps = set(val_exclusive[\"tr_cluster_rep\"].unique().tolist())\n", " test_exclusive_tr_reps = set(test_exclusive[\"tr_cluster_rep\"].unique().tolist())\n", "\n", " print(f\"Train-Val TR intersection: {len(train_exclusive_tr_seqs.intersection(val_exclusive_tr_seqs))}\")\n", " print(f\"Train-Test TR intersection: {len(train_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}\")\n", " print(f\"Val-Test TR intersection: {len(val_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}\")\n", "\n", " print(f\"Train-Val TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(val_exclusive_tr_reps))}\")\n", " print(f\"Train-Test TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}\")\n", " print(f\"Val-Test TR Cluster Rep intersection: {len(val_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}\")\n", "\n", " # Investigate DNA intersection. No assertions unless we are explicitly splitting on this. \n", " train_exclusive_dna_seqs = set(train_exclusive[\"dna_sequence\"].unique().tolist())\n", " val_exclusive_dna_seqs = set(val_exclusive[\"dna_sequence\"].unique().tolist())\n", " test_exclusive_dna_seqs = set(test_exclusive[\"dna_sequence\"].unique().tolist())\n", "\n", " train_exclusive_dna_reps = set(train_exclusive[\"dna_cluster_rep\"].unique().tolist())\n", " val_exclusive_dna_reps = set(val_exclusive[\"dna_cluster_rep\"].unique().tolist())\n", " test_exclusive_dna_reps = set(test_exclusive[\"dna_cluster_rep\"].unique().tolist())\n", "\n", " print(f\"Train-Val DNA intersection: {len(train_exclusive_dna_seqs.intersection(val_exclusive_dna_seqs))}\")\n", " print(f\"Train-Test DNA intersection: {len(train_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}\")\n", " print(f\"Val-Test DNA intersection: {len(val_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}\")\n", "\n", " print(f\"Train-Val DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(val_exclusive_dna_reps))}\")\n", " print(f\"Train-Test DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}\")\n", " print(f\"Val-Test DNA Cluster Rep intersection: {len(val_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}\")\n", "\n", "def get_reverse_complement(s):\n", " \"\"\"\n", " Returns 5' to 3' sequence of the reverse complement\n", " \"\"\"\n", " chars = list(s)\n", " recon = []\n", " rev_map = {\n", " \"a\": \"t\",\n", " \"c\": \"g\",\n", " \"t\": \"a\",\n", " \"g\": \"c\",\n", " \"A\": \"T\",\n", " \"C\": \"G\",\n", " \"T\": \"A\",\n", " \"G\": \"C\",\n", " \"n\": \"n\",\n", " \"N\": \"N\",\n", " }\n", " for c in chars:\n", " recon += [rev_map[c]]\n", "\n", " recon = \"\".join(recon)\n", " return recon[::-1]\n", "\n", "# now make reverse complements\n", "def augment_rc(df):\n", " \"\"\"\n", " Get the reverse complement and add it as a datapoint, effectively doubling the dataset.\n", " Also flip the orientation of the scores\n", "\n", " columns = [\"ID\",\"dna_sequence\",\"tr_sequence\",\"tr_cluster_rep\",\"dna_cluster_rep\", \"scores\",\"split\"]\n", " \"\"\"\n", " df_rc = df.copy(deep=True)\n", "\n", " df_rc[\"dna_sequence\"] = df_rc[\"dna_sequence\"].apply(\n", " lambda x: get_reverse_complement(x)\n", " )\n", " df_rc[\"ID\"] = df_rc[\"ID\"] + \"_rc\"\n", " df_rc[\"scores\"] = df_rc[\"scores\"].apply(lambda s: \",\".join(s.split(\",\")[::-1]))\n", "\n", " final_df = pd.concat([df, df_rc]).reset_index(drop=True)\n", "\n", " return final_df\n", "\n", "def convert_scores(scores, mode=1):\n", " \"\"\"\n", " Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score\n", " \"\"\"\n", " svec = [int(x) for x in scores.split(\",\")]\n", " max_score = max(svec)\n", " if mode ==1:\n", " binary_svec = [0 if x