| | import streamlit as st |
| | from stmol import showmol |
| | import py3Dmol |
| | import requests |
| | import biotite.structure.io as bsio |
| | import random |
| | import hashlib |
| | import urllib3 |
| | from Bio.Blast import NCBIWWW, NCBIXML |
| | from Bio.Seq import Seq |
| | from Bio.SeqRecord import SeqRecord |
| | import time |
| | import urllib.parse |
| |
|
| | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
| |
|
| | st.set_page_config(layout='wide') |
| | st.sidebar.title('🔮 GenPro2') |
| | st.sidebar.write('GenPro2 is an end-to-end protein sequence generator, structure predictor, and analysis that uses [ESMFold](https://esmatlas.com/explore?at=1%2C1%2C21.999999344348925) and the ESM-2 language model | beta v2.12') |
| |
|
| | def generate_sequence_from_words(words, length): |
| | seed = ' '.join(words).encode('utf-8') |
| | random.seed(hashlib.md5(seed).hexdigest()) |
| | amino_acids = "ACDEFGHIKLMNPQRSTVWY" |
| | return ''.join(random.choice(amino_acids) for _ in range(length)) |
| |
|
| | def render_mol(pdb): |
| | viewer = py3Dmol.view(width='100%', height='400px') |
| | viewer.addModel(pdb, 'pdb') |
| | viewer.setStyle({'cartoon': {'color': 'spectrum'}}) |
| | viewer.setBackgroundColor('white') |
| | viewer.zoomTo() |
| | viewer.zoom(0.8) |
| | viewer.spin(True) |
| | viewer.render() |
| | |
| | |
| | st.markdown(""" |
| | <style> |
| | .stmol-container { |
| | width: 100% !important; |
| | height: 400px !important; |
| | max-width: 800px; |
| | margin: 0 auto; |
| | } |
| | @media (max-width: 600px) { |
| | .stmol-container { |
| | height: 300px !important; |
| | } |
| | } |
| | </style> |
| | """, unsafe_allow_html=True) |
| | |
| | showmol(viewer, height=400, width=None) |
| |
|
| | def perform_blast_analysis(sequence): |
| | st.subheader('Protein Analysis') |
| | with st.spinner("Analyzing generated protein... This may take several minutes. Stay tuned!"): |
| | progress_bar = st.progress(0) |
| | for i in range(100): |
| | progress_bar.progress(i + 1) |
| | time.sleep(1.9) |
| | |
| | try: |
| | record = SeqRecord(Seq(sequence), id='random_protein') |
| | result_handle = NCBIWWW.qblast("blastp", "swissprot", record.seq) |
| | |
| | blast_record = NCBIXML.read(result_handle) |
| | |
| | if blast_record.alignments: |
| | alignment = blast_record.alignments[0] |
| | hsp = alignment.hsps[0] |
| | |
| | |
| | title_parts = alignment.title.split('|') |
| | protein_name = title_parts[-1].strip() |
| | organism = title_parts[-2].split('OS=')[-1].split('OX=')[0].strip() |
| | |
| | |
| | identity_percentage = (hsp.identities / hsp.align_length) * 100 |
| | |
| | st.write(f"**Top Match:** {protein_name}") |
| | st.write(f"**UniProt ID:** {organism}") |
| | st.write(f"**Sequence Identity Match:** {identity_percentage:.2f}%") |
| | |
| | |
| | if hasattr(alignment, 'description') and alignment.description: |
| | st.write(f"**Potential Function:** {alignment.description}") |
| | else: |
| | st.write("No significant matches found in the database. This might be a unique protein sequence!") |
| |
|
| | except Exception as e: |
| | st.error(f"An error occurred during protein analysis: {str(e)}") |
| | st.write("Please try again later, BLAST servers could be experiencing a delay.") |
| | |
| | def update(sequence, word1, word2, word3, sequence_length): |
| | headers = { |
| | 'Content-Type': 'application/x-www-form-urlencoded', |
| | } |
| | try: |
| | response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', |
| | headers=headers, |
| | data=sequence, |
| | verify=False, |
| | timeout=300) |
| | response.raise_for_status() |
| | pdb_string = response.content.decode('utf-8') |
| | |
| | with open('predicted.pdb', 'w') as f: |
| | f.write(pdb_string) |
| | |
| | struct = bsio.load_structure('predicted.pdb', extra_fields=["b_factor"]) |
| | b_value = round(struct.b_factor.mean(), 2) |
| | |
| | st.session_state.structure_info = { |
| | 'pdb_string': pdb_string, |
| | 'b_value': b_value, |
| | 'word1': word1, |
| | 'word2': word2, |
| | 'word3': word3, |
| | 'sequence_length': sequence_length |
| | } |
| | |
| | st.session_state.show_analyze_button = True |
| |
|
| | except requests.exceptions.RequestException as e: |
| | st.error(f"An error occurred while calling the API: {str(e)}") |
| | st.write("Please try again later or contact support if the issue persists.") |
| |
|
| | def share_on_twitter(word1, word2, word3, length, plddt): |
| | tweet_text = f"I just generated a unique protein using #GenPro2 by @WandsAI using the seed words #{word1}, #{word2}, #{word3} + sequence length of {length}. My Protein has a {plddt}% plDDT score! #PostYourProtein" |
| | tweet_url = f"https://twitter.com/intent/tweet?text={urllib.parse.quote(tweet_text)}" |
| | return tweet_url |
| |
|
| | |
| | if 'sequence' not in st.session_state: |
| | st.session_state.sequence = None |
| | if 'show_analyze_button' not in st.session_state: |
| | st.session_state.show_analyze_button = False |
| | if 'structure_info' not in st.session_state: |
| | st.session_state.structure_info = None |
| |
|
| | st.title("📖 User Guide:") |
| |
|
| | st.sidebar.subheader("Generate Sequence from Words") |
| | word1 = st.sidebar.text_input("Word 1") |
| | word2 = st.sidebar.text_input("Word 2") |
| | word3 = st.sidebar.text_input("Word 3") |
| | sequence_length = st.sidebar.number_input("Sequence Length", min_value=50, max_value=400, value=100, step=10) |
| |
|
| | |
| | st.info(""" |
| | Protein Length Guide: |
| | - 50-100 amino acids: Small proteins/peptides |
| | - 100-300 amino acids: Average protein domains |
| | - 300-500 amino acids: Larger single-domain proteins |
| | |
| | """) |
| |
|
| | st.markdown(""" |
| | 1. Start by entering any three seed words of your choice and select a sequence length in the sidebar. |
| | 2. Click 'Generate and Predict' to generate a unique protein sequence based on your inputs. |
| | 3. GenPro2 then predicts the 3D structure of your protein and provides a confidence score. |
| | |
| | More about GenPro2 and Proteins: |
| | Your unique protein could be the key to unlocking new therapeutic possibilities or understanding disease mechanisms. Who knows? Your next generated sequence might just lead to a breakthrough. Start your journey into computational protein exploration! [Learn more](https://www.youtube.com/watch?v=KpedmJdrTpY) |
| | """) |
| |
|
| | if st.sidebar.button('Generate and Predict'): |
| | if word1 and word2 and word3: |
| | sequence = generate_sequence_from_words([word1, word2, word3], sequence_length) |
| | st.session_state.sequence = sequence |
| | st.sidebar.text_area("Generated Sequence", sequence, height=100) |
| | st.sidebar.info("Note: The same words and sequence length will always produce the same sequence.") |
| | |
| | with st.spinner("Predicting protein structure... This may take a few minutes."): |
| | update(sequence, word1, word2, word3, sequence_length) |
| | else: |
| | st.sidebar.warning("Please enter all three words to generate a sequence.") |
| |
|
| | |
| | if st.session_state.structure_info: |
| | info = st.session_state.structure_info |
| | st.subheader(f'Predicted protein structure using seed: {info["word1"]}, {info["word2"]}, and {info["word3"]} + sequence length {info["sequence_length"]}') |
| | render_mol(info['pdb_string']) |
| | |
| | st.subheader('plDDT Confidence Score') |
| | st.write('plDDT is a bench mark for scoring the confidence level of protein folding predictions based on a scale from 0-100%. 70% or more is good!') |
| | plddt_score = int(info["b_value"] * 100) |
| | st.info(f'Your plDDT score is: {plddt_score}%') |
| | |
| | st.subheader("Share your unique protein on X(Twitter)") |
| | |
| | st.markdown(""" |
| | <div style='background-color: #e6f2ff; padding: 10px; border-radius: 5px; font-size: 0.8em;'> |
| | <ol> |
| | <li>Take a screenshot of the protein structure above.</li> |
| | <li>Click the 'Share Results' link below to open a pre-filled post with your proteins seed-words and plDDT score.</li> |
| | <li>Be sure to attach a screenshot of your protein before you post!</li> |
| | </ol> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | |
| | tweet_url = share_on_twitter(info["word1"], info["word2"], info["word3"], info["sequence_length"], plddt_score) |
| | st.markdown(f"[Share Results]({tweet_url})") |
| |
|
| | st.markdown(""" |
| | ## What to do next: |
| | |
| | """) |
| |
|
| | col1, col2 = st.columns(2) |
| | with col1: |
| | if st.button('Analyze Protein'): |
| | perform_blast_analysis(st.session_state.sequence) |
| | |
| | with col2: |
| | st.download_button( |
| | label="Download PDB", |
| | data=info['pdb_string'], |
| | file_name='predicted.pdb', |
| | mime='text/plain', |
| | ) |
| | |
| | st.markdown(""" |
| | If you discover an interesting protein structure, you can explore it even further: |
| | |
| | 1. Click the 'analyze protein' button to search the [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) protein database and see if your protein matches any known sequences. The sequence identity will show how close your sequence matches. *Note this can take several minutes |
| | 2. Download your protein data and visit the [Protein Data Bank (PDB)](https://www.rcsb.org/) to match your protein structure against known protein structures. |
| | 3. If you think you've discovered a unique and useful protein share it with the world on social media! |
| | |
| | |
| | **Remember, this folding is based on randomly generated sequences. Interpret the results with caution. |
| | Enjoy exploring the world of protein sequences! |
| | """) |
| |
|
| |
|