| | import streamlit as st |
| | import pandas as pd |
| | from keybert import KeyBERT |
| |
|
| | import seaborn as sns |
| |
|
| | from src.Pipeline.TextSummarization import T5_Base |
| | from src.Pipeline.QuestGen import sense2vec_get_words,get_question |
| |
|
| |
|
| | st.title("β Intelligent Question Generator") |
| | st.header("") |
| |
|
| |
|
| | with st.expander("βΉοΈ - About this app", expanded=True): |
| |
|
| | st.write( |
| | """ |
| | - The *Intelligent Question Generator* app is an easy-to-use interface built in Streamlit which uses [KeyBERT](https://github.com/MaartenGr/KeyBERT), [Sense2vec](https://github.com/explosion/sense2vec), [T5](https://huggingface.co/ramsrigouthamg/t5_paraphraser) |
| | - It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers](https://huggingface.co/transformers/) π€ to create keywords/keyphrases that are most similar to a document. |
| | - [sense2vec](https://github.com/explosion/sense2vec) (Trask et. al, 2015) is a nice twist on word2vec that lets you learn more interesting and detailed word vectors. |
| | """ |
| | ) |
| |
|
| | st.markdown("") |
| |
|
| | st.markdown("") |
| | st.markdown("## π Paste document ") |
| |
|
| | with st.form(key="my_form"): |
| | ce, c1, ce, c2, c3 = st.columns([0.07, 2, 0.07, 5, 1]) |
| | with c1: |
| | ModelType = st.radio( |
| | "Choose your model", |
| | ["DistilBERT (Default)", "BERT", "RoBERTa", "ALBERT", "XLNet"], |
| | help="At present, you can choose 1 model ie DistilBERT to embed your text. More to come!", |
| | ) |
| |
|
| | if ModelType == "Default (DistilBERT)": |
| | |
| |
|
| | @st.cache(allow_output_mutation=True) |
| | def load_model(model): |
| | return KeyBERT(model=model) |
| |
|
| | kw_model = load_model('roberta') |
| |
|
| | else: |
| | @st.cache(allow_output_mutation=True) |
| | def load_model(model): |
| | return KeyBERT(model=model) |
| |
|
| | kw_model = load_model("distilbert-base-nli-mean-tokens") |
| |
|
| | top_N = st.slider( |
| | "# of results", |
| | min_value=1, |
| | max_value=30, |
| | value=10, |
| | help="You can choose the number of keywords/keyphrases to display. Between 1 and 30, default number is 10.", |
| | ) |
| | min_Ngrams = st.number_input( |
| | "Minimum Ngram", |
| | min_value=1, |
| | max_value=4, |
| | help="""The minimum value for the ngram range. |
| | *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases.To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""", |
| | |
| | ) |
| |
|
| | max_Ngrams = st.number_input( |
| | "Maximum Ngram", |
| | value=1, |
| | min_value=1, |
| | max_value=4, |
| | help="""The maximum value for the keyphrase_ngram_range. |
| | *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases. |
| | To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""", |
| | ) |
| |
|
| | StopWordsCheckbox = st.checkbox( |
| | "Remove stop words", |
| | value=True, |
| | help="Tick this box to remove stop words from the document (currently English only)", |
| | ) |
| |
|
| | use_MMR = st.checkbox( |
| | "Use MMR", |
| | value=True, |
| | help="You can use Maximal Margin Relevance (MMR) to diversify the results. It creates keywords/keyphrases based on cosine similarity. Try high/low 'Diversity' settings below for interesting variations.", |
| | ) |
| |
|
| | Diversity = st.slider( |
| | "Keyword diversity (MMR only)", |
| | value=0.5, |
| | min_value=0.0, |
| | max_value=1.0, |
| | step=0.1, |
| | help="""The higher the setting, the more diverse the keywords.Note that the *Keyword diversity* slider only works if the *MMR* checkbox is ticked.""", |
| | ) |
| |
|
| | with c2: |
| | doc = st.text_area( |
| | "Paste your text below (max 500 words)", |
| | height=510, |
| | ) |
| |
|
| | MAX_WORDS = 500 |
| | import re |
| | res = len(re.findall(r"\w+", doc)) |
| | if res > MAX_WORDS: |
| | st.warning( |
| | "β οΈ Your text contains " |
| | + str(res) |
| | + " words." |
| | + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! π" |
| | ) |
| |
|
| | doc = doc[:MAX_WORDS] |
| | |
| | |
| |
|
| | submit_button = st.form_submit_button(label="β¨ Get me the data!") |
| |
|
| | if use_MMR: |
| | mmr = True |
| | else: |
| | mmr = False |
| |
|
| | if StopWordsCheckbox: |
| | StopWords = "english" |
| | else: |
| | StopWords = None |
| | |
| | if min_Ngrams > max_Ngrams: |
| | st.warning("min_Ngrams can't be greater than max_Ngrams") |
| | st.stop() |
| |
|
| | |
| | |
| | |
| | keywords = kw_model.extract_keywords( |
| | doc, |
| | keyphrase_ngram_range=(min_Ngrams, max_Ngrams), |
| | use_mmr=mmr, |
| | stop_words=StopWords, |
| | top_n=top_N, |
| | diversity=Diversity, |
| | ) |
| | |
| | |
| | st.markdown("## π Results ") |
| |
|
| | st.header("") |
| |
|
| |
|
| | df = ( |
| | pd.DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"]) |
| | .sort_values(by="Relevancy", ascending=False) |
| | .reset_index(drop=True) |
| | ) |
| |
|
| | df.index += 1 |
| |
|
| | |
| | cmGreen = sns.light_palette("green", as_cmap=True) |
| | cmRed = sns.light_palette("red", as_cmap=True) |
| | df = df.style.background_gradient( |
| | cmap=cmGreen, |
| | subset=[ |
| | "Relevancy", |
| | ], |
| | ) |
| |
|
| | c1, c2, c3 = st.columns([1, 3, 1]) |
| |
|
| | format_dictionary = { |
| | "Relevancy": "{:.2%}", |
| | } |
| |
|
| | df = df.format(format_dictionary) |
| |
|
| | with c2: |
| | st.table(df) |
| |
|
| | with st.expander("Note about Quantitative Relevancy"): |
| | st.markdown( |
| | """ |
| | - The relevancy score is a quantitative measure of how relevant the keyword/keyphrase is to the document. It is calculated using cosine similarity. The higher the score, the more relevant the keyword/keyphrase is to the document. |
| | - So if you see a keyword/keyphrase with a high relevancy score, it means that it is a good keyword/keyphrase to use in question answering, generation ,summarization, and other NLP tasks. |
| | """ |
| | ) |
| |
|
| | with st.form(key="ques_form"): |
| | ice, ic1, ice, ic2 ,ic3= st.columns([0.07, 2, 0.07, 5,0.07]) |
| | with ic1: |
| | TopN = st.slider( |
| | "Top N sense2vec results", |
| | value=20, |
| | min_value=0, |
| | max_value=50, |
| | step=1, |
| | help="""Get the n most similar terms.""", |
| | ) |
| |
|
| | with ic2: |
| | input_keyword = st.text_input("Paste any keyword generated above") |
| | keywrd_button = st.form_submit_button(label="β¨ Get me the questions!") |
| |
|
| | if keywrd_button: |
| | st.markdown("## π Questions ") |
| | ext_keywrds=sense2vec_get_words(TopN,input_keyword) |
| | if len(ext_keywrds)<1: |
| | st.warning("Sorry questions couldn't be generated") |
| | |
| | for answer in ext_keywrds: |
| | sentence_for_T5=" ".join(doc.split()) |
| | ques=get_question(sentence_for_T5,answer) |
| | ques=ques.replace("<pad>","").replace("</s>","").replace("<s>","") |
| | st.markdown(f'> #### {ques} ') |
| |
|
| |
|
| |
|