Spaces:

ArchitSharma
/

Text-Summarization-Tool

Sleeping

App Files Files Community

ArchitSharma commited on Apr 19

Commit

f25768b

verified ·

1 Parent(s): 78fb6d3

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -69

app.py CHANGED Viewed

@@ -3,8 +3,6 @@ import validators
 import streamlit as st
 from transformers import AutoTokenizer, pipeline
-# local modules
-from summarizer import Summarizer
 from utils import (
     clean_text,
     fetch_article_text,
@@ -12,97 +10,110 @@ from utils import (
     read_text_from_file,
 )
 if __name__ == "__main__":
-    # ---------------------------------
-    # Main Application
-    # ---------------------------------
     st.title("Text Summarization Tool 📝")
     st.markdown("---")
-    summarize_type = st.sidebar.selectbox(
-        "Summarization Type", options=["Extractive", "Abstractive"]
-    )
     st.markdown(
-        """This app supports two type of summarization:
-1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
-2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
-    )
-    # ---------------------------
-    # SETUP & Constants
-    nltk.download("punkt")
-    abs_tokenizer_name = "facebook/bart-large-cnn"
-    abs_model_name = "facebook/bart-large-cnn"
-    abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
-    abs_max_length = 130
-    abs_min_length = 30
-    # ---------------------------
-    inp_text = st.text_input("Enter Text or a URL here")
-    st.markdown(
-        "<h3 style='text-align: center;'>OR</h3>",
-        unsafe_allow_html=True,
     )
     uploaded_file = st.file_uploader(
         "Upload a .txt, .pdf, .docx file for summarization"
     )
-    is_url = validators.url(inp_text)
-    if is_url:
-        # complete text, chunks to summarize (list of sentences for long docs)
-        text, clean_txt = fetch_article_text(url=inp_text)
-    elif uploaded_file:
-        clean_txt = read_text_from_file(uploaded_file)
-        clean_txt = clean_text(clean_txt)
-    else:
-        clean_txt = clean_text(inp_text)
-    # view summarized text (expander)
     with st.expander("View Input Text"):
-        if is_url:
-            st.write(clean_txt[0])
         else:
             st.write(clean_txt)
     summarize = st.button("Summarize")
-    # called on toggle button [summarize]
     if summarize:
-        if summarize_type == "Extractive":
             if is_url:
-                text_to_summarize = " ".join([txt for txt in clean_txt])
             else:
-                text_to_summarize = clean_txt
-            # extractive summarizer
-            with st.spinner(
-                text="Creating extractive summary. This might take a few seconds ..."
-            ):
-                ext_model = Summarizer()
-                summarized_text = ext_model(text_to_summarize)
-        elif summarize_type == "Abstractive":
-            with st.spinner(
-                text="Creating abstractive summary. This might take a few seconds ..."
-            ):
-                text_to_summarize = clean_txt
-                abs_summarizer = pipeline(
-                    "summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
-                )
-                if is_url is False:
-                    # list of chunks
-                    text_to_summarize = preprocess_text_for_abstractive_summarization(
-                        tokenizer=abs_tokenizer, text=clean_txt
                     )
-                tmp_sum = abs_summarizer(
-                    text_to_summarize,
-                    max_length=len(text_to_summarize),
-                    min_length=abs_min_length,
                     do_sample=False,
                 )
-                summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
-        # final summarized output
         st.subheader("Summarized text")
         st.info(summarized_text)

 import streamlit as st
 from transformers import AutoTokenizer, pipeline
 from utils import (
     clean_text,
     fetch_article_text,
     read_text_from_file,
 )
+ABS_TOKENIZER_NAME = "facebook/bart-large-cnn"
+ABS_MODEL_NAME = "facebook/bart-large-cnn"
+ABS_MIN_LENGTH = 30
+ABS_MAX_LENGTH = 130
+@st.cache_resource
+def load_tokenizer():
+    return AutoTokenizer.from_pretrained(ABS_TOKENIZER_NAME)
+@st.cache_resource
+def load_summarizer():
+    return pipeline(
+        "summarization",
+        model=ABS_MODEL_NAME,
+        tokenizer=ABS_TOKENIZER_NAME,
+    )
+def normalize_input_text(inp_text, uploaded_file):
+    is_url = bool(inp_text and validators.url(inp_text))
+    if is_url:
+        _, clean_txt = fetch_article_text(url=inp_text)
+    elif uploaded_file:
+        clean_txt = read_text_from_file(uploaded_file)
+        clean_txt = clean_text(clean_txt)
+    else:
+        clean_txt = clean_text(inp_text)
+    return is_url, clean_txt
 if __name__ == "__main__":
+    st.set_page_config(page_title="Text Summarization Tool", page_icon="📝")
     st.title("Text Summarization Tool 📝")
     st.markdown("---")
     st.markdown(
+        """
+This app creates **abstractive summaries** using a Hugging Face Transformers summarization pipeline.
+- Paste text
+- Enter a URL
+- Or upload a `.txt`, `.pdf`, or `.docx` file
+"""
     )
+    nltk.download("punkt", quiet=True)
+    abs_tokenizer = load_tokenizer()
+    abs_summarizer = load_summarizer()
+    inp_text = st.text_input("Enter text or a URL here")
+    st.markdown("<h3 style='text-align: center;'>OR</h3>", unsafe_allow_html=True)
     uploaded_file = st.file_uploader(
         "Upload a .txt, .pdf, .docx file for summarization"
     )
+    is_url, clean_txt = normalize_input_text(inp_text, uploaded_file)
     with st.expander("View Input Text"):
+        if isinstance(clean_txt, list):
+            st.write(" ".join(clean_txt))
         else:
             st.write(clean_txt)
     summarize = st.button("Summarize")
     if summarize:
+        if not clean_txt:
+            st.warning("Please enter text, a URL, or upload a file.")
+            st.stop()
+        with st.spinner("Creating summary. This might take a few seconds..."):
             if is_url:
+                text_chunks = clean_txt if isinstance(clean_txt, list) else [clean_txt]
             else:
+                if isinstance(clean_txt, list):
+                    text_chunks = clean_txt
+                else:
+                    text_chunks = preprocess_text_for_abstractive_summarization(
+                        tokenizer=abs_tokenizer,
+                        text=clean_txt,
                     )
+            if isinstance(text_chunks, str):
+                text_chunks = [text_chunks]
+            summaries = []
+            for chunk in text_chunks:
+                if not chunk or not chunk.strip():
+                    continue
+                result = abs_summarizer(
+                    chunk,
+                    max_length=ABS_MAX_LENGTH,
+                    min_length=ABS_MIN_LENGTH,
                     do_sample=False,
                 )
+                summaries.append(result[0]["summary_text"])
+            summarized_text = " ".join(summaries)
         st.subheader("Summarized text")
         st.info(summarized_text)