ArchitSharma commited on
Commit
f25768b
ยท
verified ยท
1 Parent(s): 78fb6d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -69
app.py CHANGED
@@ -3,8 +3,6 @@ import validators
3
  import streamlit as st
4
  from transformers import AutoTokenizer, pipeline
5
 
6
- # local modules
7
- from summarizer import Summarizer
8
  from utils import (
9
  clean_text,
10
  fetch_article_text,
@@ -12,97 +10,110 @@ from utils import (
12
  read_text_from_file,
13
  )
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  if __name__ == "__main__":
16
- # ---------------------------------
17
- # Main Application
18
- # ---------------------------------
19
  st.title("Text Summarization Tool ๐Ÿ“")
20
 
21
  st.markdown("---")
22
- summarize_type = st.sidebar.selectbox(
23
- "Summarization Type", options=["Extractive", "Abstractive"]
24
- )
25
  st.markdown(
26
- """This app supports two type of summarization:
 
27
 
28
- 1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
29
- 2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
30
- )
31
- # ---------------------------
32
- # SETUP & Constants
33
- nltk.download("punkt")
34
- abs_tokenizer_name = "facebook/bart-large-cnn"
35
- abs_model_name = "facebook/bart-large-cnn"
36
- abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
37
- abs_max_length = 130
38
- abs_min_length = 30
39
- # ---------------------------
40
-
41
- inp_text = st.text_input("Enter Text or a URL here")
42
- st.markdown(
43
- "<h3 style='text-align: center;'>OR</h3>",
44
- unsafe_allow_html=True,
45
  )
 
 
 
 
 
 
 
 
46
  uploaded_file = st.file_uploader(
47
  "Upload a .txt, .pdf, .docx file for summarization"
48
  )
49
 
50
- is_url = validators.url(inp_text)
51
- if is_url:
52
- # complete text, chunks to summarize (list of sentences for long docs)
53
- text, clean_txt = fetch_article_text(url=inp_text)
54
- elif uploaded_file:
55
- clean_txt = read_text_from_file(uploaded_file)
56
- clean_txt = clean_text(clean_txt)
57
- else:
58
- clean_txt = clean_text(inp_text)
59
 
60
- # view summarized text (expander)
61
  with st.expander("View Input Text"):
62
- if is_url:
63
- st.write(clean_txt[0])
64
  else:
65
  st.write(clean_txt)
 
66
  summarize = st.button("Summarize")
67
 
68
- # called on toggle button [summarize]
69
  if summarize:
70
- if summarize_type == "Extractive":
 
 
 
 
71
  if is_url:
72
- text_to_summarize = " ".join([txt for txt in clean_txt])
73
  else:
74
- text_to_summarize = clean_txt
75
- # extractive summarizer
76
-
77
- with st.spinner(
78
- text="Creating extractive summary. This might take a few seconds ..."
79
- ):
80
- ext_model = Summarizer()
81
- summarized_text = ext_model(text_to_summarize)
82
-
83
- elif summarize_type == "Abstractive":
84
- with st.spinner(
85
- text="Creating abstractive summary. This might take a few seconds ..."
86
- ):
87
- text_to_summarize = clean_txt
88
- abs_summarizer = pipeline(
89
- "summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
90
- )
91
-
92
- if is_url is False:
93
- # list of chunks
94
- text_to_summarize = preprocess_text_for_abstractive_summarization(
95
- tokenizer=abs_tokenizer, text=clean_txt
96
  )
97
- tmp_sum = abs_summarizer(
98
- text_to_summarize,
99
- max_length=len(text_to_summarize),
100
- min_length=abs_min_length,
 
 
 
 
 
 
 
 
 
101
  do_sample=False,
102
  )
 
103
 
104
- summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
105
 
106
- # final summarized output
107
  st.subheader("Summarized text")
108
  st.info(summarized_text)
 
3
  import streamlit as st
4
  from transformers import AutoTokenizer, pipeline
5
 
 
 
6
  from utils import (
7
  clean_text,
8
  fetch_article_text,
 
10
  read_text_from_file,
11
  )
12
 
13
+ ABS_TOKENIZER_NAME = "facebook/bart-large-cnn"
14
+ ABS_MODEL_NAME = "facebook/bart-large-cnn"
15
+ ABS_MIN_LENGTH = 30
16
+ ABS_MAX_LENGTH = 130
17
+
18
+
19
+ @st.cache_resource
20
+ def load_tokenizer():
21
+ return AutoTokenizer.from_pretrained(ABS_TOKENIZER_NAME)
22
+
23
+
24
+ @st.cache_resource
25
+ def load_summarizer():
26
+ return pipeline(
27
+ "summarization",
28
+ model=ABS_MODEL_NAME,
29
+ tokenizer=ABS_TOKENIZER_NAME,
30
+ )
31
+
32
+
33
+ def normalize_input_text(inp_text, uploaded_file):
34
+ is_url = bool(inp_text and validators.url(inp_text))
35
+
36
+ if is_url:
37
+ _, clean_txt = fetch_article_text(url=inp_text)
38
+ elif uploaded_file:
39
+ clean_txt = read_text_from_file(uploaded_file)
40
+ clean_txt = clean_text(clean_txt)
41
+ else:
42
+ clean_txt = clean_text(inp_text)
43
+
44
+ return is_url, clean_txt
45
+
46
+
47
  if __name__ == "__main__":
48
+ st.set_page_config(page_title="Text Summarization Tool", page_icon="๐Ÿ“")
 
 
49
  st.title("Text Summarization Tool ๐Ÿ“")
50
 
51
  st.markdown("---")
 
 
 
52
  st.markdown(
53
+ """
54
+ This app creates **abstractive summaries** using a Hugging Face Transformers summarization pipeline.
55
 
56
+ - Paste text
57
+ - Enter a URL
58
+ - Or upload a `.txt`, `.pdf`, or `.docx` file
59
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  )
61
+
62
+ nltk.download("punkt", quiet=True)
63
+
64
+ abs_tokenizer = load_tokenizer()
65
+ abs_summarizer = load_summarizer()
66
+
67
+ inp_text = st.text_input("Enter text or a URL here")
68
+ st.markdown("<h3 style='text-align: center;'>OR</h3>", unsafe_allow_html=True)
69
  uploaded_file = st.file_uploader(
70
  "Upload a .txt, .pdf, .docx file for summarization"
71
  )
72
 
73
+ is_url, clean_txt = normalize_input_text(inp_text, uploaded_file)
 
 
 
 
 
 
 
 
74
 
 
75
  with st.expander("View Input Text"):
76
+ if isinstance(clean_txt, list):
77
+ st.write(" ".join(clean_txt))
78
  else:
79
  st.write(clean_txt)
80
+
81
  summarize = st.button("Summarize")
82
 
 
83
  if summarize:
84
+ if not clean_txt:
85
+ st.warning("Please enter text, a URL, or upload a file.")
86
+ st.stop()
87
+
88
+ with st.spinner("Creating summary. This might take a few seconds..."):
89
  if is_url:
90
+ text_chunks = clean_txt if isinstance(clean_txt, list) else [clean_txt]
91
  else:
92
+ if isinstance(clean_txt, list):
93
+ text_chunks = clean_txt
94
+ else:
95
+ text_chunks = preprocess_text_for_abstractive_summarization(
96
+ tokenizer=abs_tokenizer,
97
+ text=clean_txt,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  )
99
+
100
+ if isinstance(text_chunks, str):
101
+ text_chunks = [text_chunks]
102
+
103
+ summaries = []
104
+ for chunk in text_chunks:
105
+ if not chunk or not chunk.strip():
106
+ continue
107
+
108
+ result = abs_summarizer(
109
+ chunk,
110
+ max_length=ABS_MAX_LENGTH,
111
+ min_length=ABS_MIN_LENGTH,
112
  do_sample=False,
113
  )
114
+ summaries.append(result[0]["summary_text"])
115
 
116
+ summarized_text = " ".join(summaries)
117
 
 
118
  st.subheader("Summarized text")
119
  st.info(summarized_text)