-
Notifications
You must be signed in to change notification settings - Fork 0
/
Extractive Text Summarization.py
125 lines (92 loc) · 4.62 KB
/
Extractive Text Summarization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import streamlit as st
from bs4 import BeautifulSoup
import requests
import re
from collections import Counter
from string import punctuation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words
import pandas as pd
def tokenizer(s):
tokens = []
for word in s.split(' '):
tokens.append(word.strip().lower())
return tokens
def sent_tokenizer(s):
sents = []
for sent in s.split('.'):
sents.append(sent.strip())
return sents
def count_words(tokens):
word_counts = {}
for token in tokens:
if token not in stop_words and token not in punctuation:
if token not in word_counts.keys():
word_counts[token] = 1
else:
word_counts[token] += 1
return word_counts
def word_freq_distribution(word_counts):
freq_dist = {}
max_freq = max(word_counts.values())
for word in word_counts.keys():
freq_dist[word] = (word_counts[word]/max_freq)
return freq_dist
def score_sentences(sents, freq_dist, max_len=40):
sent_scores = {}
for sent in sents:
words = sent.split(' ')
for word in words:
if word.lower() in freq_dist.keys():
if len(words) < max_len:
if sent not in sent_scores.keys():
sent_scores[sent] = freq_dist[word.lower()]
else:
sent_scores[sent] += freq_dist[word.lower()]
return sent_scores
def summarize(sent_scores, k):
top_sents = Counter(sent_scores)
summary = ''
scores = []
top = top_sents.most_common(k)
for t in top:
summary += t[0].strip() + '. '
scores.append((t[1], t[0]))
return summary[:-1], scores
st.title('Extractive Text Summarization')
st.subheader('A simple news article text summarizer made from scratch')
st.sidebar.subheader('Working of the application')
st.sidebar.markdown("* Given an article's link and the number of sentences to be present in the summary as input, using BeautifulSoup library, scrape the text of the article which is called document. ")
st.sidebar.markdown("* Tokenize the entire document into sentences and sentences into words. We need individual words in order to determine their relative frequency in the document, and assign a corresponding score; we need individual sentences to subsequently sum the scores of each word within in order to determine 'sentence importance'. ")
st.sidebar.markdown("* Count the occurence of each word in the document. After finding the counts, we build a frequency distribution of words. To get the distribution, divide the occurrence of each word by the frequency of the most occurring word. ")
st.sidebar.markdown("* Next, assign a score to the sentences by using the frequency distribution generated. This is simply summing up the scores of each word in a sentence. This function takes a max_len argument which sets a maximum length to sentences which are to be considered for use in the summarization. ")
st.sidebar.markdown("* In the final step, based on the scores, select the top 'k' sentences that represent the summary of the article. ")
st.sidebar.markdown("* Display the summary along with the top 'k' sentences and their sentence scores.")
url = st.text_input('\nEnter URL of news article from thehindu.com: ')
no_of_sentences = st.number_input('Choose the no. of sentences in the summary', min_value = 1)
if url and no_of_sentences and st.button('Summarize'):
text = ""
r=requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find('div', attrs = {'id' : re.compile('content-body-14269002-*')})
for p in content.findChildren("p", recursive = 'False'):
text+=p.text+" "
text = re.sub(r'\[[0-9]*\]', ' ', text)
text = re.sub(r'\s+', ' ', text)
st.subheader('Original text: ')
st.write(text)
tokens = tokenizer(text)
sents = sent_tokenizer(text)
word_counts = count_words(tokens)
freq_dist = word_freq_distribution(word_counts)
sent_scores = score_sentences(sents, freq_dist)
summary, summary_sent_scores = summarize(sent_scores, no_of_sentences)
st.subheader('Summarised text: ')
st.write(summary)
subh = 'Summary sentence score for the top ' + str(no_of_sentences) + ' sentences: '
st.subheader(subh)
data = []
for score in summary_sent_scores:
data.append([score[1], score[0]])
df = pd.DataFrame(data, columns = ['Sentence', 'Score'])
st.table(df)
st.info('An application made by Srinath K R.')