-
Notifications
You must be signed in to change notification settings - Fork 0
/
web.py
108 lines (85 loc) · 3.36 KB
/
web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import streamlit as st
import pandas as pd
import pickle
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stopwd = stopwords.words('english')
def clean_text(text):
text = text.lower() # Lowercasing the text
text = re.sub('-', ' ', text.lower()) # Replacing `x-x` as `x x`
text = re.sub(r'http\S+', '', text) # Removing Links
text = re.sub(f'[{string.punctuation}]', '', text) # Remove punctuations
text = re.sub(r'\s+', ' ', text) # Removing unnecessary spaces
text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # Removing single characters
words = nltk.tokenize.word_tokenize(
text, language="english", preserve_line=True)
# Removing the stop words
text = " ".join([i for i in words if i not in stopwd and len(i) > 2])
return text.strip()
def load_vectorizer():
return pickle.load(open('vectorizer.pkl', 'rb'))
# Streamlit UI
st.title("Email Spam Detection App")
st.caption("This app detects whether an email is spam or not.")
st.divider()
# Accuracies of models
showbtn = st.checkbox("Show model Accuracies", value=True)
if showbtn:
st.markdown("## Model Accuracies")
st.write(pickle.load(open('scores.pkl', 'rb')))
# Load models
model_files = ["LogisticRegression.pkl", "RandomForestClassifier.pkl",
"SVC.pkl", "ComplementNB.pkl"]
model_options = {model_file.split(
'.')[0]: model_file for model_file in model_files}
selected_model = st.selectbox("Select Model", list(model_options.keys()))
with open(model_options[selected_model], 'rb') as model_file:
model = pickle.load(model_file)
# Function to predict spam or not
def predict_spam(input_text):
input_text = clean_text(input_text)
vectorizer = load_vectorizer()
input_text = vectorizer.transform([input_text])
prediction = model.predict(input_text)
return "Spam" if prediction[0] == 1 else "Not Spam"
# Input text or file upload
input_type = st.radio("Select Input Type", ["Text", "CSV File"])
if input_type == "Text":
user_input = st.text_area("Enter the email text:")
if st.button("Predict"):
result = predict_spam(user_input)
st.markdown(f"## Prediction: {result}")
elif input_type == "CSV File":
st.markdown(
"Note: :blue[The CSV file should have a column named 'Email']", unsafe_allow_html=True)
uploaded_file = st.file_uploader(
"upload the csv file here", type=["csv"])
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
try:
df["Email"].head()
except KeyError:
st.error("The CSV file does not have a column named 'Email'")
st.stop()
with st.spinner("Predicting..."):
predictions = df["Email"].apply(predict_spam)
df["Prediction"] = predictions
st.write("Predictions:")
st.write(df)
# About this project
st.divider()
st.markdown("## About This Project")
st.write(
"This Streamlit app is designed for email spam detection using different machine learning models."
)
st.write(
"It allows users to select a model, input email text or upload a CSV file with emails, and "
"provides predictions on whether each email is spam or not."
)
st.caption("This project is made by : Pankil Soni")
st.write(
"For more details, check the GitHub repository: [https://github.com/pankil-soni/email-spam-detector]"
)