-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_this_program.py
215 lines (182 loc) · 7.84 KB
/
run_this_program.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import requests
from bs4 import BeautifulSoup
import gpt_functions as gpt
from text_cleaner import truncate_text
import web_scraper as scrape
import time
import pandas as pd
import sys
import math
#initialise websites list
websites = []
loop = True
"""remove_nan_from_end removes nan values from the end of a list"""
def remove_nan_from_end(lst):
reversed_lst = list(reversed(lst))
result = []
found_non_nan = False
for item in reversed_lst:
try:
if math.isnan(item) and not found_non_nan:
continue
except TypeError:
pass
found_non_nan = True
result.append(item)
return list(reversed(result))
"""recover_descriptions is used to recover the descriptions and bad urls generated this session"""
def recover_descriptions():
padding_length = len(my_csv) - len(long_desc_col)
padded_values = long_desc_col + [''] * padding_length
my_csv['Long Description'] = padded_values
padding_length = len(my_csv) - len(short_desc_col)
padded_values = short_desc_col + [''] * padding_length
my_csv['Short Description'] = padded_values
padding_length = len(my_csv) - len(shortest_desc_col)
padded_values = shortest_desc_col + [''] * padding_length
my_csv['Rollover Description'] = padded_values
my_csv.to_csv(filename, index=False)
all_denied_urls_csv = pd.DataFrame({'Denied URLs': all_denied_urls})
all_denied_urls_csv.to_csv("denied_urls.csv", index=False)
print("Descriptions recovered successfully.")
"""save_descriptions writes the generated descriptions to the csv file"""
def save_descriptions():
padding_length = len(my_csv) - len(long_desc_col)
padded_values = long_desc_col + [''] * padding_length
my_csv['Long Description'] = padded_values
padding_length = len(my_csv) - len(short_desc_col)
padded_values = short_desc_col + [''] * padding_length
my_csv['Short Description'] = padded_values
padding_length = len(my_csv) - len(shortest_desc_col)
padded_values = shortest_desc_col + [''] * padding_length
my_csv['Rollover Description'] = padded_values
my_csv.to_csv(filename, index=False)
#Collect the csv file name to read/ write to via user input.
while (loop == True):
print("Please input the name of your CSV file:")
filename = input("")
try:
my_csv = pd.read_csv(filename)
loop = False
except:
print("Error: Failed to read file. Please note:")
print("-The file should be a .csv file in the same directory as this program.")
print("-Your input should not be enclosed in quotation marks.")
print("-You should include the .csv extension at the end of your file name.")
print()
#check for existing descriptions
last_valid_index = my_csv['Rollover Description'].last_valid_index()
#if there are none, then we can start fresh
if last_valid_index is None:
start_index = 0
#initialise the lists of descriptions as empty
long_desc_col = []
short_desc_col = []
shortest_desc_col = []
else: #if there are existing descriptions already
start_index = my_csv['Rollover Description'].last_valid_index() + 1
#remove nan values from the ends of the description columns
#and turn the results into lists
long_desc_col = remove_nan_from_end(my_csv['Long Description'].tolist())
short_desc_col = remove_nan_from_end(my_csv['Short Description'].tolist())
shortest_desc_col = remove_nan_from_end(my_csv['Rollover Description'].tolist())
#list of urls will begin where we left off last time,
#if this is a new file it'll start at the beginning
websites = my_csv['Tool URL'][start_index:].tolist()
"""The following code handles the description generation and the writing
of the csv files"""
#Initialise variables for time elapsed
length = len(websites)
iterator = 0
overall_start = time.perf_counter()
#initialise list of denied URLS
denied_urls_csv = pd.read_csv("denied_urls.csv")
all_denied_urls = denied_urls_csv["Denied URLs"].tolist()
#If GPT response contains these, it means it can't generate a description.
bad_phrases = ["I'm sorry", "I apologize", "JavaScript"]
#Iterate through the list of websites and generate 3 descriptions per URL
for website in websites:
print("Accessing", website, "...")
print()
start = time.perf_counter() #start the timer for this iteration
try:
#attempt to scrape the website for text
page_text = scrape.extract_website_info(website)
except:
iterator += 1
all_denied_urls.append(website)#add URL to list of denied urls
all_denied_urls_csv = pd.DataFrame({'Denied URLs': all_denied_urls})
all_denied_urls_csv.to_csv("denied_urls.csv", index=False)
long_desc_col.append("") #add blanks to the description lists
short_desc_col.append("")
shortest_desc_col.append("")
save_descriptions()
print("Unable to generate description. Website added to denied_urls.csv.")
print(">>>>>>>>>", iterator, " out of ", length, " websites complete.")
print()
continue
#shorten to 600 words to avoid token limit
page_text = truncate_text(page_text)
#Ask GPT for descriptions of the resource
try:
long_desc = gpt.long_describe(page_text)
short_desc = gpt.short_describe(page_text)
shortest_desc = gpt.link_describe(page_text)
#If the rate limit has been exceeded, stop executing.
except Exception as e:
print(f"Error: {str(e)}")
print()
recover_descriptions()
print("Process exited. You may now close the program.")
sys.exit()
iterator += 1
end = time.perf_counter() #end the timer for this iteration
elapsed = round(end - start)
if (not gpt.paid_account) and elapsed < 62:
naptime = 62 - elapsed #calculate time to back off for
else:
naptime = False
#Detect websites that are down or cannot be scraped/ described.
found_phrases = False
for phrase in bad_phrases:
if phrase in long_desc or phrase in short_desc or phrase in shortest_desc:
found_phrases = True
break
if "github.com" in website: #GPT struggles to describe scraped github pages accurately
found_phrases = True
if found_phrases:
#add URL to list of denied urls
all_denied_urls.append(website)
all_denied_urls_csv = pd.DataFrame({'Denied URLs': all_denied_urls})
all_denied_urls_csv.to_csv("denied_urls.csv", index=False)
long_desc_col.append("") #add blanks to the description lists
short_desc_col.append("")
shortest_desc_col.append("")
save_descriptions()
print("Unable to generate description. URL added to denied_urls.csv.")
else:
long_desc_col.append(long_desc) #else add descriptions to lists
short_desc_col.append(short_desc)
shortest_desc_col.append(shortest_desc)
print("********LONG DESCRIPTION********")
print(long_desc)
print("********SHORT DESCRIPTION********")
print(short_desc)
print("********ROLLOVER DESCRIPTION********")
print(shortest_desc)
print()
save_descriptions()
print("Descriptions saved to", filename)
print(">>>>>>>>>", iterator, " out of", length, "websites complete.")
print("Time elapsed:", elapsed, "seconds.")
#Rate limit is 3 requests per min, so we need to back off until a minute has elapsed
if (iterator < length) and naptime:
print("Cooling off for", naptime, "seconds...")
time.sleep(naptime)
print()
overall_end = time.perf_counter() #stop the timer
overall_elapsed = overall_end - overall_start #calculate overall time elapsed
print()
print("Process complete. Total time elapsed:", overall_elapsed, "seconds.")
#Hooray!
print("You may now close the program.")