-
Notifications
You must be signed in to change notification settings - Fork 4
/
crawler.py
67 lines (49 loc) · 1.49 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import requests
def url_input():
no_of_inputs=int(input("Mention the no of inputs:"))
index=0
url_list=[]
while index<no_of_inputs:
x=input("Mention the url:")
url_list.append(x)
index=index+1
return url_list
def html_extractor(url_list):
content_l=[]
for i in url_list:
sess={'Sessionid':'1...'}
req_info=requests.get(i,cookies=sess)
content_l.append(req_info.content)
return content_l
url_list=url_input()
content_l=html_extractor(url_list)
print(str(content_l[:300]))
def word_finder(content_l):
word=input("Search word:")
word=str(word)
#content=" ".join(content_l)
pos_l=[]
content_l=content_l[0]
content_len=len(content_l)
nl=[]
ide=0
while ide<content_len:
wr=content_l[ide]
nl.append(wr.decode('Latin-1'))
ide=ide+1
content_l="".join(nl)
word_len=len(word)
content_len=len(content_l)
iter_index=0
while iter_index<content_len:
check_word=content_l[iter_index:iter_index+word_len:word_len]
print(word_len,iter_index)
print(check_word)
#check_word=check_word.decode('Latin-1')
print(check_word,word)
if check_word==word:
print("went in")
pos_l.append(iter_index)
iter_index=iter_index+word_len
return pos_l
print(word_finder(content_l))