forked from Allianzcortex/code_collection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_search_engine.py
128 lines (100 loc) · 3.79 KB
/
build_search_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from __future__ import print_function
# -*- coding:utf-8 -*-
#!/usr/bin/env python
"""
This script is used to implement a search engine
It will be very simple and will be a MVP(Minimal Viable Product)
In a formal search engine , we should record the occurence times and display it based on the priority.
But in this case , I just remove it and return all searched result by natural order.
I build the data based on IMDB movie webpage
There is no available dataset so I must crawl it from scratch
I use NLP to remove the stop_words and store the sentences.
I implement all the codes and no copy-paste one-line from Internet.
schedule_spider() -> crawl_page() -> nlp_analyze() -> build index()
⬆ keyword
search
"""
import re
from collections import deque
import requests
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
frequence_dict = {}
# I prefer to use BFS search algorithm, use Stack instead if you want to use DFS to retrieve next page
webpage_list = deque()
crawl_web_page_num = 4
# available_link_pattern=re.compile()
content_pattern = re.compile("<meta name=\"description\" content=\"(.*?)\" />")
title_pattern = re.compile("<meta property='og:title' content=\"(.*?)\" />")
url_pattern = re.compile(" <a href=\"/title/tt(\d+)/.*\"")
def crawl_page(url):
"""
To carawl a single webpage,get content/title/description/related urls.
Overall It is not a good choice to use regex to parse the website, beautifulsoup/lxml is better
In this case I will just try to make a simple demo
@param: url webpage link
"""
global webpage_list
content = requests.get(url).text
# get content
title = title_pattern.findall(content)[0]
description = content_pattern.findall(content)[0]
urls = url_pattern.findall(content)
# always get new urls and push it to list, then it can be retrieved later
for new_url in urls[:5]:
webpage_list.append("https://www.imdb.com/title/tt{}/".format(new_url))
nlp_analyze(description,title)
def nlp_analyze(content,title):
"""
Parse the file description
"""
tags = nltk.word_tokenize(content)
build_index([x for x in tags if x not in stop_words and len(x)>=3],title)
def schdule_spider(initial_url):
"""
Decide how many pages a crawler should reach
@initial_url : seed url
"""
global webpage_list
webpage_list.append(initial_url)
index = 0
while True:
# break when exceeding the crawl amount
if(index>crawl_web_page_num):
break
print("-------------Begin to Process {} webpage-------------".format(index+1))
# get the newest url
crawl_page(webpage_list.popleft())
index += 1
def build_index(tags,title):
"""
Build the search engine based on inverted index technology
"""
global frequence_dict
for x in tags:
if x not in frequence_dict:
frequence_dict[x] = [title]
else:
frequence_dict[x].append(title)
def search():
"""
search based on user input
"""
print("Get Search Result is :")
print(frequence_dict)
print("-------------------------------------------------------")
while True:
keyword=input("Please enter the keyword , press q to quit\n")
if(keyword=='q'):
break
else:
if(keyword in frequence_dict):
print(frequence_dict[keyword])
else:
print("No Search result")
if __name__=='__main__':
# Change the following url and you can get a different result
# I use this link because <The Rock> is the first action movie I watched (:
schdule_spider("https://www.imdb.com/title/tt0117500/")
search()