-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathscript.py
59 lines (50 loc) · 1.91 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#imports
import requests
from bs4 import BeautifulSoup
import pandas
import re
#variables
movies_originals = []
csv_file = "./filmes.csv"
#functions
def read_links(soup):
links = []
for link in soup.find_all("a"):
if link.get('data-movie-pk') != None:
links.append(link.get('href'))
return links
def get_original_title(links):
global movies_originals
for link in links:
html_doc = requests.get('https://filmow.com%s' % link)
print("getting original title from link %s" % link)
soup = BeautifulSoup(html_doc.text, 'html.parser')
for link in soup.find_all("h2", class_="movie-original-title"):
if re.match("[^\x00-\x7F]", link.get_text()):
for link in soup.find_all('div', class_="movie-other-titles"):
for item in link.find_all('li'):
if item.em.get_text() == "Estados Unidos da América":
movies_originals.append(item.strong.get_text())
else:
movies_originals.append(link.get_text())
def read_movies(user):
i = 1
while requests.get('https://filmow.com/usuario/%s/filmes/ja-vi/?pagina=%d' % (user, i)):
html_doc = requests.get('https://filmow.com/usuario/%s/filmes/ja-vi/?pagina=%d' % (user, i))
print("reading page %d" % i)
i = i + 1
soup = BeautifulSoup(html_doc.text, 'html.parser')
links = read_links(soup)
get_original_title(links)
def get_info():
user = input("Digite o seu nome de usuário no filmow: ")
print("OK! A busca será feita no usuário " + user + ". Seu csv será salvo no diretório atual com o nome filmes.csv")
return user
def main():
user = get_info()
read_movies(user)
#writing csv
df = pandas.DataFrame(data={"Title": movies_originals})
df.to_csv(csv_file, sep=',',index=False)
if __name__ == "__main__":
main()