-
Notifications
You must be signed in to change notification settings - Fork 0
/
newWork.py
53 lines (38 loc) · 1.46 KB
/
newWork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#for grabbing web details
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np
def parsePage(pageNumber):
quote_page = 'https://www.workana.com/jobs?query=data+scientist&publication=any&language=en&page=' + str(pageNumber)
page = urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
#next_page_link = quote_page+ elm['href']
#extract titles
title_box = soup.findAll('h2', attrs={'class': 'h2 project-title'})
for each in title_box:
#title = each.text.strip()
title = each.text.strip().encode('utf-8')
#print (title)
title_col.append(title)
#response = requests.get(next_page_link)
#extract project descriptions
project_det = soup.findAll('div', attrs={'class': 'html-desc project-details'})
for each1 in project_det:
proj = each1.text.strip().encode('utf-8')
project_col.append(proj)
#pagination
farPage = 'https://www.workana.com/jobs?query=data+scientist&publication=any&language=en&page=100'
endPage = urlopen(farPage)
endPageSoup = BeautifulSoup(endPage, 'html.parser')
pagination = endPageSoup.find('ul', attrs={'class': 'pagination'}).findAll('a')
pagination = pagination[len(pagination) - 2].string
endPageN = int(pagination)
title_col = []
project_col = []
for i in range(1, endPageN):
parsePage(i)
#creating data frame
df = pd.DataFrame(project_col, title_col)
df.to_csv('Workana_1.csv', header=False)