forked from iSchool-597PR/2021Fall_finals
-
Notifications
You must be signed in to change notification settings - Fork 2
/
scraper.py
84 lines (72 loc) · 3.03 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Analysis of Unemployment Rate in the United States
Authors: Divyaang Agarwal, Ankita Khiratkar
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
def scrape_table(i: int, path: str):
"""
This function scrapes data from the below mentioned website and store them in the folder with specified file name
as input
:param i: table number specified on the website; used to scrape the particular table
0: job gains table; 1: job losses table
:param path: name of the csv file where scraped data is stored
:return: None
>>> scrape_table(0, './ScraperDoctestFiles/job_gains.csv')
>>> if 'job_gains.csv' in os.listdir('./ScraperDoctestFiles/'): print('data scraped successfully')
data scraped successfully
>>> scrape_table(1, './ScraperDoctestFiles/job_losses.csv')
>>> if 'job_losses.csv' in os.listdir('./ScraperDoctestFiles/'): print('data scraped successfully')
data scraped successfully
"""
# Calling a get request to the desired url
response = requests.get(
'https://www.bls.gov/opub/ted/2021/job-gains-and-losses-by-state-from-march-2019-to-march-2021.htm')
html_text = response.text
data = BeautifulSoup(html_text, 'html.parser')
# Extracting the required data
tables = data.find_all(class_='article-table')
table_data = []
for row in tables[i].find_all('tr'):
row_list = []
for value in row.stripped_strings:
row_list.append(value)
table_data.append(row_list)
# Storing the scraped data to a dataframe
df = pd.DataFrame(data=table_data[1:],
columns=table_data[0])
df.set_index('State', inplace=True)
df.dropna(how='all', inplace=True)
# Storing the scraped data in a csv file
df.to_csv(path)
def scrape_state_unemp(res_path: str):
"""
This function scrapes data from the below mentioned website and store them in the folder with specified file name
as input
:param res_path: name of the csv file where scraped data is stored
:return: None
>>> scrape_state_unemp('./ScraperDoctestFiles/state_unemployment_11_21.csv')
>>> if 'state_unemployment_11_21.csv' in os.listdir('./ScraperDoctestFiles/'): print('data scraped successfully')
data scraped successfully
"""
# Calling a get request to the desired url
response = requests.get(
'https://www.bls.gov/charts/state-employment-and-unemployment/state-unemployment-rates-animated.htm')
html_text = response.text
data = BeautifulSoup(html_text, 'html.parser')
# Extracting the required data
table = data.find(id='lau_rc_unmapanim')
table_data = []
for row in table.find_all('tr'):
row_list = []
for value in row.stripped_strings:
row_list.append(value)
table_data.append(row_list)
# Storing the scraped data to a dataframe
df = pd.DataFrame(data=table_data[1:],
columns=table_data[0])
df.set_index('State', inplace=True)
# Storing the scraped data in a csv file
df.to_csv(res_path)