-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_mars.py
141 lines (113 loc) · 4.21 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from bs4 import BeautifulSoup
import pandas as pd
import time
from splinter import Browser
import requests
def init_browser():
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
return Browser('chrome', **executable_path, headless=False)
def scrape(): # NASA Mars News
"""
NASA Mars News
"""
browser = init_browser()
# Navigate to the page
news_url = 'https://mars.nasa.gov/news/'
browser.visit(news_url)
time.sleep(4)
# Assign the HTML content of the page to a variable
news_html = browser.html
# Parse HTML with Beautifulsoup
soup = BeautifulSoup(news_html,'html.parser')
# Retrieve the latest News Title and Paragraph Text
result = soup.find('div', class_="list_text")
news_title = result.a.text
news_p = result.find('div',class_="article_teaser_body").text
"""
Mars Img
"""
# Navigate to the page
img_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(img_url)
time.sleep(4)
# Assign the HTML content of the page to a variable
imgs_html = browser.html
# Parse HTML with Beautifulsoup
soup = BeautifulSoup(imgs_html,'html.parser')
# Find the image url for the current Featured Mars Image
img_result = soup.find('article', class_="carousel_item")['style']
img_url = img_result.replace("background-image: url('","").replace("');","")
featured_image_url = f"https://www.jpl.nasa.gov{img_url}"
"""
Mars Weather
"""
# Navigate to the page
weather_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(weather_url)
time.sleep(4)
# Assign the HTML content of the page to a variable
weather_html = browser.html
# Parse HTML with Beautifulsoup
soup = BeautifulSoup(weather_html, 'html.parser')
span_class = "css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0"
results = soup.body.find_all("span", class_=span_class)
# Retrieve the latest Mars weather tweet from the page.
for index, result in enumerate(results):
if "InSight" in result.text:
mars_weather = result.text
break # get the first result only
else:
pass
"""
Mars_data
"""
facts_url = 'https://space-facts.com/mars/'
browser.visit(facts_url)
# Collect the tables from the page
tables = pd.read_html(requests.get('https://space-facts.com/mars/').text)
#tables = pd.read_html(facts_url)
# Retrieve the table containing facts about the planet
df = tables[2]
df.columns = ["Description","Value"]
idx_df = df.set_index("Description")
# Export to a HTML file
mars_df = idx_df.to_html(border="1",justify="left")
"""
Mars Hemispheres
"""
# Navigate to the page
hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemisphere_url)
time.sleep(4)
# Assign the HTML content of the page to a variable
hemisphere_html = browser.html
# Parse HTML with Beautifulsoup
soup = BeautifulSoup(hemisphere_html,'html.parser')
# Collect the urls for the hemisphere images
items = soup.find_all("div", class_="item")
main_url = "https://astrogeology.usgs.gov"
hemisphere_image_urls=[]
for item in items:
hemisphere_url = f"{main_url}{item.find('a', class_='itemLink')['href']}"
# Navigate to the page
browser.visit(hemisphere_url)
time.sleep(4)
# Assign the HTML content of the page to a variable
hemisphere_html = browser.html
# Parse HTML with Beautifulsoup
soup = BeautifulSoup(hemisphere_html,'html.parser')
img_url = soup.find('img', class_="wide-image")['src']
title = soup.find('h2', class_="title").text
hemisphere_image_urls.append({"title":title,"img_url":f"{main_url}{img_url}"})
mars_info = {
"mars_news": {
"news_title": news_title,
"news_p": news_p,
},
"mars_img": featured_image_url,
"mars_weather": mars_weather,
"mars_fact": mars_df,
"mars_hemisphere": hemisphere_image_urls
}
browser.quit()
return mars_info