Skip to content

Commit

Permalink
Update GKE EOL scrapping approach
Browse files Browse the repository at this point in the history
Use Beautiful soup to pull data and parse in the queries/web module

renamed queries/html to web (this could be used to pull other static web
contents not related to )

Fix: 328239034

Change-Id: I6bc5ab086a9a79f11262fa2368346163543192d8
GitOrigin-RevId: b4bd193daabc388bd46d6506f684d0a29b4d411e
  • Loading branch information
ebenezergraham authored and copybara-github committed Oct 1, 2024
1 parent d2b18a5 commit 2e96494
Show file tree
Hide file tree
Showing 17 changed files with 346 additions and 116 deletions.
6 changes: 0 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,6 @@ repos:
^gcpdiag/runbook/[^/]+/snapshots/|
^gcpdiag/runbook/[^/]+/(constants|command|flags|util).py|
^gcpdiag/runbook/(constants|command|flags|exceptions|report|util).py
- repo: local
hooks:
- id: gke-eol-file
name: Check that gke eol.yaml file is up to date and contains fresh EOL data
entry: ./bin/precommit-gke-eol-file.sh
language: script
- repo: local
hooks:
- id: todo-annotations
Expand Down
3 changes: 3 additions & 0 deletions bin/precommit-gke-eol-file.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/sh

# WARNING: This script is deprecated and may be removed in a future release.
# Please use 'gcpdiag/queries/gke.get_release_schedule' instead.

# Checks if gcpdiag/lint/gke/eol.yaml file contains the up to date list of
# GKE Releases with corresponding EOL (end-of-life) dates

Expand Down
14 changes: 10 additions & 4 deletions gcpdiag/lint/datafusion/snapshots/WARN_2024_004.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,15 @@
projects/gcpdiag-datafusion1-aaaa/instances/my-instance/computeProfiles/dataproc
(No imageVersion defined)

- projects/gcpdiag-datafusion1-aaaa/locations/us-central1/instances/my-instance [ OK ]
projects/gcpdiag-datafusion1-aaaa/instances/my-instance/computeProfiles/namespace_profile
Datafusion version : 6.7.1
Dataproc version : 2.0
- projects/gcpdiag-datafusion1-aaaa/locations/us-central1/instances/my-instance [FAIL]
projects/gcpdiag-datafusion1-aaaa/instances/my-instance/computeProfiles/namespace_profile
Datafusion version : 6.7.1
Dataproc version : 2.0


The version of your Cloud Data Fusion environment might not be compatible
with the version of your Dataproc cluster from the corresponding compute
profiles.

https://gcpdiag.dev/rules/datafusion/WARN/2024_004

33 changes: 7 additions & 26 deletions gcpdiag/lint/gke/bp_2022_003_cluster_eol.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@
"""

from datetime import date, timedelta
from os.path import dirname
from typing import Dict

from boltons.iterutils import get_path
from yaml import safe_load

from gcpdiag import lint, models
from gcpdiag.queries import gke
Expand Down Expand Up @@ -64,14 +62,6 @@ def _estimate_oss_release_date(version: Version) -> date:
MINOR_RELEASE_PACE_IN_DAYS)


def _get_date(str_or_date) -> date:
# Handle incomplete dates in 'YYYY-MM' form
if str_or_date and isinstance(str_or_date,
str) and len(str_or_date) == len('YYYY-MM'):
return date.fromisoformat(f'{str_or_date}-15')
return str_or_date


def _estimate_gke_eol_date(version: Version, eol_schedule: Dict):
"""
Estimate End Of Life date for a given GKE version
Expand All @@ -83,12 +73,10 @@ def _estimate_gke_eol_date(version: Version, eol_schedule: Dict):

short_version = f'{version.major}.{version.minor}'

regular_release = _get_date(
get_path(eol_schedule, (short_version, 'regular_avail'), None))
rapid_release = _get_date(
get_path(eol_schedule, (short_version, 'rapid_avail'), None))
oss_release = _get_date(
get_path(eol_schedule, (short_version, 'oss_release'), None))
regular_release = get_path(eol_schedule, (short_version, 'regular_avail'),
None)
rapid_release = get_path(eol_schedule, (short_version, 'rapid_avail'), None)
oss_release = get_path(eol_schedule, (short_version, 'oss_release'), None)

if regular_release and regular_release != TBD:
return regular_release + timedelta(days=GKE_REGULAR_SUPPORT_PERIOD_IN_DAYS)
Expand Down Expand Up @@ -125,15 +113,15 @@ def _notification_required(version: Version, eol_schedule: Dict) -> bool:
# Update the EOL date in the `eol_schedule` dict
eol_schedule[short_version] = {'eol': eol_date, 'estimated': True}
else:
eol_date = _get_date(eol_schedule[short_version]['eol'])
eol_date = eol_schedule[short_version]['eol']

return date.today() > eol_date - timedelta(days=NOTIFY_PERIOD_IN_DAYS)


def _get_notification_msg(version: Version, eol_schedule: Dict) -> str:
short_version = f'{version.major}.{version.minor}'
msg = f'''GKE version {short_version}\n
scheduled end of life: {_get_date(eol_schedule[short_version]["eol"])}'''
scheduled end of life: {eol_schedule[short_version]["eol"]}'''
if 'estimated' in eol_schedule[short_version]:
msg += ' (estimation)'
return msg
Expand All @@ -144,14 +132,7 @@ def run_rule(context: models.Context, report: lint.LintReportRuleInterface):
if not clusters:
report.add_skipped(None, 'no clusters found')

# This file should be updated regularly by `eol_parser.sh` or by `make`:
# <root_prj_folder>$ make gke-eol-file
try:
with open(f'{dirname(__file__)}/eol.yaml', encoding='utf-8') as eol_file:
eol_schedule = safe_load(eol_file)
except OSError:
# Ignore absence of the file, estimations will be used
eol_schedule = {}
eol_schedule = gke.get_release_schedule()

for _, c in sorted(clusters.items()):
if c.release_channel:
Expand Down
38 changes: 0 additions & 38 deletions gcpdiag/lint/gke/eol.yaml

This file was deleted.

3 changes: 3 additions & 0 deletions gcpdiag/lint/gke/eol_parser.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

# WARNING: This script is deprecated and may be removed in a future release.
# Please use 'gcpdiag/queries/gke.get_release_schedule' instead.

# Parses the public GKE Schedule page and extratct EOL (end-of-life) dates
# for currently available GKE versions

Expand Down
3 changes: 2 additions & 1 deletion gcpdiag/lint/snapshot_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@

from gcpdiag import lint, models
from gcpdiag.lint.output import terminal_output
from gcpdiag.queries import apis_stub, kubectl_stub
from gcpdiag.queries import apis_stub, kubectl_stub, web_stub
from gcpdiag.queries.generic_api.api_build import generic_api_stub


@mock.patch('gcpdiag.queries.web.get', new=web_stub.get)
@mock.patch('gcpdiag.queries.apis.get_api', new=apis_stub.get_api_stub)
@mock.patch('gcpdiag.queries.kubectl.verify_auth', new=kubectl_stub.verify_auth)
@mock.patch(
Expand Down
14 changes: 7 additions & 7 deletions gcpdiag/queries/datafusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import requests

from gcpdiag import caching, config, models, utils
from gcpdiag.queries import apis, crm, html, network
from gcpdiag.queries import apis, crm, network, web
from gcpdiag.queries.generic_api.api_build import get_generic
from gcpdiag.utils import Version

Expand Down Expand Up @@ -228,9 +228,9 @@ def extract_support_datafusion_version() -> Dict[str, str]:
page_url = 'https://cloud.google.com/data-fusion/docs/support/version-support-policy'

try:
data_fusion_table = html.fetch_and_extract_table(page_url,
tag='h2',
tag_id='support_timelines')
data_fusion_table = web.fetch_and_extract_table(page_url,
tag='h2',
tag_id='support_timelines')
if data_fusion_table:
versions = []
support_end_dates = []
Expand Down Expand Up @@ -400,9 +400,9 @@ def extract_datafusion_dataproc_version() -> Dict[str, list[str]]:
page_url = 'https://cloud.google.com/data-fusion/docs/concepts/configure-clusters'

try:
table = html.fetch_and_extract_table(page_url,
tag='h2',
tag_id='version-compatibility')
table = web.fetch_and_extract_table(page_url,
tag='h2',
tag_id='version-compatibility')
if table:
rows = table.find_all('tr')[1:] #Skip the header row
version_dict = {}
Expand Down
24 changes: 4 additions & 20 deletions gcpdiag/queries/datafusion_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from unittest import mock

from gcpdiag import models
from gcpdiag.queries import apis_stub, datafusion
from gcpdiag.queries import apis_stub, datafusion, web_stub
from gcpdiag.queries.generic_api.api_build import generic_api_stub

DUMMY_REGION = 'us-central1'
Expand Down Expand Up @@ -73,31 +73,15 @@ def test_is_private_ip(self):
True) in [(i.name, i.is_private) for k, i in instances.items()]


@mock.patch('gcpdiag.queries.html.requests.get', autospec=True)
@mock.patch('gcpdiag.queries.web.get', new=web_stub.get)
class TestExtractVersionPolicyDict:
"""Test html content."""

def test_extract_support_datafusion_version(self, mock_get):

with open(
'test-data/datafusion1/html-content/'
'version_support_policy.html',
encoding='utf-8') as fh:
mock_get.return_value.content = fh.read().encode('utf-8')
mock_get.return_value.status_code = 200

def test_extract_support_datafusion_version(self):
response_dict = datafusion.extract_support_datafusion_version()
assert response_dict == SUPPORTED_VERSIONS_DICT

def test_extract_datafusion_dataproc_version(self, mock_get):

with open(
'test-data/datafusion1/html-content/'
'version_compatability.html',
encoding='utf-8') as fh:
mock_get.return_value.content = fh.read().encode('utf-8')
mock_get.return_value.status_code = 200

def test_extract_datafusion_dataproc_version(self):
response_dict = datafusion.extract_datafusion_dataproc_version()
assert response_dict == DATAFUSION_DATAPROC_VERSIONS_DICT

Expand Down
84 changes: 83 additions & 1 deletion gcpdiag/queries/gke.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,20 @@
# Lint as: python3
"""Queries related to GCP Kubernetes Engine clusters."""

import datetime
import functools
import ipaddress
import logging
import re
from typing import Dict, Iterable, List, Mapping, Optional, Union

import bs4
import googleapiclient.errors
import requests
from boltons.iterutils import get_path

from gcpdiag import caching, config, models, utils
from gcpdiag.queries import apis, crm, gce, network
from gcpdiag.queries import apis, crm, gce, network, web
from gcpdiag.utils import Version

# To avoid name conflict with L342
Expand Down Expand Up @@ -599,3 +602,82 @@ def get_node_by_instance_id(context: models.Context, instance_id: str) -> Node:
except AttributeError as err:
raise KeyError from err
return None


@caching.cached_api_call
def get_release_schedule() -> Dict:
"""Extract the release schdule for gke clusters
Returns:
A dictionary of release schdule.
"""
page_url = 'https://cloud.google.com/kubernetes-engine/docs/release-schedule'
release_data = {}
# estimate first month of the quarter
quarter_dates = {'Q1': '1', 'Q2': '4', 'Q3': '7', 'Q4': '10'}
try:
table = web.fetch_and_extract_table(page_url,
tag='table',
class_name='gke-release-schedule')

# Function to parse a date string or return None for 'N/A'
def parse_date(date_str) -> Optional[datetime.date]:
p = r'(?P<year>\d{4})-(?:(?P<quarter>Q[1-4])|(?P<month>[0-9]{1,2}))(?:-(?P<day>[0-9]{1,2}))?'
match = re.search(p, date_str)
# Handle incomplete dates in 'YYYY-MM' form
if match and match.group('month') and not match.group('day'):
return datetime.date.fromisoformat(f'{date_str}-15')
# Handle quarter year (for example, 2025-Q3) approximations that are updated when known.
# https://cloud.google.com/kubernetes-engine/docs/release-schedule.md#fn6
if match and match.group('quarter') and not match.group('day'):
date_str = f"{match.group('year')}-{quarter_dates[match.group('quarter')]}-01"
return datetime.date.fromisoformat(date_str)
if match and match.group('year') and match.group('month') and match.group(
'day'):
return datetime.date.fromisoformat(date_str)
# anything less like N/A return None
return None

def find_date_str_in_td(e):
"""recursively find a date string in a td"""
if isinstance(e, str):
return e
if isinstance(e, bs4.element.Tag):
return find_date_str_in_td(e.next)
return None

# Find all table rows within tbody
rows = table.find('tbody').find_all('tr')

# Iterate over each row and extract the data
for row in rows:
# Extract all the columns (td elements)
cols = row.find_all('td')

# Extract relevant data

minor_version = cols[0].next.strip()
rapid_avail = parse_date(find_date_str_in_td(cols[1].next))
regular_avail = parse_date(find_date_str_in_td(cols[3].next))
stable_avail = parse_date(find_date_str_in_td(cols[5].next))
extended_avail = parse_date(find_date_str_in_td(cols[7].next))
end_of_standard_support = parse_date(find_date_str_in_td(cols[9].next))

# Add the extracted data into the dictionary in the desired format
release_data[minor_version] = {
'rapid_avail': rapid_avail,
'regular_avail': regular_avail,
'stable_avail': stable_avail,
'extended_avail': extended_avail,
'eol': end_of_standard_support,
}
return release_data
except (
requests.exceptions.RequestException,
AttributeError,
TypeError,
ValueError,
IndexError,
) as e:
logging.error('Error in extracting gke release schedule: %s', e)
return release_data
Loading

0 comments on commit 2e96494

Please sign in to comment.