Update GKE EOL scrapping approach

Use Beautiful soup to pull data and parse in the queries/web module renamed queries/html to web (this could be used to pull other static web contents not related to ) Fix: 328239034 Change-Id: I6bc5ab086a9a79f11262fa2368346163543192d8 GitOrigin-RevId: b4bd193daabc388bd46d6506f684d0a29b4d411e
GoogleCloudPlatform · Oct 1, 2024 · 2e96494 · 2e96494
1 parent d2b18a5
commit 2e96494
Show file tree

Hide file tree

Showing 17 changed files with 346 additions and 116 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -88,12 +88,6 @@ repos:
       ^gcpdiag/runbook/[^/]+/snapshots/|
       ^gcpdiag/runbook/[^/]+/(constants|command|flags|util).py|
       ^gcpdiag/runbook/(constants|command|flags|exceptions|report|util).py
-- repo: local
-  hooks:
-  - id: gke-eol-file
-    name: Check that gke eol.yaml file is up to date and contains fresh EOL data
-    entry: ./bin/precommit-gke-eol-file.sh
-    language: script
 - repo: local
   hooks:
   - id: todo-annotations

diff --git a/bin/precommit-gke-eol-file.sh b/bin/precommit-gke-eol-file.sh
@@ -1,5 +1,8 @@
 #!/bin/sh
 
+# WARNING: This script is deprecated and may be removed in a future release.
+# Please use 'gcpdiag/queries/gke.get_release_schedule' instead.
+
 # Checks if gcpdiag/lint/gke/eol.yaml file contains the up to date list of
 # GKE Releases with corresponding EOL (end-of-life) dates
 

diff --git a/gcpdiag/lint/datafusion/snapshots/WARN_2024_004.txt b/gcpdiag/lint/datafusion/snapshots/WARN_2024_004.txt
@@ -7,9 +7,15 @@
 	projects/gcpdiag-datafusion1-aaaa/instances/my-instance/computeProfiles/dataproc
 	(No imageVersion defined)
 
-   - projects/gcpdiag-datafusion1-aaaa/locations/us-central1/instances/my-instance [ OK ] 
-	projects/gcpdiag-datafusion1-aaaa/instances/my-instance/computeProfiles/namespace_profile
-	Datafusion version : 6.7.1
-	Dataproc version : 2.0
+   - projects/gcpdiag-datafusion1-aaaa/locations/us-central1/instances/my-instance [FAIL]
+     	projects/gcpdiag-datafusion1-aaaa/instances/my-instance/computeProfiles/namespace_profile
+     	Datafusion version : 6.7.1
+     	Dataproc version : 2.0
+
+
+   The version of your Cloud Data Fusion environment might not be compatible
+   with the version of your Dataproc cluster from the corresponding compute
+   profiles.
 
+   https://gcpdiag.dev/rules/datafusion/WARN/2024_004
 
diff --git a/gcpdiag/lint/gke/bp_2022_003_cluster_eol.py b/gcpdiag/lint/gke/bp_2022_003_cluster_eol.py
@@ -20,11 +20,9 @@
 """
 
 from datetime import date, timedelta
-from os.path import dirname
 from typing import Dict
 
 from boltons.iterutils import get_path
-from yaml import safe_load
 
 from gcpdiag import lint, models
 from gcpdiag.queries import gke
@@ -64,14 +62,6 @@ def _estimate_oss_release_date(version: Version) -> date:
       MINOR_RELEASE_PACE_IN_DAYS)
 
 
-def _get_date(str_or_date) -> date:
-  # Handle incomplete dates in 'YYYY-MM' form
-  if str_or_date and isinstance(str_or_date,
-                                str) and len(str_or_date) == len('YYYY-MM'):
-    return date.fromisoformat(f'{str_or_date}-15')
-  return str_or_date
-
-
 def _estimate_gke_eol_date(version: Version, eol_schedule: Dict):
   """
   Estimate End Of Life date for a given GKE version
@@ -83,12 +73,10 @@ def _estimate_gke_eol_date(version: Version, eol_schedule: Dict):
 
   short_version = f'{version.major}.{version.minor}'
 
-  regular_release = _get_date(
-      get_path(eol_schedule, (short_version, 'regular_avail'), None))
-  rapid_release = _get_date(
-      get_path(eol_schedule, (short_version, 'rapid_avail'), None))
-  oss_release = _get_date(
-      get_path(eol_schedule, (short_version, 'oss_release'), None))
+  regular_release = get_path(eol_schedule, (short_version, 'regular_avail'),
+                             None)
+  rapid_release = get_path(eol_schedule, (short_version, 'rapid_avail'), None)
+  oss_release = get_path(eol_schedule, (short_version, 'oss_release'), None)
 
   if regular_release and regular_release != TBD:
     return regular_release + timedelta(days=GKE_REGULAR_SUPPORT_PERIOD_IN_DAYS)
@@ -125,15 +113,15 @@ def _notification_required(version: Version, eol_schedule: Dict) -> bool:
     # Update the EOL date in the `eol_schedule` dict
     eol_schedule[short_version] = {'eol': eol_date, 'estimated': True}
   else:
-    eol_date = _get_date(eol_schedule[short_version]['eol'])
+    eol_date = eol_schedule[short_version]['eol']
 
   return date.today() > eol_date - timedelta(days=NOTIFY_PERIOD_IN_DAYS)
 
 
 def _get_notification_msg(version: Version, eol_schedule: Dict) -> str:
   short_version = f'{version.major}.{version.minor}'
   msg = f'''GKE version {short_version}\n
-    scheduled end of life: {_get_date(eol_schedule[short_version]["eol"])}'''
+    scheduled end of life: {eol_schedule[short_version]["eol"]}'''
   if 'estimated' in eol_schedule[short_version]:
     msg += ' (estimation)'
   return msg
@@ -144,14 +132,7 @@ def run_rule(context: models.Context, report: lint.LintReportRuleInterface):
   if not clusters:
     report.add_skipped(None, 'no clusters found')
 
-  # This file should be updated regularly by `eol_parser.sh` or by `make`:
-  #  <root_prj_folder>$ make gke-eol-file
-  try:
-    with open(f'{dirname(__file__)}/eol.yaml', encoding='utf-8') as eol_file:
-      eol_schedule = safe_load(eol_file)
-  except OSError:
-    # Ignore absence of the file, estimations will be used
-    eol_schedule = {}
+  eol_schedule = gke.get_release_schedule()
 
   for _, c in sorted(clusters.items()):
     if c.release_channel:

diff --git a/gcpdiag/lint/gke/eol.yaml b/gcpdiag/lint/gke/eol.yaml
diff --git a/gcpdiag/lint/gke/eol_parser.sh b/gcpdiag/lint/gke/eol_parser.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# WARNING: This script is deprecated and may be removed in a future release.
+# Please use 'gcpdiag/queries/gke.get_release_schedule' instead.
+
 # Parses the public GKE Schedule page and extratct EOL (end-of-life) dates
 # for currently available GKE versions
 

diff --git a/gcpdiag/lint/snapshot_test_base.py b/gcpdiag/lint/snapshot_test_base.py
@@ -19,10 +19,11 @@
 
 from gcpdiag import lint, models
 from gcpdiag.lint.output import terminal_output
-from gcpdiag.queries import apis_stub, kubectl_stub
+from gcpdiag.queries import apis_stub, kubectl_stub, web_stub
 from gcpdiag.queries.generic_api.api_build import generic_api_stub
 
 
+@mock.patch('gcpdiag.queries.web.get', new=web_stub.get)
 @mock.patch('gcpdiag.queries.apis.get_api', new=apis_stub.get_api_stub)
 @mock.patch('gcpdiag.queries.kubectl.verify_auth', new=kubectl_stub.verify_auth)
 @mock.patch(

diff --git a/gcpdiag/queries/datafusion.py b/gcpdiag/queries/datafusion.py
@@ -25,7 +25,7 @@
 import requests
 
 from gcpdiag import caching, config, models, utils
-from gcpdiag.queries import apis, crm, html, network
+from gcpdiag.queries import apis, crm, network, web
 from gcpdiag.queries.generic_api.api_build import get_generic
 from gcpdiag.utils import Version
 
@@ -228,9 +228,9 @@ def extract_support_datafusion_version() -> Dict[str, str]:
   page_url = 'https://cloud.google.com/data-fusion/docs/support/version-support-policy'
 
   try:
-    data_fusion_table = html.fetch_and_extract_table(page_url,
-                                                     tag='h2',
-                                                     tag_id='support_timelines')
+    data_fusion_table = web.fetch_and_extract_table(page_url,
+                                                    tag='h2',
+                                                    tag_id='support_timelines')
     if data_fusion_table:
       versions = []
       support_end_dates = []
@@ -400,9 +400,9 @@ def extract_datafusion_dataproc_version() -> Dict[str, list[str]]:
   page_url = 'https://cloud.google.com/data-fusion/docs/concepts/configure-clusters'
 
   try:
-    table = html.fetch_and_extract_table(page_url,
-                                         tag='h2',
-                                         tag_id='version-compatibility')
+    table = web.fetch_and_extract_table(page_url,
+                                        tag='h2',
+                                        tag_id='version-compatibility')
     if table:
       rows = table.find_all('tr')[1:]  #Skip the header row
       version_dict = {}

diff --git a/gcpdiag/queries/datafusion_test.py b/gcpdiag/queries/datafusion_test.py
@@ -16,7 +16,7 @@
 from unittest import mock
 
 from gcpdiag import models
-from gcpdiag.queries import apis_stub, datafusion
+from gcpdiag.queries import apis_stub, datafusion, web_stub
 from gcpdiag.queries.generic_api.api_build import generic_api_stub
 
 DUMMY_REGION = 'us-central1'
@@ -73,31 +73,15 @@ def test_is_private_ip(self):
             True) in [(i.name, i.is_private) for k, i in instances.items()]
 
 
-@mock.patch('gcpdiag.queries.html.requests.get', autospec=True)
+@mock.patch('gcpdiag.queries.web.get', new=web_stub.get)
 class TestExtractVersionPolicyDict:
   """Test html content."""
 
-  def test_extract_support_datafusion_version(self, mock_get):
-
-    with open(
-        'test-data/datafusion1/html-content/'
-        'version_support_policy.html',
-        encoding='utf-8') as fh:
-      mock_get.return_value.content = fh.read().encode('utf-8')
-      mock_get.return_value.status_code = 200
-
+  def test_extract_support_datafusion_version(self):
     response_dict = datafusion.extract_support_datafusion_version()
     assert response_dict == SUPPORTED_VERSIONS_DICT
 
-  def test_extract_datafusion_dataproc_version(self, mock_get):
-
-    with open(
-        'test-data/datafusion1/html-content/'
-        'version_compatability.html',
-        encoding='utf-8') as fh:
-      mock_get.return_value.content = fh.read().encode('utf-8')
-      mock_get.return_value.status_code = 200
-
+  def test_extract_datafusion_dataproc_version(self):
     response_dict = datafusion.extract_datafusion_dataproc_version()
     assert response_dict == DATAFUSION_DATAPROC_VERSIONS_DICT
 

diff --git a/gcpdiag/queries/gke.py b/gcpdiag/queries/gke.py
@@ -15,17 +15,20 @@
 # Lint as: python3
 """Queries related to GCP Kubernetes Engine clusters."""
 
+import datetime
 import functools
 import ipaddress
 import logging
 import re
 from typing import Dict, Iterable, List, Mapping, Optional, Union
 
+import bs4
 import googleapiclient.errors
+import requests
 from boltons.iterutils import get_path
 
 from gcpdiag import caching, config, models, utils
-from gcpdiag.queries import apis, crm, gce, network
+from gcpdiag.queries import apis, crm, gce, network, web
 from gcpdiag.utils import Version
 
 # To avoid name conflict with L342
@@ -599,3 +602,82 @@ def get_node_by_instance_id(context: models.Context, instance_id: str) -> Node:
   except AttributeError as err:
     raise KeyError from err
   return None
+
+
+@caching.cached_api_call
+def get_release_schedule() -> Dict:
+  """Extract the release schdule for gke clusters
+
+  Returns:
+    A dictionary of release schdule.
+  """
+  page_url = 'https://cloud.google.com/kubernetes-engine/docs/release-schedule'
+  release_data = {}
+  # estimate first month of the quarter
+  quarter_dates = {'Q1': '1', 'Q2': '4', 'Q3': '7', 'Q4': '10'}
+  try:
+    table = web.fetch_and_extract_table(page_url,
+                                        tag='table',
+                                        class_name='gke-release-schedule')
+
+    # Function to parse a date string or return None for 'N/A'
+    def parse_date(date_str) -> Optional[datetime.date]:
+      p = r'(?P<year>\d{4})-(?:(?P<quarter>Q[1-4])|(?P<month>[0-9]{1,2}))(?:-(?P<day>[0-9]{1,2}))?'
+      match = re.search(p, date_str)
+      # Handle incomplete dates in 'YYYY-MM' form
+      if match and match.group('month') and not match.group('day'):
+        return datetime.date.fromisoformat(f'{date_str}-15')
+      # Handle quarter year (for example, 2025-Q3) approximations that are updated when known.
+      # https://cloud.google.com/kubernetes-engine/docs/release-schedule.md#fn6
+      if match and match.group('quarter') and not match.group('day'):
+        date_str = f"{match.group('year')}-{quarter_dates[match.group('quarter')]}-01"
+        return datetime.date.fromisoformat(date_str)
+      if match and match.group('year') and match.group('month') and match.group(
+          'day'):
+        return datetime.date.fromisoformat(date_str)
+      # anything less like N/A return None
+      return None
+
+    def find_date_str_in_td(e):
+      """recursively find a date string in a td"""
+      if isinstance(e, str):
+        return e
+      if isinstance(e, bs4.element.Tag):
+        return find_date_str_in_td(e.next)
+      return None
+
+    # Find all table rows within tbody
+    rows = table.find('tbody').find_all('tr')
+
+    # Iterate over each row and extract the data
+    for row in rows:
+      # Extract all the columns (td elements)
+      cols = row.find_all('td')
+
+      # Extract relevant data
+
+      minor_version = cols[0].next.strip()
+      rapid_avail = parse_date(find_date_str_in_td(cols[1].next))
+      regular_avail = parse_date(find_date_str_in_td(cols[3].next))
+      stable_avail = parse_date(find_date_str_in_td(cols[5].next))
+      extended_avail = parse_date(find_date_str_in_td(cols[7].next))
+      end_of_standard_support = parse_date(find_date_str_in_td(cols[9].next))
+
+      # Add the extracted data into the dictionary in the desired format
+      release_data[minor_version] = {
+          'rapid_avail': rapid_avail,
+          'regular_avail': regular_avail,
+          'stable_avail': stable_avail,
+          'extended_avail': extended_avail,
+          'eol': end_of_standard_support,
+      }
+    return release_data
+  except (
+      requests.exceptions.RequestException,
+      AttributeError,
+      TypeError,
+      ValueError,
+      IndexError,
+  ) as e:
+    logging.error('Error in extracting gke release schedule: %s', e)
+    return release_data