From 7f14e434c87d3c3ffe403ecf84d10f1ca7686446 Mon Sep 17 00:00:00 2001
From: Maxime Beauchemin <maximebeauchemin@gmail.com>
Date: Tue, 13 May 2025 08:36:12 -0700
Subject: [PATCH] fix: loading examples in CI returns http error "too many
 requests" (#33412)

---
 superset/examples/bart_lines.py              |  8 ++---
 superset/examples/birth_names.py             |  6 ++--
 superset/examples/country_map.py             |  8 ++---
 superset/examples/energy.py                  |  6 ++--
 superset/examples/flights.py                 | 12 ++++---
 superset/examples/helpers.py                 | 34 ++++++++++++++++++++
 superset/examples/long_lat.py                |  8 ++---
 superset/examples/multiformat_time_series.py |  8 +++--
 superset/examples/paris.py                   |  6 ++--
 superset/examples/random_time_series.py      |  5 ++-
 superset/examples/sf_population_polygons.py  |  6 ++--
 superset/examples/world_bank.py              |  5 ++-
 12 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/superset/examples/bart_lines.py b/superset/examples/bart_lines.py
index 2005ace22b..37f41e7556 100644
--- a/superset/examples/bart_lines.py
+++ b/superset/examples/bart_lines.py
@@ -16,7 +16,6 @@
 # under the License.
 import logging
 
-import pandas as pd
 import polyline
 from sqlalchemy import inspect, String, Text
 
@@ -25,7 +24,7 @@ from superset.sql_parse import Table
 from superset.utils import json
 
 from ..utils.database import get_example_database
-from .helpers import get_example_url, get_table_connector_registry
+from .helpers import get_table_connector_registry, read_example_data
 
 logger = logging.getLogger(__name__)
 
@@ -38,8 +37,9 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("bart-lines.json.gz")
-            df = pd.read_json(url, encoding="latin-1", compression="gzip")
+            df = read_example_data(
+                "bart-lines.json.gz", encoding="latin-1", compression="gzip"
+            )
             df["path_json"] = df.path.map(json.dumps)
             df["polyline"] = df.path.map(polyline.encode)
             del df["path"]
diff --git a/superset/examples/birth_names.py b/superset/examples/birth_names.py
index d94353219c..62eb357a3b 100644
--- a/superset/examples/birth_names.py
+++ b/superset/examples/birth_names.py
@@ -33,11 +33,11 @@ from superset.utils.core import DatasourceType
 
 from ..utils.database import get_example_database
 from .helpers import (
-    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
     misc_dash_slices,
+    read_example_data,
     update_slice_ids,
 )
 
@@ -57,8 +57,8 @@ def gen_filter(
 
 
 def load_data(tbl_name: str, database: Database, sample: bool = False) -> None:
-    url = get_example_url("birth_names2.json.gz")
-    pdf = pd.read_json(url, compression="gzip")
+    pdf = read_example_data("birth_names2.json.gz", compression="gzip")
+
     # TODO(bkyryliuk): move load examples data into the pytest fixture
     if database.backend == "presto":
         pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
diff --git a/superset/examples/country_map.py b/superset/examples/country_map.py
index a093044b70..83b880a319 100644
--- a/superset/examples/country_map.py
+++ b/superset/examples/country_map.py
@@ -17,7 +17,6 @@
 import datetime
 import logging
 
-import pandas as pd
 from sqlalchemy import BigInteger, Date, inspect, String
 from sqlalchemy.sql import column
 
@@ -29,11 +28,11 @@ from superset.sql_parse import Table
 from superset.utils.core import DatasourceType
 
 from .helpers import (
-    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
     misc_dash_slices,
+    read_example_data,
 )
 
 logger = logging.getLogger(__name__)
@@ -49,8 +48,9 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("birth_france_data_for_country_map.csv")
-            data = pd.read_csv(url, encoding="utf-8")
+            data = read_example_data(
+                "birth_france_data_for_country_map.csv", encoding="utf-8"
+            )
             data["dttm"] = datetime.datetime.now().date()
             data.to_sql(
                 tbl_name,
diff --git a/superset/examples/energy.py b/superset/examples/energy.py
index 1b2bfa72a2..9dfd199731 100644
--- a/superset/examples/energy.py
+++ b/superset/examples/energy.py
@@ -17,7 +17,6 @@
 import logging
 import textwrap
 
-import pandas as pd
 from sqlalchemy import Float, inspect, String
 from sqlalchemy.sql import column
 
@@ -29,11 +28,11 @@ from superset.sql_parse import Table
 from superset.utils.core import DatasourceType
 
 from .helpers import (
-    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
     misc_dash_slices,
+    read_example_data,
 )
 
 logger = logging.getLogger(__name__)
@@ -51,8 +50,7 @@ def load_energy(
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("energy.json.gz")
-            pdf = pd.read_json(url, compression="gzip")
+            pdf = read_example_data("energy.json.gz", compression="gzip")
             pdf = pdf.head(100) if sample else pdf
             pdf.to_sql(
                 tbl_name,
diff --git a/superset/examples/flights.py b/superset/examples/flights.py
index 81a31bb970..4d8b04e42a 100644
--- a/superset/examples/flights.py
+++ b/superset/examples/flights.py
@@ -23,7 +23,7 @@ import superset.utils.database as database_utils
 from superset import db
 from superset.sql_parse import Table
 
-from .helpers import get_example_url, get_table_connector_registry
+from .helpers import get_table_connector_registry, read_example_data
 
 logger = logging.getLogger(__name__)
 
@@ -37,12 +37,14 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None:
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            flight_data_url = get_example_url("flight_data.csv.gz")
-            pdf = pd.read_csv(flight_data_url, encoding="latin-1", compression="gzip")
+            pdf = read_example_data(
+                "flight_data.csv.gz", encoding="latin-1", compression="gzip"
+            )
 
             # Loading airports info to join and get lat/long
-            airports_url = get_example_url("airports.csv.gz")
-            airports = pd.read_csv(airports_url, encoding="latin-1", compression="gzip")
+            airports = read_example_data(
+                "airports.csv.gz", encoding="latin-1", compression="gzip"
+            )
             airports = airports.set_index("IATA_CODE")
 
             pdf[  # pylint: disable=unsupported-assignment-operation,useless-suppression
diff --git a/superset/examples/helpers.py b/superset/examples/helpers.py
index 908142ec77..c0673a8f77 100644
--- a/superset/examples/helpers.py
+++ b/superset/examples/helpers.py
@@ -43,7 +43,11 @@ Environment knobs
 from __future__ import annotations
 
 import os
+import time
 from typing import Any
+from urllib.error import HTTPError
+
+import pandas as pd
 
 from superset import app, db
 from superset.connectors.sqla.models import SqlaTable
@@ -119,3 +123,33 @@ def get_example_url(filepath: str) -> str:
     paths like ``datasets/examples/slack/messages.csv``.
     """
     return f"{BASE_URL}{filepath}"
+
+
+def read_example_data(
+    filepath: str,
+    max_attempts: int = 5,
+    wait_seconds: float = 60,
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Load CSV or JSON from example data mirror with retry/backoff."""
+    from superset.examples.helpers import get_example_url
+
+    url = get_example_url(filepath)
+    is_json = filepath.endswith(".json") or filepath.endswith(".json.gz")
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            if is_json:
+                return pd.read_json(url, **kwargs)
+            return pd.read_csv(url, **kwargs)
+        except HTTPError as e:
+            if e.code == 429 and attempt < max_attempts:
+                sleep_time = wait_seconds * (2 ** (attempt - 1))
+                print(
+                    f"HTTP 429 received from {url}. ",
+                    f"Retrying in {sleep_time:.1f}s ",
+                    f"(attempt {attempt}/{max_attempts})...",
+                )
+                time.sleep(sleep_time)
+            else:
+                raise
diff --git a/superset/examples/long_lat.py b/superset/examples/long_lat.py
index 87bcbd9fdc..9108bb52c5 100644
--- a/superset/examples/long_lat.py
+++ b/superset/examples/long_lat.py
@@ -19,7 +19,6 @@ import logging
 import random
 
 import geohash
-import pandas as pd
 from sqlalchemy import DateTime, Float, inspect, String
 
 import superset.utils.database as database_utils
@@ -29,11 +28,11 @@ from superset.sql_parse import Table
 from superset.utils.core import DatasourceType
 
 from .helpers import (
-    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
     misc_dash_slices,
+    read_example_data,
 )
 
 logger = logging.getLogger(__name__)
@@ -48,8 +47,9 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("san_francisco.csv.gz")
-            pdf = pd.read_csv(url, encoding="utf-8", compression="gzip")
+            pdf = read_example_data(
+                "san_francisco.csv.gz", encoding="utf-8", compression="gzip"
+            )
             start = datetime.datetime.now().replace(
                 hour=0, minute=0, second=0, microsecond=0
             )
diff --git a/superset/examples/multiformat_time_series.py b/superset/examples/multiformat_time_series.py
index af357385f9..e788c19ee8 100644
--- a/superset/examples/multiformat_time_series.py
+++ b/superset/examples/multiformat_time_series.py
@@ -27,11 +27,11 @@ from superset.utils.core import DatasourceType
 
 from ..utils.database import get_example_database
 from .helpers import (
-    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
     misc_dash_slices,
+    read_example_data,
 )
 
 logger = logging.getLogger(__name__)
@@ -48,8 +48,10 @@ def load_multiformat_time_series(  # pylint: disable=too-many-locals
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("multiformat_time_series.json.gz")
-            pdf = pd.read_json(url, compression="gzip")
+            pdf = read_example_data(
+                "multiformat_time_series.json.gz", compression="gzip"
+            )
+
             # TODO(bkyryliuk): move load examples data into the pytest fixture
             if database.backend == "presto":
                 pdf.ds = pd.to_datetime(pdf.ds, unit="s")
diff --git a/superset/examples/paris.py b/superset/examples/paris.py
index 56a6affe43..002b129734 100644
--- a/superset/examples/paris.py
+++ b/superset/examples/paris.py
@@ -17,7 +17,6 @@
 
 import logging
 
-import pandas as pd
 from sqlalchemy import inspect, String, Text
 
 import superset.utils.database as database_utils
@@ -25,7 +24,7 @@ from superset import db
 from superset.sql_parse import Table
 from superset.utils import json
 
-from .helpers import get_example_url, get_table_connector_registry
+from .helpers import get_table_connector_registry, read_example_data
 
 logger = logging.getLogger(__name__)
 
@@ -38,8 +37,7 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) ->
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("paris_iris.json.gz")
-            df = pd.read_json(url, compression="gzip")
+            df = read_example_data("paris_iris.json.gz", compression="gzip")
             df["features"] = df.features.map(json.dumps)
 
             df.to_sql(
diff --git a/superset/examples/random_time_series.py b/superset/examples/random_time_series.py
index f473565235..47ffd1dcd1 100644
--- a/superset/examples/random_time_series.py
+++ b/superset/examples/random_time_series.py
@@ -26,10 +26,10 @@ from superset.sql_parse import Table
 from superset.utils.core import DatasourceType
 
 from .helpers import (
-    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
+    read_example_data,
 )
 
 logger = logging.getLogger(__name__)
@@ -46,8 +46,7 @@ def load_random_time_series_data(
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("random_time_series.json.gz")
-            pdf = pd.read_json(url, compression="gzip")
+            pdf = read_example_data("random_time_series.json.gz", compression="gzip")
             if database.backend == "presto":
                 pdf.ds = pd.to_datetime(pdf.ds, unit="s")
                 pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S")
diff --git a/superset/examples/sf_population_polygons.py b/superset/examples/sf_population_polygons.py
index 239ab04a17..b89bc98c5a 100644
--- a/superset/examples/sf_population_polygons.py
+++ b/superset/examples/sf_population_polygons.py
@@ -17,7 +17,6 @@
 
 import logging
 
-import pandas as pd
 from sqlalchemy import BigInteger, Float, inspect, Text
 
 import superset.utils.database as database_utils
@@ -25,7 +24,7 @@ from superset import db
 from superset.sql_parse import Table
 from superset.utils import json
 
-from .helpers import get_example_url, get_table_connector_registry
+from .helpers import get_table_connector_registry, read_example_data
 
 logger = logging.getLogger(__name__)
 
@@ -40,8 +39,7 @@ def load_sf_population_polygons(
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("sf_population.json.gz")
-            df = pd.read_json(url, compression="gzip")
+            df = read_example_data("sf_population.json.gz", compression="gzip")
             df["contour"] = df.contour.map(json.dumps)
 
             df.to_sql(
diff --git a/superset/examples/world_bank.py b/superset/examples/world_bank.py
index b683d8fb91..16b8b9854c 100644
--- a/superset/examples/world_bank.py
+++ b/superset/examples/world_bank.py
@@ -25,12 +25,12 @@ import superset.utils.database
 from superset import app, db
 from superset.connectors.sqla.models import BaseDatasource, SqlMetric
 from superset.examples.helpers import (
-    get_example_url,
     get_examples_folder,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
     misc_dash_slices,
+    read_example_data,
     update_slice_ids,
 )
 from superset.models.dashboard import Dashboard
@@ -55,8 +55,7 @@ def load_world_bank_health_n_pop(  # pylint: disable=too-many-locals
         table_exists = database.has_table(Table(tbl_name, schema))
 
         if not only_metadata and (not table_exists or force):
-            url = get_example_url("countries.json.gz")
-            pdf = pd.read_json(url, compression="gzip")
+            pdf = read_example_data("countries.json.gz", compression="gzip")
             pdf.columns = [col.replace(".", "_") for col in pdf.columns]
             if database.backend == "presto":
                 pdf.year = pd.to_datetime(pdf.year)
-- 
GitLab