diff --git a/superset/examples/bart_lines.py b/superset/examples/bart_lines.py index 2005ace22b15ea7a387d07342739c9344f14092f..37f41e7556dc9c42f5d05d5f4d3157b7f9251d3a 100644 --- a/superset/examples/bart_lines.py +++ b/superset/examples/bart_lines.py @@ -16,7 +16,6 @@ # under the License. import logging -import pandas as pd import polyline from sqlalchemy import inspect, String, Text @@ -25,7 +24,7 @@ from superset.sql_parse import Table from superset.utils import json from ..utils.database import get_example_database -from .helpers import get_example_url, get_table_connector_registry +from .helpers import get_table_connector_registry, read_example_data logger = logging.getLogger(__name__) @@ -38,8 +37,9 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None: table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("bart-lines.json.gz") - df = pd.read_json(url, encoding="latin-1", compression="gzip") + df = read_example_data( + "bart-lines.json.gz", encoding="latin-1", compression="gzip" + ) df["path_json"] = df.path.map(json.dumps) df["polyline"] = df.path.map(polyline.encode) del df["path"] diff --git a/superset/examples/birth_names.py b/superset/examples/birth_names.py index d94353219cfacd2db70e09a3ccf55ce2880c6722..62eb357a3b3d262f88460158d356bd50d0c44ce0 100644 --- a/superset/examples/birth_names.py +++ b/superset/examples/birth_names.py @@ -33,11 +33,11 @@ from superset.utils.core import DatasourceType from ..utils.database import get_example_database from .helpers import ( - get_example_url, get_slice_json, get_table_connector_registry, merge_slice, misc_dash_slices, + read_example_data, update_slice_ids, ) @@ -57,8 +57,8 @@ def gen_filter( def load_data(tbl_name: str, database: Database, sample: bool = False) -> None: - url = get_example_url("birth_names2.json.gz") - pdf = pd.read_json(url, compression="gzip") + pdf = read_example_data("birth_names2.json.gz", compression="gzip") + # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="ms") diff --git a/superset/examples/country_map.py b/superset/examples/country_map.py index a093044b7091599fae5d165da692a0f16da5b86d..83b880a319572a11c2070da5553049650a10edf9 100644 --- a/superset/examples/country_map.py +++ b/superset/examples/country_map.py @@ -17,7 +17,6 @@ import datetime import logging -import pandas as pd from sqlalchemy import BigInteger, Date, inspect, String from sqlalchemy.sql import column @@ -29,11 +28,11 @@ from superset.sql_parse import Table from superset.utils.core import DatasourceType from .helpers import ( - get_example_url, get_slice_json, get_table_connector_registry, merge_slice, misc_dash_slices, + read_example_data, ) logger = logging.getLogger(__name__) @@ -49,8 +48,9 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("birth_france_data_for_country_map.csv") - data = pd.read_csv(url, encoding="utf-8") + data = read_example_data( + "birth_france_data_for_country_map.csv", encoding="utf-8" + ) data["dttm"] = datetime.datetime.now().date() data.to_sql( tbl_name, diff --git a/superset/examples/energy.py b/superset/examples/energy.py index 1b2bfa72a204c405df767ceeda05dfdb6d03e8e3..9dfd199731c4c245bffbac10e9eebcf141e61ab2 100644 --- a/superset/examples/energy.py +++ b/superset/examples/energy.py @@ -17,7 +17,6 @@ import logging import textwrap -import pandas as pd from sqlalchemy import Float, inspect, String from sqlalchemy.sql import column @@ -29,11 +28,11 @@ from superset.sql_parse import Table from superset.utils.core import DatasourceType from .helpers import ( - get_example_url, get_slice_json, get_table_connector_registry, merge_slice, misc_dash_slices, + read_example_data, ) logger = logging.getLogger(__name__) @@ -51,8 +50,7 @@ def load_energy( table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("energy.json.gz") - pdf = pd.read_json(url, compression="gzip") + pdf = read_example_data("energy.json.gz", compression="gzip") pdf = pdf.head(100) if sample else pdf pdf.to_sql( tbl_name, diff --git a/superset/examples/flights.py b/superset/examples/flights.py index 81a31bb970aec5f057cf639edb55f4b872308ad1..4d8b04e42a0091c74d3648b712a7d18cb6d1d161 100644 --- a/superset/examples/flights.py +++ b/superset/examples/flights.py @@ -23,7 +23,7 @@ import superset.utils.database as database_utils from superset import db from superset.sql_parse import Table -from .helpers import get_example_url, get_table_connector_registry +from .helpers import get_table_connector_registry, read_example_data logger = logging.getLogger(__name__) @@ -37,12 +37,14 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None: table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - flight_data_url = get_example_url("flight_data.csv.gz") - pdf = pd.read_csv(flight_data_url, encoding="latin-1", compression="gzip") + pdf = read_example_data( + "flight_data.csv.gz", encoding="latin-1", compression="gzip" + ) # Loading airports info to join and get lat/long - airports_url = get_example_url("airports.csv.gz") - airports = pd.read_csv(airports_url, encoding="latin-1", compression="gzip") + airports = read_example_data( + "airports.csv.gz", encoding="latin-1", compression="gzip" + ) airports = airports.set_index("IATA_CODE") pdf[ # pylint: disable=unsupported-assignment-operation,useless-suppression diff --git a/superset/examples/helpers.py b/superset/examples/helpers.py index 908142ec778e38aa4164953e1299a6ed6999a426..c0673a8f77abebb212870aa500cd40816af065f5 100644 --- a/superset/examples/helpers.py +++ b/superset/examples/helpers.py @@ -43,7 +43,11 @@ Environment knobs from __future__ import annotations import os +import time from typing import Any +from urllib.error import HTTPError + +import pandas as pd from superset import app, db from superset.connectors.sqla.models import SqlaTable @@ -119,3 +123,33 @@ def get_example_url(filepath: str) -> str: paths like ``datasets/examples/slack/messages.csv``. """ return f"{BASE_URL}{filepath}" + + +def read_example_data( + filepath: str, + max_attempts: int = 5, + wait_seconds: float = 60, + **kwargs: Any, +) -> pd.DataFrame: + """Load CSV or JSON from example data mirror with retry/backoff.""" + from superset.examples.helpers import get_example_url + + url = get_example_url(filepath) + is_json = filepath.endswith(".json") or filepath.endswith(".json.gz") + + for attempt in range(1, max_attempts + 1): + try: + if is_json: + return pd.read_json(url, **kwargs) + return pd.read_csv(url, **kwargs) + except HTTPError as e: + if e.code == 429 and attempt < max_attempts: + sleep_time = wait_seconds * (2 ** (attempt - 1)) + print( + f"HTTP 429 received from {url}. ", + f"Retrying in {sleep_time:.1f}s ", + f"(attempt {attempt}/{max_attempts})...", + ) + time.sleep(sleep_time) + else: + raise diff --git a/superset/examples/long_lat.py b/superset/examples/long_lat.py index 87bcbd9fdc5127b4002de7c1ec2d5a69eceb4aa1..9108bb52c5d55f500463fe91fa57f473dfba99ec 100644 --- a/superset/examples/long_lat.py +++ b/superset/examples/long_lat.py @@ -19,7 +19,6 @@ import logging import random import geohash -import pandas as pd from sqlalchemy import DateTime, Float, inspect, String import superset.utils.database as database_utils @@ -29,11 +28,11 @@ from superset.sql_parse import Table from superset.utils.core import DatasourceType from .helpers import ( - get_example_url, get_slice_json, get_table_connector_registry, merge_slice, misc_dash_slices, + read_example_data, ) logger = logging.getLogger(__name__) @@ -48,8 +47,9 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("san_francisco.csv.gz") - pdf = pd.read_csv(url, encoding="utf-8", compression="gzip") + pdf = read_example_data( + "san_francisco.csv.gz", encoding="utf-8", compression="gzip" + ) start = datetime.datetime.now().replace( hour=0, minute=0, second=0, microsecond=0 ) diff --git a/superset/examples/multiformat_time_series.py b/superset/examples/multiformat_time_series.py index af357385f97f5a5af44bff04ec813eb33b72fe73..e788c19ee8757fb591e2bc3265d0f5411aea89e6 100644 --- a/superset/examples/multiformat_time_series.py +++ b/superset/examples/multiformat_time_series.py @@ -27,11 +27,11 @@ from superset.utils.core import DatasourceType from ..utils.database import get_example_database from .helpers import ( - get_example_url, get_slice_json, get_table_connector_registry, merge_slice, misc_dash_slices, + read_example_data, ) logger = logging.getLogger(__name__) @@ -48,8 +48,10 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("multiformat_time_series.json.gz") - pdf = pd.read_json(url, compression="gzip") + pdf = read_example_data( + "multiformat_time_series.json.gz", compression="gzip" + ) + # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="s") diff --git a/superset/examples/paris.py b/superset/examples/paris.py index 56a6affe432b7f6cf968ae9cd55e90765b934358..002b129734f2c17190ec8d91311b1897dddb7945 100644 --- a/superset/examples/paris.py +++ b/superset/examples/paris.py @@ -17,7 +17,6 @@ import logging -import pandas as pd from sqlalchemy import inspect, String, Text import superset.utils.database as database_utils @@ -25,7 +24,7 @@ from superset import db from superset.sql_parse import Table from superset.utils import json -from .helpers import get_example_url, get_table_connector_registry +from .helpers import get_table_connector_registry, read_example_data logger = logging.getLogger(__name__) @@ -38,8 +37,7 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("paris_iris.json.gz") - df = pd.read_json(url, compression="gzip") + df = read_example_data("paris_iris.json.gz", compression="gzip") df["features"] = df.features.map(json.dumps) df.to_sql( diff --git a/superset/examples/random_time_series.py b/superset/examples/random_time_series.py index f473565235ae90edd2e6e381cf8ecd8fe6377b2a..47ffd1dcd19d34a7ebc2761a8ce0f6d0098e94f7 100644 --- a/superset/examples/random_time_series.py +++ b/superset/examples/random_time_series.py @@ -26,10 +26,10 @@ from superset.sql_parse import Table from superset.utils.core import DatasourceType from .helpers import ( - get_example_url, get_slice_json, get_table_connector_registry, merge_slice, + read_example_data, ) logger = logging.getLogger(__name__) @@ -46,8 +46,7 @@ def load_random_time_series_data( table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("random_time_series.json.gz") - pdf = pd.read_json(url, compression="gzip") + pdf = read_example_data("random_time_series.json.gz", compression="gzip") if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="s") pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S") diff --git a/superset/examples/sf_population_polygons.py b/superset/examples/sf_population_polygons.py index 239ab04a17a6d3189eee57138fd0344210b42226..b89bc98c5a1ccef23e8fc02a7d00ade7fa4b7e22 100644 --- a/superset/examples/sf_population_polygons.py +++ b/superset/examples/sf_population_polygons.py @@ -17,7 +17,6 @@ import logging -import pandas as pd from sqlalchemy import BigInteger, Float, inspect, Text import superset.utils.database as database_utils @@ -25,7 +24,7 @@ from superset import db from superset.sql_parse import Table from superset.utils import json -from .helpers import get_example_url, get_table_connector_registry +from .helpers import get_table_connector_registry, read_example_data logger = logging.getLogger(__name__) @@ -40,8 +39,7 @@ def load_sf_population_polygons( table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("sf_population.json.gz") - df = pd.read_json(url, compression="gzip") + df = read_example_data("sf_population.json.gz", compression="gzip") df["contour"] = df.contour.map(json.dumps) df.to_sql( diff --git a/superset/examples/world_bank.py b/superset/examples/world_bank.py index b683d8fb911df41d25b73baa435005cc31e56b4b..16b8b9854ca7572243df57894cd2b2faf5c58a39 100644 --- a/superset/examples/world_bank.py +++ b/superset/examples/world_bank.py @@ -25,12 +25,12 @@ import superset.utils.database from superset import app, db from superset.connectors.sqla.models import BaseDatasource, SqlMetric from superset.examples.helpers import ( - get_example_url, get_examples_folder, get_slice_json, get_table_connector_registry, merge_slice, misc_dash_slices, + read_example_data, update_slice_ids, ) from superset.models.dashboard import Dashboard @@ -55,8 +55,7 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals table_exists = database.has_table(Table(tbl_name, schema)) if not only_metadata and (not table_exists or force): - url = get_example_url("countries.json.gz") - pdf = pd.read_json(url, compression="gzip") + pdf = read_example_data("countries.json.gz", compression="gzip") pdf.columns = [col.replace(".", "_") for col in pdf.columns] if database.backend == "presto": pdf.year = pd.to_datetime(pdf.year)