Data¶

`data` ¶

Example datasets and data getting utilities.

This module contains functions for either reading in sample datasets or pulling data from external data providers.

`coordinates_from_incidence(incidence)` ¶

Extract model coordinates from an incidence pandas DataFrame.

Parameters:

Name	Type	Description	Default
`incidence`	`DataFrame`	A formatted incidence pandas DataFrame.	required

Returns:

Type	Description
`dict[Literal['season', 'region', 'strata', 'observation'], list[str]]`	A dictionary of coordinates that can be provided to xarray.

Source code in src/vaxflux/data.py

def coordinates_from_incidence(
    incidence: pd.DataFrame,
) -> dict[Literal["season", "region", "strata", "observation"], list[str]]:
    """
    Extract model coordinates from an incidence pandas DataFrame.

    Args:
        incidence: A formatted incidence pandas DataFrame.

    Returns:
        A dictionary of coordinates that can be provided to xarray.

    """
    keys: tuple[Literal["season", "region", "strata"], ...] = (
        "season",
        "region",
        "strata",
    )
    coords: dict[Literal["season", "region", "strata", "observation"], list[str]] = {
        **{v: np.sort(incidence[v].unique()).tolist() for v in keys},
        "observation": np.arange(len(incidence)).astype(str).tolist(),
    }
    return coords

`create_logistic_sample_dataset(parameters, time, epsilon, error='gamma', seed=0)` ¶

Create a synthetic logistic incidence dataset.

Parameters:

Name	Type	Description	Default
`parameters`	`DataFrame`	A pandas DataFrame with the columns 'season', 'strata', 'region', 'm', 'r', and 's'.	required
`time`	`NDArray[float64]`	A numpy array of the time steps to generate a dataset for.	required
`epsilon`	`float`	The standard deviation to use in the resulting observations.	required
`error`	`Literal['gamma', 'normal'] \| None`	The error distribution to use in generating the observed incidences or `None` for no noise added to the dataset.	`'gamma'`
`seed`	`int`	An integer corresponding to the random seed to use when generating a dataset for consistency across calls.	`0`

Returns:

Type	Description
`DataFrame`	A formatted incidence dataset.

Examples:

>>> import numpy as np
>>> import pandas as pd
>>> from vaxflux.data import create_logistic_sample_dataset
>>> parameters = pd.DataFrame(
...     data={
...         "season": ["2023/24"],
...         "strata": ["All stratas"],
...         "region": ["All regions"],
...         "m": [0.5],
...         "r": [0.3],
...         "s": [20.0],
...     },
... )
>>> parameters
    season       strata       region    m    r     s
0  2023/24  All stratas  All regions  0.5  0.3  20.0
>>> time = np.arange(40, step=3)
>>> create_logistic_sample_dataset(parameters, time, 0.001)
     season       strata       region  time     value
0   2023/24  All stratas  All regions   0.0  0.000128
1   2023/24  All stratas  All regions   3.0  0.001984
2   2023/24  All stratas  All regions   6.0  0.005459
3   2023/24  All stratas  All regions   9.0  0.007348
4   2023/24  All stratas  All regions  12.0  0.014066
5   2023/24  All stratas  All regions  15.0  0.027984
6   2023/24  All stratas  All regions  18.0  0.044186
7   2023/24  All stratas  All regions  21.0  0.046088
8   2023/24  All stratas  All regions  24.0  0.033544
9   2023/24  All stratas  All regions  27.0  0.019666
10  2023/24  All stratas  All regions  30.0  0.008194
11  2023/24  All stratas  All regions  33.0  0.001570
12  2023/24  All stratas  All regions  36.0  0.001995
13  2023/24  All stratas  All regions  39.0  0.000210

Source code in src/vaxflux/data.py

def create_logistic_sample_dataset(
    parameters: pd.DataFrame,
    time: npt.NDArray[np.float64],
    epsilon: float,
    error: Literal["gamma", "normal"] | None = "gamma",
    seed: int = 0,
) -> pd.DataFrame:
    """
    Create a synthetic logistic incidence dataset.

    Args:
        parameters: A pandas DataFrame with the columns 'season', 'strata', 'region',
            'm', 'r', and 's'.
        time: A numpy array of the time steps to generate a dataset for.
        epsilon: The standard deviation to use in the resulting observations.
        error: The error distribution to use in generating the observed incidences or
            `None` for no noise added to the dataset.
        seed: An integer corresponding to the random seed to use when generating a
            dataset for consistency across calls.

    Returns:
        A formatted incidence dataset.

    Examples:
        >>> import numpy as np
        >>> import pandas as pd
        >>> from vaxflux.data import create_logistic_sample_dataset
        >>> parameters = pd.DataFrame(
        ...     data={
        ...         "season": ["2023/24"],
        ...         "strata": ["All stratas"],
        ...         "region": ["All regions"],
        ...         "m": [0.5],
        ...         "r": [0.3],
        ...         "s": [20.0],
        ...     },
        ... )
        >>> parameters
            season       strata       region    m    r     s
        0  2023/24  All stratas  All regions  0.5  0.3  20.0
        >>> time = np.arange(40, step=3)
        >>> create_logistic_sample_dataset(parameters, time, 0.001)
             season       strata       region  time     value
        0   2023/24  All stratas  All regions   0.0  0.000128
        1   2023/24  All stratas  All regions   3.0  0.001984
        2   2023/24  All stratas  All regions   6.0  0.005459
        3   2023/24  All stratas  All regions   9.0  0.007348
        4   2023/24  All stratas  All regions  12.0  0.014066
        5   2023/24  All stratas  All regions  15.0  0.027984
        6   2023/24  All stratas  All regions  18.0  0.044186
        7   2023/24  All stratas  All regions  21.0  0.046088
        8   2023/24  All stratas  All regions  24.0  0.033544
        9   2023/24  All stratas  All regions  27.0  0.019666
        10  2023/24  All stratas  All regions  30.0  0.008194
        11  2023/24  All stratas  All regions  33.0  0.001570
        12  2023/24  All stratas  All regions  36.0  0.001995
        13  2023/24  All stratas  All regions  39.0  0.000210

    """
    rs = np.random.RandomState(seed)
    incidence = []
    for row in cast("list[ParametersRow]", parameters.to_dict(orient="records")):
        tmp = np.exp(-row["r"] * (time - row["s"]))
        mu = expit(row["m"]) * row["r"] * tmp * np.power(1.0 + tmp, -2.0)
        if error == "gamma":
            obs = rs.gamma(shape=np.power(mu / epsilon, 2.0), scale=(epsilon**2.0) / mu)
        elif error == "normal":
            obs = np.maximum(rs.normal(loc=mu, scale=epsilon), 0.0)
        else:
            obs = mu
        incidence.append(
            pd.DataFrame(
                data={
                    "season": len(time) * [row["season"]],
                    "strata": len(time) * [row["strata"]],
                    "region": len(time) * [row["region"]],
                    "time": time,
                    "incidence": obs,
                },
            ),
        )
    incidence_df = pd.concat(incidence, ignore_index=True)
    incidence_df = format_incidence_dataframe(incidence_df)
    return incidence_df.rename(columns={"incidence": "value"})

`format_incidence_dataframe(incidence)` ¶

Format an incidence pandas DataFrame.

Parameters:

Name	Type	Description	Default
`incidence`	`DataFrame`	A DataFrame with at least the columns 'time' and 'incidence' and optionally 'season', 'strata', 'region'.	required

Returns:

Type	Description
`DataFrame`	A pandas DataFrame with the columns

Examples:

>>> import pandas as pd
>>> df = pd.DataFrame(
...     data={
...         "time": [1.0, 1.5, 2.0],
...         "incidence": [0.01, 0.02, 0.015],
...     }
... )
>>> df
   time  incidence
0   1.0      0.010
1   1.5      0.020
2   2.0      0.015
>>> format_incidence_dataframe(df)
        season       strata       region  time  incidence
0  All Seasons  All Stratas  All Regions   1.0      0.010
1  All Seasons  All Stratas  All Regions   1.5      0.020
2  All Seasons  All Stratas  All Regions   2.0      0.015

Source code in src/vaxflux/data.py

def format_incidence_dataframe(incidence: pd.DataFrame) -> pd.DataFrame:
    """
    Format an incidence pandas DataFrame.

    Args:
        incidence: A DataFrame with at least the columns 'time' and 'incidence' and
            optionally 'season', 'strata', 'region'.

    Returns:
        A pandas DataFrame with the columns

    Examples:
        >>> import pandas as pd
        >>> df = pd.DataFrame(
        ...     data={
        ...         "time": [1.0, 1.5, 2.0],
        ...         "incidence": [0.01, 0.02, 0.015],
        ...     }
        ... )
        >>> df
           time  incidence
        0   1.0      0.010
        1   1.5      0.020
        2   2.0      0.015
        >>> format_incidence_dataframe(df)
                season       strata       region  time  incidence
        0  All Seasons  All Stratas  All Regions   1.0      0.010
        1  All Seasons  All Stratas  All Regions   1.5      0.020
        2  All Seasons  All Stratas  All Regions   2.0      0.015

    """
    incidence = incidence.copy()
    incidence_columns = set(incidence.columns.tolist())

    if missing_columns := {"time", "incidence"} - incidence_columns:
        msg = (
            "The `incidence` provided is missing required columns: "
            f"""'{"', '".join(missing_columns)}'."""
        )
        raise ValueError(
            msg,
        )

    for column in ("time", "incidence"):
        incidence[column] = pd.to_numeric(incidence[column]).astype("float64")

    for column in ("season", "strata", "region"):
        if column not in incidence_columns:
            incidence[column] = pd.Series(
                data=len(incidence) * [f"All {column.capitalize()}s"],
                dtype="string",
            )
        else:
            incidence[column] = incidence[column].astype("string")

    return incidence[["season", "strata", "region", "time", "incidence"]]

`get_ncird_nis_frvm_flu_vaccination_coverage(*, include_age_groups=False)` ¶

Get NCIRD NIS/FRVM flu vaccination coverage formatted for VaxfluxModel.

This uses the CDC dataset Weekly Influenza Vaccination Coverage and Intent for Vaccination Among Adults 18 Years and Older <https://data.cdc.gov/Flu-Vaccinations/Weekly-Influenza-Vaccination-Coverage-and-Intent-f/sw5n-wg2p/about_data>_.

Parameters:

Name	Type	Description	Default
`include_age_groups`	`bool`	Whether to return an age-stratified output. When `False`, only overall national rows are returned.	`False`

Returns:

Type	Description
`DataFrame`	A pandas DataFrame with columns required by
`DataFrame`	meth:`VaxfluxModel.add_observations`:
`DataFrame`	`season`, `season_start_date`, `season_end_date`, `start_date`,
`DataFrame`	`end_date`, `report_date`, `type`, and `value`. Includes `age`
`DataFrame`	when `include_age_groups=True`.

Source code in src/vaxflux/data.py

def get_ncird_nis_frvm_flu_vaccination_coverage(
    *,
    include_age_groups: bool = False,
) -> pd.DataFrame:
    """
    Get NCIRD NIS/FRVM flu vaccination coverage formatted for `VaxfluxModel`.

    This uses the CDC dataset `Weekly Influenza Vaccination Coverage and Intent
    for Vaccination Among Adults 18 Years and Older
    <https://data.cdc.gov/Flu-Vaccinations/Weekly-Influenza-Vaccination-Coverage-and-Intent-f/sw5n-wg2p/about_data>`_.

    Args:
        include_age_groups: Whether to return an age-stratified output. When
            `False`, only overall national rows are returned.

    Returns:
        A pandas DataFrame with columns required by
        :meth:`VaxfluxModel.add_observations`:
        `season`, `season_start_date`, `season_end_date`, `start_date`,
        `end_date`, `report_date`, `type`, and `value`. Includes `age`
        when `include_age_groups=True`.
    """
    now = datetime.now(UTC)
    cache_bust = time.mktime(now.timetuple())
    date = now.strftime("%Y%m%d")
    url = (
        "https://data.cdc.gov/api/views/sw5n-wg2p/rows.csv?fourfour=sw5n-wg2p"
        f"&cacheBust={cache_bust}&date={date}&accessType=DOWNLOAD"
    )
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    nis_df = pd.read_csv(io.BytesIO(resp.content))
    nis_df.columns = (
        nis_df.columns.str.strip()
        .str.lower()
        .str.replace(r"[^a-z0-9]+", "_", regex=True)
        .str.strip("_")
    )

    filtered = nis_df[
        (nis_df["geographic_level"] == "National")
        & (nis_df["indicator_label"] == "Up-to-date")
    ].copy()

    if include_age_groups:
        age_categories = ("18-49 years", "50-64 years", "65+ years")
        filtered = filtered[
            (filtered["demographic_level"] == "Age")
            & (filtered["demographic_name"].isin(age_categories))
        ].copy()
        filtered["age"] = filtered["demographic_name"].astype("string")
        group_cols = ["season", "age"]
    else:
        filtered = filtered[filtered["demographic_level"] == "Overall"].copy()
        group_cols = ["season"]

    if filtered.empty:
        msg = "No rows found for requested NCIRD NIS/FRVM filters."
        raise ValueError(msg)

    return _prepare_nis_frvm_observations(
        filtered,
        group_cols=group_cols,
        include_age_groups=include_age_groups,
    )

`get_ncird_weekly_cumulative_vaccination_coverage()` ¶

Get weekly cumulative vaccination coverage data provided by NCIRD.

More information about this data can be found on the CDC data page for this dataset: Weekly Cumulative Influenza Vaccination Coverage, Adults 18 and Older, United States <https://data.cdc.gov/Flu-Vaccinations/Weekly-Cumulative-Influenza-Vaccination-Coverage-A/2v3t-r3np/about_data>_.

Returns:

Type	Description
`DataFrame`	A pandas DataFrame with the columns 'geographic_level', 'geographic_name',
`DataFrame`	'demographic_level', 'demographic_name', 'indicator_label',
`DataFrame`	'indicator_category_label', 'month_week', 'nd_weekly_estimate',
`DataFrame`	'ci_half_width_95pct', 'n_unweighted', 'suppression_flag',
`DataFrame`	'current_season_week_ending', 'influenza_season', 'legend',
`DataFrame`	'indicator_category_label_sort', 'demographic_level_sort',
`DataFrame`	'demographic_name_sort', 'geographic_sort', 'season_sort',
`DataFrame`	'legend_sort', '95_ci_lower', and '95_ci_upper'.

Source code in src/vaxflux/data.py

def get_ncird_weekly_cumulative_vaccination_coverage() -> pd.DataFrame:
    """
    Get weekly cumulative vaccination coverage data provided by NCIRD.

    More information about this data can be found on the CDC data page for this
    dataset: `Weekly Cumulative Influenza Vaccination Coverage, Adults 18 and Older, United States <https://data.cdc.gov/Flu-Vaccinations/Weekly-Cumulative-Influenza-Vaccination-Coverage-A/2v3t-r3np/about_data>`_.

    Returns:
        A pandas DataFrame with the columns 'geographic_level', 'geographic_name',
        'demographic_level', 'demographic_name', 'indicator_label',
        'indicator_category_label', 'month_week', 'nd_weekly_estimate',
        'ci_half_width_95pct', 'n_unweighted', 'suppression_flag',
        'current_season_week_ending', 'influenza_season', 'legend',
        'indicator_category_label_sort', 'demographic_level_sort',
        'demographic_name_sort', 'geographic_sort', 'season_sort',
        'legend_sort', '95_ci_lower', and '95_ci_upper'.

    """  # noqa: E501
    now = datetime.now(UTC)
    cache_bust = time.mktime(now.timetuple())
    date = now.strftime("%Y%m%d")
    url = (
        "https://data.cdc.gov/api/views/2v3t-r3np/rows.csv?fourfour=2v3t-r3np"
        f"&cacheBust={cache_bust}&date={date}&accessType=DOWNLOAD"
    )
    # Get and parse the data
    resp = requests.get(url, timeout=30)
    ncird_df = pd.read_csv(
        io.BytesIO(resp.content),
        dtype={
            "Geographic_Level": "string",
            "Geographic_Name": "string",
            "Demographic_Level": "string",
            "Demographic_Name": "string",
            "Indicator_Label": "string",
            "Indicator_Category_Label": "string",
            "Month_Week": "string",
            "Week_Ending": "string",  # empty
            "ND_Weekly_Estimate": "Float64",
            "CI_Half_width_95pct": "Float64",
            "n_unweighted": "UInt64",
            "Suppression_Flag": "boolean",  # 1/0 for True/False
            "Current_Season_Week_Ending": "string",  # datetime string
            "Influenza_Season": "string",
            "Legend": "string",
            "95 CI (%)": "string",  # 2 numbers with a dash
            "Indicator_Category_Label_Sort": "UInt64",
            "Demographic_Level_Sort": "UInt64",
            "Demographic_Name_Sort": "UInt64",
            "Geographic_Sort": "UInt64",
            "Season_Sort": "UInt64",
            "Legend_Sort": "UInt64",
        },
    )
    # Format the data
    ncird_df = ncird_df.rename(
        columns={
            "Geographic_Level": "geographic_level",
            "Geographic_Name": "geographic_name",
            "Demographic_Level": "demographic_level",
            "Demographic_Name": "demographic_name",
            "Indicator_Label": "indicator_label",
            "Indicator_Category_Label": "indicator_category_label",
            "Month_Week": "month_week",
            "Week_Ending": "week_ending",
            "ND_Weekly_Estimate": "nd_weekly_estimate",
            "CI_Half_width_95pct": "ci_half_width_95pct",
            "n_unweighted": "n_unweighted",
            "Suppression_Flag": "suppression_flag",
            "Current_Season_Week_Ending": "current_season_week_ending",
            "Influenza_Season": "influenza_season",
            "Legend": "legend",
            "95 CI (%)": "95_ci",
            "Indicator_Category_Label_Sort": "indicator_category_label_sort",
            "Demographic_Level_Sort": "demographic_level_sort",
            "Demographic_Name_Sort": "demographic_name_sort",
            "Geographic_Sort": "geographic_sort",
            "Season_Sort": "season_sort",
            "Legend_Sort": "legend_sort",
        },
    )
    # Special handling for select columns
    ncird_df["suppression_flag"] = ncird_df["suppression_flag"].astype("boolean")
    ncird_df["current_season_week_ending"] = pd.to_datetime(
        ncird_df["current_season_week_ending"],
        format="%m/%d/%Y %H:%M:%S %p",
    )
    ncird_df[["95_ci_lower", "95_ci_upper"]] = ncird_df["95_ci"].str.split(
        "-",
        n=1,
        expand=True,
    )
    ncird_df["95_ci_lower"] = pd.to_numeric(ncird_df["95_ci_lower"].str.strip()).astype(
        "Float64",
    )
    ncird_df["95_ci_upper"] = pd.to_numeric(ncird_df["95_ci_upper"].str.strip()).astype(
        "Float64",
    )
    return ncird_df.drop(columns=["week_ending", "95_ci"])

`sample_dataset(curve, season_ranges, date_ranges, covariate_categories, parameters, epsilon, noise='gamma', random_seed=1)` ¶

Generate a sample dataset from the given incidence curve.

Parameters:

Name	Type	Description	Default
`curve`	`Curve`	The incidence curve to sample from.	required
`season_ranges`	`list[SeasonRange]`	The season ranges to sample from.	required
`date_ranges`	`list[DateRange]`	The date ranges to generate observations for.	required
`covariate_categories`	`list[CovariateCategories]`	The covariate categories to sample from.	required
`parameters`	`list[tuple[str \| float, ...]]`	The parameters to sample from. List of tuples with the first element being the curve parameter name, the second element being the season, and the following being the covariate categories and the last element being the value.	required
`epsilon`	`float`	The standard deviation to use in the resulting observations.	required
`noise`	`Literal['gamma', 'normal']`	The noise distribution to apply to the daily incidence values.	`'gamma'`
`random_seed`	`int`	The random seed to use for reproducibility.	`1`

Returns:

Type	Description
`DataFrame`	A pandas DataFrame of observations with the columns 'season',
`DataFrame`	'season_start_date', 'season_end_date', 'start_date', 'end_date', 'report_date',
`DataFrame`	'type', and 'value' as well as the covariate categories covariate names.

Source code in src/vaxflux/data.py

def sample_dataset(
    curve: Curve,
    season_ranges: list[SeasonRange],
    date_ranges: list[DateRange],
    covariate_categories: list[CovariateCategories],
    parameters: list[tuple[str | float, ...]],
    epsilon: float,
    noise: Literal["gamma", "normal"] = "gamma",
    random_seed: int = 1,
) -> pd.DataFrame:
    """
    Generate a sample dataset from the given incidence curve.

    Args:
        curve: The incidence curve to sample from.
        season_ranges: The season ranges to sample from.
        date_ranges: The date ranges to generate observations for.
        covariate_categories: The covariate categories to sample from.
        parameters: The parameters to sample from. List of tuples with the first element
            being the curve parameter name, the second element being the season, and the
            following being the covariate categories and the last element being the
            value.
        epsilon: The standard deviation to use in the resulting observations.
        noise: The noise distribution to apply to the daily incidence values.
        random_seed: The random seed to use for reproducibility.

    Returns:
        A pandas DataFrame of observations with the columns 'season',
        'season_start_date', 'season_end_date', 'start_date', 'end_date', 'report_date',
        'type', and 'value' as well as the covariate categories covariate names.

    """
    generator = np.random.default_rng(seed=random_seed)
    season_ranges_map = {
        season_range.season: season_range for season_range in season_ranges
    }
    categories_prod = _covariate_categories_product(covariate_categories) or [{}]
    records = []
    for date_range in date_ranges:
        for category_prod in categories_prod:
            season_range = season_ranges_map[date_range.season]
            kwargs = {}
            for parameter in parameters:
                param_parts = list(parameter)[1:-1]
                if param_parts == [season_range.season, *category_prod.values()]:
                    kwargs[str(parameter[0])] = np.array(float(parameter[-1]))
                elif param_parts == [season_range.season]:
                    kwargs.setdefault(
                        str(parameter[0]),
                        np.array(float(parameter[-1])),
                    )
            t_start = (date_range.start_date - season_range.start_date).days
            t_end = (date_range.end_date - season_range.start_date).days + 1.0
            t0 = np.array([float(i) for i in range(int(t_start), int(t_end))])
            t1 = t0 + 1.0
            y = curve.prevalence_difference(t0, t1, **kwargs)
            y_values = y.eval() if hasattr(y, "eval") else y
            y_array: npt.NDArray[np.float64] = np.asarray(y_values, dtype=float)
            if epsilon > 0:
                if noise == "gamma":
                    y_array = generator.gamma(
                        shape=np.power(y_array / epsilon, 2.0),
                        scale=(epsilon**2.0) / y_array,
                    )
                elif noise == "normal":
                    y_array = generator.normal(loc=y_array, scale=epsilon)
                else:
                    msg = f"Unknown noise model: {noise}."
                    raise ValueError(msg)
            y_array = np.clip(y_array, 0.0, 1.0)
            value = float(y_array.sum())
            record = (
                {
                    "season": date_range.season,
                    "season_start_date": season_range.start_date,
                    "season_end_date": season_range.end_date,
                    "start_date": date_range.start_date,
                    "end_date": date_range.end_date,
                    "report_date": date_range.report_date,
                }
                | category_prod
                | {"type": "incidence", "value": value}
            )
            records.append(record)
    observations = pd.DataFrame.from_records(records)
    for col in ("season", "type"):
        observations[col] = observations[col].astype("string")
    for col in (
        "season_start_date",
        "season_end_date",
        "start_date",
        "end_date",
        "report_date",
    ):
        observations[col] = pd.to_datetime(observations[col])
    return observations

Data¶

data ¶

coordinates_from_incidence(incidence) ¶

create_logistic_sample_dataset(parameters, time, epsilon, error='gamma', seed=0) ¶

format_incidence_dataframe(incidence) ¶

get_ncird_nis_frvm_flu_vaccination_coverage(*, include_age_groups=False) ¶

get_ncird_weekly_cumulative_vaccination_coverage() ¶

sample_dataset(curve, season_ranges, date_ranges, covariate_categories, parameters, epsilon, noise='gamma', random_seed=1) ¶

`data` ¶

`coordinates_from_incidence(incidence)` ¶

`create_logistic_sample_dataset(parameters, time, epsilon, error='gamma', seed=0)` ¶

`format_incidence_dataframe(incidence)` ¶

`get_ncird_nis_frvm_flu_vaccination_coverage(*, include_age_groups=False)` ¶

`get_ncird_weekly_cumulative_vaccination_coverage()` ¶

`sample_dataset(curve, season_ranges, date_ranges, covariate_categories, parameters, epsilon, noise='gamma', random_seed=1)` ¶