Technoecon etl

Extract data from Remind, transform it for pypsa PyPSA and write it to files

`make_pypsa_like_costs(frames)`

translate the REMIND costs into pypsa format for a single region.

Parameters:

Name	Type	Description	Default
`frames`	`dict[DataFrame]`	dictionary with the REMIND data tables to be transformed. Region-filtered	required

Returns: pd.DataFrame: DataFrame containing cost data for a region.

Source code in src/rpycpl/technoecon_etl.py

def make_pypsa_like_costs(
    frames: dict[pd.DataFrame],
) -> pd.DataFrame:
    """translate the REMIND costs into pypsa format for a single region.

    Args:
        frames: dictionary with the REMIND data tables to be transformed. Region-filtered
    Returns:
        pd.DataFrame: DataFrame containing cost data for a region.
    """

    # check single region or region already removed
    regions_filtered = not any(["region" in df.columns for df in frames.values()])
    if not regions_filtered and any(
        [df.region.nunique() > 1 for df in frames.values() if "region" in df.columns]
    ):
        raise Warning("The dataframes are not region-filtered. Not supported.")
    elif not regions_filtered:
        frames.update(
            {k: df.drop(columns=["region"]) for k, df in frames.items() if "region" in df.columns}
        )

    years = frames["capex"].year.unique()
    capex = transform_capex(frames["capex"])

    # transform the data
    vom = transform_vom(frames["tech_data"].query("parameter == 'omv'"))
    fom = transform_fom(frames["tech_data"].query("parameter == 'omf'"))
    lifetime = transform_lifetime(frames["tech_data"].query("parameter == 'lifetime'"))

    co2_intens = transform_co2_intensity(frames["co2_intensity"], years)
    eta = transform_efficiency(frames["eta"], years)
    fuel_costs = transform_fuels(frames["fuel_costs"])
    discount_rate = transform_discount_rate(frames["discount_r"])

    del frames

    # stitch together in pypsa format
    cost_frames = {
        "capex": capex,
        "efficiency": eta,
        "fuel": fuel_costs,
        "co2": co2_intens,
        "lifetime": lifetime,
        "vom": vom,
        "fom": fom,
        "discount_rate": discount_rate,
    }

    # TODO Can do more efficient operations with join
    # add years to table with time-indep data
    for label, frame in cost_frames.items():
        if "year" not in frame.columns:
            cost_frames[label] = expand_years(frame, capex.year.unique())
    # add missing techs for tech agnostic data
    for label, frame in cost_frames.items():
        if "technology" not in frame.columns:
            cost_frames[label] = pd.concat(
                [frame.assign(technology=tech) for tech in capex.technology.unique()]
            )
    column_order = ["technology", "year", "parameter", "value", "unit", "source"]

    # merge the dataframes for the region
    costs_remind = pd.concat(
        [frame[column_order] for frame in cost_frames.values()], axis=0
    ).reset_index(drop=True)
    costs_remind.sort_values(by=["technology", "year", "parameter"], key=key_sort, inplace=True)

    return costs_remind.query("year in @years")

`map_to_pypsa_tech(remind_costs_formatted, pypsa_costs, mappings, weights, years=None, currency_conversion=1)`

Map the REMIND technology names to pypsa technoloies using the conversions specified in the map config

Parameters:

Name	Type	Description	Default
`remind_costs_formatted`	`DataFrame`	DataFrame containing REMIND cost data.	required
`pypsa_costs`	`DataFrame`	DataFrame containing pypsa cost data.	required
`mappings`	`DataFrame`	DataFrame containing the mapping funcs and names from REMIND to pypsa technologies.	required
`weights`	`DataFrame`	DataFrame containing the weights.	required
`years`	`Iterable`	years to be used. Defaults to None (use remidn dat)	`None`
`currency_conversion`	`float`	conversion factor for currency (PyPSA to REMIND).	`1`

Returns: pd.DataFrame: DataFrame with mapped technology names.

Source code in src/rpycpl/technoecon_etl.py

def map_to_pypsa_tech(
    remind_costs_formatted: pd.DataFrame,
    pypsa_costs: pd.DataFrame,
    mappings: pd.DataFrame,
    weights: pd.DataFrame,
    years: list | Iterable = None,
    currency_conversion: float = 1,
) -> pd.DataFrame:
    """Map the REMIND technology names to pypsa technoloies using the conversions specified in the
    map config

    Args:
        remind_costs_formatted (pd.DataFrame): DataFrame containing REMIND cost data.
        pypsa_costs (pd.DataFrame): DataFrame containing pypsa cost data.
        mappings (pd.DataFrame): DataFrame containing the mapping funcs and names from
            REMIND to pypsa technologies.
        weights (pd.DataFrame): DataFrame containing the weights.
        years (Iterable, optional): years to be used. Defaults to None (use remidn dat)
        currency_conversion (float, optional): conversion factor for currency (PyPSA to REMIND).
    Returns:
        pd.DataFrame: DataFrame with mapped technology names.
    """
    if years is None:
        years = remind_costs_formatted.year.unique()
    else:
        years = pd.Index(years, dtype=int)

    # direct mapping of remind
    use_remind = (
        mappings.query("mapper == 'use_remind'")
        .drop("unit", axis=1)
        .merge(
            remind_costs_formatted,
            left_on=["reference", "parameter"],
            right_on=["technology", "parameter"],
            how="left",
        )
    )
    use_remind.drop(columns=["technology"], inplace=True)

    direct_input = mappings.query("mapper == 'set_value'").rename(columns={"reference": "value"})
    direct_input = direct_input.assign(source="direct_input from coupling mapping")
    direct_input = expand_years(direct_input, years)

    # pypsa values
    from_pypsa = _use_pypsa(mappings, pypsa_costs, years, "constant", currency_conversion)
    from_pypsa.drop(columns=["technology"], inplace=True)

    # techs with proxy learnign
    proxy_learning = _learn_investment_from_proxy(
        mappings, pypsa_costs, remind_costs_formatted, ref_year=years.min()
    )
    if not proxy_learning.empty:
        proxy_learning.loc[:, "further description"] = "proxy learning from REMIND"
    # TODO check weighing is by right quantities
    # weighed by remind tech basket
    weighed_basket = _weigh_remind_by(remind_costs_formatted, weights, mappings)
    # format for output
    direct_input.rename(
        columns={"PyPSA_tech": "technology", "comment": "further description"},
        inplace=True,
    )
    use_remind.rename(
        columns={"PyPSA_tech": "technology", "comment": "further description"},
        inplace=True,
    )
    from_pypsa.rename(
        columns={"PyPSA_tech": "technology", "comment": "further description"},
        inplace=True,
    )
    proxy_learning.rename(
        columns={"PyPSA_tech": "technology", "comment": "further description"},
        inplace=True,
    )
    weighed_basket.rename(
        columns={"PyPSA_tech": "technology", "comment": "further description"},
        inplace=True,
    )

    output_frames = [
        direct_input,
        use_remind,
        from_pypsa,
        proxy_learning,
        weighed_basket,
    ]
    output = pd.concat([df[OUTP_COLS] for df in output_frames if not df.empty], axis=0)
    output = output.assign(year=output.year.astype(int))
    return output.sort_values(["year", "technology", "parameter"], key=key_sort).reset_index(
        drop=True
    )

`transform_capex(capex)`

Transform the CAPEX data from REMIND to pypsa.

Parameters:

Name	Type	Description	Default
`capex`	`DataFrame`	DataFrame containing REMIND capex data.	required

Returns: pd.DataFrame: Transformed capex data.

Source code in src/rpycpl/technoecon_etl.py

def transform_capex(capex: pd.DataFrame) -> pd.DataFrame:
    """Transform the CAPEX data from REMIND to pypsa.

    Args:
        capex (pd.DataFrame): DataFrame containing REMIND capex data.
    Returns:
        pd.DataFrame: Transformed capex data.
    """
    capex.loc[:, "value"] *= UNIT_CONVERSION["capex"]
    capex = capex.assign(source="REMIND " + capex.technology, parameter="investment", unit="USD/MW")
    store_techs = STOR_TECHS
    for stor in store_techs:
        capex.loc[capex["technology"] == stor, "unit"] = "USD/MWh"
    return capex

`transform_co2_intensity(co2_intensity, years)`

Transform the CO2 intensity data from REMIND to pypsa.

Parameters:

Name	Type	Description	Default
`co2_intensity`	`DataFrame`	DataFrame containing REMIND CO2 intensity data.	required
`years`	`list \| Index`	relevant years data.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Transformed CO2 intensity data.

Source code in src/rpycpl/technoecon_etl.py

def transform_co2_intensity(co2_intensity: pd.DataFrame, years: list | pd.Index) -> pd.DataFrame:
    """Transform the CO2 intensity data from REMIND to pypsa.

    Args:
        co2_intensity (pd.DataFrame): DataFrame containing REMIND CO2 intensity data.
        years (list | pd.Index): relevant years data.

    Returns:
        pd.DataFrame: Transformed CO2 intensity data.
    """
    # TODO Co2 equivalent
    co2_intens = co2_intensity.rename(
        columns={
            "carrier": "from_carrier",
            "all_enty_1": "to_carrier",
            "all_enty_2": "emission_type",
            "all_enty.1": "to_carrier",
            "all_enty.2": "emission_type",
        },
    )
    co2_intens = co2_intens.query("to_carrier == 'seel' & emission_type == 'co2' & year in @years")
    co2_intens = co2_intens.assign(
        parameter="CO2 intensity",
        unit="t_CO2/MWh_th",
        source=co2_intens.technology + " REMIND",
    )
    co2_intens.loc[:, "value"] *= UNIT_CONVERSION["co2_intensity"]
    return co2_intens

`transform_efficiency(eff_data, years)`

Transform the efficiency data from REMIND to pypsa.

Parameters:

Name	Type	Description	Default
`eff_data`	`DataFrame`	DataFrame containing REMIND efficiency data.	required
`years`	`list \| Index`	relevant years.	required

Returns: pd.DataFrame: Transformed efficiency data.

Source code in src/rpycpl/technoecon_etl.py

def transform_efficiency(eff_data: pd.DataFrame, years: list | pd.Index) -> pd.DataFrame:
    """Transform the efficiency data from REMIND to pypsa.

    Args:
        eff_data (pd.DataFrame): DataFrame containing REMIND efficiency data.
        years (list | pd.Index): relevant years.
    Returns:
        pd.DataFrame: Transformed efficiency data.
    """
    eta = eff_data.query("year in @years")
    eta = eta.assign(source=eta.technology + " REMIND", unit="p.u.", parameter="efficiency")

    # Special treatment for nuclear: Efficiencies are in TWa/Mt=8760 TWh/Tg_U
    #  -> convert to MWh/g_U to match with fuel costs in USD/g_U
    eta.loc[eta["technology"].isin(["fnrs", "tnrs"]), "value"] *= 8760 / 1e6
    eta.loc[eta["technology"].isin(["fnrs", "tnrs"]), "unit"] = "MWh/g_U"
    # Special treatment for battery: Efficiencies in costs.csv should be roundtrip
    eta.loc[eta["technology"] == "btin", "value"] **= 2

    return eta

`transform_fom(fom)`

Transform the Fixed Operational Maintenance data from REMIND to pypsa.

Parameters:

Name	Type	Description	Default
`fom`	`DataFrame`	DataFrame containing REMIND FOM data.	required

Returns: pd.DataFrame: Transformed FOM data.

Source code in src/rpycpl/technoecon_etl.py

def transform_fom(fom: pd.DataFrame) -> pd.DataFrame:
    """Transform the Fixed Operational Maintenance data from REMIND to pypsa.

    Args:
        fom (pd.DataFrame): DataFrame containing REMIND FOM data.
    Returns:
        pd.DataFrame: Transformed FOM data.
    """
    fom.loc[:, "value"] *= UNIT_CONVERSION["FOM"]
    fom = fom.assign(source=fom.technology + " REMIND")
    fom = fom.assign(unit="percent", parameter="FOM")

    return fom

`transform_lifetime(lifetime)`

Transform the lifetime data from REMIND to pypsa.

Parameters:

Name	Type	Description	Default
`lifetime`	`DataFrame`	DataFrame containing REMIND lifetime data.	required

Returns: pd.DataFrame: Transformed lifetime data.

Source code in src/rpycpl/technoecon_etl.py

def transform_lifetime(lifetime: pd.DataFrame) -> pd.DataFrame:
    """Transform the lifetime data from REMIND to pypsa.

    Args:
        lifetime (pd.DataFrame): DataFrame containing REMIND lifetime data.
    Returns:
        pd.DataFrame: Transformed lifetime data.
    """
    lifetime = lifetime.assign(unit="years", source=lifetime.technology + " REMIND", inplace=True)
    return lifetime

`transform_vom(vom)`

Transform the Variable Operational Maintenance data from REMIND to pypsa. Args: vom (pd.DataFrame): DataFrame containing REMIND VOM data. Returns: pd.DataFrame: Transformed VOM data.

Source code in src/rpycpl/technoecon_etl.py

def transform_vom(vom: pd.DataFrame) -> pd.DataFrame:
    """Transform the Variable Operational Maintenance data from REMIND to pypsa.
    Args:
        vom (pd.DataFrame): DataFrame containing REMIND VOM data.
    Returns:
        pd.DataFrame: Transformed VOM data.
    """
    vom.loc[:, "value"] *= UNIT_CONVERSION["VOM"]
    vom = vom.assign(unit="USD/MWh", source=vom.technology + " REMIND", parameter="VOM")
    return vom

`validate_mappings(mappings)`

validate the mapping of the technologies to pypsa technologies Args: mappings (pd.DataFrame): DataFrame containing the mapping funcs and names from REMIND to pypsa technologies. Raises: ValueError: if mappers not allowed ValueError: if columns not expected ValueError: if proxy learning (use_remind_with_learning_from) is used for something other than invest

Source code in src/rpycpl/technoecon_etl.py

def validate_mappings(mappings: pd.DataFrame):
    """validate the mapping of the technologies to pypsa technologies
    Args:
        mappings (pd.DataFrame): DataFrame containing the mapping funcs and names
            from REMIND to pypsa technologies.
    Raises:
        ValueError: if mappers not allowed
        ValueError: if columns not expected
        ValueError: if proxy learning (use_remind_with_learning_from) is used
            for something other than invest

    """

    # validate columns
    EXPECTED_COLUMNS = [
        "PyPSA_tech",
        "parameter",
        "mapper",
        "reference",
        "unit",
        "comment",
    ]
    if not sorted(mappings.columns) == sorted(EXPECTED_COLUMNS):
        raise ValueError(f"Invalid mapping. Allowed columns are: {EXPECTED_COLUMNS}")

    # validate mappers allowed
    forbidden_mappers = set(mappings.mapper.unique()).difference(MAPPING_FUNCTIONS)
    if forbidden_mappers:
        raise ValueError(f"Forbidden mappers found in mappings: {forbidden_mappers}")

    # validate proxy learning
    proxy_learning = mappings.query("mapper == 'use_remind_with_learning_from'")
    proxy_params = set(proxy_learning.parameter)
    if proxy_params.difference({"investment"}):
        raise ValueError(f"Proxy learning is only allowed for investment but Found: {proxy_params}")

    # validate numeric
    set_vals = mappings.query("mapper == 'set_value'")["reference"]
    try:
        set_vals.astype(float)
    except ValueError as e:
        raise ValueError(f"set_value reference values must be numeric but: {e}")

    # check uniqueness
    counts = mappings.groupby(["PyPSA_tech", "parameter"]).count()
    repeats = counts[counts.values > 1]
    if len(repeats):
        raise ValueError(f"Mappings are not unique: n repeats:\n {repeats} ")
    # should validate that remind references are actually in the remind export

    if "value" in mappings.columns:
        raise ValueError("Mapping has an unexpected 'value' column")

`validate_output(df_out, costs_remind)`

validate the output data Args: df_out (pd.DataFrame): DataFrame containing the output data costs_remind (pd.DataFrame): DataFrame containing the formatted remind data

Source code in src/rpycpl/technoecon_etl.py

def validate_output(df_out: pd.DataFrame, costs_remind: pd.DataFrame):
    """validate the output data
    Args:
        df_out (pd.DataFrame): DataFrame containing the output data
        costs_remind (pd.DataFrame): DataFrame containing the formatted remind data
    """

    missing_vals = df_out.value.isna().any()
    if missing_vals:
        raise ValueError(f"Missing values or nans in output data: {df_out[df_out.value.isna()]}")

    n_expected = costs_remind.technology.nunique() * costs_remind.year.nunique()

`validate_remind_data(costs_remind, mappings)`

validate the remind cost data Args: remind_data (pd.DataFrame): DataFrame containing the remind data

Source code in src/rpycpl/technoecon_etl.py

def validate_remind_data(costs_remind: pd.DataFrame, mappings: pd.DataFrame):
    """validate the remind cost data
    Args:
        remind_data (pd.DataFrame): DataFrame containing the remind data
    """
    requested_data = mappings.query("mapper.str.contains('remind')")[
        ["PyPSA_tech", "parameter", "reference"]
    ].explode("reference")
    data = requested_data.explode("reference").merge(
        costs_remind.rename(columns={"technology": "reference"}),
        on=["parameter", "reference"],
        how="left",
    )
    data = data[["PyPSA_tech", "reference", "year", "parameter", "value"]]
    missing = data[(data.isna()).any(axis=1)]
    if not missing.empty:
        raise ValueError(
            f"Missing data in REMIND for (first <10 rows)\n{missing.drop_duplicates().head(10)}"
            "\nCheck the mappings and the remind data."
            " Hint: are your reference lists consistently separated by ',' or ', '?"
        )