Build population gridcell map

`build_gridded_population(prov_pop_path, pop_density_raster_path, cutout_path, province_shape_path, gridded_pop_out)`

Build a gridded population DataFrame by matching population density to the cutout grid cells. This DataFrame is a sparse matrix of the population and shape BusesxCutout_gridcells where buses are the provinces

Parameters:

Name	Type	Description	Default
`prov_pop_path`	`PathLike`	Path to the province population count file (hdf5).	required
`pop_density_raster_path`	`PathLike`	Path to the population density raster file.	required
`cutout_path`	`PathLike`	Path to the cutout file containing the grid.	required
`province_shape_path`	`PathLike`	Path to the province shape file.	required
`gridded_pop_out`	`PathLike`	output file path.	required

Source code in workflow/scripts/build_population_gridcell_map.py

def build_gridded_population(
    prov_pop_path: PathLike,
    pop_density_raster_path: PathLike,
    cutout_path: PathLike,
    province_shape_path: PathLike,
    gridded_pop_out: PathLike,
):
    """Build a gridded population DataFrame by matching population density to the cutout grid cells.
    This DataFrame is a sparse matrix of the population and shape BusesxCutout_gridcells
      where buses are the provinces

    Args:
        prov_pop_path (PathLike): Path to the province population count file (hdf5).
        pop_density_raster_path (PathLike): Path to the population density raster file.
        cutout_path (PathLike): Path to the cutout file containing the grid.
        province_shape_path (PathLike): Path to the province shape file.
        gridded_pop_out (PathLike): output file path.
    """

    with pd.HDFStore(prov_pop_path, mode="r") as store:
        pop_province = store["population"]

    prov_poly = read_province_shapes(province_shape_path)
    pop_density = read_pop_density(pop_density_raster_path, prov_poly, crs=CRS)

    cutout = atlite.Cutout(cutout_path)
    grid_points = cutout.grid
    # this is in polygons but need points for sjoin with pop dnesity to work
    grid_points.to_crs(3857, inplace=True)
    grid_points["geometry"] = grid_points.centroid
    grid_points.to_crs(CRS, inplace=True)

    # match cutout grid to province
    # cutout_pts_in_prov = gpd.tools.sjoin(grid_points, prov_poly,
    # how="left", predicate="intersects")
    # TODO: do you want to dropna here?
    cutout_pts_in_prov = gpd.tools.sjoin(
        grid_points, prov_poly, how="left", predicate="intersects"
    )  # .dropna()
    cutout_pts_in_prov.rename(
        columns={"index_right": "province_index", "province": "province_name"},
        inplace=True,
    )

    # match cutout grid to province
    cutout_pts_in_prov = gpd.tools.sjoin(grid_points, prov_poly, how="left", predicate="intersects")
    cutout_pts_in_prov.rename(
        columns={"index_right": "province_index", "province": "province_name"},
        inplace=True,
    )
    # cutout_pts_in_prov.dropna(inplace=True)

    # TODO CRS, think about whether this makes sense or need grid interp
    merged = gpd.tools.sjoin_nearest(
        cutout_pts_in_prov.to_crs(3857), pop_density.to_crs(3857), how="inner"
    )
    merged = merged.to_crs(CRS)
    # points outside china are NaN, need to rename to keep the index cutout after agg
    # otherwise the spare matrix will not match the cutoutpoints
    #  (smarter would be to change the cutout)
    merged.fillna({"province_name": "OutsideChina"}, inplace=True)

    points_in_provinces = pd.DataFrame(index=cutout_pts_in_prov.index)
    # normalise pop per province and make a loc_id/province table
    points_in_provinces = (
        merged.groupby("province_name")["pop_density"]
        .apply(lambda x: x / x.sum())
        .unstack(fill_value=0.0)
        .T
    )
    # now get rid of the outside china "province"
    points_in_provinces.drop(columns="OutsideChina", inplace=True)
    points_in_provinces.index.name = ""
    points_in_provinces.fillna(0.0, inplace=True)

    points_in_provinces *= pop_province

    with pd.HDFStore(gridded_pop_out, mode="w", complevel=4) as store:
        store["population_gridcell_map"] = points_in_provinces

`build_population_map(prov_pop_path, pop_density_raster_path, cutout_path, province_shape_path, gridded_pop_out)`

Build a gridded population DataFrame by matching population density to the cutout grid cells. This DataFrame is a sparse matrix of the population and shape BusesxCutout_gridcells where buses are the provinces

Parameters:

Name	Type	Description	Default
`prov_pop_path`	`PathLike`	Path to the province population count file (hdf5).	required
`pop_density_raster_path`	`PathLike`	Path to the population density raster file.	required
`cutout_path`	`PathLike`	Path to the cutout file containing the grid.	required
`province_shape_path`	`PathLike`	Path to the province shape file.	required
`gridded_pop_out`	`PathLike`	output file path.	required

Source code in workflow/scripts/build_population_gridcell_map.py

def build_population_map(
    prov_pop_path: PathLike,
    pop_density_raster_path: PathLike,
    cutout_path: PathLike,
    province_shape_path: PathLike,
    gridded_pop_out: PathLike,
):
    """Build a gridded population DataFrame by matching population density to the cutout grid cells.
    This DataFrame is a sparse matrix of the population and shape BusesxCutout_gridcells
      where buses are the provinces

    Args:
        prov_pop_path (PathLike): Path to the province population count file (hdf5).
        pop_density_raster_path (PathLike): Path to the population density raster file.
        cutout_path (PathLike): Path to the cutout file containing the grid.
        province_shape_path (PathLike): Path to the province shape file.
        gridded_pop_out (PathLike): output file path.
    """

    # =============== load data ===================
    with pd.HDFStore(prov_pop_path, mode="r") as store:
        pop_province_count = store["population"]

    # CFSR points and Provinces
    pop_ww = load_cfrs_data(pop_density_raster_path)

    prov_poly = gpd.read_file(province_shape_path)[["province", "geometry"]]
    prov_poly.set_index("province", inplace=True)
    prov_poly = prov_poly.reindex(PROV_NAMES)
    prov_poly.reset_index(inplace=True)

    # load renewable profiles & grid & extract gridpoints
    cutout = atlite.Cutout(cutout_path)
    grid_points = cutout.grid
    grid_points.to_crs(3857, inplace=True)
    grid_points["geometry"] = grid_points.centroid
    grid_points.to_crs(CRS, inplace=True)

    # match cutout grid to province
    cutout_pts_in_prov = gpd.tools.sjoin(grid_points, prov_poly, how="left", predicate="intersects")
    cutout_pts_in_prov.rename(
        columns={"index_right": "province_index", "province": "province_name"},
        inplace=True,
    )

    # Province masks merged with population density
    # TODO: THIS REQUIRES EXPLANATION - can't just use random crs :||
    cutout_pts_in_prov = cutout_pts_in_prov.to_crs(3857)
    pop_ww = pop_ww.to_crs(3857)

    merged = gpd.tools.sjoin_nearest(cutout_pts_in_prov, pop_ww, how="inner")
    merged = merged.to_crs(CRS)

    # normalised pop distribution per province
    # need an extra province for points not in the province, otherwise lose cutout grid index
    merged.fillna({"province_name": "OutsideChina"}, inplace=True)
    points_in_provinces = pd.DataFrame(index=cutout_pts_in_prov.index)
    points_in_provinces = (
        merged.groupby("province_name")["pop_density"]
        .apply(lambda x: x / x.sum())
        .unstack(fill_value=0.0)
        .T
    )
    # Cleanup the matrix: get rid of the outside china "province" et
    points_in_provinces.drop(columns="OutsideChina", inplace=True)
    points_in_provinces.index.name = ""
    points_in_provinces.fillna(0.0, inplace=True)

    # go from normalised distribution to head count
    points_in_provinces *= pop_province_count
    with pd.HDFStore(gridded_pop_out, mode="w", complevel=4) as store:
        store["population_gridcell_map"] = points_in_provinces

`load_cfrs_data(target)`

load CFRS_grid.nc type files into a geodatafram

Parameters:

Name	Type	Description	Default
`target`	`PathLike`	the abs path	required

Returns:

Type	Description
`GeoDataFrame`	gpd.GeoDataFrame: the data in gdf

Source code in workflow/scripts/build_population_gridcell_map.py

def load_cfrs_data(target: PathLike) -> gpd.GeoDataFrame:
    """load  CFRS_grid.nc type files into a geodatafram

    Args:
        target (PathLike): the abs path

    Returns:
        gpd.GeoDataFrame: the data in gdf
    """
    pop_density = xr.open_dataarray(target).to_dataset(name="pop_density")
    pop_ww = xarr_to_gdf(pop_density, var_name="pop_density")  # TODO is the CRS correct?

    return pop_ww

`xarr_to_gdf(xarr, var_name, x_var='x', y_var='y', crs=CRS)`

convert an xarray to GDF

Parameters:

Name	Type	Description	Default
`xarr`	`DataArray`	the input array	required
`var_name`	`str`	the array variable to be converted.	required
`x_var`	`str`	the x dimension. Defaults to "x".	`'x'`
`y_var`	`str`	the y dimension. Defaults to "y".	`'y'`
`crs`	`_type_`	the crs. Defaults to CRS.	`CRS`

Returns:

Type	Description
`GeoDataFrame`	gpd.GeoDataFrame: geodata frame in chosen CRS

Source code in workflow/scripts/build_population_gridcell_map.py

def xarr_to_gdf(
    xarr: xr.DataArray, var_name: str, x_var="x", y_var="y", crs=CRS
) -> gpd.GeoDataFrame:
    """convert an xarray to GDF

    Args:
        xarr (xr.DataArray): the input array
        var_name (str): the array variable to be converted.
        x_var (str, optional): the x dimension. Defaults to "x".
        y_var (str, optional): the y dimension. Defaults to "y".
        crs (_type_, optional): the crs. Defaults to CRS.

    Returns:
        gpd.GeoDataFrame: geodata frame in chosen CRS
    """
    df = xarr.to_dataframe()
    df.reset_index(inplace=True)
    return gpd.GeoDataFrame(
        df[var_name], geometry=gpd.points_from_xy(df[x_var], df[y_var]), crs=crs
    )