CAMELS Forcing Ensemble

CAMELS Forcing Ensemble#

The CAMELS dataset provides a large ensemble of hydrologic model simulations for 671 basins across the contiguous United States. However, the forcing data is not well-structured and can be difficult to work with. This code snippet demonstrates how to process the forcing data into a single NetCDF file by taking the mean of the ensemble members.

First, download the Daymet forcing data from here. Then, extract the contents of the zip file into a directory called data. Then, install cytoolz, pandas, and xarray and run the following code:

from collections.abc import Generator
from pathlib import Path

import cytoolz.curried as tlz
import pandas as pd
import xarray as xr


def read_camels(filename: Path) -> pd.DataFrame:
    try:
        forcing = pd.read_csv(filename, usecols=[0, 1, 2, 4, 5, 6, 7, 8, 9], sep=r"\s+")
    except pd.errors.EmptyDataError:
        print(f"{filename} is empty")
        return pd.DataFrame()
    forcing = forcing.rename(
        columns={
            "YR": "year",
            "MNTH": "month",
            "DY": "day",
            "SWE": "swe",
            "PRCP": "prcp",
            "RAIM": "liquid",
            "TAIR": "tas",
            "PET": "pet",
            "ET": "et",
        }
    )
    date_cols = ["year", "month", "day"]
    forcing.index = pd.to_datetime(forcing[date_cols])
    return forcing.drop(columns=date_cols)


def ensemble_mean(
    df_list: Generator[pd.DataFrame, None, None], station_id: str
) -> xr.Dataset:
    ensemble = xr.concat(
        (df.to_xarray() for df in df_list if len(df) > 0), dim="ensemble"
    ).mean("ensemble")
    ensemble = ensemble.rename({"index": "time"}).expand_dims(
        {"station_id": [station_id]}
    )
    ensemble["swe"].attrs = {"units": "kg/m2", "long_name": "Snow Water Equivalent"}
    ensemble["prcp"].attrs = {"units": "mm", "long_name": "Precipitation"}
    ensemble["liquid"].attrs = {"units": "mm", "long_name": "Liquid Precipitation"}
    ensemble["tas"].attrs = {"units": "C", "long_name": "Air Temperature"}
    ensemble["pet"].attrs = {"units": "mm", "long_name": "Potential Evapotranspiration"}
    ensemble["et"].attrs = {"units": "mm", "long_name": "Evapotranspiration"}
    return ensemble


data_dir = Path("data")
files = Path(data_dir, "model_output_daymet/model_output/flow_timeseries/daymet").glob(
    "*/*model_output.txt"
)
station_files = tlz.groupby(lambda x: x.stem.split("_")[0], files)
counter = 1
for parts in tlz.partition_all(100, station_files.items()):
    clm = xr.merge(
        ensemble_mean((read_camels(f) for f in flist), sid) for sid, flist in parts
    )
    clm.to_netcdf(Path(data_dir, f"camels_daymet_ensemble_{counter}.nc"))
    counter += 1
clm = xr.merge(
    (xr.open_dataset(f) for f in data_dir.glob("camels_daymet_ensemble_*.nc"))
)
clm.transpose("time", "station_id").to_netcdf(
    Path(data_dir, "camels_daymet_ensemble.nc")
)
_ = [f.unlink() for f in data_dir.glob("camels_daymet_ensemble_*.nc")]