Source code for ocean_model_skill_assessor.stats

"""
Statistics functions.
"""

from typing import Union

import numpy as np
import pandas as pd
import xarray as xr
import yaml

from .paths import Paths


[docs] def compute_bias(obs: Union[pd.Series, xr.DataArray], model: xr.DataArray) -> float: """Given obs and model signals return bias.""" assert isinstance(obs, (pd.Series, xr.DataArray)) assert isinstance(model, xr.DataArray) # # easier to consistently check model for this # # if 3D, assume we should calculate metrics over time dimension # if model.squeeze().ndim == 3: # dim = "T" # out = (model - obs).cf.mean(dim=dim) # else: out = float((model - obs).cf.mean()) return out
[docs] def compute_correlation_coefficient( obs: Union[pd.Series, xr.DataArray], model: xr.DataArray ) -> float: """Given obs and model signals, return Pearson product-moment correlation coefficient""" assert isinstance(obs, (pd.Series, xr.DataArray)) assert isinstance(model, xr.DataArray) # # easier to consistently check model for this # # if 3D, assume we should calculate metrics over time dimension # if model.squeeze().ndim == 3: # # can't figure this one out and doesn't seem high priority # out = np.nan # else: # can't send nan's in inds = obs.notnull().values * model.notnull().values inds = inds.squeeze() # out = float((model - obs).cf.mean()) out = float(np.corrcoef(np.array(obs)[inds], np.array(model)[inds])[0, 1]) return out
[docs] def compute_index_of_agreement( obs: Union[pd.Series, xr.DataArray], model: xr.DataArray ) -> float: """Given obs and model signals, return Index of Agreement (Willmott 1981)""" assert isinstance(obs, (pd.Series, xr.DataArray)) assert isinstance(model, xr.DataArray) # # easier to consistently check model for this # # if 3D, assume we should calculate metrics over time dimension # if model.squeeze().ndim == 3: # dim = "T" # ref_mean = obs.cf.mean(dim=dim) # num = ((obs - model) ** 2).cf.sum(dim=dim) # denom_a = np.abs(model - ref_mean) # denom_b = np.abs(obs - ref_mean) # # denom_a = np.abs(np.array(model - ref_mean)) # # denom_b = np.abs(np.array(obs - ref_mean)) # denom = ((denom_a + denom_b) ** 2).cf.sum(dim=dim) # out = 1 - num / denom # else: ref_mean = obs.mean() num = ((obs - model) ** 2).sum() denom_a = np.abs(np.array(model - ref_mean)) denom_b = np.abs(np.array(obs - ref_mean)) denom = ((denom_a + denom_b) ** 2).sum() out = float(1 - num / denom) # handle underfloat if denom < 1e-16: return 1 return out
[docs] def compute_mean_square_error( obs: Union[pd.Series, xr.DataArray], model: xr.DataArray, centered=False ) -> float: """Given obs and model signals, return mean squared error (MSE)""" assert isinstance(obs, (pd.Series, xr.DataArray)) assert isinstance(model, xr.DataArray) error = obs - model # # easier to consistently check model for this # # if 3D, assume we should calculate metrics over time dimension # if model.squeeze().ndim == 3: # dim = "T" # if centered: # raise NotImplementedError("Centered not implemented for 3D") # out = (error**2).cf.mean(dim=dim) # else: if centered: error += float(-obs.mean() + model.mean()) out = float((error**2).mean()) return out
[docs] def compute_murphy_skill_score( obs: Union[pd.Series, xr.DataArray], model: xr.DataArray, obs_model=None ) -> float: """Given obs and model signals, return Murphy Skill Score (Murphy 1988)""" assert isinstance(obs, (pd.Series, xr.DataArray)) assert isinstance(model, xr.DataArray) # # easier to consistently check model for this # # if 3D, assume we should calculate metrics over time dimension # if model.squeeze().ndim == 3: # dim = "T" # if not obs_model: # obs_model = obs.copy() # # # Have default solution be to skip the climatology # # obs_model[:] = 0 # # if a obs forecast is not available, use mean of the *original* observations # obs_model[:] = obs.cf.mean(dim=dim) # out = 1 - ((obs - model) ** 2).cf.sum(dim=dim) / ((obs - obs_model) ** 2).cf.sum(dim=dim) # else: if not obs_model: obs_model = obs.copy() # # Have default solution be to skip the climatology # obs_model[:] = 0 # if a obs forecast is not available, use mean of the *original* observations obs_model[:] = obs.mean() out = float(1 - ((obs - model) ** 2).sum() / ((obs - obs_model) ** 2).sum()) # # jesse's # mse_model = compute_mean_square_error(obs, model, centered=False) # # fake the name for the column # obs.name = "model" # mse_obs_model = compute_mean_square_error(obs_model, obs, centered=False) # if mse_obs_model <= 0: # return -1 # return float(1 - mse_model / mse_obs_model) # 1-((obs - model)**2).sum()/(obs**2).sum() return out
[docs] def compute_root_mean_square_error( obs: Union[pd.Series, xr.DataArray], model: xr.DataArray, centered=False ) -> float: """Given obs and model signals, return Root Mean Square Error (RMSE)""" assert isinstance(obs, (pd.Series, xr.DataArray)) assert isinstance(model, xr.DataArray) # # easier to consistently check model for this # # if 3D, assume we should calculate metrics over time dimension # if model.squeeze().ndim == 3: # mse = compute_mean_square_error(obs, model, centered=centered) # out = np.sqrt(mse) # else: mse = compute_mean_square_error(obs, model, centered=centered) out = float(np.sqrt(mse)) return out
[docs] def compute_descriptive_statistics(model: xr.DataArray, ddof=0) -> list: """Given obs and model signals, return the standard deviation""" assert isinstance(model, xr.DataArray) # # easier to consistently check model for this # # if 3D, assume we should calculate metrics over time dimension # if model.squeeze().ndim == 3: # dim = "T" # out = list( # [ # model.cf.max(dim=dim), # model.cf.min(dim=dim), # model.cf.mean(dim=dim), # model.cf.std(ddof=ddof, dim=dim), # ] # ) # else: out = list( [ float(model.max()), float(model.min()), float(model.mean()), float(model.std(ddof=ddof)), ] ) return out
[docs] def compute_stats(obs: Union[pd.Series, xr.DataArray], model: xr.DataArray) -> dict: """Compute stats and return as DataFrame""" assert isinstance(obs, (pd.Series, xr.DataArray)) assert isinstance(model, xr.DataArray) return { "bias": compute_bias(obs, model), "corr": compute_correlation_coefficient(obs, model), "ioa": compute_index_of_agreement(obs, model), "mse": compute_mean_square_error(obs, model), "ss": compute_murphy_skill_score(obs, model), "rmse": compute_root_mean_square_error(obs, model), "descriptive": compute_descriptive_statistics(model), }
[docs] def save_stats( source_name: str, stats: dict, key_variable: str, paths: Paths, filename=None ): """Save computed stats to file.""" stats["bias"] = { "value": stats["bias"], "name": "Bias", "long_name": "Bias or MSD", } stats["corr"] = { "value": stats["corr"], "name": "Correlation Coefficient", "long_name": "Pearson product-moment correlation coefficient", } stats["ioa"] = { "value": stats["ioa"], "name": "Index of Agreement", "long_name": "Index of Agreement (Willmott 1981)", } stats["mse"] = { "value": stats["mse"], "name": "Mean Squared Error", "long_name": "Mean Squared Error (MSE)", } stats["ss"] = { "value": stats["ss"], "name": "Skill Score", "long_name": "Skill Score (Bogden 1996)", } stats["rmse"] = { "value": stats["rmse"], "name": "RMSE", "long_name": "Root Mean Square Error (RMSE)", } stats["descriptive"] = { "value": stats["descriptive"], "name": "Descriptive Statistics", "long_name": "Max, Min, Mean, Standard Deviation", } if "dist" in stats: stats["dist"] = { "value": stats["dist"], "name": "Distance", "long_name": "Distance in km from data location to selected model location", } if filename is None: filename = paths.PROJ_DIR / f"stats_{source_name}_{key_variable}.yaml" with open(filename, "w") as outfile: yaml.dump(stats, outfile, default_flow_style=False)