"""
Data object module
===================
The :class:`.Data` is the backbone of the library that allows one to compute easily the different quantities needed to
postprocess forecasts.
Basically, it is a Numpy :class:`~numpy.ndarray` with the first 5 dimensions allocated to a
dedicated and fixed meaning. These first axis of the data represent:
* Axis 0th: predictor number (:math:`p`)
* Axis 1st: observation/realization number (:math:`n`)
* Axis 2nd: ensemble member number (:math:`m`)
* Axis 3rd: variable or label number (:math:`v`) **[Not used/implemented for the moment!]**
* Axis 4th: lead time (:math:`t`)
These 5 first dimensions are called the *data index* (see :attr:`Data.index_shape`).
As such, they represent the data as a multi-dimensional array :math:`\\mathcal{D}_{p,n,m,v} (t)` where :math:`t`
is the lead time.
The extra dimensions possibly trailing in the array are the intrinsic dimensions of the data itself.
For instance, an array of total dimension 7 represents 2D data (e.g. fields).
If only 5 dimensions are present on total, then the data is a scalar.
The main operations of the :class:`.Data` are broadcasted over these extra-dimension, making the data object directly compliant
with multi-dimensional forecast data.
Examples
--------
Here is an example showing how the :class:`Data` object works:
>>> import numpy as np
>>> from core.data import Data
>>> a = np.random.randn(2, 3, 10, 1, 60, 20, 20)
>>> data = Data(a)
>>> data.number_of_predictors
2
>>> data.number_of_observations
3
>>> data.number_of_members
10
>>> data.number_of_variables
1
>>> data.number_of_time_steps
60
>>> data.shape
(20, 20)
>>> data.index_shape
(2, 3, 10, 1, 60)
Notes
-----
* The methods of the :class:`Data` object return as much as possible another :class:`Data` object. If it is not possible
to format the ouptut according to the shape described above, a :class:`~numpy.ndarray` is returned. For example, matrix
derived from the data are returned as NumPy arrays.
* By convention, if a method or operation reduces or returns a :class:`Data` with one of the index missing, the corresponding index of
the object is set to zero to preserve index shape of the object. For example, for a :class:`Data` object
:math:`\\mathcal{D}_{p,n,m,v} (t)` of :attr:`~Data.index_shape` `(P, N, M, V, T)`, the :attr:`Data.ensemble_max`
method returns :math:`\\max_m \\mathcal{D}_{p,n,m,v} (t)` as a :class:`Data` object of shape `(P, N, 1, V, T)`.
E.g.:
>>> import numpy as np
>>> from core.data import Data
>>> a = np.random.randn(2, 3, 10, 1, 60, 20, 20)
>>> data = Data(a)
>>> data.index_shape
(2, 3, 10, 1, 60)
>>> maxi = data.ensemble_max
>>> maxi.index_shape
(2, 3, 1, 1, 60)
In the following, in such a case we will use the notation :math:`\\mathcal{D}_{p,n,v} (t) \\equiv \\mathcal{D}_{p,n,0,v} (t)`.
* Missing values in :class:`Data` objects can be marked as :data:`numpy.nan`. The various averages, summation and methods will automatically **ignore** the missing values.
As a consequence, it means that these averages and summations will include less terms. For example, if at one lead time, an ensemble member value is missing,
the ensemble mean is done on the rest of the ensemble at this precise lead time and obviously does not include this member.
References
----------
.. bibliography:: ../ref.bib
:labelprefix: DATA-
:keyprefix: data-
Warnings
--------
Several properties and definitions inside the Data object are not yet fixed or well-defined. Usages and standards might still
evolve.
.. _quantiles: https://en.wikipedia.org/wiki/Quantile
"""
# TODO : - Timestamps as a _total_shape array of np.datetime64 ?
# - Metadata management
# - data consistency check
import numpy as np
from scipy.stats import norm
import pickle
import warnings
import datetime
try:
import matplotlib.pyplot as plt
from core.plotting import std_plot, minmax_plot
except ModuleNotFoundError:
warnings.warn('Unable to import matplotlib, plotting methods will not work.', ImportWarning)
plt = None
try:
import pandas as pd
except ModuleNotFoundError:
warnings.warn('Unable to import pandas, loading pandas DataFrame will not work.', ImportWarning)
pd = None
[docs]class Data(object):
"""Main data structure of the library.
Parameters
----------
data: None or ~numpy.ndarray
The data array. If not `None`, should be an array of shape
(:attr:`~.Data.number_of_predictors`, :attr:`~.Data.number_of_observations`, :attr:`~.Data.number_of_members`, :attr:`~.Data.number_of_variables`, :attr:`~.Data.number_of_time_steps`)
Default to `None`.
timestamps: None or ~numpy.ndarray(~datetime.datetime) or list(~numpy.ndarray(~datetime.datetime))
The timestamps of the forecast data.
Can be a 1D :class:`~numpy.ndarray` of :class:`~datetime.datetime` timestamps (one per lead time). In that case, the same timestamps vector is
attributed to all the predictors and the observations provided by `data`.
Can also be a list of 1D :class:`~numpy.ndarray` of :class:`~datetime.datetime` timestamps (one list entry per observation). It allows one to set a different timestamps per
observation/realization.
If None, no timestamp is set. Default to `None`.
metadata: object or ~numpy.ndarray(object) or list(~numpy.ndarray(object))
Object(s) describing the metadata of the data (not implemented yet). Can be an object, a :class:`~numpy.ndarray` of objects
(one per observation/realization), or a list of 1D :class:`~numpy.ndarray` of objects (one list entry per predictor, one array component
per observation/realization). If a single array is provided, it can be of shape (:attr:`~.Data.number_of_predictors`, :attr:`~.Data.number_of_observations`) to
specify the metadata of each predictor and observation/realization separately. It can also be a 1D array for which each component corresponds
to an observation/realization. In this case, the same metadata object is used for each predictor. Default to the `None` object.
dtype: ~numpy.dtype
The data type of the data being stored. Default to :class:`numpy.float64`.
Attributes
----------
data: ~numpy.ndarray
The data array.
timestamps: (~numpy.ndarray(~numpy.ndarray(~datetime.datetime))
The timestamps of the data, stored as :class:`~numpy.ndarray` of :class:`~datetime.datetime` and with shape corresponding to (:attr:`~.Data.number_of_predictors`, :attr:`~.Data.number_of_observations`).
metadata: ~numpy.ndarray(object)
Object describing the metadata of the data (not specified yet).
"""
# First, some builtin standards
_axis_name_dict = {"predictor": 0,
"observation": 1,
"obs": 1,
"realization": 1,
"member": 2,
"ensemble": 2,
"label": 3,
"variable": 3,
"time": 4,
"lead": 4}
def __init__(self, data=None, metadata=None, timestamps=None, dtype=np.float64):
if data is not None:
self.data = data.astype(dtype)
self._dtype = dtype
self.metadata = self._create_metadata(data, metadata)
self.timestamps = self._create_timestamps(data, data, timestamps)
else:
self.data = data
self._dtype = dtype
self.metadata = metadata
self.timestamps = timestamps
[docs] def load_timestamps(self, timestamps):
"""Load timestamps data.
Parameters
----------
timestamps: ~numpy.ndarray(~datetime.datetime) or list(~numpy.ndarray(~datetime.datetime))
The timestamps of the forecast data.
Can be a 1D :class:`~numpy.ndarray` of :class:`~datetime.datetime` timestamps (one per lead time). In that case, the same timestamps vector is
attributed to all the predictors and the observations provided by `data`.
Can also be a list of 1D :class:`~numpy.ndarray` of :class:`~datetime.datetime` timestamps (one list entry per observation). It allows one to set a different timestamps per
observation/realization.
"""
self.timestamps = self._create_timestamps(self.data, self.data, timestamps)
[docs] def clear_data(self):
"""Reset the Data object."""
self.data = None
self.metadata = None
self.timestamps = None
[docs] def set_dtype(self, dtype):
"""Set the data type.
Parameters
----------
dtype: ~numpy.dtype
The Numpy data type of the data.
"""
self.data = self.data.astype(dtype)
self._dtype = dtype
@property
def dtype(self):
"""~numpy.dtype: The data type."""
return self._dtype
@property
def number_of_observations(self):
"""int: The number of observations stored in the data object."""
if self.data is None:
return None
else:
return self.index_shape[1]
@property
def number_of_variables(self):
"""int: The number of variables stored in the data object."""
if self.data is None:
return None
else:
return self.index_shape[3]
@property
def number_of_time_steps(self):
"""int: The number of time steps stored in the data object."""
if self.data is None:
return None
else:
return self.index_shape[4]
@property
def number_of_predictors(self):
"""int: The number of predictors stored in the data object."""
if self.data is None:
return None
else:
return self.index_shape[0]
@property
def number_of_members(self):
"""int: The number of ensemble members stored in the data object."""
if self.data is None:
return None
else:
return self.index_shape[2]
@property
def ndim(self):
"""int: The dimension of the data."""
if self.data is None:
return None
else:
return self.data.ndim-5
@property
def index_shape(self):
"""tuple(int): The shape of the data index."""
if self.data is None:
return None
else:
return self.data.shape[:5]
@property
def _total_shape(self):
"""tuple(int): The total shape of the data array."""
if self.data is None:
return None
else:
return self.data.shape
@property
def shape(self):
"""tuple(int): The shape of the data."""
if self.data is None:
return None
else:
return self.data.shape[5:]
@property
def _scalars(self):
if self.data is None:
return False
else:
return self.ndim == 0
@property
def _vectors(self):
if self.data is None:
return False
else:
return self.ndim == 1
@property
def _fields(self):
if self.data is None:
return False
else:
return self.ndim == 2
[docs] def is_scalar(self):
"""bool: Return true if the data stored are scalars."""
return self._scalars
[docs] def is_vector(self):
"""bool: Return true if the data stored are vectors."""
return self._vectors
[docs] def is_field(self):
"""bool: Return true if the data stored are fields."""
return self._fields
[docs] def is_empty(self):
"""bool: Return true if there is no data stored."""
return self.data is None
def __repr__(self):
return self.data.__repr__()
def __str(self):
return self.data.__str()
def __getitem__(self, index):
return self.data[index[:5]]
def __setitem__(self, index, value):
self.data[index[:5]] = value
def __add__(self, other):
try:
res = self.data + other.data
except:
try:
res = self.data + other
except:
res = None
return Data(res, self.metadata, self.timestamps, self.dtype)
def __radd__(self, other):
self.__add__(other)
def __sub__(self, other):
try:
res = self.data - other.data
except:
try:
res = self.data - other
except:
res = None
return Data(res, self.metadata, self.timestamps, self.dtype)
def __rsub__(self, other):
return self.__sub__(other)
def __mul__(self, other):
try:
res = self.data * other.data
except:
try:
res = self.data * other
except:
res = None
return Data(res, self.metadata, self.timestamps, self.dtype)
def __truediv__(self, other):
try:
res = self.data / other.data
except:
try:
res = self.data / other
except:
res = None
return Data(res, self.metadata, self.timestamps, self.dtype)
def __pow__(self, power, modulo=None):
res = self.data.__pow__(power, modulo)
return Data(res, self.metadata, self.timestamps, self.dtype)
def __rmul__(self, other):
return self.__mul__(other)
def __rtruediv__(self, other):
return self.__truediv__(other)
def __abs__(self):
return Data(np.abs(self.data), self.metadata, self.timestamps, self.dtype)
[docs] def get_value(self, index):
"""Get the value(s) of a particular data index.
Parameters
----------
index: tuple(int)
The data index of the value.
Returns
-------
values: ~numpy.ndarray
The values corresponding to the index.
"""
return self.data[index]
[docs] def get_data(self):
"""Return the whole data array.
Returns
-------
~numpy.ndarray
The whole data array.
"""
return self.data
[docs] def copy(self):
"""Return a (shallow) copy of the Data object.
Returns
-------
Data
A copy of the Data object.
"""
return Data(self.data.copy(), self.metadata, self.timestamps, self.dtype)
@property
def observational_mean(self):
"""Data: Mean over the observation index :math:`n`:
:math:`\\mu^{\\rm obs}_{p,m,v} (t) = \langle \mathcal{D}_{p,n,m,v} (t) \\rangle_n`."""
return Data(np.nanmean(self.data, axis=1)[:, np.newaxis, ...], self.metadata, None, self.dtype)
@property
def observational_median(self):
"""Data: Median over the observation index :math:`n`."""
return Data(np.nanmedian(self.data, axis=1)[:, np.newaxis, ...], self.metadata, None, self.dtype)
@property
def observational_var(self):
"""Data: Variance over the observation index :math:`n`:
:math:`\\sigma^{\\rm obs}_{p,m,v} (t)^2 = \\left\\langle (\\mathcal{D}_{p,n,m,v} (t) - \\mu^{\\rm obs}_{p,m,v} (t))^2 \\right\\rangle_n`."""
return Data(np.nanvar(self.data, axis=1, ddof=1)[:, np.newaxis, ...], self.metadata, None, self.dtype)
@property
def observational_std(self):
"""Data: Standard deviation over the observation index :math:`n`:
:math:`\sigma^{\\rm obs}_{p,m,v} (t)`."""
return Data(np.nanstd(self.data, axis=1, ddof=1)[:, np.newaxis, ...], self.metadata, None, self.dtype)
@property
def observational_min(self):
"""Data: Observational minimum over the ensemble index :math:`n`:
:math:`\min_n \mathcal{D}_{p,n,m,v} (t)`."""
return Data(np.nanmin(self.data, axis=1)[:, :, np.newaxis, ...], self.metadata, None, self.dtype)
@property
def observational_max(self):
"""Data: Observational maximum over the ensemble index :math:`n`:
:math:`\max_n \mathcal{D}_{p,n,m,v} (t)`."""
return Data(np.nanmax(self.data, axis=1)[:, :, np.newaxis, ...], self.metadata, None, self.dtype)
[docs] def observational_quantiles(self, q, interpolation='linear'):
"""Return the observational `quantiles`_ of the data.
Parameters
----------
q: array_like(float)
Quantile or sequence of quantiles to compute, which must be between 0 and 1 inclusive.
interpolation: str, optional
This optional parameter specifies the interpolation method to use when the desired quantile lies between
two data points. See :func:`numpy.quantile` for more information.
Returns
-------
Data
The observational quantiles, stored along the observation axis (1st axis).
"""
return Data(np.moveaxis(np.nanquantile(self.data, q, axis=1, interpolation=interpolation), 0, 1),
self.metadata, None, self.dtype)
@property
def ensemble_mean(self):
"""Data: Ensemble mean. Mean over the ensemble index :math:`m`:
:math:`\\mu^{\\rm{ens}}_{p,n,v} (t) = \\langle \\mathcal{D}_{p,n,m,v} (t) \\rangle_m`."""
return Data(np.nanmean(self.data, axis=2)[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype)
@property
def ensemble_median(self):
"""Data: Ensemble median. Median over the ensemble index :math:`m`."""
return Data(np.nanmedian(self.data, axis=2)[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype)
@property
def ensemble_var(self):
"""Data: Ensemble variance over the ensemble index :math:`m`:
:math:`\\sigma^{\\rm{ens}}_{p,n,v} (t)^2 = \\left\\langle \\left( \\mathcal{D}_{p,n,m,v} (t) - \\mu^{\\rm{ens}}_{p,n,v} (t) \\right)^2 \\right\\rangle_m`."""
return Data(np.nanvar(self.data, axis=2, ddof=1)[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype)
@property
def ensemble_std(self):
"""Data: Ensemble standard deviation over the ensemble index :math:`m`:
:math:`\\sigma^{\\rm{ens}}_{p,n,v} (t)`."""
return Data(np.nanstd(self.data, axis=2, ddof=1)[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype)
@property
def ensemble_min(self):
"""Data: Ensemble minimum over the ensemble index :math:`m`:
:math:`\\min_m \\mathcal{D}_{p,n,m,v} (t)`."""
return Data(np.nanmin(self.data, axis=2)[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype)
@property
def ensemble_max(self):
"""Data: Ensemble maximum over the ensemble index :math:`m`:
:math:`\\max_m \\mathcal{D}_{p,n,m,v} (t)`."""
return Data(np.nanmax(self.data, axis=2)[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype)
[docs] def ensemble_quantiles(self, q, interpolation='linear'):
"""Return the ensemble `quantiles`_ of the data.
Parameters
----------
q: array_like(float)
Quantile or sequence of quantiles to compute, which must be between 0 and 1 inclusive.
interpolation: str, optional
This optional parameter specifies the interpolation method to use when the desired quantile lies between
two data points. See :func:`numpy.quantile` for more information.
Returns
-------
Data
The ensemble quantiles, stored along the ensemble member number axis (1st axis).
"""
return Data(np.moveaxis(np.nanquantile(self.data, q, axis=2, interpolation=interpolation), 0, 2),
self.metadata, self.timestamps, self.dtype)
@property
def ensemble_members_distance(self):
"""~numpy.ndarray: Distance between ensemble members:
:math:`d^{\\rm{MBM}}_{p,n,m_1,m_2,v} (t) = |\mathcal{D}_{p,n,m_1,v} (t)- \mathcal{D}_{p,n,m_2,v} (t) |`"""
return np.abs(self.data[:, :, :, np.newaxis, ...] - self.data[:, :, np.newaxis, :, ...])
@property
def observational_distance(self):
"""~numpy.ndarray: Distance between observations:
:math:`d^{\\rm{obs}}_{p,n_1,n_2,m,v} (t) = |\\mathcal{D}_{p,n_1,m,v} (t)- \\mathcal{D}_{p,n_2,m,v} (t)|`"""
return np.abs(self.data[:, :, np.newaxis, ...] - self.data[:, np.newaxis, :, ...])
@property
def delta(self):
"""Data: Average over the ensemble members of the :attr:`ensemble_members_distance`:
:math:`\\delta_{p,n,v} (t) = \\left\\langle d^{\\rm{MBM}}_{p,n,m_1,m_2,v} (t) \\right\\rangle_{m_1, m_2}`
"""
return Data(np.nanmean(np.nanmean(self.ensemble_members_distance, axis=2), axis=2)[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype)
@property
def uncertainty(self):
"""Data: Average over the observations of the :attr:`observational_distance` divided by 2:
:math:`\\langle d^{\\rm{obs}}_{p,n_1,n_2,m,v} (t)\\rangle_{n_1,n_2} / 2`.
Sometimes called the uncertainty contribution of the CRPS.
See :cite:`data-hersbach2000decomposition` for more details."""
return Data(np.nanmean(np.nanmean(self.observational_distance, axis=1), axis=1)[:, np.newaxis, ...] / 2, self.metadata, self.timestamps, self.dtype)
@property
def centered_ensemble(self):
"""Data: Returns an ensemble centered on its :attr:`ensemble_mean`:
:math:`\\bar{\\mathcal{D}}^{\\rm ens}_{p,n,m,v} (t) = \\mathcal{D}_{p,n,m,v} (t) - \\mu^{\\rm ens}_{p,n,v} (t)`."""
return self - self.ensemble_mean
@property
def centered_observation(self):
"""Data: Returns an ensemble centered on its :attr:`observational_mean`:
:math:`\\bar{\\mathcal{D}}^{\\rm obs}_{p,n,m,v} (t) = \\mathcal{D}_{p,n,m,v} (t) - \\mu^{\\rm obs}_{p,n,v} (t)`."""
return self - self.observational_mean
[docs] def ensemble_distance(self, other):
"""Data: Averaged distance between ensemble member and another Data object:
:math:`d^{\\rm{ens}}_{p,n,v} [\\mathcal{O}] (t) = \\langle|\\mathcal{D}_{p,n,m,v} (t)- \\mathcal{O}_{p,n,m,v} (t)|\\rangle_m` where :math:`\\mathcal{O}` is the
other Data object."""
return abs(self - other).ensemble_mean
def _CRPS_ab(self, other):
data = np.sort(self.data, axis=2)
d = np.diff(data, axis=2)
a = np.where(data[:, :, 1:, ...] < other.data, d, 0)
b = np.where(data[:, :, :-1, ...] > other.data, d, 0)
a = np.where(np.logical_and(data[:, :, :-1, ...] < other.data, other.data < data[:, :, 1:, ...]),
other.data - data[:, :, :-1, ...], a)
b = np.where(np.logical_and(data[:, :, :-1, ...] < other.data, other.data < data[:, :, 1:, ...]),
data[:, :, 1:, ...] - other.data, b)
outb = np.where(other.data < self.ensemble_min.data, self.ensemble_min.data - other.data, 0)
outa = np.where(other.data > self.ensemble_max.data, other.data - self.ensemble_max.data, 0)
a = np.concatenate((a, outa), axis=2)
b = np.concatenate((outb, b), axis=2)
return Data(a, self.metadata, self.timestamps, self.dtype), Data(b, self.metadata, self.timestamps, self.dtype)
def _CRPS_go(self, other):
a, b = self._CRPS_ab(other)
aa = a.observational_mean
bb = b.observational_mean
g = aa + bb
o = bb / g
o.data[:, 0, 0, ...] = np.nanmean(np.where(self.ensemble_min.data - other.data > 0, 1, 0), axis=(1, 2))
o.data[:, 0, -1, ...] = np.nanmean(np.where(self.ensemble_max.data - other.data > 0, 1, 0), axis=(1, 2))
g.data[:, 0, 0, ...] = bb.data[:, 0, 0, ...] / o.data[:, 0, 0, ...]
g.data[:, 0, -1, ...] = aa[:, 0, -1, ...] / (1 - o.data[:, 0, -1, ...])
return g, o
[docs] def CRPS_relipot(self, other):
"""Return the decomposition of CRPS scores with another Data object :math:`\\mathcal{O}` (typically containing
observations) according to the fomula:
:math:`{\\rm CRPS}_{p,v} (t) = {\\rm Reli}_{p,v} (t) + {\\rm CRPS}^{\\rm pot}_{p,v} (t)`
where :math:`{\\rm Reli}_{p,v} (t)` and :math:`{\\rm CRPS}^{\\rm pot}_{p,v} (t)` are
respectively the reliability and potential CRPS, i.e. the CRPS one would obtain with a perfectly reliable ensemble.
See :cite:`data-hersbach2000decomposition`, pp. 564 for more details.
Parameters
----------
other: Data
Another Data object with observations in it.
Returns
-------
tuple(Data)
The decomposition of the CRPS score into the reliability and the potential CRPS.
"""
p = np.arange(1, self.number_of_members+1) / self.number_of_members
g, o = self._CRPS_go(other)
reli = np.moveaxis(np.moveaxis(g.data, 2, -1) * (np.moveaxis(o.data, 2, -1) - p)**2, -1, 2)
pot = g.data * o.data * (1 - o.data)
return Data((np.nansum(reli, axis=2))[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype), \
Data((np.nansum(pot, axis=2))[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype)
[docs] def CRPS_decomposition(self, other):
"""Return the decomposition of CRPS scores with another Data object :math:`\\mathcal{O}` (typically containing
observations) according to the fomula:
:math:`{\\rm CRPS}_{p,v} (t) = {\\rm Reli}_{p,v} (t) - {\\rm Resol}_{p,v} (t) + {\\rm Unc}_{p,v} (t)`
where :math:`{\\rm Reli}_{p,v} (t)`, :math:`{\\rm Resol}_{p,v} (t)` and :math:`{\\rm Unc}_{p,v} (t)` are
respectively the reliability, the resolution and the uncertainty contribution to the CRPS.
See :cite:`data-hersbach2000decomposition`, pp. 565 for more details.
Parameters
----------
other: Data
Another Data object with observations in it.
Returns
-------
tuple(Data)
The decomposition of the CRPS score into the reliability, the resolution and the uncertainty contribution.
"""
reli, pot = self.CRPS_relipot(other)
uncert = other.uncertainty
resol = uncert - pot
return reli, resol, uncert
[docs] def CRPS(self, other):
"""Return the CRPS scores with another Data object :math:`\\mathcal{O}` (typically containing
observations). This score is computed according to :cite:`data-hersbach2000decomposition` (see pp. 563-564).
Parameters
----------
other: Data
Another Data object with observations in it.
Returns
-------
Data
The CRPS score.
"""
p = np.arange(1, self.number_of_members+1) / self.number_of_members
a, b = self._CRPS_ab(other)
c = np.moveaxis(np.moveaxis(a.data, 2, -1) * p**2 + np.moveaxis(b.data, 2, -1) * (1 - p)**2, -1, 2)
return Data((c.sum(axis=2))[:, :, np.newaxis, ...], self.metadata, self.timestamps, self.dtype).observational_mean
[docs] def Abs_CRPS(self, other):
"""Return the Absolute norm CRPS scores with another Data object :math:`\\mathcal{O}` (typically containing
observations). This score is computed with the analytical formula:
:math:`{\\rm CRPS}^{\\rm Abs}_{p,v} (t) = \\left\\langle d^{\\rm ens}_{p,n,v} [\\mathcal{O}] (t) - \\delta_{p,n,v} (t) /2 \\right\\rangle_n`
where :math:`d^{\\rm{ens}}_{p,n,v} [\\mathcal{O}]` is the :meth:`~.Data.ensemble_distance` with the observations object :math:`\\mathcal{O}`,
and :math:`\\delta_{p,n,v}` is :attr:`delta`, obtained by taking the average of the :attr:`ensemble_members_distance` over the ensemble members.
See :cite:`data-van2015ensemble` and :cite:`data-gneiting2007strictly` for more details.
Parameters
----------
other: Data
Another Data object with observations in it.
Returns
-------
Data
The Absolute norm CRPS score.
"""
return (self.ensemble_distance(other) - 0.5 * self.delta).observational_mean
[docs] def Ngr_CRPS(self, other):
"""Return the Non-homogeneous Gaussian Regression (NGR) CRPS scores with another Data object :math:`\\mathcal{O}`
(typically containing observations). This score is computed with the analytical formula:
:math:`{\\rm CRPS}^{\\rm Ngr}_{p,v} (t) = \\left\\langle\\sigma^{\\rm ens}_{p,n,v} (t) \\left(z_{p,n,v}(t)(2\\Phi(z_{p,n,v}(t)) -1) + 2\\phi(z_{p,n,v}(t)) - \\pi^{-1/2}\\right)\\right\\rangle_n`
where :math:`\\phi` is the normal distribution, :math:`\\Phi` is its cumulative distribution function and
:math:`z_{p,n,v} = \\left(\\mathcal{O}_{p,n,v} (t) -\\mu^{\\rm{ens}}_{p,n,v} (t)\\right)/\\sigma^{\\rm{ens}}_{p,n,v} (t)` is the
standardized error with respect to the `other` data (where :math:`\\mu^{\\rm{ens}}` and :math:`\\sigma^{\\rm{ens}}` are respectively the :attr:`ensemble_mean` and the :attr:`ensemble_std`).
See :cite:`data-van2015ensemble` and :cite:`data-gneiting2005calibrated` for more details.
Parameters
----------
other: Data
Another Data object with observations in it. Must have the same shape as the Data object, except along the members :math:`m` dimensions (2nd axis).
Returns
-------
Data
The NGR CRPS score.
"""
z = (other - self).ensemble_mean.data / self.ensemble_std.data
return (self.ensemble_std * (z * (2 * norm.cdf(z) - 1) + 2 * norm.pdf(z) - np.pi**(-0.5))).observational_mean
[docs] def bias(self, other):
"""Return the bias :math:`\\left\\langle\\mu^{\\rm{ens}}_{p,n,v} (t) - \\mathcal{O}_{p,n,v} (t)\\right\\rangle_n`
with another Data object :math:`\\mathcal{O}` (typically containing observations).
Parameters
----------
other: Data
Another Data object with observations in it. Must have the same shape as the Data object, except along the members :math:`m` dimensions (2nd axis).
Returns
-------
Data
The bias.
"""
return (self - other).ensemble_mean.observational_mean
[docs] def ensemble_mean_MSE(self, other):
"""Return the Mean Square Error of the ensemble mean :math:`\\left\\langle\\left(\\mu^{\\rm{ens}}_{p,n,v} (t) - \\mathcal{O}_{p,n,v} (t)\\right)^2\\right\\rangle_n`
with another Data object :math:`\\mathcal{O}` (typically containing observations).
Parameters
----------
other: Data
Another Data object with observations in it. Must have the same shape as the Data object, except along the members :math:`m` dimensions (2nd axis).
Returns
-------
Data
The ensemble mean Mean Square Error.
"""
return ((self.ensemble_mean - other)**2).observational_mean
[docs] def ensemble_mean_RMSE(self, other):
"""Return the Root Mean Square Error of the ensemble mean :math:`\\sqrt{\\left\\langle\\left(\\mu^{\\rm{ens}}_{p,n,v} (t) - \\mathcal{O}_{p,n,v} (t)\\right)^2\\right\\rangle_n}`
with another Data object :math:`\\mathcal{O}` (typically containing observations).
Parameters
----------
other: Data
Another Data object with observations in it. Must have the same shape as the Data object, except along the members :math:`m` dimensions (2nd axis).
Returns
-------
Data
The ensemble mean Root Mean Square Error.
"""
return self.ensemble_mean_MSE(other)**0.5
[docs] def plot(self, predictor=0, variable=0, ax=None, timestamps=None, global_label=None, grid_point=None, **kwargs):
"""Plot the data as a function of time.
Parameters
----------
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
global_label: None or str or list(str), optional
Label to represent all the data (str), or all the data of one observation (list of str) in the legend.
grid_point: tuple(int, int), optional
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the data were plotted.
"""
if plt is None:
warnings.warn('Matplotlib not loaded, cannot plot !', ImportWarning)
return None
p = predictor
v = variable
if grid_point is None:
ni = None
nj = None
else:
ni = grid_point[0]
nj = grid_point[1]
if ax is None:
fig = plt.figure()
ax = fig.gca()
if timestamps is None:
timestamps = self._get_timestamps(predictor)
elif not isinstance(timestamps, list):
timestamps = [timestamps] * self.number_of_observations
opo = False
lab = False
if isinstance(global_label, list):
opo = True
lab = True
elif isinstance(global_label, str):
opo = False
lab = True
if self._scalars:
first = True
for i in range(self.number_of_observations):
if opo:
first = True
for j in range(self.number_of_members):
if lab and first:
if opo:
ax.plot(timestamps[i], self.data[p, i, j, v, :], label=global_label[i], **kwargs)
else:
ax.plot(timestamps[i], self.data[p, i, j, v, :], label=global_label, **kwargs)
first = False
else:
ax.plot(timestamps[i], self.data[p, i, j, v, :], **kwargs)
elif self._fields:
if ni is None or nj is None:
warnings.warn('You must specify which grid point to plot, cannot plot !', UserWarning)
return None
first = True
for i in range(self.number_of_observations):
if opo:
first = True
for j in range(self.number_of_members):
if lab and first:
if opo:
ax.plot(timestamps[i], self.data[p, i, j, v, :, ni, nj], label=global_label[i], **kwargs)
else:
ax.plot(timestamps[i], self.data[p, i, j, v, :, ni, nj], label=global_label, **kwargs)
first = False
else:
ax.plot(timestamps[i], self.data[p, i, j, v, :, ni, nj], **kwargs)
labels = ax.get_xticklabels()
plt.setp(labels, rotation=30, horizontalalignment='center')
return ax
[docs] def plot_ensemble_mean(self, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data ensemble mean :attr:`ensemble_mean` as a function of time.
Parameters
----------
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the data were plotted.
"""
em = self.ensemble_mean
ax = em.plot(predictor, variable, ax, timestamps, grid_point=grid_point, **kwargs)
return ax
[docs] def plot_ensemble_std(self, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data ensemble standard deviation :attr:`ensemble_std` as a function of time.
Parameters
----------
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the data were plotted.
"""
if plt is None:
warnings.warn('Matplotlib not loaded, cannot plot !', ImportWarning)
return None
if grid_point is None:
ni = None
nj = None
else:
ni = grid_point[0]
nj = grid_point[1]
if ax is None:
fig = plt.figure()
ax = fig.gca()
if timestamps is None:
timestamps = self._get_timestamps(predictor)
elif not isinstance(timestamps, list):
timestamps = [timestamps] * self.number_of_observations
em = self.ensemble_mean
std = np.sqrt(self.ensemble_var.data)
if self._scalars:
for i in range(self.number_of_observations):
std_plot(timestamps[i], em[predictor, i, 0, variable, :], std[predictor, i, 0, variable, :], ax=ax, **kwargs)
elif self._fields:
if ni is None or nj is None:
warnings.warn('You must specify which grid point to plot, cannot plot !', UserWarning)
return None
for i in range(self.number_of_observations):
std_plot(timestamps[i], em[predictor, i, 0, variable, :, ni, nj], std[predictor, i, 0, variable, :, ni, nj], ax=ax, **kwargs)
labels = ax.get_xticklabels()
plt.setp(labels, rotation=30, horizontalalignment='center')
return ax
[docs] def plot_ensemble_quantiles(self, q, low_interpolation='linear', high_interpolation='linear',
predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, alpha=0.1, **kwargs):
"""Plot the data ensemble quantiles :attr:`ensemble_quantiles` as a function of time.
Parameters
----------
q: array_like(float)
Quantile or sequence of quantiles to compute, which must be between 0 and 0.5 exclusive.
A symmetric quantile with respect to 0.5 will also be computed.
low_interpolation: str, optional
This optional parameter specifies the interpolation method to use when the desired lower quantile (q<0.5)
lies between two data points. See :func:`numpy.quantile` for more information.
high_interpolation: str, optional
This optional parameter specifies the interpolation method to use when the desired higher quantile (q>0.5)
lies between two data points. See :func:`numpy.quantile` for more information.
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
alpha: float
Base level of transparency for the highest and lowest quantiles.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the data were plotted.
"""
if plt is None:
warnings.warn('Matplotlib not loaded, cannot plot !', ImportWarning)
return None
if grid_point is None:
ni = None
nj = None
else:
ni = grid_point[0]
nj = grid_point[1]
if ax is None:
fig = plt.figure()
ax = fig.gca()
if timestamps is None:
timestamps = self._get_timestamps(predictor)
elif not isinstance(timestamps, list):
timestamps = [timestamps] * self.number_of_observations
qq = list()
for qi in q:
qq.append(1.-qi)
low_quant = self.ensemble_quantiles(q, interpolation=low_interpolation)
high_quant = self.ensemble_quantiles(qq, interpolation=high_interpolation)
if self._scalars:
for i in range(self.number_of_observations):
for qi in range(len(q)):
ax.fill_between(timestamps[i], low_quant[predictor, i, qi, variable, :],
high_quant[predictor, i, qi, variable, :],
alpha=alpha+q[qi], **kwargs)
elif self._fields:
if ni is None or nj is None:
warnings.warn('You must specify which grid point to plot, cannot plot !', UserWarning)
return None
for i in range(self.number_of_observations):
for qi in range(len(q)):
ax.fill_between(timestamps[i], low_quant[predictor, i, qi, variable, :, ni, nj],
high_quant[predictor, i, qi, variable, :, ni, nj],
alpha=alpha+q[qi], **kwargs)
return ax
[docs] def plot_ensemble_minmax(self, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data ensemble minimum :attr:`ensemble_min` and maximum :attr:`ensemble_max` as a function of time.
Parameters
----------
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes, optional
An axes where the data were plotted.
"""
if plt is None:
warnings.warn('Matplotlib not loaded, cannot plot !', ImportWarning)
return None
if grid_point is None:
ni = None
nj = None
else:
ni = grid_point[0]
nj = grid_point[1]
if ax is None:
fig = plt.figure()
ax = fig.gca()
if timestamps is None:
timestamps = self._get_timestamps(predictor)
elif not isinstance(timestamps, list):
timestamps = [timestamps] * self.number_of_observations
mini = self.ensemble_min
maxi = self.ensemble_max
if self._scalars:
for i in range(self.number_of_observations):
minmax_plot(timestamps[i], mini[predictor, i, 0, variable, :], maxi[predictor, i, 0, variable, :], ax=ax, **kwargs)
elif self._fields:
if ni is None or nj is None:
warnings.warn('You must specify which grid point to plot, cannot plot !', UserWarning)
return None
for i in range(self.number_of_observations):
minmax_plot(timestamps[i], mini[predictor, i, 0, variable, :], maxi[predictor, i, 0, variable, :, ni, nj], ax=ax, **kwargs)
labels = ax.get_xticklabels()
plt.setp(labels, rotation=30, horizontalalignment='center')
return ax
[docs] def plot_CRPS(self, other, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data CRPS :meth:`CRPS` score with respect to observation data (other) a function of time.
Parameters
----------
other: Data
Another data structure holding the observations.
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the CRPS data were plotted.
"""
crps = self.CRPS(other)
ax = crps.plot(predictor, variable, ax, timestamps, grid_point=grid_point, **kwargs)
return ax
[docs] def plot_Abs_CRPS(self, other, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data Absolute norm CRPS :meth:`Abs_CRPS` score with respect to observation data (other) a function of time.
Parameters
----------
other: Data
Another data structure holding the observations.
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the CRPS data were plotted.
"""
abs_crps = self.Abs_CRPS(other)
ax = abs_crps.plot(predictor, variable, ax, timestamps, grid_point=grid_point, **kwargs)
return ax
[docs] def plot_Ngr_CRPS(self, other, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data Non-homogeneous Gaussian Regression (NGR) CRPS :meth:`Ngr_CRPS` score with respect to observation data (other) a function of time.
Parameters
----------
other: Data
Another data structure holding the observations.
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the CRPS data were plotted.
"""
if timestamps is None:
timestamps = self.timestamps
ngr_crps = self.Ngr_CRPS(other)
ax = ngr_crps.plot(predictor, variable, ax, timestamps, grid_point=grid_point, **kwargs)
return ax
[docs] def plot_observational_mean(self, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data observational mean :attr:`observational_mean` as a function of time.
Parameters
----------
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the data were plotted.
"""
om = self.observational_mean
ax = om.plot(predictor, variable, ax, timestamps, grid_point=grid_point, **kwargs)
return ax
[docs] def plot_observational_std(self, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data observational standard deviation :attr:`observational_std` as a function of time.
Parameters
----------
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the data were plotted.
"""
if plt is None:
warnings.warn('Matplotlib not loaded, cannot plot !', ImportWarning)
return None
if grid_point is None:
ni = None
nj = None
else:
ni = grid_point[0]
nj = grid_point[1]
if ax is None:
fig = plt.figure()
ax = fig.gca()
em = self.observational_median
std = np.sqrt(self.observational_var.data)
if timestamps is None:
timestamps = em._get_timestamps(predictor)
timestamps = timestamps[predictor, 0]
if self._scalars:
std_plot(timestamps, em[predictor, 0, 0, variable, :], std[predictor, 0, 0, variable, :], ax=ax, **kwargs)
elif self._fields:
if ni is None or nj is None:
warnings.warn('You must specify which grid point to plot, cannot plot !', UserWarning)
return None
std_plot(timestamps, em[predictor, 0, 0, variable, :, ni, nj], std[predictor, 0, 0, variable, :, ni, nj], ax=ax, **kwargs)
labels = ax.get_xticklabels()
plt.setp(labels, rotation=30, horizontalalignment='center')
return ax
[docs] def plot_observational_quantiles(self, q, low_interpolation='linear', high_interpolation='linear',
predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, alpha=0.1, **kwargs):
"""Plot the data observational quantiles :attr:`observational_quantiles` as a function of time.
Parameters
----------
q: array_like(float)
Quantile or sequence of quantiles to compute, which must be between 0 and 0.5 exclusive.
A symmetric quantile with respect to 0.5 will also be computed.
low_interpolation: str, optional
This optional parameter specifies the interpolation method to use when the desired lower quantile (q<0.5)
lies between two data points. See :func:`numpy.quantile` for more information.
high_interpolation: str, optional
This optional parameter specifies the interpolation method to use when the desired higher quantile (q>0.5)
lies between two data points. See :func:`numpy.quantile` for more information.
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
alpha: float
Base level of transparency for the highest and lowest quantiles.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes
An axes where the data were plotted.
"""
if plt is None:
warnings.warn('Matplotlib not loaded, cannot plot !', ImportWarning)
return None
if grid_point is None:
ni = None
nj = None
else:
ni = grid_point[0]
nj = grid_point[1]
if ax is None:
fig = plt.figure()
ax = fig.gca()
qq = list()
for qi in q:
qq.append(1.-qi)
low_quant = self.observational_quantiles(q, interpolation=low_interpolation)
high_quant = self.observational_quantiles(qq, interpolation=high_interpolation)
if timestamps is None:
timestamps = low_quant._get_timestamps(predictor)
timestamps = timestamps[0]
if self._scalars:
for qi in range(len(q)):
ax.fill_between(timestamps, low_quant[predictor, 0, qi, variable, :],
high_quant[predictor, 0, qi, variable, :],
alpha=alpha+q[qi], **kwargs)
elif self._fields:
if ni is None or nj is None:
warnings.warn('You must specify which grid point to plot, cannot plot !', UserWarning)
return None
for qi in range(len(q)):
ax.fill_between(timestamps, low_quant[predictor, 0, qi, variable, :, ni, nj],
high_quant[predictor, 0, qi, variable, :, ni, nj],
alpha=alpha+q[qi], **kwargs)
return ax
[docs] def plot_observational_minmax(self, predictor=0, variable=0, ax=None, timestamps=None, grid_point=None, **kwargs):
"""Plot the data observational minimum :attr:`observational_min` and maximum :attr:`observational_max` as a function of time.
Parameters
----------
predictor: int, optional
The predictor index to use. Default is 0.
variable: int, optional
The variable index to use.
ax: ~matplotlib.axes.Axes, optional
An axes on which to plot.
timestamps: None or ~numpy.ndarray(~datetime.datetime), optional
An array containing the timestamp of the data. If None, try to use the data `timestamps` and in last resort a numbered time index.
Default to `None`.
grid_point: tuple(int, int)
If the data are fields, specifies which grid point to plot.
kwargs: dict
Argument to be passed to the plotting routine.
Returns
-------
ax: ~matplotlib.axes.Axes, optional
An axes where the data were plotted.
"""
if plt is None:
warnings.warn('Matplotlib not loaded, cannot plot !', ImportWarning)
return None
if grid_point is None:
ni = None
nj = None
else:
ni = grid_point[0]
nj = grid_point[1]
if ax is None:
fig = plt.figure()
ax = fig.gca()
mini = self.observational_min
maxi = self.observational_max
if timestamps is None:
timestamps = mini._get_timestamps(predictor)
timestamps = timestamps[0]
if self._scalars:
minmax_plot(timestamps, mini[predictor, 0, 0, variable, :], maxi[predictor, 0, 0, variable, :], ax=ax, **kwargs)
elif self._fields:
if ni is None or nj is None:
warnings.warn('You must specify which grid point to plot, cannot plot !', UserWarning)
return None
minmax_plot(timestamps, mini[predictor, 0, 0, variable, :], maxi[predictor, 0, 0, variable, :, ni, nj], ax=ax, **kwargs)
labels = ax.get_xticklabels()
plt.setp(labels, rotation=30, horizontalalignment='center')
return ax
[docs] def load_scalars(self, data, metadata=None, timestamps=None, load_axis=1, concat_axis=1, columns=0, replace_timestamps=False):
"""Load scalar data in the Data object. For the moment, only Pandas dataframe and NumPy arrays are accepted.
Parameters
----------
data: ~pandas.DataFrame or ~numpy.ndarray or list(~pandas.DataFrame) or list(~numpy.ndarray)
The data to load in the object, packed along the load_axis.
If ~numpy.ndarray are provided, they can be at most 2-dimensional and their last axis is always identified with the lead time.
The remaining axis will be identified to an axis of the Data object given by `load_axis`
If ~pandas.DataFrame are provided, there row axis is expected to be identified with the lead time,
while the columns axis will be identified to an axis of the Data object given by `load_axis`
In both cases, a list of data can be provided instead, the list items will be loaded along the axis provided by the first element of `load_axis`,
which must thus be a 2-tuple. If the list elements are 2D, their first axis are loaded along the axis of the Data object corresponding to the second element of
`load_axis`, the last axis being loaded along the lead time axis. If the list elements are 1D, they are loaded along the lead time axis.
Finally, if there are already data inside the object, the data provided will be appended to them along the `concat_axis`.
metadata: object or list(object)
The metadata of the provided data. If a list of data is provided, then a list of metadata object can be provided. Otherwise, the same
metadata object will be used for all the data items in the list.
timestamps: ~numpy.ndarray(~datetime.datetime) or list(~numpy.ndarray(~datetime.datetime))
The timestamps array(s) of the provided data, as ~datetime.datetime object. If a list of data is provided, then a list of timestamps arrays can be provided.
Otherwise, the same timestamps array will be used for all the data items in the list.
load_axis: int or str or tuple(int) or tuple(str)
Axis over which the provided data are loaded in the Data object.
Equal to 1 by default to match the observation index.
Can be a number if `data` is a ~numpy.ndarray or a ~pandas.DataFrame. Have to be 2-tuple if `data` is a list (see above).
Can also be a string like i.e. 'obs' to load along the observation index, or 'members' to load along the ensemble member index.
concat_axis: int or str
Axis over which the data have to be concatenated. Can be a number or
a string like i.e. 'obs' to concatenate along the observation index, or 'members' to concatenate along the ensemble member index.
columns: int or str or list(int) or list(str), optional
Allow to specify the column of the ~pandas.DataFrame to load along `load_axis`. Only works with pandas ~pandas.DataFrame.
replace_timestamps: bool
Replace the timestamps possibly already present in the Data object if `concat_axis` is not 1. Default to `False`.
"""
# checking axis input argument
load_axis = self._determine_axis(load_axis)
concat_axis = self._determine_axis(concat_axis)
if load_axis is None:
warnings.warn('Wrong axis argument. Data not loaded.', SyntaxWarning)
return
if isinstance(load_axis, int):
if load_axis == 0:
warnings.warn('Cannot load a predictor with this method. Data not loaded.', UserWarning)
return
else:
if 0 in load_axis:
warnings.warn('Cannot load a predictor with this method. Data not loaded.', UserWarning)
return
# checking the data type of the data input argument
data_type = self._determine_data_type(data)
if data_type in ["unknown", "none"]:
s = ""
if pd is None:
s += "\nPlease note that pandas is not loaded. That might cause the error."
warnings.warn('Wrong data type for the data argument. Data not loaded.' + s, UserWarning)
return
# checking the columns input argument
column_type, columns = self._determine_columns(columns)
if column_type == "unknown":
warnings.warn('Wrong columns argument. Data not loaded.', SyntaxWarning)
return
# considering the case where the input data are pandas dataframes
if "pandas" in data_type:
# loading the data array first
# case where only one dataframe is provided
if data_type == "pandas":
# loading the input data in a numpy array
new_data = self._create_scalars_data_from_pandas(data, load_axis, columns, column_type, self._dtype)
# case where a list of data frame is provided
elif data_type == "pandas_list":
# loading the input data in a list of numpy array
new_data = self._create_scalars_data_from_pandas_list(data, load_axis, columns, column_type, self._dtype)
else:
warnings.warn('Wrong pandas type. Data not loaded.', UserWarning)
new_data = None
if new_data is not None:
# loading metadata
new_metadata = self._create_metadata(new_data, metadata)
# loading timestamps
new_timestamps = self._create_timestamps(new_data, data, timestamps)
# considering the case where the input data are numpy arrays
elif "numpy" in data_type:
# loading the data array first
# case where only one array is provided
if data_type == "numpy":
new_data = self._create_scalars_data_from_ndarray(data, load_axis)
# case where a list of array is provided
elif data_type == "numpy_list":
new_data = self._create_scalars_data_from_ndarray_list(data, load_axis)
else:
warnings.warn('Wrong numpy type. Data not loaded.', UserWarning)
new_data = None
if new_data is not None:
# loading metadata
new_metadata = self._create_metadata(new_data, metadata)
# loading timestamps
new_timestamps = self._create_timestamps(new_data, data, timestamps)
else:
new_data = None
new_timestamps = None
new_metadata = None
if new_data is not None:
# case if the data object is empty
if self.data is None:
self.data = new_data
self.timestamps = new_timestamps
self.metadata = new_metadata
# otherwise try to concatenate with current data
else:
if not self._scalars:
warnings.warn('Existing Data object are not scalars. Data not loaded.', UserWarning)
return
try:
self.data = np.concatenate((self.data, new_data), axis=concat_axis)
except:
warnings.warn('Problem to concatenate the new data with one already present in the Data object. Data not loaded', UserWarning)
return
if load_axis == 1:
try:
self.timestamps = np.concatenate((self.timestamps, new_timestamps), axis=concat_axis)
except:
warnings.warn('Problem to concatenate the new timestamps with one already present in the Data object. Timestamps not loaded', UserWarning)
try:
self.metadata = np.concatenate((self.metadata, new_metadata), axis=concat_axis)
except:
warnings.warn('Problem to concatenate the new metadata with one already present in the Data object. Metadata not loaded', UserWarning)
elif replace_timestamps:
self.timestamps = new_timestamps
self.metadata = new_metadata
else:
warnings.warn('Warning. No timestamps or metadata loaded with the new data!', UserWarning)
def _create_scalars_data_from_pandas(self, data, axis, columns, column_type, output_dtype):
if column_type == 'label':
data_np = data.loc[:, columns].values.T.astype(dtype=output_dtype)
elif column_type == 'number':
data_np = data.iloc[:, columns].values.T.astype(dtype=output_dtype)
elif column_type == "all":
data_np = data.values.T.astype(dtype=output_dtype)
else:
warnings.warn('Wrong columns argument. Data not loaded.', SyntaxWarning)
return None
new_data = self._create_scalars_data_from_ndarray(data_np, axis)
return new_data
@staticmethod
def _create_scalars_data_from_ndarray(data_np, axis):
if not isinstance(axis, int):
warnings.warn('Wrong axis argument. Data not loaded.', SyntaxWarning)
return None
if len(data_np.shape) == 1:
data_np = data_np[np.newaxis, :]
# determining the shape of the data object being created
shape = [1 for i in range(axis)] + [data_np.shape[0]] + [1 for i in range(axis + 1, 4)] + [data_np.shape[1]]
try:
ret_data = data_np.reshape(shape)
except:
warnings.warn('Impossible to reshape properly the input data. Data not loaded.', UserWarning)
ret_data = None
return ret_data
def _create_scalars_data_from_pandas_list(self, data, axis, columns, column_type, output_dtype):
data_np = list()
if column_type == 'label':
for d in data:
data_np.append(d.loc[:, columns].values.T.astype(dtype=output_dtype))
if len(data_np[-1].shape) == 1:
data_np[-1] = data_np[-1][np.newaxis, :]
elif column_type == 'number':
for d in data:
data_np.append(d.iloc[:, columns].values.T.astype(dtype=output_dtype))
if len(data_np[-1].shape) == 1:
data_np[-1] = data_np[-1][np.newaxis, :]
elif column_type == "all":
for d in data:
data_np.append(d.values.T.astype(dtype=output_dtype))
if len(data_np[-1].shape) == 1:
data_np[-1] = data_np[-1][np.newaxis, :]
else:
warnings.warn('Wrong columns argument. Data not loaded.', SyntaxWarning)
return None
# check the shape of the provided arrays
for d in data_np[1:]:
if d.shape != data_np[0].shape:
warnings.warn('Incompatible pandas dataframes in the list provided. Data not loaded.', SyntaxWarning)
return None
new_data = self._create_scalars_data_from_ndarray_list(data_np, axis)
return new_data
@staticmethod
def _create_scalars_data_from_ndarray_list(data_np, axis):
# determining the shape of the data object being created
if not isinstance(axis, (list, tuple)):
warnings.warn('Wrong axis argument. Data not loaded.', SyntaxWarning)
return None
elif len(axis) > 2:
warnings.warn('Too many axis provided. Only considering the first 2.', SyntaxWarning)
axis = axis[:2]
sh = [1, data_np[0].shape[0]]
if axis[0] == axis[1]:
warnings.warn('Wrong axis argument. Data not loaded.', SyntaxWarning)
return None
elif axis[0] < axis[1]:
sa = axis
else:
sa = axis[::-1]
sh = sh[::-1]
shape = [1 for i in range(sa[0])] + [sh[0]] + [1 for i in range(sa[0] + 1, sa[1])] + [sh[1]] + \
[1 for i in range(sa[1] + 1, 4)] + [data_np[0].shape[1]]
try:
new_data = data_np[0].reshape(shape)
for d in data_np[1:]:
new_data = np.concatenate((new_data, d.reshape(shape)), axis=axis[0])
except:
warnings.warn('Impossible to reshape and concatenate properly the input data. Data not loaded.', UserWarning)
new_data = None
return new_data
@staticmethod
def _create_metadata(new_data, metadata):
if metadata is False: # private way to disable the metadata
return None
new_metadata = np.empty(new_data.shape[:2], dtype=object)
if not isinstance(metadata, (list, tuple)):
# warnings.warn('List of data provided, but only one metadata object provided. Using the same metadata for each member of the list', UserWarning)
for i in range(new_metadata.shape[1]):
new_metadata[0, i] = metadata
else:
if len(metadata) == new_metadata.shape[1]:
for i in range(new_metadata.shape[1]):
new_metadata[0, i] = metadata[i]
else:
warnings.warn('Incoherent number of data elements and metadata elements. Metadata not loaded.', UserWarning)
return new_metadata
@staticmethod
def _create_timestamps(new_data, data, timestamps):
if timestamps is False: # private way to disable the timestamps
return None
# check if a timestamp vector list has been provided
new_timestamps = np.empty(new_data.shape[:2], dtype=object)
if not isinstance(timestamps, (list, tuple)):
# warnings.warn('List of data provided, but only one timestamps vectors provided. Using the same vector for each member of the list', UserWarning)
if timestamps is not None:
# check the provided timestamp
if not isinstance(timestamps, np.ndarray):
warnings.warn('The provided timestamps do not form a numpy array. Timestamps not loaded.', UserWarning)
timestamps = None
else:
# if a valid timestamps array is provided just pass it
if timestamps.shape == new_data.shape[:2] and timestamps.dtype == np.dtype(object):
return timestamps
for t in timestamps:
if not isinstance(t, datetime.datetime):
raise Exception # TODO: Debug
warnings.warn('The provided timestamps are not datetime object. Timestamps not loaded.', UserWarning)
timestamps = None
break
for i in range(new_timestamps.shape[1]):
new_timestamps[0, i] = timestamps
else:
if isinstance(data, (list, tuple)):
timestamps = list()
for d in data:
try:
timestamps.append(d.index.to_pydatetime())
except:
timestamps.append(None)
if len(timestamps) != new_timestamps.shape[1]:
timestamps = [None] * new_timestamps.shape[1]
for i in range(new_timestamps.shape[1]):
new_timestamps[0, i] = timestamps[i]
else:
try:
timestamps = data.index.to_pydatetime()
except:
timestamps = None
for i in range(new_timestamps.shape[1]):
new_timestamps[0, i] = timestamps
else:
if len(timestamps) == new_timestamps.shape[1]:
for i in range(new_timestamps.shape[1]):
new_timestamps[0, i] = timestamps[i]
else:
warnings.warn('Incoherent number of data elements and timestamps elements. Timestamps not loaded.', UserWarning)
return new_timestamps
[docs] def append_predictors(self, data):
"""Append a predictors Data object to the current ones (i.e. along the 0th axis).
Parameters
----------
data: Data
The data object of the predictors to append. Must be compatible/broadcastable.
If the initial Data object is empty, simply copy the `data` object.
"""
try:
if isinstance(data, Data):
if self.data is not None:
self.data = np.concatenate((self.data, data.data))
if self.metadata is not None:
self.metadata = np.concatenate((self.metadata, data.metadata))
if self.timestamps is not None:
self.timestamps = np.concatenate((self.timestamps, data.timestamps))
else:
self.data = data.data.copy()
self.metadata = data.metadata.copy()
self.timestamps = data.timestamps.copy()
except:
warnings.warn('Unable to append the provided predictors.', UserWarning)
[docs] def append_realizations(self, data):
"""Append a realizations Data object to the current ones (i.e. along the 1st axis).
Parameters
----------
data: Data
The data object of the realizations to append. Must be compatible/broadcastable.
If the initial Data object is empty, simply copy the `data` object.
"""
try:
if isinstance(data, Data):
if self.data is not None:
self.data = np.concatenate((self.data, data.data), axis=1)
if self.metadata is not None:
self.metadata = np.concatenate((self.metadata, data.metadata), axis=1)
if self.timestamps is not None:
self.timestamps = np.concatenate((self.timestamps, data.timestamps), axis=1)
else:
self.data = data.data.copy()
self.metadata = data.metadata.copy()
self.timestamps = data.timestamps.copy()
except:
warnings.warn('Unable to append the provided realizations.', UserWarning)
[docs] def append_observations(self, data):
"""Append a observations Data object to the current ones (i.e. along the 1st axis). Alias for
:meth:`.append_realization`.
Parameters
----------
data: Data
The data object of the observations to append. Must be compatible/broadcastable.
If the initial Data object is empty, simply copy the `data` object.
"""
try:
if isinstance(data, Data):
if self.data is not None:
self.data = np.concatenate((self.data, data.data), axis=1)
if self.metadata is not None:
self.metadata = np.concatenate((self.metadata, data.metadata), axis=1)
if self.timestamps is not None:
self.timestamps = np.concatenate((self.timestamps, data.timestamps), axis=1)
else:
self.data = data.data.copy()
self.metadata = data.metadata.copy()
self.timestamps = data.timestamps.copy()
except:
warnings.warn('Unable to append the provided observations.', UserWarning)
[docs] def append_members(self, data):
"""Append a members Data object to the current ones (i.e. along the 2nd axis).
Parameters
----------
data: Data
The data object of the members to append. Must be compatible/broadcastable.
If the initial Data object is empty, simply copy the `data` object.
"""
try:
if isinstance(data, Data):
if self.data is not None:
self.data = np.concatenate((self.data, data.data), axis=2)
else:
self.data = data.data.copy()
self.metadata = data.metadata.copy()
self.timestamps = data.timestamps.copy()
except:
warnings.warn('Unable to append the provided members.', UserWarning)
def append_variable(self, data):
warnings.warn('Method not yet implemented.', UserWarning)
# if isinstance(data, np.ndarray):
# if self.data is None:
# self.data = data
# else:
# self.data = np.concatenate((self.data, data), axis=3)
# elif isinstance(data, Data):
# if self.data is None:
# if data.data is not None:
# self.data = data.data
# else:
# if data.data is not None:
# self.data = np.concatenate((self.data, data.data), axis=3)
@property
def ensemble_mean_observational_self_covariance(self):
"""numpy.ndarray: Ensemble mean observational covariance matrix:
:math:`{\\rm Cov}^{\\rm obs}_{p_1, p_2, v} [\\mu^{\\rm ens}, \\mu^{\\rm ens}] (t)= \\left\\langle \\bar{\\mu}^{\\rm ens}_{p_1,n,v} (t) \, \\bar{\\mu}^{\\rm ens}_{p_2,n,v} (t) \\right\\rangle_n`
where :math:`\\bar{\\mu}^{\\rm ens}_{p,n,v} (t) = \\mu^{\\rm ens}_{p,n,v}(t) - \\langle \\mu^{\\rm ens}_{p,n',v}(t) \\rangle_{n'}` and
where :math:`\\mu^{\\rm ens}_{p,n,v}(t)` is the :attr:`ensemble_mean`.
"""
em = self.ensemble_mean.centered_observation.data
res = np.nanmean(em[:, np.newaxis, ...] * em[np.newaxis, ...], axis=2)[:, :, np.newaxis, ...]
return res
[docs] def ensemble_mean_observational_covariance(self, other):
"""Observational covariance matrix of the ensemble mean with another Data object :math:`\\mathcal{O}`:
:math:`{\\rm Cov}^{\\rm obs}_{p_1, p_2, v} [\\bar{\\mathcal{O}}^{\\rm obs}, \\mu^{\\rm ens}] (t)= \\left\\langle \\left\\langle\\bar{\\mathcal{O}}^{\\rm obs}_{p_1,n,m,v} (t)\\right\\rangle_m \, \, \\bar{\\mu}^{\\rm ens}_{p_2,n,v} (t) \\right\\rangle_n`
where :math:`\\bar{\\mu}^{\\rm ens}_{p,n,v} (t) = \\mu^{\\rm ens}_{p,n,v}(t) - \\langle \\mu^{\\rm ens}_{p,n',v}(t) \\rangle_{n'}` and
where :math:`\\mu^{\\rm ens}_{p,n,v}(t)` is the :attr:`ensemble_mean`. :math:`\\bar{\\mathcal{O}}^{\\rm obs}_{p,n,m,v} (t)` is the :attr:`centered_observation` of the other Data object.
Parameters
----------
other: Data
Another Data object with observations in it.
Returns
-------
Data
The variance vector.
"""
em = self.ensemble_mean.centered_observation.data
om = other.ensemble_mean.centered_observation.data
return np.nanmean(om[:, np.newaxis, ...] * em[np.newaxis, ...], axis=2)[:, :, np.newaxis, ...]
[docs] def load_from_file(self, filename, **kwargs):
"""Function to load previously saved data with the method :meth:`save_to_file`.
Parameters
----------
filename: str
The file name where the Data object was saved.
kwargs: dict
Keyword arguments to pass to the pickle module method.
"""
with open(filename, 'rb') as f:
tmp_dict = pickle.load(f, **kwargs)
self.__dict__.clear()
self.__dict__.update(tmp_dict)
[docs] def save_to_file(self, filename, **kwargs):
"""Function to save the data to a file with the :mod:`pickle` module.
Parameters
----------
filename: str
The file name where to save the Data object.
kwargs: dict
Keyword arguments to pass to the pickle module method.
"""
with open(filename, 'wb') as f:
pickle.dump(self.__dict__, f, **kwargs)
def _get_timestamps(self, predictor):
timestamps = list()
for n in range(self.number_of_observations):
if self.timestamps[predictor, n] is not None:
timestamps.append(self.timestamps[predictor, n])
else:
timestamps.append(np.array(range(self.number_of_time_steps)))
return timestamps
def _determine_axis(self, axis):
if isinstance(axis, str):
if axis in self._axis_name_dict.keys():
axis = self._axis_name_dict[axis]
return axis
else:
return None
elif isinstance(axis, int):
return axis
elif isinstance(axis, (tuple, list)):
a = list()
for axe in axis:
if isinstance(axe, str):
if axe in self._axis_name_dict.keys():
a.append(self._axis_name_dict[axe])
else:
return None
elif isinstance(axe, int):
a.append(axe)
else:
return None
return a
else:
return None
@staticmethod
def _determine_data_type(data):
if data is None:
return "none"
data_type = ""
if pd is not None:
if isinstance(data, pd.DataFrame):
data_type = "pandas"
if isinstance(data, np.ndarray):
data_type = "numpy"
if isinstance(data, (list, tuple)):
if pd is not None:
if isinstance(data[0], pd.DataFrame):
data_type = "pandas_list"
if isinstance(data[0], np.ndarray):
data_type = "numpy_list"
if data_type:
return data_type
else:
return "unknown"
@staticmethod
def _determine_columns(columns):
column_type = ""
if isinstance(columns, int):
column_type = "number"
elif isinstance(columns, slice):
column_type = "number"
elif isinstance(columns, str):
if columns == "all":
column_type = "all"
else:
column_type = "label"
elif isinstance(columns, (list, tuple)):
if len(columns) > 0:
if isinstance(columns[0], int):
column_type = "number"
elif isinstance(columns[0], bool):
column_type = "label"
elif isinstance(columns[0], str):
column_type = "label"
else:
column_type = "unknown"
else:
column_type = "unknown"
else:
column_type = "unknown"
return column_type, columns
[docs] def full_like(self, value, **kwargs):
"""Like :func:`numpy.full_like`, returns a full :class:`Data` object with the same :attr:`index_shape` and
:attr:`shape` and type as the initial one.
Parameters
----------
value:
The fill value to use.
kwargs: dict
The argument to pass to :func:`numpy.full_like`.
Returns
-------
Data:
The full :class:`Data` object
"""
data = np.full_like(self.data, value, self._dtype, **kwargs)
return Data(data, self.metadata, self.timestamps, self._dtype)
[docs] def zeros_like(self, **kwargs):
"""Like :func:`numpy.zeros_like`, returns a :class:`Data` object with the same :attr:`index_shape` and
:attr:`shape` and type as the initial one, but filled with zeros.
Parameters
----------
kwargs: dict
The argument to pass to :func:`numpy.zeros_like`.
Returns
-------
Data:
The zeros :class:`Data` object
"""
data = np.zeros_like(self.data, self._dtype, **kwargs)
return Data(data, self.metadata, self.timestamps, self._dtype)