MPAS-JEDI
stat_utils.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 
3 from collections import defaultdict
4 import logging
5 from netCDF4 import Dataset
6 import numpy as np
7 import os
8 import pandas as pd
9 
10 _logger = logging.getLogger(__name__)
11 
12 #==========================================================
13 # utilities for statistics, aggregation, and bootstrapping
14 #==========================================================
15 
16 # single-precision for floating point storage
17 rKindStore = np.float32
18 rKindNC = 'f4'
19 
20 # double-precision for floating point storage
21 #rKindStore = np.float64
22 #rKindNC = 'f8'
23 
24 # double-precision for floating point calculation
25 rKindCompute = np.float64
26 
27 
28 #============
29 # statistics
30 #============
31 
32 # ddof - Delta Degrees of Freedom used in standard deviation calculations
33 # Note: must be identical across experiments for the two stages:
34 # + statistics generation
35 # + plot generation
36 ddof = int(0)
37 
38 #Ordered list of statistics available in ASCII stats_* files...
39 # (1) that can be aggregated
40 aggregatableFileStats = ['Count','Mean','MS','RMS','STD','Min','Max']
41 
42 allFileStats = aggregatableFileStats
43 
44 statDtypes = {
45  'Count': int,
46  'Mean': rKindStore,
47  'MS': rKindStore,
48  'RMS': rKindStore,
49  'STD': rKindStore,
50  'Min': rKindStore,
51  'Max': rKindStore,
52 }
53 
54 statDtypesCompute = {
55  'Count': int,
56  'Mean': rKindCompute,
57  'MS': rKindCompute,
58  'RMS': rKindCompute,
59  'STD': rKindCompute,
60  'Min': rKindStore,
61  'Max': rKindStore,
62 }
63 
64 # (2) that can be sampled with bootstrap
65 sampleableAggStats = ['Count','Mean','MS','RMS']
66 
67 fileStatAttributes = ['DiagSpaceGrp','varName','varUnits','diagName',
68  'binMethod','binVar','binVal','binUnits']
69 
70 statsFilePrefix = 'stats_'
71 
72 
73 ###############################################################################
74 def calcStats(array_f):
75 # INPUTS:
76 # array_f[np.array(float)] - 1-d array of floating point values for which
77 # statistics will be calculated
78 
79 # RETURNS:
80 # STATS[dict] - population statistics, accounting for NaN values
81 
82  #Only include non-NaN values in statistics
83  STATS = {}
84  STATS['Count'] = np.isfinite(array_f).sum()
85  if STATS['Count'] > 0:
86  STATS['Mean'] = np.nanmean(array_f)
87  STATS['MS'] = np.nanmean(np.square(array_f))
88  STATS['RMS'] = np.sqrt(STATS['MS'])
89  STATS['STD'] = np.nanstd(array_f, ddof=ddof)
90  STATS['Min'] = np.nanmin(array_f)
91  STATS['Max'] = np.nanmax(array_f)
92  else:
93  for stat in allFileStats:
94  if stat != 'Count':
95  STATS[stat] = np.NaN
96 
97  return STATS
98 
99 
100 ###############################################################################
101 def write_stats_nc(statSpace, statsDict):
102 
103  # TODO: This data is hierarchical. Compare
104  # memory/storage requirements when using
105  # Pandas dataframe and hdf5 statistics files
106  # instead of dict of lists and nc4
107  statsFile = statsFilePrefix+statSpace+'.nc'
108  if os.path.exists(statsFile):
109  os.remove(statsFile)
110 
111  ncid = Dataset(statsFile, 'w', format='NETCDF4')
112  ncid.description = statSpace+" diagnostic statistics"
113 
114  nrows = len(statsDict[fileStatAttributes[0]])
115  ncid.createDimension('nrows', nrows)
116 
117  for attribName in fileStatAttributes:
118  attribHandle = ncid.createVariable(attribName, str, 'nrows')
119  attribHandle[:] = np.array(statsDict[attribName], dtype=object)
120 
121  for statName in allFileStats:
122  statHandle = ncid.createVariable(statName, rKindNC, 'nrows')
123  statHandle[:] = statsDict[statName]
124 
125  ncid.close()
126 
127 
128 ###############################################################################
129 def read_stats_nc(statsFile):
130 
131  ncid = Dataset(statsFile, 'r')
132  ncid.set_auto_mask(False)
133 
134  statsDict = {}
135  for attribName in fileStatAttributes:
136  statsDict[attribName] = np.asarray(ncid.variables[attribName][:])
137 
138  for statName in allFileStats:
139  statsDict[statName] = np.asarray(ncid.variables[statName][:], statDtypes[statName])
140 
141  ncid.close()
142 
143  return statsDict
144 
145 
146 #===================
147 # basic aggregation
148 #===================
149 
150 ###############################################################################
151 def aggStatsDF(x):
152 #PURPOSE: aggregate DataFrame containing aggregatableFileStats
153 # INPUT: x[pd.DataFrame] - subpopulation statistics
154 # RETURNS: y[pd.Series] - aggregated statistics
155 
156  #converting to dictionary first speeds up the memory access
157  y = aggStatsDict(x.to_dict('list'))
158 
159  return pd.Series(y, index=aggregatableFileStats)
160 
161 def aggStatsDict(x_): #, stats = aggregatableFileStats):
162 #PURPOSE: aggregate Dictionary containing aggregatableFileStats
163 # INPUT: x[dict] - subpopulation statistics
164 # RETURNS: y[dict] - aggregated statistics
165 
166  counts = np.array(x_['Count'])
167  mask = np.logical_and(np.greater(counts, 0),
168  np.isfinite(counts))
169 
170  # convert lists to masked np.array to enable math functions
171  x = {}
172  for stat in aggregatableFileStats:
173  x[stat] = np.array(x_[stat], dtype=statDtypesCompute[stat])[mask]
174 
175  y = {}
176  for stat in aggregatableFileStats:
177  y[stat] = np.NaN
178 
179  ## Count
180  Count = int(np.nansum(x['Count']))
181  y['Count'] = Count
182 
183  if Count > 0 and np.isfinite(Count):
184  # Note: arrays are sorted in ascending order before summation to avoid precision loss
185  ## Mean
186  v1_ = np.multiply(x['Mean'], x['Count'].astype(rKindCompute))
187  v1_ = np.sort(v1_)
188  Mean = np.nansum(v1_) / rKindCompute(Count)
189  y['Mean'] = rKindStore(Mean)
190 
191  ## MS
192  v1_ = np.multiply(x['MS'], x['Count'].astype(rKindCompute))
193  v1_ = np.sort(v1_)
194  ms = np.nansum(v1_) / rKindCompute(Count)
195  y['MS'] = rKindStore(ms)
196 
197  ## RMS
198  y['RMS'] = rKindStore(np.sqrt(ms))
199 
200  ## STD
201  # Pooled variance Formula as described here:
202  # https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics
203  # https://stats.stackexchange.com/questions/43159/how-to-calculate-pooled-variance-of-two-or-more-groups-given-known-group-varianc
204  v1_ = np.multiply(np.square(x['STD']), np.subtract(x['Count'].astype(rKindCompute), ddof))
205  v2_ = np.multiply(np.square(x['Mean']), x['Count'].astype(rKindCompute))
206  v12 = np.nansum(np.sort(np.append(v1_, v2_)))
207  v3 = np.square(Mean) * rKindCompute(Count)
208  if v12 >= v3:
209  y['STD'] = rKindStore(np.sqrt((v12 - v3) / rKindCompute(Count - ddof)))
210 
211  ## Min
212  y['Min'] = np.nanmin(x['Min'])
213 
214  ## Max
215  y['Max'] = np.nanmax(x['Max'])
216 
217  return y
218 
219 
220 #============================================
221 # bootstrapping for confidence intervals (CI)
222 #============================================
223 
224 cimean = 'VALUE'
225 cimin = 'LO'
226 cimax = 'HI'
227 ciTraits = [cimean, cimin, cimax]
228 
229 ################################################################################
230 def identityFunc(x):
231  return x
232 
233 
234 ################################################################################
235 def rmsFunc(x, axis=0):
236  return np.sqrt(np.nanmean(np.square(x), axis=axis))
237 
238 
239 ################################################################################
240 def bootStrapVector(X, alpha=0.05, n_samples=8000, weights=None):
241 # PURPOSE: compute bootstrap confidence intervals on vector of data
242 #
243 # INPUTS:
244 # X[np.array] - values to be bootstrapped. For linear quantities (e.g., Mean, MS),
245 # these can also be subpopulation quantities, so long as the weights are set accordingly.
246 # See bootStrapClusterFunc for an example.
247 # alpha[float] - confidence interval (CI) percentile (e.g., 0.05 for 95%), optional
248 # n[int or list(int) or array(int)] - number of bootstrap samples, optional
249 # weights[np.array] - same length as X, adjusts re-sampling rates for each value, optional
250 
251 # RETURNS:
252 # ciVals - dictionary object containing mean, CI min and CI max
253 
254  if type(n_samples) is list:
255  nsSamples = n_samples
256  elif (type(n_samples) is np.array
257  or type(n_samples) is np.ndarray):
258  nsSamples = list(n_samples)
259  else:
260  nsSamples = [n_samples]
261  max_samples = np.max(nsSamples)
262 
263  ## number of "data points" must by > 0
264  # could be aggregated over a time series, space, or any other binning characteristic
265  Ndata = len(X)
266 
267  if weights is None:
268  iResample = np.random.choice(Ndata, (Ndata, max_samples))
269  else:
270  iResample = np.random.choice(Ndata, (Ndata, max_samples), p = weights)
271 
272  XResample = X[iResample]
273  Expect = np.nanmean(XResample, axis=0)
274 
275  ciVals = defaultdict(list)
276 
277  for nSamples in nsSamples:
278  sampleVals = np.sort(Expect[0:nSamples])
279  nonNaNSamples = np.isfinite(sampleVals).sum()
280 
281  iMid = int(np.around(0.5 * rKindCompute(nonNaNSamples)))
282  iLeft = int(np.around(0.5 * alpha * rKindCompute(nonNaNSamples)))
283  iRight = int(np.around((1 - 0.5 * alpha) * rKindCompute(nonNaNSamples)))
284 
285  ciVals[cimean].append(rKindStore(sampleVals[iMid]))
286  ciVals[cimin].append(rKindStore(sampleVals[iLeft]))
287  ciVals[cimax].append(rKindStore(sampleVals[iRight]))
288 
289  return ciVals
290 
291 
292 ################################################################################
293 def bootStrapVectorRMSFunc(X, Y, statFunc=np.subtract,
294  alpha=0.05, n_samples=8000):
295 # PURPOSE: compute bootstrap confidence intervals on RMS of vector of data
296 #
297 # INPUTS:
298 # X, Y - intrinsic values for whole population
299 # statFunc - function f(RMS(X),RMS(Y)), optional, default is np.subtract
300 # alpha[float] - confidence interval (CI) percentile (e.g., 0.05 for 95%), optional
301 # n_samples[int or list(int) or array(int)] - number of bootstrap samples, optional
302 
303 # RETURNS:
304 # ciVals - dictionary object containing mean, CI min and CI max
305 
306  if type(n_samples) is list:
307  nsSamples = n_samples
308  elif (type(n_samples) is np.array
309  or type(n_samples) is np.ndarray):
310  nsSamples = list(n_samples)
311  else:
312  nsSamples = [n_samples]
313  max_samples = np.max(nsSamples)
314 
315  ## number of "data points" must by > 0
316  # could be aggregated over a time series, space, or any other binning characteristic
317  Ndata = len(X)
318 
319  iResample = np.random.choice(Ndata, (Ndata, max_samples))
320 
321  XResample = rmsFunc(X[iResample], axis=0)
322  YResample = rmsFunc(Y[iResample], axis=0)
323 
324  Expect = statFunc(XResample,YResample)
325 
326  ciVals = defaultdict(list)
327 
328  for nSamples in nsSamples:
329  sampleVals = np.sort(Expect[0:nSamples])
330  nonNaNSamples = np.isfinite(sampleVals).sum()
331 
332  iMid = int(np.around(0.5 * rKindCompute(nonNaNSamples)))
333  iLeft = int(np.around(0.5 * alpha * rKindCompute(nonNaNSamples)))
334  iRight = int(np.around((1 - 0.5 * alpha) * rKindCompute(nonNaNSamples)))
335 
336  ciVals[cimean].append(rKindStore(sampleVals[iMid]))
337  ciVals[cimin].append(rKindStore(sampleVals[iLeft]))
338  ciVals[cimax].append(rKindStore(sampleVals[iRight]))
339 
340  return ciVals
341 
342 
343 ################################################################################
344 def bootStrapAggRMSFunc(X, Y, Ns, statFunc=np.subtract,
345  alpha=0.05, n_samples=8000):
346 # PURPOSE: compute bootstrap confidence intervals on function of aggregated RMS
347 # of two arrays of subpopulation RMSs
348 #
349 # INPUTS:
350 # X[np.array] - one set of RMS for multiple subpopulations
351 # Y[np.array] - second set of RMS for the same subpopulations
352 # Ns[np.array] - counts of data associated for the same subpopulations
353 # statFunc - function f(agg(X),agg(Y)), optional, default is np.subtract
354 # alpha[float] - confidence interval (CI) percentile (e.g., 0.05 for 95%), optional
355 # n_samples[int or list(int) or array(int)] - number of bootstrap samples, optional
356 
357 # RETURNS:
358 # ciVals - dictionary object containing mean, CI min and CI max
359 
360  if type(n_samples) is list:
361  nsSamples = n_samples
362  elif (type(n_samples) is np.array
363  or type(n_samples) is np.ndarray):
364  nsSamples = list(n_samples)
365  else:
366  nsSamples = [n_samples]
367  max_samples = np.max(nsSamples)
368 
369  # remove invalid clusters
370  mask = np.logical_and(np.isfinite(X), np.isfinite(Y))
371  mask = np.logical_and(mask, Ns > 0.)
372  if mask.sum() == 0:
373  _logger.error("\n\nERROR: no valid subpopulation data")
374  os._exit(1)
375 
376  X_ = np.asarray(X[mask], dtype=rKindCompute)
377  Y_ = np.asarray(Y[mask], dtype=rKindCompute)
378  Ns_ = np.asarray(Ns[mask], dtype=rKindCompute)
379 
380  Ndata = len(X_) ; # number of "data points"
381 
382  iResample = np.random.choice(Ndata, (Ndata, max_samples))
383 
384  NsResample = Ns_[iResample]
385  XResample = np.nansum(np.multiply(np.square(X_[iResample]), NsResample), axis=0)
386  YResample = np.nansum(np.multiply(np.square(Y_[iResample]), NsResample), axis=0)
387 
388  NaggResample = np.nansum(NsResample, axis=0)
389  XaggResample = np.sqrt(np.divide(XResample, NaggResample))
390  YaggResample = np.sqrt(np.divide(YResample, NaggResample))
391 
392  Expect = statFunc(XaggResample, YaggResample)
393 
394  ciVals = defaultdict(list)
395 
396  for nSamples in nsSamples:
397  sampleVals = np.sort(Expect[0:nSamples])
398  nonNaNSamples = np.isfinite(sampleVals).sum()
399 
400  iMid = int(np.around(0.5 * rKindCompute(nonNaNSamples)))
401  iLeft = int(np.around(0.5 * alpha * rKindCompute(nonNaNSamples)))
402  iRight = int(np.around((1 - 0.5 * alpha) * rKindCompute(nonNaNSamples)))
403 
404  ciVals[cimean].append(rKindStore(sampleVals[iMid]))
405  ciVals[cimin].append(rKindStore(sampleVals[iLeft]))
406  ciVals[cimax].append(rKindStore(sampleVals[iRight]))
407 
408  return ciVals
409 
410 
411 ################################################################################
412 def bootStrapVectorFunc(X, Y, alpha=0.05,
413  n_samples=5000,
414  vecFuncs=[identityFunc],
415  bootFuncs=[bootStrapVector],
416  statFunc=np.subtract):
417 # PURPOSE: compute bootstrap confidence intervals (bootFunc) on
418 # E[statFunc(vecFunc(X),vecFunc(Y))] using two vectors of data, X and Y,
419 # which are unique realizations of the same whole population of data
420 # INPUTS:
421 # X[np.array] - one set of values for the whole population
422 # Y[np.array] - second set of values for the whole population
423 # alpha[float] - confidence interval (CI) percentile (e.g., 0.05 for 95%), optional
424 # n_samples[int or list(int) or array(int)] - number of bootstrap samples, optional
425 # vecFuncs[list(function)] - function to apply independently to X and Y, e.g., np.square, optional
426 # bootFuncs[list(function)] - function to use for bootstrapping each vecFuncs member (same length), optional
427 # statFunc[function] - function f(vecFunc(X),vecFunc(Y)), optional, default is np.subtract
428 
429 # RETURNS:
430 # funcCIVals - dictionary object containing median, low CI, and high CI of bootstrapped stats
431 
432  if type(n_samples) is list:
433  nsSamples = n_samples
434  elif (type(n_samples) is np.array
435  or type(n_samples) is np.ndarray):
436  nsSamples = list(n_samples)
437  else:
438  nsSamples = [n_samples]
439  max_samples = np.max(nsSamples)
440 
441  mask = np.logical_and(np.isfinite(X), np.isfinite(Y))
442  X_ = X[mask]
443  Y_ = Y[mask]
444 
445  funcCIVals = {}
446  for ifunc, func in enumerate(vecFuncs):
447  bootFunc = bootFuncs[ifunc]
448  if bootFunc is bootStrapVector:
449  ciVals = bootFunc(
450  statFunc(func(X_), func(Y_)),
451  n_samples=nsSamples)
452  elif bootFunc is bootStrapVectorRMSFunc:
453  ciVals = bootFunc(
454  X_, Y_, statFunc,
455  n_samples=nsSamples)
456 
457  funcCIVals[ifunc] = {}
458  for trait in ciTraits:
459  funcCIVals[ifunc][trait] = ciVals[trait]
460 
461  return funcCIVals
462 
463 
464 ################################################################################
465 def bootStrapClusterFunc(X, Y, alpha=0.05,
466  n_samples=5000,
467  statFunc=np.subtract,
468  statNames=sampleableAggStats):
469 # PURPOSE: compute bootstrap confidence intervals on
470 # E[statFunc(X, Y)], where X and Y contain statistics from
471 # subgroups of a whole population
472 # INPUTS:
473 # X[pd.DataFrame] - one set of aggregatableFileStats for all subgroups
474 # Y[pd.DataFrame] - second set of aggregatableFileStats for same subgroups
475 # alpha[float] - confidence interval (CI) percentile (e.g., 0.05 for 95%), optional
476 # n_samples[int or list(int) or array(int)] - number of bootstrap samples, optional
477 # statFunc[function] - function f(agg(X),agg(Y)), optional, default is np.subtract
478 # statNames[list(str)] - the statistics for which CI's will be produced
479 
480 # RETURNS:
481 # statCIVals - nested dictionary object containing median, low CI, and high CI
482 # of all statNames
483 
484  ## initialize output
485  statCIVals = {}
486  for stat in statNames:
487  statCIVals[stat] = {}
488  for trait in ciTraits:
489  statCIVals[stat][trait] = [np.NaN]
490 
491  ## number of "data points" must by > 0
492  # could be aggregated over a time series, space, or any other binning characteristic
493  nClust = len(X)
494  if nClust > 0:
495 
496  if type(n_samples) is list:
497  nsSamples = n_samples
498  elif (type(n_samples) is np.array
499  or type(n_samples) is np.ndarray):
500  nsSamples = list(n_samples)
501  else:
502  nsSamples = [n_samples]
503  max_samples = np.max(nsSamples)
504 
505  for stat in statNames:
506  if stat == 'Count': continue
507  Ns = X.loc[:,'Count'].to_numpy(dtype=rKindCompute)
508  X_ = X.loc[:,stat].to_numpy(dtype=rKindCompute)
509  Y_ = Y.loc[:,stat].to_numpy(dtype=rKindCompute)
510 
511  # remove zero-size and non-finite clusters
512  mask = np.logical_and(np.isfinite(X_), np.isfinite(Y_))
513  mask = np.logical_and(mask, Ns > 0.)
514  if mask.sum() == 0: continue
515  Ns = Ns[mask]
516  X_ = X_[mask]
517  Y_ = Y_[mask]
518 
519  if stat == 'Mean' or stat == 'MS':
520  weights = np.divide(Ns,np.nansum(Ns))
521  ciVals = bootStrapVector(
522  statFunc(X_,Y_),
523  weights=weights,
524  n_samples=nsSamples)
525  elif stat == 'RMS':
526  ciVals = bootStrapAggRMSFunc(
527  X_, Y_, Ns, statFunc,
528  n_samples=nsSamples)
529  else:
530  _logger.error("\n\nERROR: stat not implemented: ", stat)
531  os._exit(1)
532 
533  for trait in ciTraits:
534  statCIVals[stat][trait] = ciVals[trait]
535 
536  return statCIVals
def read_stats_nc(statsFile)
Definition: stat_utils.py:129
def aggStatsDict(x_)
Definition: stat_utils.py:161
def rmsFunc(x, axis=0)
Definition: stat_utils.py:235
def identityFunc(x)
Definition: stat_utils.py:230
def aggStatsDF(x)
Definition: stat_utils.py:151
def bootStrapVectorFunc(X, Y, alpha=0.05, n_samples=5000, vecFuncs=[identityFunc], bootFuncs=[bootStrapVector], statFunc=np.subtract)
Definition: stat_utils.py:416
def bootStrapClusterFunc(X, Y, alpha=0.05, n_samples=5000, statFunc=np.subtract, statNames=sampleableAggStats)
Definition: stat_utils.py:468
def bootStrapVector(X, alpha=0.05, n_samples=8000, weights=None)
Definition: stat_utils.py:240
def calcStats(array_f)
Definition: stat_utils.py:74
def bootStrapAggRMSFunc(X, Y, Ns, statFunc=np.subtract, alpha=0.05, n_samples=8000)
Definition: stat_utils.py:345
def write_stats_nc(statSpace, statsDict)
Definition: stat_utils.py:101
def bootStrapVectorRMSFunc(X, Y, statFunc=np.subtract, alpha=0.05, n_samples=8000)
Definition: stat_utils.py:294