MPAS-JEDI
bootstrap_demo.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 
3 #import datetime as dt
4 #import getopt
5 from copy import deepcopy
6 import matplotlib
7 import matplotlib.pyplot as plt
8 #from matplotlib.dates import DateFormatter, AutoDateLocator
9 import numpy as np
10 import os
11 import pandas as pd
12 #import sys
13 
14 import plot_utils as pu
15 import stat_utils as su
16 
17 
19 
20  NOMINAL_GROUP_SIZE = 500
21 
22  NOMINAL_NUMBER_OF_GROUPS = 60
23 
24 
25  N_SUPER_SAMPLE = NOMINAL_GROUP_SIZE * NOMINAL_NUMBER_OF_GROUPS
26 
27  # number of bootstrap samples
28  nBootSamples = np.array([ np.around(10.0 ** x).astype(int) \
29  for x in list(np.arange(1.5,4.5,0.25))])
30 
31 # nBootSamples = np.array([ np.around(10.0 ** x).astype(int) \
32 # for x in list(np.arange(1.5,2.75,0.25))])
33 
34 
35  # general settings for plotting
36  fullColor = pu.plotColors[1]
37  clustColor = pu.plotColors[2]
38 
39  # establish f(g(X),g(Y)) for full population sampling
40  #g(x)
41  vecFuncs = [su.identityFunc, np.square, su.rmsFunc]
42  bootFuncs = [su.bootStrapVector,su.bootStrapVector,su.bootStrapVectorRMSFunc]
43  # vecFuncs = [np.square]
44  nFuncs = len(vecFuncs)
45 
46  #f(x)
47  statFunc = np.subtract
48 
49  # draw data from N(XMEAN, XVAR):
50  randSample1 = np.random.randn(N_SUPER_SAMPLE)
51  X1MEAN = 2.0 ; X1STD = 1.5
52  X1_SAMPLE = X1MEAN + X1STD*randSample1
53 
54  randSample2 = np.random.randn(N_SUPER_SAMPLE)
55  X2MEAN = 1.85 ; X2STD = 1.65
56  X2_SAMPLE = X2MEAN + X2STD*randSample2
57 
58  print("\n\nX1_SAMPLE stats:")
59  print(su.calcStats(X1_SAMPLE))
60 
61  print("\n\nX2_SAMPLE stats:")
62  print(su.calcStats(X2_SAMPLE))
63 
64  print("\n\nstatFunc(vecFunc(X2),vecFunc(X1)) stats:")
65  sampleStats = []
66  for ifunc, vecFunc in enumerate(vecFuncs):
67  if vecFunc is su.rmsFunc:
68  sampleStats.append({'Mean': statFunc(vecFunc(X2_SAMPLE),vecFunc(X1_SAMPLE))})
69 
70  else:
71  sampleStats.append(su.calcStats(statFunc(vecFunc(X2_SAMPLE),vecFunc(X1_SAMPLE))))
72 
73  print(sampleStats)
74 
75  # Bootstrap on mean of point-by-point statistics for the full population
76  # MEAN(statFunc(vecFunc(X2),vecFunc(X1)))
77 
78  # Bootstrap on full population
79  statsCIFullSamples = su.bootStrapVectorFunc( \
80  X2_SAMPLE, X1_SAMPLE,
81  n_samples = nBootSamples, \
82  vecFuncs = vecFuncs, \
83  bootFuncs = bootFuncs, \
84  statFunc = statFunc)
85 
86  # Loop over variations in cluster sizes
87 # SIGMANS = [5, 15, 50, 150, 400]
88  SIGMANS = np.around(np.multiply(NOMINAL_GROUP_SIZE, \
89  [0.005, 0.015, 0.050, 0.150, 0.400])).astype(int)
90 
91 
92  for SIGMAN in SIGMANS:
93 
94  #Create DataFrames containing cluster statistics for both experiments
95  # establish clusters
96  clustsN = []
97  clustsStart = []
98  clustsEnd = []
99 
100 ## alternative way to initialize cluster sizes
101 # clustsStart.append(0)
102 # nremain = N_SUPER_SAMPLE
103 # while nremain > 0:
104 # clustN = int(max([np.around(NOMINAL_GROUP_SIZE + np.random.randn(1)*SIGMAN), 1.0]))
105 # if np.sum(clustsN) + clustN > N_SUPER_SAMPLE - NOMINAL_GROUP_SIZE/4:
106 # clustN = N_SUPER_SAMPLE - np.sum(clustsN)
107 # clustsN.append( clustN )
108 # nremain = nremain - clustN
109 # ntotal = np.sum(clustsN)
110 # clustsEnd.append(ntotal)
111 # clustsStart.append(clustsEnd[-1])
112 #
113 
114  clustsN = np.multiply(NOMINAL_GROUP_SIZE,np.ones(NOMINAL_NUMBER_OF_GROUPS,dtype=int))
115  for i in list(range(0,int(NOMINAL_NUMBER_OF_GROUPS/2 - 1))):
116  delta = NOMINAL_GROUP_SIZE - \
117  int(max([np.around(NOMINAL_GROUP_SIZE + np.random.randn(1)*SIGMAN), 1.0]))
118 
119  delta = np.abs(delta)
120 
121  ### note: always causes non-negligible difference in CI
122  clustsN[i] = np.max([clustsN[i] + delta, 1])
123  clustsN[-(i+1)] = np.max([clustsN[-(i+1)] - delta, 1])
124 
125 
126  # clustsN[i] = np.max([clustsN[i] + delta*2, 1])
127  # clustsN[i+1] = np.max([clustsN[i+1] - delta, 1])
128  # clustsN[-(i+1)] = np.max([clustsN[-(i+1)] - delta, 1])
129 
130  clustsN = np.random.choice(clustsN, NOMINAL_NUMBER_OF_GROUPS, replace = False )
131 
132  clustsStart.append(0)
133 
134  for i in list(range(0,NOMINAL_NUMBER_OF_GROUPS)):
135  clustsEnd.append(np.sum(clustsN[0:i+1]))
136  clustsStart.append(clustsEnd[-1])
137 
138  print("\n\nclustsN = ",clustsN)
139 
140 
141  #X1_SAMPLE
142  X1ClustsValues = []
143  X1ClustsStats = {}
144  for stat in su.aggregatableFileStats:
145  X1ClustsStats[stat] = []
146 
147  for ic, clusts in enumerate(clustsN):
148  X1ClustsValues.append(X1_SAMPLE[clustsStart[ic]:clustsEnd[ic]])
149  clustStats = su.calcStats(X1ClustsValues[ic])
150  for stat in su.aggregatableFileStats:
151  X1ClustsStats[stat].append(clustStats[stat])
152 
153  X1ClustsStatsDF = pd.DataFrame.from_dict(X1ClustsStats)
154  X1ClustsStatsDF.sort_index()
155 
156  # print("\n\nX1 cluster stats:")
157  # print(X1ClustsStatsDF)
158 
159 
160  print("\n\nX1 aggregated stats:")
161  aggClustStats = su.aggStatsDF(X1ClustsStatsDF)
162  print(aggClustStats)
163 
164 
165  #X2_SAMPLE
166  X2ClustsValues = []
167  X2ClustsStats = {}
168  for stat in su.aggregatableFileStats:
169  X2ClustsStats[stat] = []
170 
171  for ic, clusts in enumerate(clustsN):
172  X2ClustsValues.append(X2_SAMPLE[clustsStart[ic]:clustsEnd[ic]])
173  clustStats = su.calcStats(X2ClustsValues[ic])
174  for stat in su.aggregatableFileStats:
175  X2ClustsStats[stat].append(clustStats[stat])
176 
177  X2ClustsStatsDF = pd.DataFrame.from_dict(X2ClustsStats)
178  X2ClustsStatsDF.sort_index()
179 
180 
181  # print("\n\nX2 cluster stats:")
182  # print(X2ClustsStatsDF)
183 
184 
185  print("\n\nX2 aggregated stats:")
186  aggClustStats = su.aggStatsDF(X2ClustsStatsDF)
187  print(aggClustStats)
188 
189 
190  print("\n\nPerforming bootstrap for sigmaN = ",SIGMAN)
191 
192  # Bootstrap on cluster subpopulations
193  statsCIClustSamples = su.bootStrapClusterFunc( \
194  X2ClustsStatsDF, X1ClustsStatsDF, \
195  n_samples = nBootSamples, \
196  statFunc = statFunc)
197 
198  #
199  # Results for point-by-point
200  #
201  xVals = nBootSamples
202  ny = nFuncs
203  nx = 2
204 
205  fig = pu.setup_fig(nx, ny, inch_width=2.2)
206 
207  for ifunc, vecFunc in enumerate(vecFuncs):
208  bootFunc = bootFuncs[ifunc]
209  if vecFunc is su.identityFunc:
210  sampleAggStat = "Mean"
211  elif vecFunc is np.square:
212  sampleAggStat = "MS"
213  elif vecFunc is su.rmsFunc:
214  sampleAggStat = "RMS"
215  else:
216  print("ERROR: vecFunc has no equivalent in aggregated cluster bootstrap")
217  os._exit(1)
218 
219 
220  ax1 = fig.add_subplot(ny, nx, nx*ifunc+1)
221 
222  plotVals = []
223 
224  #Full Sampling
225  lineVals = deepcopy(statsCIFullSamples[ifunc]['VALUE'])
226  lineValsMin = deepcopy(statsCIFullSamples[ifunc]['LO'])
227  lineValsMax = deepcopy(statsCIFullSamples[ifunc]['HI'])
228  plotVals.append(lineVals)
229  plotVals.append(lineValsMin)
230  plotVals.append(lineValsMax)
231 
232 
233  ax1.plot(xVals, lineVals, \
234  label='std bootstrap', \
235  color=fullColor, \
236  ls=pu.plotLineStyles[1], \
237  linewidth=0.7)
238 
239  ax1.plot(xVals, lineValsMin, \
240  label=None, \
241  color=fullColor, \
242  alpha=0.7, \
243  ls='-', \
244  linewidth=0.7)
245  ax1.plot(xVals, lineValsMax, \
246  label=None, \
247  color=fullColor, \
248  alpha=0.7, \
249  ls='-', \
250  linewidth=0.7)
251 
252  ax1.plot([xVals[0], xVals[-1]], np.multiply(lineValsMin[-1],[1., 1.]), \
253  ls="--", c=fullColor, \
254  alpha=0.4, \
255  linewidth=0.5,markersize=0)
256  ax1.plot([xVals[0], xVals[-1]], np.multiply(lineValsMax[-1],[1., 1.]), \
257  ls="--", c=fullColor, \
258  alpha=0.4, \
259  linewidth=0.5,markersize=0)
260 
261 
262  #Cluster Sampling
263  lineVals = deepcopy(statsCIClustSamples[sampleAggStat]['VALUE'])
264  lineValsMin = deepcopy(statsCIClustSamples[sampleAggStat]['LO'])
265  lineValsMax = deepcopy(statsCIClustSamples[sampleAggStat]['HI'])
266  plotVals.append(lineVals)
267  plotVals.append(lineValsMin)
268  plotVals.append(lineValsMax)
269 
270  ax1.plot(xVals, lineVals, \
271  label='cluster bootstrap', \
272  color=clustColor, \
273  ls=pu.plotLineStyles[1], \
274  linewidth=0.7)
275 
276  ax1.plot(xVals, lineValsMin, \
277  label=None, \
278  color=clustColor, \
279  alpha=0.6, \
280  ls='-', \
281  linewidth=0.7)
282  ax1.plot(xVals, lineValsMax, \
283  label=None, \
284  color=clustColor, \
285  alpha=0.6, \
286  ls='-', \
287  linewidth=0.7)
288 
289  ax1.plot([xVals[0], xVals[-1]], np.multiply(lineValsMin[-1],[1., 1.]), \
290  ls="--", c=clustColor, \
291  alpha=0.4, \
292  linewidth=0.5,markersize=0)
293  ax1.plot([xVals[0], xVals[-1]], np.multiply(lineValsMax[-1],[1., 1.]), \
294  ls="--", c=clustColor, \
295  alpha=0.4, \
296  linewidth=0.5,markersize=0)
297 
298 
299  # Population Statistics
300  ax1.plot([xVals[0], xVals[-1]], np.multiply(sampleStats[ifunc]['Mean'],[1., 1.]), \
301  ls="--", c=".2", \
302  linewidth=0.9,markersize=0,label='sample pop.')
303 
304  lh = ax1.legend(loc='best',fontsize=3,frameon=True,\
305  framealpha=0.4,ncol=2)
306  lh.get_frame().set_linewidth(0.0)
307 
308  ax1.set_ylabel(sampleAggStat+"_2 - "+sampleAggStat+"_1",fontsize=4)
309  ax1.set_xlabel('# bootstrap samples',fontsize=4)
310  ax1.set_xscale('log')
311  ax1.xaxis.set_tick_params(labelsize=3)
312  ax1.yaxis.set_tick_params(labelsize=3)
313 
314  minyval, maxyval = pu.get_clean_ax_limits(plotVals=plotVals,symmetric=False)
315  ax1.set_ylim(minyval,maxyval)
316  #ax1.grid(True)
317 
318 
319  ax2 = fig.add_subplot(ny, nx, nx*ifunc+2)
320  ax2.hist(clustsN,bins=int(1.2*np.floor(np.sqrt(NOMINAL_NUMBER_OF_GROUPS))))
321  ax2.set_ylabel("Count",fontsize=4)
322  ax2.set_xlabel('Cluster Size',fontsize=4)
323  ax2.grid(True)
324 
325  filename = 'cluster_bootstrap_test/comparison' \
326  +'_Nmean='+str(NOMINAL_GROUP_SIZE) \
327  +'_Nstd='+str(SIGMAN) \
328  +'_Nt='+str(N_SUPER_SAMPLE)
329 
330  pu.finalize_fig(fig, filename, 'png', True)
331 
332 
333 def main():
335 
336 if __name__ == '__main__': main()