IODA Bundle
subset_files.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 import argparse
3 from multiprocessing import Pool
4 import glob
5 import numpy as np
6 import random
7 import sys
8 import netCDF4 as nc
9 import os
10 
11 
12 def subset(infile, nlocsout, suffix, geofile, diagfile):
13  print('Processing:', infile)
14  outfile = infile[:-4]+suffix
15  ncin = nc.Dataset(infile)
16  ncout = nc.Dataset(outfile, 'w')
17  # I think this has to be called before each call to random.sample to make it the same random set
18  random.seed(5)
19  # Get number of records (nrecsin)
20  recnum = ncin.variables['record_number@MetaData'][:]
21  recnum_uniq = np.unique(recnum)
22  nrecsin = len(recnum_uniq)
23  nlocsin = len(ncin.dimensions['nlocs'])
24  nvars = len(ncin.dimensions['nvars'])
25  if nrecsin > 100.:
26  nsamples = int(nlocsout/10.) # just picked 10 randomly
27  nsamples = max(1, nsamples)
28  npossible = nrecsin
29  recs = True
30  else:
31  nsamples = nlocsout
32  npossible = nlocsin
33  recs = False
34  if nlocsout > nlocsin:
35  nsamples = npossible
36  flag = random.sample(list(np.arange(0, npossible)), nsamples)
37  flag = sorted(flag)
38  # get the nlocs where flag is true so consistent for geovals too
39  if recs:
40  nrecsout = len(flag)
41  flag2 = flag
42  flag = np.isin(ncin.variables['record_number@MetaData'][:], flag)
43  nlocsout = len(ncin.variables['record_number@MetaData'][flag, ...])
44  else:
45  flag2 = [0]
46  nrecsout = 1
47  nlocsout = len(flag)
48  # process observation file
49  # copy global attributes
50  for aname in ncin.ncattrs():
51  avalue = ncin.getncattr(aname)
52  ncout.setncattr(aname, avalue)
53  # redo nlocs
54  ncout.setncattr("nlocs", np.int32(nlocsout))
55  # copy dimensions
56  for dim in ncin.dimensions.values():
57  if dim.name == 'nlocs':
58  d_size = nlocsout
59  else:
60  d_size = len(dim)
61  ncout.createDimension(dim.name, d_size)
62  # copy variables
63  for var in ncin.variables.values():
64  vname = var.name
65  vdata = ncin.variables[vname]
66  var_out = ncout.createVariable(vname, var.dtype, var.dimensions)
67  if (var.dimensions[0] == 'nlocs'):
68  var_out[...] = vdata[flag, ...]
69  else:
70  var_out[:] = vdata[:]
71  # variable attributes
72  for aname in var.ncattrs():
73  avalue = var.getncattr(aname)
74  var_out.setncattr_string(aname, avalue)
75  ncin.close()
76  ncout.close()
77 
78  print('wrote obs to:', outfile)
79  # now process geoval file if necessary
80  if geofile:
81  outfile = geofile[:-4]+suffix
82  ncin = nc.Dataset(geofile)
83  ncout = nc.Dataset(outfile, 'w')
84  # attributes
85  for aname in ncin.ncattrs():
86  avalue = ncin.getncattr(aname)
87  ncout.setncattr(aname, avalue)
88  # redo nlocs
89  ncout.setncattr("nlocs", np.int32(nlocsout))
90  # copy dimensions
91  for dim in ncin.dimensions.values():
92  if dim.name == 'nlocs':
93  d_size = nlocsout
94  else:
95  d_size = len(dim)
96  ncout.createDimension(dim.name, d_size)
97  # copy variables
98  for var in ncin.variables.values():
99  vname = var.name
100  vdata = ncin.variables[vname]
101  var_out = ncout.createVariable(vname, var.dtype, var.dimensions)
102  if (var.dimensions[0] == 'nlocs'):
103  var_out[...] = vdata[flag, ...]
104  else:
105  var_out[:] = vdata[:]
106  ncin.close()
107  ncout.close()
108 
109  print('wrote geovals to:', outfile)
110 
111  # now process obsdiag file if necessary
112  if diagfile:
113  outfile = diagfile[:-4]+suffix
114  ncin = nc.Dataset(diagfile)
115  ncout = nc.Dataset(outfile, 'w')
116  # attributes
117  for aname in ncin.ncattrs():
118  avalue = ncin.getncattr(aname)
119  ncout.setncattr(aname, avalue)
120  # redo nlocs
121  ncout.setncattr("nlocs", np.int32(nlocsout))
122  # copy dimensions
123  for dim in ncin.dimensions.values():
124  if dim.name == 'nlocs':
125  d_size = nlocsout
126  else:
127  d_size = len(dim)
128  ncout.createDimension(dim.name, d_size)
129  # copy variables
130  for var in ncin.variables.values():
131  vname = var.name
132  vdata = ncin.variables[vname]
133  var_out = ncout.createVariable(vname, var.dtype, var.dimensions)
134  if (var.dimensions[0] == 'nlocs'):
135  var_out[...] = vdata[flag, ...]
136  else:
137  var_out[:] = vdata[:]
138  ncin.close()
139  ncout.close()
140 
141  print('wrote obsdiag to:', outfile)
142 
143 # main script ##############
144 
145 
146 # parse command line
147 ap = argparse.ArgumentParser()
148 ap.add_argument("-m", "--medium", action='store_true',
149  help="Subset to 5 records or 100 obs")
150 ap.add_argument("-s", "--single", action='store_true',
151  help="Output single observation or record")
152 ap.add_argument("-g", "--geovals", help="Path to geoval directory")
153 ap.add_argument("-d", "--obsdiag", help="Path to obsdiag directory")
154 ap.add_argument("filedir", help="Path to obs files to process")
155 ap.add_argument("-n", "--nprocs",
156  help="Number of tasks/processors for multiprocessing")
157 
158 MyArgs = ap.parse_args()
159 
160 if MyArgs.nprocs:
161  nprocs = int(MyArgs.nprocs)
162 else:
163  nprocs = 1
164 
165 if MyArgs.medium:
166  nobs = 100
167  suffix = '_m.nc4'
168 elif MyArgs.single:
169  nobs = 1
170  suffix = '_s.nc4'
171 else:
172  print('need either -m or -s, exiting...')
173  sys.exit(1)
174 
175 InDir = MyArgs.filedir
176 
177 infiles = glob.glob(InDir+'/*.nc4')
178 obspool = Pool(processes=nprocs)
179 
180 for infile in infiles:
181  if os.path.getsize(infile) < 10000:
182  continue
183  if infile[-6:] in ['_m.nc4', '_s.nc4']:
184  # print('skipping',infile)
185  continue
186  # get geofile
187  geofile = False
188  if MyArgs.geovals:
189  inob = infile.split('/')[-1]
190  ingeo = inob.replace('obs', 'geoval')
191  geofile = MyArgs.geovals+'/'+ingeo
192  # get diagfile
193  diagfile = False
194  if MyArgs.obsdiag:
195  inob = infile.split('/')[-1]
196  indiag = inob.replace('obs', 'obsdiag')
197  diagfile = MyArgs.obsdiag+'/'+indiag
198  res = obspool.apply_async(subset, args=(infile, nobs, suffix, geofile, diagfile))
199 obspool.close()
200 obspool.join()
def subset(infile, nlocsout, suffix, geofile, diagfile)
Definition: subset_files.py:12