IODA Bundle
combine_files.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 # combine_files.py
3 # combine conventional obs and GOESIR (currently ahi_himawari8) from IODA format into
4 # one output file with matching corresponding locations
5 # and missing data where applicable
6 
7 import sys
8 import netCDF4 as nc
9 import numpy as np
10 import argparse
11 from collections import defaultdict, OrderedDict
12 import datetime as dt
13 from pathlib import Path
14 
15 IODA_CONV_PATH = Path(__file__).parent/"@SCRIPT_LIB_PATH@"
16 if not IODA_CONV_PATH.is_dir():
17  IODA_CONV_PATH = Path(__file__).parent/'..'/'lib-python'
18 sys.path.append(str(IODA_CONV_PATH.resolve()))
19 
20 import ioda_conv_ncio as iconv
21 from orddicts import DefaultOrderedDict
22 
23 vtypedict = {
24  'int32': 'integer',
25  '|S1': 'string',
26  'float32': 'float',
27 }
28 
29 
30 def concat_ioda(FileList, OutFile, GeoDir):
31  outdata = defaultdict(lambda: DefaultOrderedDict(OrderedDict))
32  loc_mdata = defaultdict(lambda: DefaultOrderedDict(OrderedDict))
33  var_mdata = defaultdict(lambda: DefaultOrderedDict(OrderedDict))
34  AttrData = {}
35  DataVarNames = []
36  DataVars = []
37  VarUnits = {}
38  DataVType = []
39  MetaVars = []
40  MetaVarNames = []
41  MetaVType = []
42  VarMetaVars = []
43  LocKeyList = []
44  MetaInAll = {}
45  # get lists of variables
46  for f in FileList:
47  ncf = nc.Dataset(f, mode='r')
48  for key, value in ncf.variables.items():
49  vs = key.split('@')
50  if vs[1] not in ['MetaData', 'VarMetaData', 'TestReference']:
51  if vs[0] not in DataVarNames:
52  DataVarNames.append(vs[0])
53  if vs[0] in DataVarNames:
54  if key not in DataVars:
55  DataVars.append(key)
56  vtype = ncf.variables[key].dtype
57  DataVType.append(vtype)
58  try:
59  units = ncf.variables[vs[0]+"@ObsValue"].getncattr("units")
60  VarUnits[vs[0]] = units
61  except AttributeError:
62  pass
63 
64  else:
65  if vs[1] == 'MetaData' and key not in MetaVars and vs[0] not in ['time', 'ObsIndex']:
66  MetaVars.append(key)
67  MetaVarNames.append(tuple(vs))
68  vtype = ncf.variables[key].dtype
69  MetaVType.append(vtype)
70  try:
71  units = ncf.variables[vs[0]+"@MetaData"].getncattr("units")
72  VarUnits[vs[0]] = units
73  except AttributeError:
74  pass
75  vtypestr = vtypedict[str(vtype)]
76  if vs[1] == 'datetime':
77  vtypestr = 'datetime'
78  LocKeyList.append((vs[0], vtypestr))
79  elif vs[1] == 'VarMetaData' and key not in VarMetaVars and vs[0] not in ['variable_names']:
80  VarMetaVars.append(key)
81  ncf.close()
82 
83  # determine the obstype. If the obstype is a GOESIR type, also get the sensor name and the satellite name
84  # an example is: inob="ahi_himawari8_obs_2018041500.nc4", obstype="ahi_himawari8", sensor="ahi"
85  # satellite="himawari8"
86  inob = f.split('/')[-1]
87  obstype = inob[:inob.rfind("obs")-1]
88  if obstype == "ahi_himawari8":
89  sensor = obstype.split('_')[0]
90  satellite = obstype.split('_')[-1]
91 
92  # determine which metadata is in all files
93  for v in MetaVars:
94  MetaInAll[v] = True
95  for f in FileList:
96  ncf = nc.Dataset(f, mode='r')
97  try:
98  a = np.array(ncf.variables[v])[0, ...]
99  except KeyError:
100  MetaInAll[v] = False
101  ncf.close()
102  # extract metadata and generate a numpy array
103  MetaVarData = []
104  bad_idxs = []
105  for v in MetaVars:
106  tmpvardata = []
107  if MetaInAll[v]:
108  for f in FileList:
109  ncf = nc.Dataset(f, mode='r')
110  tmpdata = np.array(ncf.variables[v])
111  tmpvardata.append(tmpdata)
112  ncf.close()
113  try:
114  ashape = tmpdata.shape[1]
115  tmpvardata = np.vstack(tmpvardata)
116  tmpvardata = np.array([b''.join(td) for td in tmpvardata])
117  except (IndexError, ValueError):
118  if len(tmpvardata):
119  tmpvardata = np.hstack(tmpvardata)
120  if len(tmpvardata):
121  MetaVarData.append(tmpvardata)
122  else:
123  bad_idxs.append(MetaVars.index(v))
124  for i in sorted(bad_idxs, reverse=True):
125  del MetaVars[i]
126  del MetaVarNames[i]
127  del MetaVType[i]
128  MetaVarData = np.vstack(MetaVarData)
129  MetaVarUnique, idx, inv, cnt = np.unique(MetaVarData, return_index=True, return_inverse=True, return_counts=True, axis=1)
130  # grab variables to write out
131  DataVarData = []
132  DataVarIdx = []
133  for idx2, v in enumerate(DataVars):
134  tmpvardata = []
135  for f in FileList:
136  ncf = nc.Dataset(f, mode='r')
137  try:
138  tmpdata = np.array(ncf.variables[v])
139  tmpvardata.append(tmpdata)
140  except KeyError:
141  tmpdata = np.ones_like(np.array(ncf.variables['record_number@MetaData'])).astype(DataVType[idx2])
142  if DataVType[idx2] == np.int32:
143  tmpdata = tmpdata * nc.default_fillvals['i4']
144  else:
145  tmpdata = tmpdata * np.abs(nc.default_fillvals['f4'])
146  tmpvardata.append(tmpdata)
147  validtime = dt.datetime.strptime(str(ncf.getncattr('date_time')), "%Y%m%d%H")
148  ncf.close()
149  tmpvardata = np.hstack(tmpvardata)
150  DataVarData.append(tmpvardata)
151  DataVarData = np.vstack(DataVarData)
152  DataVarUnique = np.ones((len(DataVarData), len(idx))) * nc.default_fillvals['f4']
153  for ii in range(DataVarData.shape[0]):
154  mask = ~((DataVarData[ii, :] == nc.default_fillvals['i4']) | (DataVarData[ii, :] == np.abs(nc.default_fillvals['f4'])))
155  DataVarUnique[ii, inv[mask]] = DataVarData[ii, mask]
156 
157  # set up things for the Ncwriter
158  if obstype != "ahi_himawari8":
159  ridx = np.argwhere(np.array(MetaVars) == 'record_number@MetaData')[0][0]
160  nlocs = MetaVarUnique.shape[-1]
161  writer = iconv.NcWriter(OutFile, LocKeyList)
162  var_mdata['variable_names'] = writer.FillNcVector(DataVarNames, "string")
163  # TODO add RecMetaData for Station ID, etc...
164  AttrData["date_time_string"] = validtime.strftime("%Y-%m-%dT%H:%M:%SZ")
165  writer._nvars = len(DataVarNames)
166  writer._nlocs = nlocs
167 
168  for idx3, vname in enumerate(MetaVarNames):
169  if vname[0] in ['datetime', 'station_id']:
170  # TODO Speed up this section creating a character array
171  tmp = MetaVarUnique[idx3, :]
172  tmp = tmp.astype(str)
173  if vname[0] == 'datetime':
174  jj = 20
175  else:
176  jj = 50
177  tmp2 = np.empty((len(tmp), jj), dtype=str)
178  # loop and replace chars
179  for i in range(len(tmp)):
180  s = tmp[i]
181  tmp2[i, :len(s)] = list(s)
182  loc_mdata[vname[0]] = tmp2.astype(MetaVType[idx3])
183  else:
184  loc_mdata[vname[0]] = MetaVarUnique[idx3, ...].astype(MetaVType[idx3])
185  for idx3, vname in enumerate(DataVars):
186  tmp = DataVarUnique[idx3, ...]
187  tmp = tmp.astype(DataVType[idx3])
188  if DataVType[idx3] == 'int32':
189  tmp[tmp < -1e5] = nc.default_fillvals['i4']
190  outdata[tuple(vname.split('@'))] = tmp
191  if obstype == "ahi_himawari8":
192  var_mdata['sensor_channel'] = np.asarray(list(range(7, 17)))
193  AttrData["satellite"] = satellite
194  AttrData["sensor"] = sensor
195  writer.BuildNetcdf(outdata, loc_mdata, var_mdata, AttrData, VarUnits)
196 
197  # now write out combined GeoVaLs file
198  if GeoDir:
199  # get list of geoval files
200  GeoFileList = []
201  GeoVarNames2 = []
202  GeoVarTypes2 = []
203  GeoVarNames3 = []
204  GeoVarTypes3 = []
205  GeoVarNames31 = []
206  GeoVarTypes31 = []
207  for f in FileList:
208  inob = f.split('/')[-1]
209  ingeo = inob.replace('obs', 'geoval')
210  g = GeoDir+'/'+ingeo
211  GeoFileList.append(g)
212  # figure out possible variable names
213  for f in GeoFileList:
214  ncf = nc.Dataset(f, mode='r')
215  for key, value in ncf.variables.items():
216  if key not in GeoVarNames2 and key not in GeoVarNames3 and key not in GeoVarNames31:
217  if value.ndim == 1:
218  GeoVarNames2.append(key)
219  GeoVarTypes2.append(ncf.variables[key].dtype)
220  else:
221  if 'nlevs' in value.dimensions:
222  GeoVarNames3.append(key)
223  GeoVarTypes3.append(ncf.variables[key].dtype)
224  else:
225  GeoVarNames31.append(key)
226  GeoVarTypes31.append(ncf.variables[key].dtype)
227 
228  ncf.close()
229  GeoVarData2 = []
230  GeoVarIdx2 = []
231  GeoVarData3 = []
232  GeoVarIdx3 = []
233  GeoVarData31 = []
234  GeoVarIdx31 = []
235  for idx2, v in enumerate(GeoVarNames2):
236  tmpgeodata = []
237  tmpgeoidx = []
238  for f in GeoFileList:
239  ncf = nc.Dataset(f, mode='r')
240  try:
241  tmpdata = np.array(ncf.variables[v])
242  tmpgeodata.append(tmpdata)
243  tmpgeoidx.append(np.ones_like(tmpdata).astype(int)*int(idx2))
244  except KeyError:
245  tmpdata = np.ones_like(np.array(ncf.variables['time'])).astype(GeoVarTypes2[idx2])
246  if GeoVarTypes2[idx2] == np.int32:
247  tmpdata = tmpdata * nc.default_fillvals['i4']
248  else:
249  tmpdata = tmpdata * np.abs(nc.default_fillvals['f4'])
250  tmpgeodata.append(tmpdata)
251  tmpgeoidx.append(np.ones_like(tmpdata).astype(int)*int(idx2))
252  ncf.close()
253  tmpgeodata = np.hstack(tmpgeodata)
254  tmpgeoidx = np.hstack(tmpgeoidx)
255  GeoVarData2.append(tmpgeodata)
256  GeoVarIdx2.append(tmpgeoidx)
257  GeoVarData2 = np.vstack(GeoVarData2)
258  GeoVarIdx2 = np.vstack(GeoVarIdx2)
259  GeoVarData2 = np.transpose(GeoVarData2)
260  GeoVarIdx2 = np.transpose(GeoVarIdx2)
261  GeoVarUnique2 = np.ones((len(idx), len(GeoVarData2[0])))*np.abs(nc.default_fillvals['f4'])
262  for idx2, v in enumerate(GeoVarNames3):
263  tmpgeodata = []
264  tmpgeoidx = []
265  for f in GeoFileList:
266  ncf = nc.Dataset(f, mode='r')
267  try:
268  tmpdata = np.array(ncf.variables[v])
269  tmpgeodata.append(tmpdata)
270  tmpgeoidx.append(np.ones_like(tmpdata).astype(int)*int(idx2))
271  except KeyError:
272  try:
273  tmpdata = np.ones_like(np.array(ncf.variables['air_pressure'])).astype(GeoVarTypes3[idx2])
274  except KeyError:
275  tmpdata = np.ones_like(np.array(ncf.variables['atmosphere_ln_pressure_coordinate'])).astype(GeoVarTypes3[idx2])
276  if GeoVarTypes3[idx2] == np.int32:
277  tmpdata = tmpdata * nc.default_fillvals['i4']
278  else:
279  tmpdata = tmpdata * np.abs(nc.default_fillvals['f4'])
280  tmpgeodata.append(tmpdata)
281  tmpgeoidx.append(np.ones_like(tmpdata).astype(int)*int(idx2))
282  tmpgeodata = np.vstack(tmpgeodata)
283  tmpgeoidx = np.vstack(tmpgeoidx)
284  GeoVarData3.append(tmpgeodata)
285  GeoVarIdx3.append(tmpgeoidx)
286  GeoVarData3 = np.dstack(GeoVarData3)
287  GeoVarIdx3 = np.dstack(GeoVarIdx3)
288  GeoVarUnique3 = np.ones((len(idx), len(GeoVarData3[0]), len(GeoVarData3[0, 0, :])))*np.abs(nc.default_fillvals['f4'])
289  for idx2, v in enumerate(GeoVarNames31):
290  tmpgeodata = []
291  tmpgeoidx = []
292  for f in GeoFileList:
293  ncf = nc.Dataset(f, mode='r')
294  try:
295  tmpdata = np.array(ncf.variables[v])
296  tmpgeodata.append(tmpdata)
297  tmpgeoidx.append(np.ones_like(tmpdata).astype(int)*int(idx2))
298  except KeyError:
299  try:
300  tmpdata = np.ones_like(np.array(ncf.variables['air_pressure_levels'])).astype(GeoVarTypes31[idx2])
301  except KeyError:
302  tmpdata = np.ones_like(np.array(ncf.variables['atmosphere_ln_pressure_interface_coordinate'])).astype(GeoVarTypes31[idx2])
303  if GeoVarTypes31[idx2] == np.int32:
304  tmpdata = tmpdata * nc.default_fillvals['i4']
305  else:
306  tmpdata = tmpdata * np.abs(nc.default_fillvals['f4'])
307  tmpgeodata.append(tmpdata)
308  tmpgeoidx.append(np.ones_like(tmpdata).astype(int)*int(idx2))
309  tmpgeodata = np.vstack(tmpgeodata)
310  tmpgeoidx = np.vstack(tmpgeoidx)
311  GeoVarData31.append(tmpgeodata)
312  GeoVarIdx31.append(tmpgeoidx)
313  GeoVarData31 = np.dstack(GeoVarData31)
314  GeoVarIdx31 = np.dstack(GeoVarIdx31)
315  GeoVarUnique31 = np.ones((len(idx), len(GeoVarData31[0]), len(GeoVarData31[0, 0, :])))*np.abs(nc.default_fillvals['f4'])
316  for ii, jj in np.ndindex(GeoVarData2.shape):
317  j = GeoVarIdx2[ii, jj]
318  i = inv[ii]
319  if GeoVarData2[ii, jj] != nc.default_fillvals['i4'] and GeoVarData2[ii, jj] != np.abs(nc.default_fillvals['f4']):
320  GeoVarUnique2[i, j] = GeoVarData2[ii, jj]
321  for ii, kk, jj in np.ndindex(GeoVarData3.shape):
322  j = GeoVarIdx3[ii, kk, jj]
323  i = inv[ii]
324  if GeoVarData3[ii, kk, jj] != nc.default_fillvals['i4'] and GeoVarData3[ii, kk, jj] != np.abs(nc.default_fillvals['f4']):
325  GeoVarUnique3[i, kk, j] = GeoVarData3[ii, kk, jj]
326  for ii, kk, jj in np.ndindex(GeoVarData31.shape):
327  j = GeoVarIdx31[ii, kk, jj]
328  i = inv[ii]
329  if GeoVarData31[ii, kk, jj] != nc.default_fillvals['i4'] and GeoVarData31[ii, kk, jj] != np.abs(nc.default_fillvals['f4']):
330  GeoVarUnique31[i, kk, j] = GeoVarData31[ii, kk, jj]
331  OutGeoFile = OutFile.replace('obs', 'geoval')
332  of = nc.Dataset(OutGeoFile, 'w', format='NETCDF4')
333  of.setncattr("date_time", ncf.getncattr("date_time"))
334  nlocs = len(GeoVarUnique3)
335  nlevs = len(GeoVarUnique3[0])
336  ninterfaces = len(GeoVarUnique31[0])
337  of.createDimension("nlocs", nlocs)
338  of.createDimension("nlevs", nlevs)
339  of.createDimension("ninterfaces", ninterfaces)
340  for ivar, var in enumerate(GeoVarNames2):
341  dims = ("nlocs", )
342  var_out = of.createVariable(var, GeoVarTypes2[ivar], dims)
343  var_out[...] = GeoVarUnique2[..., ivar]
344  for ivar, var in enumerate(GeoVarNames3):
345  dims = ("nlocs", "nlevs")
346  var_out = of.createVariable(var, GeoVarTypes3[ivar], dims)
347  var_out[...] = GeoVarUnique3[..., ivar]
348  for ivar, var in enumerate(GeoVarNames31):
349  dims = ("nlocs", "ninterfaces")
350  var_out = of.createVariable(var, GeoVarTypes31[ivar], dims)
351  var_out[...] = GeoVarUnique31[..., ivar]
352 
353 
354 ######################################################
355 ######################################################
356 if __name__ == '__main__':
357  parser = argparse.ArgumentParser(
358  description='Combine conventional obs and GOESIR (currently ahi_himawari8) in IODA format into one output file',
359  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
360  parser.add_argument(
361  '-i', '--input', help='list of the input files to combine',
362  type=str, nargs='+', required=True)
363  parser.add_argument(
364  '-o', '--output', help='name of the output IODA observation file',
365  type=str, required=True, default=None)
366  parser.add_argument('-g', '--geovals', help='path to where matching geovals should be')
367 
368  args = parser.parse_args()
369 
370  FileList = args.input
371  OutFile = args.output
372 
373  GeoDir = False
374  if args.geovals:
375  GeoDir = args.geovals
376 
377  concat_ioda(FileList, OutFile, GeoDir)
def concat_ioda(FileList, OutFile, GeoDir)