Loading [MathJax]/extensions/tex2jax.js
IODA
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
upgrade.cpp
Go to the documentation of this file.
1 /*
2  * (C) Copyright 2021 UCAR
3  *
4  * This software is licensed under the terms of the Apache Licence Version 2.0
5  * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
6  */
7 /*! @file upgrade.cpp
8 * @brief A program to upgrade ioda files to a newer format.
9 *
10 * Call program as: ioda-upgrade.x YAML_settings_file [input files] ... output_directory
11 */
12 
13 #include <algorithm>
14 #include <cctype>
15 #include <cstdlib>
16 #include <exception>
17 #include <iostream>
18 #include <numeric>
19 #include <set>
20 #include <string>
21 #include <utility>
22 #include <vector>
23 
24 #include "ioda/Engines/Factory.h"
25 #include "ioda/Engines/HH.h"
26 #include "ioda/Exception.h"
27 #include "ioda/Group.h"
28 #include "ioda/ObsGroup.h"
30 #include "ioda/Misc/StringFuncs.h"
31 
32 // Annoying header junk on Windows. Who in their right mind defines a macro with
33 // the same name as a standard library function?
34 #ifdef max
35 #undef max
36 #endif
37 #ifdef min
38 #undef min
39 #endif
40 
41 /*! @brief Convenience lambda to hint if a variable @b might be a scale.
42  *
43  * @details This is not definitive,
44  * but has a high likelihood of being correct. The idea is that all variables
45  * will have either a "@" or "/" in their names, whereas dimension scales
46  * will not. This lambda returns true if the name has neither "@" nor "/" in
47  * its value.
48  * @param name is the variable name
49  * @returns true if yes, false if no.
50 */
51 bool isPossiblyScale(const std::string& name)
52 {
53  return (std::string::npos == name.find('@'))
54  && (std::string::npos == name.find('/')) ? true : false;
55 }
56 
57 /*! @brief Sort variable names in a preferential way so that likely scales end up first. For speed.
58 * @param allVars is an unordered vector of all variables.
59 * @returns an ordered list. "nlocs" is first, then all potential scales, then all other variables.
60 */
61 std::list<std::string> preferentialSortVariableNames(const std::vector<std::string>& allVars) {
62  std::list<std::string> sortedAllVars;
63  for (const auto& name : allVars) {
64  if (sortedAllVars.empty()) {
65  sortedAllVars.push_back(name);
66  } else {
67  if (isPossiblyScale(name)) {
68  auto second = sortedAllVars.begin();
69  second++;
70  if (sortedAllVars.front() == "nlocs") {
71  sortedAllVars.insert(second, name);
72  } else {
73  sortedAllVars.push_front(name);
74  }
75  } else {
76  sortedAllVars.push_back(name);
77  }
78  }
79  }
80  return sortedAllVars;
81 }
82 
83 
84 
85 typedef std::vector<ioda::Named_Variable> Vec_Named_Variable;
86 typedef std::map<ioda::Named_Variable, Vec_Named_Variable> VarDimMap;
87 
88 /// @brief Traverse file structure and determine dimension scales and regular variables. Also
89 /// determine which dimensions are attached to which variables at which dimension numbers.
90 /// @param[in] obsGroup is the incoming group. Really any group works.
91 /// @param[out] varList is the list of variables (not dimension scales).
92 /// @param[out] dimVarList is the list of dimension scales.
93 /// @param[out] dimsAttachedToVars is the mapping of the scales attached to each variable.
94 /// @param[out] maxVarSize0 is the max dimension length (nlocs). Unused here, but used in ioda.
95 void collectVarDimInfo(const ioda::Group& obsGroup, Vec_Named_Variable& varList,
96  Vec_Named_Variable& dimVarList, VarDimMap& dimsAttachedToVars,
97  ioda::Dimensions_t& maxVarSize0) {
98  using namespace ioda;
99  // We really want to maximize performance here and avoid excessive variable
100  // re-opens and closures that would kill the HDF5 backend.
101  // We want to:
102  // 1) separate the dimension scales from the regular variables.
103  // 2) determine the maximum size along the 0-th dimension.
104  // 3) determine which dimensions are attached to which variable axes.
105 
106  // Retrieve all variable names from the input file. Argument to listObjects is bool
107  // and when true will cause listObjects to recurse through the entire Group hierarchy.
108  std::vector<std::string> allVars = obsGroup.listObjects<ObjectType::Variable>(true);
109 
110  // A sorted list of all variable names that will help optimize the actual processing.
111  std::list<std::string> sortedAllVars = preferentialSortVariableNames(allVars);
112 
113  // TODO(ryan): refactor
114  // GeoVaLs fix: all variables appear at the same level, and this is problematic.
115  // Detect these files and do some extra sorting.
116  if (obsGroup.list().empty()) { // No Groups under the ObsGroup
117  std::list<std::string> fix_known_scales, fix_known_nonscales;
118  for (const auto& vname : sortedAllVars) {
119  Named_Variable v{vname, obsGroup.vars.open(vname)};
120  if (v.var.isDimensionScale()) {
121  (v.name == "nlocs") // true / false ternary
122  ? fix_known_scales.push_front(v.name)
123  : fix_known_scales.push_back(v.name);
124  } else
125  fix_known_nonscales.push_back(v.name);
126  }
127  sortedAllVars.clear();
128  for (const auto& e : fix_known_scales) sortedAllVars.push_back(e);
129  for (const auto& e : fix_known_nonscales) sortedAllVars.push_back(e);
130  }
131 
132  // Now for the main processing loop.
133  // We separate dimension scales from non-dimension scale variables.
134  // We record the maximum sizes of variables.
135  // We construct the in-memory mapping of dimension scales and variable axes.
136  // Keep track of these to avoid re-opening the scales repeatedly.
137  std::list<Named_Variable> dimension_scales;
138 
139  varList.reserve(allVars.size());
140  dimVarList.reserve(allVars.size());
141  maxVarSize0 = 0;
142  for (const auto& vname : sortedAllVars) {
143  Named_Variable v{vname, obsGroup.vars.open(vname)};
144  const auto dims = v.var.getDimensions();
145  if (dims.dimensionality >= 1) {
146  maxVarSize0 = std::max(maxVarSize0, dims.dimsCur[0]);
147  }
148 
149  // Expensive function call.
150  // Only 1-D variables can be scales. Also pre-filter based on name.
151  if (dims.dimensionality == 1 && isPossiblyScale(vname)) {
152  if (v.var.isDimensionScale()) {
153  (v.name == "nlocs") // true / false ternary
154  ? dimension_scales.push_front(v)
155  : dimension_scales.push_back(v);
156  dimVarList.push_back(v);
157 
158  //std::cout << "Dimension: " << v.name << " - size " << dims.numElements << "\n";
159  continue; // Move on to next variable in the for loop.
160  }
161  }
162 
163  // See above block. By this point in execution, we know that this variable
164  // is not a dimension scale.
165  varList.push_back(v);
166 
167  // Let's figure out which scales are attached to which dimensions.
168  auto attached_dimensions = v.var.getDimensionScaleMappings(dimension_scales);
169  std::vector<Named_Variable> dimVars;
170  dimVars.reserve(dims.dimensionality);
171  for (const auto& dim_scales_along_axis : attached_dimensions) {
172  if (dim_scales_along_axis.empty()) {
173  throw Exception("Unexpected size of dim_scales_along_axis", ioda_Here());
174  }
175  dimVars.push_back(dim_scales_along_axis[0]);
176  }
177  //std::cout << "\nVar " << v.name << ": |";
178  // for (const auto& i : dimVars) std::cout << " " << i.name << " |";
179 
180  dimsAttachedToVars.emplace(v, dimVars);
181  }
182  //std::cout << std::endl;
183 }
184 
185 /// @brief Determine which variables may be grouped.
186 /// @param[in] inVarList is the list of all variables.
187 std::size_t getChanSuffixPos(const std::string & name) {
188  // Identify the position of the _<number> suffix that represents the channel
189  // number (if present in name). Allow for the case where you have multiple
190  // _<number> suffixes, and take the last one as the channel number indicator.
191  //
192  // Go to the last occurrence of an underscore. If no underscores, then return npos.
193  // If we have an underscore, check to see if only digits occur after the underscore.
194  // If so, then we have a channel number suffix and return pos. If not, then we don't
195  // have a channel number suffix and return npos.
196  auto pos = name.find_last_of("_");
197  if (pos != std::string::npos) {
198  // have an underscore, check to see if only digits after the underscore
199  if ((pos + 1) < name.length()) {
200  std::string testSuffix = name.substr(pos + 1);
201  if (testSuffix.find_first_not_of("0123456789") != std::string::npos) {
202  // not only digits after the underscore so set pos to npos
203  pos = std::string::npos;
204  }
205  }
206  }
207  return pos;
208 }
209 
210 /// @brief Determine which variables may be grouped.
211 /// @param[in] inVarList is the list of all variables.
212 /// @param[out] similarVariables is the collection of similar variables, grouped by similarity and sorted numerically.
213 /// @param[out] dissimilarVariables are all variables that are not "similar".
214 void identifySimilarVariables(const Vec_Named_Variable& inVarList, VarDimMap& similarVariables,
215  Vec_Named_Variable &dissimilarVariables) {
216  using namespace ioda;
217  using namespace std;
218  dissimilarVariables.reserve(inVarList.size());
219 
220  // Transform names to new format so that all groups come first. Then, sort so that similar
221  // variables are lexically related.
222  Vec_Named_Variable sortedNames = inVarList;
223  for (auto& v : sortedNames) v.name = convertV1PathToV2Path(v.name);
224  sort(sortedNames.begin(), sortedNames.end());
225 
226  // Iterate through the sorted list. Find ranges of *similar* variable names where the
227  // variables end in '_' + a number.
228  auto varsAreSimilar = [](const std::string& lhs, const std::string& rhs) -> bool {
229  // Don't allow variables in the meta data groups to be associated. These variables
230  // should always be vectors dimensioned by the axis they describe.
231  if ((lhs.find("MetaData/") != string::npos) || (rhs.find("MetaData/") != string::npos)) {
232  return false;
233  }
234  // Split if statements to avoid an out-of-bounds bug that could otherwise occur with
235  // substr(0, found_range+1).
236  if (lhs.find_first_of("0123456789") == string::npos
237  && rhs.find_first_of("0123456789") == string::npos)
238  return lhs == rhs;
239  string id_lhs = lhs.substr(0, getChanSuffixPos(lhs));
240  string id_rhs = rhs.substr(0, getChanSuffixPos(rhs));
241  return id_lhs == id_rhs;
242  };
243 
244  // Collect up similar named variables and place them under their "base" name. If the
245  // variable name is unique and doesn't have a channel suffix, then place it in the
246  // dissimilarVariables list. Otherwise record the name variants under the base name
247  // in the similarVariables list.
248  auto collect = [&dissimilarVariables, &similarVariables](Vec_Named_Variable::const_iterator start, Vec_Named_Variable::const_iterator end) {
249  // End of a range. If range has only one variable, check if it has channel suffix
250  if (start == end) {
251  if (start->name.find("MetaData/") != string::npos ||
252  getChanSuffixPos(start->name) == std::string::npos) {
253  // Metadata variable, or no channel suffix: save as unique variable
254  cout << " Unique variable: " << start->name << ".\n";
255  dissimilarVariables.push_back(*start);
256  } else {
257  // Not Metadata varaible and channel suffix: figure out the new name.
258  string rangeName
259  = start->name.substr(0, getChanSuffixPos(start->name));
260  cout << " Grouping 1 variable into: " << rangeName << ".\n";
261  similarVariables[Named_Variable{rangeName, Variable()}] = {*start};
262  }
263  }
264  // If a range has multiple variables, sort and group.
265  else {
266  // A range has been found. Pack into similarVariables.
267  Vec_Named_Variable range(start, end+1);
268  // Sort this range based on a true numeric sort. The usual lexical sort is problematic
269  // because variable suffixes have different lengths.
270  sort(range.begin(), range.end(),
271  [](const Named_Variable& lhs, const Named_Variable& rhs) -> bool {
272  string sidnum_lhs = lhs.name.substr(getChanSuffixPos(lhs.name) + 1);
273  string sidnum_rhs = rhs.name.substr(getChanSuffixPos(rhs.name) + 1);
274  int idnum_lhs = std::atoi(sidnum_lhs.c_str());
275  int idnum_rhs = std::atoi(sidnum_rhs.c_str());
276  return idnum_lhs < idnum_rhs;
277  });
278 
279  // Figure out the new name.
280  string rangeName
281  = start->name.substr(0, getChanSuffixPos(start->name));
282  cout << " Grouping " << range.size() << " variables into: " << rangeName << ".\n";
283  similarVariables[Named_Variable{rangeName, Variable()}] = std::move(range);
284  }
285  };
286 
287  auto rangeStart = sortedNames.cbegin();
288  auto rangeEnd = rangeStart;
289  for (auto it = sortedNames.cbegin() + 1; it != sortedNames.cend(); ++it) {
290  if (varsAreSimilar(rangeStart->name, it->name)) {
291  rangeEnd = it;
292  } else {
293  collect(rangeStart, rangeEnd);
294  rangeStart = it;
295  rangeEnd = it;
296  }
297 
298  // Special case terminating the variable sequence.
299  if ((it + 1) == sortedNames.cend()) {
300  collect(rangeStart, rangeEnd);
301  }
302  }
303 }
304 
305 
306 
307 
308 /*
309 /// @brief Swap out new dimension scale names
310 /// @param oldDimsAttachedToVars
311 /// @param newDimList
312 /// @return A variable -> dimension map that references the new dimension scales. Old variable mappings are used.
313 VarDimMap translateToNewDims(
314  const Vec_Named_Variable& newDimList,
315  const VarDimMap& oldDimsAttachedToVars) {
316  std::map<std::string, ioda::Named_Variable> newDims;
317  for (const auto& d : newDimList) newDims[d.name] = d;
318 
319  VarDimMap res;
320 
321  for (const auto& oldVar : oldDimsAttachedToVars) {
322  const auto& oldVec = oldVar.second;
323  Vec_Named_Variable newVec(oldVec);
324  for (auto& s : newVec) s.var = newDims.at(s.name).var;
325  res[ioda::Named_Variable(oldVar.first.name, ioda::Variable())] = newVec;
326  }
327 
328  return res;
329 }
330 */
331 
332 /// @brief Copy data from oldvar into newvar. Offsets are supported for variable combination.
333 /// @param oldvar is the old variable.
334 /// @param newvar is the new variable.
335 /// @param base is the ObsGroup root object. Used in detecting ioda file versions.
336 /// @todo Add offset, oldvar_dims, newvar_dims.
337 void copyData(const Vec_Named_Variable& old, ioda::Variable& newvar, const ioda::ObsGroup& base,
338  const std::string & newVarName, const std::map<int, int> & chanNumToIndex) {
339  using namespace ioda;
340  using namespace std;
341 
342  // DEBUG(ryan): Add the old variable names as a string attribute to ensure proper alignment.
343  //vector<string> var_alignment;
344  //for (const auto& v : old) var_alignment.push_back(v.name);
345  //newvar.atts.add<string>("check_copy_vars", var_alignment);
346 
347  // Loop over each variable in old and apply to the appropriate place in newvar.
348  for (size_t i = 0; i < old.size(); ++i) {
349  const Variable oldvar = old[i].var;
350 
351  Dimensions oldvar_dims = oldvar.getDimensions();
352  Dimensions newvar_dims = newvar.getDimensions();
353  size_t sz_type_in_bytes = oldvar.getType().getSize();
354  if (oldvar.isA<std::string>()) {
355  // Some old ioda files have really odd string formats. We detect these here and
356  // repack the strings appropriately.
357  vector<string> buf_in;
358  oldvar.read<string>(buf_in);
359  if (oldvar_dims.numElements == newvar_dims.numElements) {
360  newvar.write<string>(buf_in);
361  } else {
362  if (oldvar_dims.dimensionality > 0) {
363  vector<string> buf_out;
364  buf_out.reserve(gsl::narrow<size_t>(newvar_dims.numElements));
365  // Look at the last dimension in oldvar_dims. Group according to this size.
366  size_t group_sz
367  = gsl::narrow<size_t>(oldvar_dims.dimsCur[oldvar_dims.dimensionality - 1]);
368  vector<char> new_str(group_sz + 1, '\0');
369  for (size_t i = 0; i < buf_in.size(); ++i) {
370  size_t idx = i % group_sz;
371  new_str[idx] = buf_in[i][0];
372 
373  string str(new_str.data());
374  // In-place right trim
375  str.erase(std::find_if(str.rbegin(), str.rend(),
376  [](unsigned char ch) { return !std::isspace(ch); }).base(), str.end());
377 
378  if (idx + 1 == group_sz) buf_out.push_back(str);
379  }
380  newvar.write<string>(buf_out);
381  }
382  }
383  } else {
384  vector<char> buf(oldvar_dims.numElements * sz_type_in_bytes);
385  oldvar.read(gsl::make_span<char>(buf.data(), buf.size()), oldvar.getType());
386  if (old.size() == 1) {
387  // We are writing out the entire variable.
388  newvar.write(gsl::make_span<char>(buf.data(), buf.size()), newvar.getType());
389  } else {
390  // If the chanNumToIndex is not empty, extract the channel number from the
391  // var name suffix and use the corresponding index for writing the variable.
392  int chanIndex = -1;
393  if (!chanNumToIndex.empty()) {
394  string oldVarName = old[i].name;
395  if (oldVarName.find(newVarName) == 0) {
396  // have a name with a channel suffix
397  int pos = newVarName.length() + 1;
398  int chanNum = stoi(oldVarName.substr(pos));
399  chanIndex = chanNumToIndex.at(chanNum);
400  }
401  }
402 
403  // We are writing a selection. Needs start, count, stride, block.
404  Selection::VecDimensions_t extent_ioda = newvar_dims.dimsCur;
405  Selection::VecDimensions_t extent_mem = newvar_dims.dimsCur;
406  *extent_mem.rbegin() = 1;
407 
408  Selection::VecDimensions_t start_mem(newvar_dims.dimensionality);
409  Selection::VecDimensions_t start_ioda(newvar_dims.dimensionality);
410  if (chanIndex >= 0) {
411  *start_ioda.rbegin() = chanIndex;
412  } else {
413  *start_ioda.rbegin() = i;
414  }
415  Selection::VecDimensions_t count = newvar_dims.dimsCur;
416  *count.rbegin() = 1;
417  Selection::VecDimensions_t stride(newvar_dims.dimensionality, 1);
418  Selection::VecDimensions_t block(newvar_dims.dimensionality, 1);
419 
420  Selection::SingleSelection sel_mem(SelectionOperator::SET, start_mem, count, stride, block);
421  Selection mem_selection(extent_mem);
422  mem_selection.select(sel_mem);
423 
424  Selection::SingleSelection sel_ioda(SelectionOperator::SET, start_ioda, count, stride, block);
425  Selection ioda_selection(extent_ioda);
426  ioda_selection.select(sel_ioda);
427 
428  newvar.write(gsl::make_span<char>(buf.data(), buf.size()), newvar.getType(), mem_selection,
429  ioda_selection);
430  }
431  }
432  }
433 }
434 
435 /// @brief Copy attributes from src to dest. Ignore duplicates and dimension scales.
436 /// @param src is the source.
437 /// @param dest is the destination.
439  using namespace ioda;
440  using namespace std;
441  vector<pair<string, Attribute>> srcAtts = src.openAll();
442 
443  for (const auto &s : srcAtts) {
444  // This set contains the names of atttributes that need to be stripped off of
445  // variables coming from the input file. The items in the list are related to
446  // dimension scales and will confuse the netcdf API and tools if allowed to be
447  // copied to the output file variables.
448  //
449  // In other words, these attributes assist the netcdf API in navigating the
450  // association of variables with dimension scales and have meaning to the netcdf API.
451  // These represent the associations in the input file and need to be stripped off
452  // since the associations in the output file will be re-created (and will not
453  // necessarily match the associations in the input file).
454  const set<string> ignored_names{
455  "CLASS",
456  "DIMENSION_LIST",
457  "NAME",
458  "REFERENCE_LIST",
459  "_Netcdf4Coordinates",
460  "_Netcdf4Dimid",
461  "_nc3_strict"
462  };
463  if (ignored_names.count(s.first)) continue;
464  if (dest.exists(s.first)) continue;
465 
466  Dimensions dims = s.second.getDimensions();
467  Type typ = s.second.getType();
468  size_t sz_type_in_bytes = typ.getSize();
469 
470  // Some variable attributes consist of an empty string in which case
471  // numElements is zero. In this is the case, create an empty string in the
472  // destination output, but make it consist of the null byte.
473  if (dims.numElements == 0) {
474  vector<char> buf(1, '\0');
475  Attribute newatt = dest.create(s.first, typ, { 1 });
476  newatt.write(gsl::make_span<char>(buf.data(), buf.size()), typ);
477  } else {
478  // copy from src attribute to dest attribute
479  vector<char> buf(dims.numElements * sz_type_in_bytes);
480  s.second.read(gsl::make_span<char>(buf.data(), buf.size()), typ);
481 
482  Attribute newatt = dest.create(s.first, typ, dims.dimsCur);
483  newatt.write(gsl::make_span<char>(buf.data(), buf.size()), typ);
484  }
485  }
486 }
487 
489  bool groupSimilarVariables = true;
490 };
491 
492 bool upgradeFile(const std::string& inputName, const std::string& outputName, const UpgradeParameters &params) {
493  // Open file, determine dimension scales and variables.
494  using namespace ioda;
495  using namespace std;
496  const Group in = Engines::HH::openMemoryFile(inputName);
497 
498  Vec_Named_Variable varList, dimVarList;
499  VarDimMap dimsAttachedToVars;
500  Dimensions_t maxVarSize0;
501 
502  collectVarDimInfo(in, varList, dimVarList, dimsAttachedToVars, maxVarSize0);
503 
504  // Figure out which variables can be combined
505  Vec_Named_Variable ungrouped_varList;
506  VarDimMap old_grouped_vars;
507  const bool groupSimilarVariables = false;
508  if (params.groupSimilarVariables)
509  identifySimilarVariables(varList, old_grouped_vars, ungrouped_varList);
510  else
511  ungrouped_varList = varList;
512 
513  // Create the output file
514 
515  // TODO(ryan): Fix this odd workaround where the map searches fail oddly.
516  map<string, Vec_Named_Variable> dimsAttachedToVars_bystring;
517  for (const auto& val : dimsAttachedToVars)
518  dimsAttachedToVars_bystring[convertV1PathToV2Path(val.first.name)] = val.second;
519 
520  // Construct the ObsGroup with the same scales as the input file.
521  //
522  // There are some cases where extraneous dimensions get included. An extraneous
523  // dimension is one that is not attached to any variable in the file. Exclude defining
524  // extraneous dimensions in the output file. To help with this, create a set
525  // of dim names and use this to mark which dimensions are being used.
526  set<string> attachedDims;
527  for (const auto & ivar : dimsAttachedToVars_bystring) {
528  for (const auto & idim : dimsAttachedToVars_bystring.at(ivar.first)) {
529  attachedDims.insert(idim.name);
530  }
531  }
532 
533  NewDimensionScales_t newdims;
534  for (const auto& dim : dimVarList) {
535  // GMI data bug: nchans already exists. Suppress creation of this scale if
536  // we are grouping new data to nchans (below).
537  // Also suppress creation of any scales not being used in the input file.
538  if (!(dim.name == "nchans" && old_grouped_vars.size()) &&
539  (attachedDims.find(dim.name) != attachedDims.end()))
540  newdims.push_back(
541  NewDimensionScale(dim.name, dim.var, ScaleSizes{Unspecified, Unspecified, 100}));
542  }
543  if (old_grouped_vars.size()) {
544  cout << " Creating nchans variable.\n";
545  // Extract the channel numbers
546  //
547  // First, find the variable with the maximum number of channels and use that
548  // as a template for the others. This covers cases where some of the channel variables
549  // are missing in some groups. These variables will end up with missing data for the
550  // channels they don't have.
551  VarDimMap::iterator chanTemplate;
552  int maxChanSize = 0;
553  for (VarDimMap::iterator ivar = old_grouped_vars.begin();
554  ivar != old_grouped_vars.end(); ++ivar) {
555  if (ivar->second.size() > maxChanSize) {
556  chanTemplate = ivar;
557  maxChanSize = ivar->second.size();
558  }
559  }
560 
561  vector<int32_t> channels(chanTemplate->second.size());
562  for (size_t i = 0; i < chanTemplate->second.size(); ++i) {
563  string schan = chanTemplate->second[i].name.substr(
564  chanTemplate->second[i].name.find_last_not_of("_0123456789") + 2);
565  channels[i] = std::atoi(schan.c_str());
566  }
567 
568  // Limited dimension. Channels are chunked together.
569  auto nds = NewDimensionScale<int32_t>("nchans", gsl::narrow<Dimensions_t>(channels.size()),
570  gsl::narrow<Dimensions_t>(channels.size()),
571  gsl::narrow<Dimensions_t>(channels.size()));
572  nds->initdata_ = channels; // Pass initial channel data.
573  newdims.push_back(nds);
574  }
575 
576  Group g_out = Engines::HH::createFile(outputName,
580  ObsGroup out = ObsGroup::generate(g_out, newdims);
581 
582  // Copy attributes from the root group
583  copyAttributes(in.atts, out.atts);
584 
585  // Open all new scales
586  map<string, Variable> newscales, newvars;
587  for (const auto& dim : newdims) newscales[dim->name_] = out.vars[dim->name_];
588  // Copy missing attributes from old scales.
589  for (const Named_Variable& d : dimVarList) {
590  if (attachedDims.find(d.name) != attachedDims.end()) {
591  copyAttributes(d.var.atts, newscales.at(d.name).atts);
592  }
593  }
594 
595 
596  // Make all variables and store handles. Do not attach dimension scales yet.
597  // Loop is split for ungrouped vs grouped vars.
598  auto makeNewVar = [&newvars,&out](const Named_Variable &oldVar, const Dimensions &dims, const VariableCreationParameters &params) {
599  // Check if we are creating a string variable. If so, determine if we are upgrading
600  // the string format. This is also relevant for the copyData function, which checks
601  // the re-mapping of dimensions to see if a string repack is needed.
602  if (oldVar.var.isA<string>()) {
603  // In the really old format, fixed-length strings each have a size of one byte.
604  // We use this as the discriminator to signify that these strings need conversion.
605  size_t sz_bytes = oldVar.var.getType().getSize();
606  Dimensions mod_dims = dims;
607  VariableCreationParameters adjustedParams = params;
608  if (sz_bytes == 1 && mod_dims.dimensionality > 1) {
609  mod_dims.dimensionality -= 1;
610  mod_dims.dimsCur.resize(mod_dims.dimsCur.size() - 1);
611  mod_dims.dimsMax.resize(mod_dims.dimsMax.size() - 1);
612  mod_dims.numElements = (mod_dims.dimensionality == 0) ? 0 : 1;
613  for (const auto& d : mod_dims.dimsCur) mod_dims.numElements *= d;
614 
615  adjustedParams.chunks = mod_dims.dimsCur; // A suggestion.
616  }
617 
618  // Set the fill value to an empty string. The calls to getCreationParameters()
619  // on the ioda v1 variables that preceed the call to this function set the fill
620  // value to a null character (\0) since the ioda v1 format for strings is a
621  // character array style. We are going to convert that character array to a vector
622  // of strings and the fill value needs to use the special string container instead
623  // of the union (which the character uses).
624  adjustedParams.setFillValue<string>("");
625 
626  cout << " Converting old-format string variable: " << oldVar.name << "\n";
627 
628  newvars[oldVar.name] = out.vars.create<string>(oldVar.name, mod_dims, adjustedParams);
629  return newvars[oldVar.name];
630  } else {
631  // TODO(ryan): turn on chunking and compression everywhere relevant.
632  VariableCreationParameters adjustedParams = params;
633  adjustedParams.chunk = true;
634  {
635  // Ideal chunking is a bit complicated.
636  // Start with using all dimensions. If this is greater than 6400,
637  // reduce the rightmost dimension. If rightmost dimension equals 1,
638  // then target the second-to-last dimension, and so on.
639  adjustedParams.chunks = dims.dimsCur; // Initial suggestion.
640  auto& c = adjustedParams.chunks;
641  const Dimensions_t max_chunk_size = 6400;
642  while (accumulate(c.begin(), c.end(),
643  static_cast<Dimensions_t>(1), multiplies<Dimensions_t>())> max_chunk_size)
644  {
645  auto dim = c.rbegin();
646  while (*dim == 1) dim++;
647  *dim /= 2;
648  }
649  }
650  adjustedParams.compressWithGZIP();
651 
652  newvars[oldVar.name]
653  = out.vars.create(oldVar.name, oldVar.var.getType(), dims, adjustedParams);
654  return newvars[oldVar.name];
655  }
656  };
657  VarDimMap dimsForNewVars;
658 
659 
660  // create vars in the ungrouped list, including copy of their attributes
661  for (const auto& oldVar : ungrouped_varList) {
662  Dimensions dims = oldVar.var.getDimensions();
663  // TODO(ryan): copy over other attributes?
664  VariableCreationParameters params = oldVar.var.getCreationParameters(false, false);
665  auto newvar = makeNewVar(oldVar, dims, params);
666  copyAttributes(oldVar.var.atts, newvar.atts);
667  const Vec_Named_Variable old_attached_dims
668  = dimsAttachedToVars_bystring.at(convertV1PathToV2Path(oldVar.name));
669  dimsForNewVars[Named_Variable{oldVar.name, newvar}] = old_attached_dims;
670  }
671 
672  const Dimensions_t suggested_chan_chunking
673  = (newscales.count("nchans")) ? newscales["nchans"].atts["suggested_chunk_dim"].read<Dimensions_t>() : 100;
674  map<string, Named_Variable> new_grouped_vars;
675  Dimensions_t numChans;
676  if (old_grouped_vars.size() > 0) {
677  numChans = out.vars.open("nchans").getDimensions().dimsCur[0];
678  }
679  for (const auto& oldGroup : old_grouped_vars) {
680  Dimensions dims = oldGroup.second.begin()->var.getDimensions();
681  Dimensions_t n = gsl::narrow<Dimensions_t>(oldGroup.second.size());
682  if (n > 1) {
683  n = numChans;
684  }
685  dims.dimensionality++;
686  dims.dimsCur.push_back(n);
687  dims.dimsMax.push_back(n);
688  dims.numElements *= n;
689 
691  = oldGroup.second.begin()->var.getCreationParameters(false, false);
692  params.chunks.push_back(suggested_chan_chunking);
693 
694  Named_Variable proto_var{oldGroup.first.name, oldGroup.second.begin()->var};
695  auto createdVar = makeNewVar(proto_var, dims, params);
696  // Copy attributes from all old variables.
697  for (const auto& src : oldGroup.second) copyAttributes(src.var.atts, createdVar.atts);
698 
699  // Also add in a new entry in dimsAttachedToVars for this variable grouping.
700  Named_Variable created{proto_var.name, createdVar};
701  Vec_Named_Variable ungrouped_scales
702  = dimsAttachedToVars_bystring.at(convertV1PathToV2Path(oldGroup.second.begin()->name));
703  Vec_Named_Variable grouped_scales = ungrouped_scales;
704  grouped_scales.push_back(Named_Variable{"nchans", newscales["nchans"]});
705  dimsForNewVars[created] = grouped_scales;
706  new_grouped_vars[oldGroup.first.name] = created;
707  }
708 
709 
710 
711  // Attach all dimension scales to all variables.
712  // We separate this from the variable creation (above) since we might want to implement a
713  // collective call.
714  {
715  vector<pair<Variable, vector<Variable>>> out_dimsAttachedToVars;
716  auto make_out_dimsAttachedToVars
717  = [&newvars, &newscales, &out_dimsAttachedToVars](const Vec_Named_Variable& olddims,
718  const Named_Variable& m) {
719  Variable newvar{newvars[m.name]};
720  vector<Variable> newdims;
721  for (const auto& d : olddims)
722  newdims.emplace_back(newscales[d.name]);
723  // Check for an old-format string. If found, drop the last dimension.
724  if (m.var.isA<string>()) {
725  if (m.var.getType().getSize() == 1) {
726  newdims.pop_back();
727  }
728  }
729  out_dimsAttachedToVars.emplace_back(make_pair(newvar, newdims));
730  };
731  for (const auto& m : ungrouped_varList) {
732  make_out_dimsAttachedToVars(dimsForNewVars.at(m), m);
733  }
734  for (const auto& m : new_grouped_vars) {
735  make_out_dimsAttachedToVars(dimsForNewVars.at(m.second), m.second);
736  }
737  out.vars.attachDimensionScales(out_dimsAttachedToVars);
738  }
739 
740  cout << "\n Copying data:\n";
741 
742  // Copy over all data.
743  // Do this for both variables and scales!
744  for (const auto& oldvar : ungrouped_varList) {
745  cout << " " << oldvar.name << "\n";
746  copyData(Vec_Named_Variable{oldvar}, newvars[oldvar.name], out, string(""), { });
747  }
748  // If we have grouped variables, create a map going from channel number to channel index
749  if (old_grouped_vars.size() > 0) {
750  std::map<int, int> chanNumToIndex;
751  std::vector<int> chanNums;
752  out.vars.open("nchans").read<int>(chanNums);
753  for (size_t i = 0; i < chanNums.size(); ++i) {
754  chanNumToIndex[chanNums[i]] = i;
755  }
756 
757  for (const auto& v : old_grouped_vars) {
758  cout << " " << v.first.name << "\n";
759  copyData(v.second, newvars[v.first.name], out, v.first.name, chanNumToIndex);
760  }
761  }
762 
763 
764  return true;
765 }
766 
767 int main(int argc, char** argv) {
768  using namespace std;
769  try {
770  // Program options
771  auto doHelp = []() {
772  cerr << "Usage: ioda-upgrade.x [-n] input_file output_file\n"
773  << " -n: do not group similar variables into one 2D varible\n";
774  exit(1);
775  };
776  // quick and dirty argument parsing meant to hold us over until the YAML
777  // configuration is implemented
778  string sInputFile;
779  string sOutputFile;
780  bool groupVariables = true;
781  if (argc == 3) {
782  sInputFile = argv[1];
783  sOutputFile = argv[2];
784  } else if ((argc == 4) && (strcmp(argv[1],"-n") == 0)) {
785  sInputFile = argv[2];
786  sOutputFile = argv[3];
787  groupVariables = false;
788  } else {
789  doHelp();
790  }
791 
792  // Parse YAML file here
793  // Unimplemented
794 
795  cout << "Input: " << sInputFile << "\nOutput: " << sOutputFile << endl;
797  params.groupSimilarVariables = groupVariables;
798  upgradeFile(sInputFile, sOutputFile, params);
799  cout << " Success!\n";
800 
801  } catch (const std::exception& e) {
802  cerr << "Exception: " << e.what() << endl << endl;
803  return 1;
804  } catch (...) {
805  cerr << "An uncaught exception occurred." << endl << endl;
806  return 1;
807  }
808  return 0;
809 }
Convenience classes for constructing ObsSpaces and setting up new Dimension Scales.
IODA's error system.
Definitions for setting up backends with file and memory I/O.
Interfaces for ioda::Group and related classes.
HDF5 engine.
Interfaces for ioda::ObsGroup and related classes.
This class represents attributes, which may be attached to both Variables and Groups.
Definition: Attribute.h:493
The ioda exception class.
Definition: Exception.h:54
Groups are a new implementation of ObsSpaces.
Definition: Group.h:159
This class exists inside of ioda::Group or ioda::Variable and provides the interface to manipulating ...
An ObsGroup is a specialization of a ioda::Group. It provides convenience functions and guarantees th...
Definition: ObsGroup.h:32
static ObsGroup generate(Group &emptyGroup, const NewDimensionScales_t &fundamentalDims, std::shared_ptr< const detail::DataLayoutPolicy > layout=nullptr)
Create an empty ObsGroup and populate it with the fundamental dimensions.
Definition: ObsGroup.cpp:72
A Selection represents the bounds of the data, in ioda or in userspace, that you are reading or writi...
Definition: Selection.h:48
Represents the "type" (i.e. integer, string, float) of a piece of data.
Definition: Type.h:123
std::type_index getType() const
Definition: Type.h:136
Variables store data!
Definition: Variable.h:680
virtual Attribute_Implementation write(gsl::span< char > data, const Type &type)
The fundamental write function. Backends overload this function to implement all write operations.
Definition: Attribute.cpp:65
Has_Attributes atts
Use this to access the metadata for the group / ObsSpace.
Definition: Group.h:120
Has_Variables vars
Use this to access variables.
Definition: Group.h:123
virtual std::map< ObjectType, std::vector< std::string > > listObjects(ObjectType filter=ObjectType::Ignored, bool recurse=false) const
List all objects (groups + variables) within this group.
Definition: Group.cpp:53
std::vector< std::string > list() const
List all one-level child groups in this group.
Definition: Group.cpp:43
virtual Attribute create(const std::string &attrname, const Type &in_memory_dataType, const std::vector< Dimensions_t > &dimensions={1})
Create an Attribute without setting its data.
virtual std::vector< std::pair< std::string, Attribute > > openAll() const
Open all attributes in an object.
virtual bool exists(const std::string &attname) const
Does an Attribute with the specified name exist?
virtual Variable open(const std::string &name) const
Open a Variable by name.
virtual size_t getSize() const
Get the size of a single element of a type, in bytes.
Definition: Type.cpp:67
virtual Type getType() const
Get type.
Definition: Variable.cpp:49
bool isA() const
Convenience function to check a Variable's storage type.
Definition: Variable.h:99
virtual Dimensions getDimensions() const
Definition: Variable.cpp:160
virtual Variable read(gsl::span< char > data, const Type &in_memory_dataType, const Selection &mem_selection=Selection::all, const Selection &file_selection=Selection::all) const
Read the Variable - as char array. Ordering is row-major.
Definition: Variable.cpp:330
virtual Variable write(gsl::span< char > data, const Type &in_memory_dataType, const Selection &mem_selection=Selection::all, const Selection &file_selection=Selection::all)
The fundamental write function. Backends overload this function to implement all write operations.
Definition: Variable.cpp:317
IODA_DL Group createFile(const std::string &filename, BackendCreateModes mode, HDF5_Version_Range compat=defaultVersionRange())
Create a ioda::Group backed by an HDF5 file.
Definition: HH.cpp:120
IODA_DL Group openMemoryFile(const std::string &filename, BackendOpenModes mode=BackendOpenModes::Read_Only, bool flush_on_close=false, size_t increment_len_bytes=1000000, HDF5_Version_Range compat=defaultVersionRange())
Map an HDF5 file in memory and open a ioda::Group.
Definition: HH.cpp:171
std::pair< HDF5_Version, HDF5_Version > HDF5_Version_Range
Definition: HH.h:42
@ V18
Use the latest HDF5 v1.8 format for storing objects.
@ Truncate_If_Exists
If the file already exists, overwrite it.
std::vector< Dimensions_t > VecDimensions_t
Definition: Selection.h:50
Selection & select(const SingleSelection &s)
Append a new selection.
Definition: Selection.h:103
IODA_DL std::string convertV1PathToV2Path(const std::string &path)
Split path into substrings separated by @ characters, then concatenate them in reverse order,...
Definition: StringFuncs.cpp:85
std::vector< std::shared_ptr< NewDimensionScale_Base > > NewDimensionScales_t
std::map< std::string, std::vector< std::string > > VarDimMap
typedef for holding dim names attached to variables
Definition: IodaUtils.h:36
std::shared_ptr< NewDimensionScale_Object< DataType > > NewDimensionScale(const std::string &name, Dimensions_t size, Dimensions_t maxSize=Unspecified, Dimensions_t chunkingSize=Unspecified)
Wrapper function used when listing new dimension scales to construct.
void collectVarDimInfo(const ObsGroup &obsGroup, VarNameObjectList &varObjectList, VarNameObjectList &dimVarObjectList, VarDimMap &dimsAttachedToVars, Dimensions_t &maxVarSize0)
collect variable and dimension information from a ioda ObsGroup
Definition: IodaUtils.cc:125
#define ioda_Here()
Describes the dimensions of an Attribute or Variable.
Definition: Dimensions.h:22
std::vector< Dimensions_t > dimsCur
The dimensions of the data.
Definition: Dimensions.h:23
Dimensions_t numElements
Definition: Dimensions.h:26
Dimensions_t dimensionality
The dimensionality (rank) of the data.
Definition: Dimensions.h:25
std::vector< Dimensions_t > dimsMax
This must always equal dimsCur for Attribute.
Definition: Dimensions.h:24
A named pair of (variable_name, ioda::Variable).
Definition: Variable.h:752
std::string name
Definition: Variable.h:753
Represents a hyperslab or a series of points in a selection, coupled with a SelectionOperator "action...
Definition: Selection.h:67
Used to specify Variable creation-time properties.
Definition: Has_Variables.h:57
std::vector< Dimensions_t > chunks
Manually specify the chunks. Never directly use. Use getChunks(...) instead.
Definition: Has_Variables.h:87
VariableCreationParameters & setFillValue(DataType fill)
Definition: Has_Variables.h:69
bool chunk
Do we chunk this variable? Required for extendible / compressible Variables.
Definition: Has_Variables.h:84
int main(int argc, char **argv)
Definition: upgrade.cpp:767
void identifySimilarVariables(const Vec_Named_Variable &inVarList, VarDimMap &similarVariables, Vec_Named_Variable &dissimilarVariables)
Determine which variables may be grouped.
Definition: upgrade.cpp:214
void collectVarDimInfo(const ioda::Group &obsGroup, Vec_Named_Variable &varList, Vec_Named_Variable &dimVarList, VarDimMap &dimsAttachedToVars, ioda::Dimensions_t &maxVarSize0)
Traverse file structure and determine dimension scales and regular variables. Also determine which di...
Definition: upgrade.cpp:95
bool isPossiblyScale(const std::string &name)
Convenience lambda to hint if a variable might be a scale.
Definition: upgrade.cpp:51
std::list< std::string > preferentialSortVariableNames(const std::vector< std::string > &allVars)
Sort variable names in a preferential way so that likely scales end up first.
Definition: upgrade.cpp:61
void copyData(const Vec_Named_Variable &old, ioda::Variable &newvar, const ioda::ObsGroup &base, const std::string &newVarName, const std::map< int, int > &chanNumToIndex)
Copy data from oldvar into newvar. Offsets are supported for variable combination.
Definition: upgrade.cpp:337
void copyAttributes(const ioda::Has_Attributes &src, ioda::Has_Attributes &dest)
Copy attributes from src to dest. Ignore duplicates and dimension scales.
Definition: upgrade.cpp:438
std::map< ioda::Named_Variable, Vec_Named_Variable > VarDimMap
Definition: upgrade.cpp:86
std::size_t getChanSuffixPos(const std::string &name)
Determine which variables may be grouped.
Definition: upgrade.cpp:187
std::vector< ioda::Named_Variable > Vec_Named_Variable
Definition: upgrade.cpp:85
bool upgradeFile(const std::string &inputName, const std::string &outputName, const UpgradeParameters &params)
Definition: upgrade.cpp:492