UFO
PoissonDiskThinning.cc
Go to the documentation of this file.
1 /*
2  * (C) Copyright 2019 Met Office UK
3  *
4  * This software is licensed under the terms of the Apache Licence Version 2.0
5  * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
6  */
7 
9 
10 #include <algorithm>
11 #include <cmath>
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 
17 #include "eckit/config/Configuration.h"
18 #include "eckit/container/KDTree.h"
19 #include "ioda/ObsDataVector.h"
20 #include "ioda/ObsSpace.h"
21 #include "oops/base/Variables.h"
22 #include "oops/util/DateTime.h"
23 #include "oops/util/Duration.h"
24 #include "oops/util/IsAnyPointInVolumeInterior.h"
25 #include "oops/util/Logger.h"
29 #include "ufo/utils/Constants.h"
31 
32 namespace ufo {
33 
34 namespace {
35 
36 /// \brief Abstract interface of a container storing point sets and able to answer spatial queries
37 /// needed by PoissonDiskThinning ("does any point lie in the interior of an axis-aligned
38 /// ellipsoid/cylinder?").
39 template <int numDims_>
40 class PointIndex {
41  public:
42  typedef float CoordType;
43  static const int numDims = numDims_;
44 
45  typedef std::array<CoordType, numDims> Point;
46  typedef std::array<CoordType, numDims> Extent;
47 
48  virtual ~PointIndex() {}
49 
50  virtual void insert(const Point &point) = 0;
51 
52  virtual bool isAnyPointInCylinderInterior(const Point &center,
53  const Extent &semiAxes,
54  int numSpatialDims) const = 0;
55 
56  virtual bool isAnyPointInEllipsoidInterior(const Point &center,
57  const Extent &semiAxes) const = 0;
58 };
59 
60 /// \brief An implementation of PointIndex storing the point set in a kd-tree.
61 template <int numDims_>
62 class KDTree : public PointIndex<numDims_> {
63  public:
65 
66  typedef typename Base::CoordType CoordType;
67  typedef typename Base::Point Point;
68  typedef typename Base::Extent Extent;
69 
70  static const int numDims = Base::numDims;
71 
72  void insert(const Point &point) override;
73 
74  bool isAnyPointInCylinderInterior(const Point &center,
75  const Extent &semiAxes,
76  int numSpatialDims) const override;
77 
78  bool isAnyPointInEllipsoidInterior(const Point &center,
79  const Extent &semiAxes) const override;
80 
81  private:
82  struct EmptyPayload {};
83 
84  struct TreeTraits {
85  typedef eckit::geometry::KPoint<numDims> Point;
87  };
88 
89  typedef eckit::KDTreeMemory<TreeTraits> KDTreeImpl;
90  typedef typename KDTreeImpl::Alloc Alloc;
91  typedef typename KDTreeImpl::Node Node;
92  typedef typename KDTreeImpl::Point KPoint;
93  typedef typename KDTreeImpl::Value Value;
94 
96 };
97 
98 template <int numDims_>
100  const Point &point) {
101  tree_.insert(Value(KPoint(point), EmptyPayload()));
102 }
103 
104 template <int numDims_>
106  const Point &center, const Extent &semiAxes) const {
107  KPoint lbound, ubound;
108  for (int d = 0; d < numDims; ++d) {
109  lbound.data()[d] = center[d] - semiAxes[d];
110  ubound.data()[d] = center[d] + semiAxes[d];
111  }
112  return util::isAnyPointInEllipsoidInterior(tree_, lbound, ubound);
113 }
114 
115 template <int numDims_>
117  const Point &center, const Extent &semiAxes, int numSpatialDims) const {
118  KPoint lbound, ubound;
119  for (int d = 0; d < numDims; ++d) {
120  lbound.data()[d] = center[d] - semiAxes[d];
121  ubound.data()[d] = center[d] + semiAxes[d];
122  }
123  return util::isAnyPointInCylinderInterior(tree_, lbound, ubound, numSpatialDims);
124 }
125 
126 } // namespace
127 
129 {
130  boost::optional<util::ScalarOrMap<int, float>> minHorizontalSpacings;
131  boost::optional<std::vector<float>> latitudes;
132  boost::optional<std::vector<float>> longitudes;
133 
134  boost::optional<util::ScalarOrMap<int, float>> minVerticalSpacings;
135  boost::optional<std::vector<float>> pressures;
136 
137  boost::optional<util::ScalarOrMap<int, util::Duration>> minTimeSpacings;
138  boost::optional<std::vector<util::DateTime>> times;
139 
140  boost::optional<std::vector<int>> priorities;
141 
142  // Total number of observations held by all MPI tasks
143  size_t totalNumObs = 0;
144 };
145 
147  const Parameters_ &parameters,
148  std::shared_ptr<ioda::ObsDataVector<int> > flags,
149  std::shared_ptr<ioda::ObsDataVector<float> > obserr)
150  : FilterBase(obsdb, parameters, flags, obserr), options_(parameters)
151 {
152  oops::Log::debug() << "PoissonDiskThinning: config = " << options_ << std::endl;
153 }
154 
155 // Required for the correct destruction of options_.
157 {}
158 
159 void PoissonDiskThinning::applyFilter(const std::vector<bool> & apply,
160  const Variables & filtervars,
161  std::vector<std::vector<bool>> & flagged) const {
162  ObsAccessor obsAccessor = createObsAccessor();
163 
164  const std::vector<size_t> validObsIds = getValidObservationIds(apply, filtervars, obsAccessor);
165 
166  int numSpatialDims, numNonspatialDims;
167  ObsData obsData = getObsData(obsAccessor, numSpatialDims, numNonspatialDims);
168 
169  std::vector<bool> isThinned(obsData.totalNumObs, false);
170 
171  if (options_.shuffle) {
172  // If the same observations will be processed on multiple ranks, they must use random number
173  // generators seeded with the same value because they need to reject the same observations. If
174  // all ranks process disjoint sets of observations, synchronisation of random number generators
175  // isn't necessary (and in fact somewhat undesirable). The sets of observations processed by
176  // different ranks will be disjoint if both of the following conditions are met:
177  // * the category_variable option is set to the variable used previously to split the
178  // observation space into records
179  // * each record is held on a single rank only (i.e. the observation space isn't using the
180  // InefficientDistribution).
181  // Unfortunately the second condition can't be checked without breaking the encapsulation
182  // of ioda::Distribution (and reintroducing the isDistributed() method).
183  //
184  // So to stay on the safe side we synchronise the random number generators whenever the shuffle
185  // option is selected.
187  }
188 
189  // Thin points from each category separately.
190  RecursiveSplitter categorySplitter =
191  obsAccessor.splitObservationsIntoIndependentGroups(validObsIds);
192  for (auto categoryGroup : categorySplitter.multiElementGroups()) {
193  std::vector<size_t> obsIdsInCategory;
194  for (size_t validObsIndex : categoryGroup) {
195  obsIdsInCategory.push_back(validObsIds[validObsIndex]);
196  }
197 
198  // Within each category, sort points by descending priority and then (if requested)
199  // randomly shuffle points of equal priority.
200  RecursiveSplitter prioritySplitter(obsIdsInCategory.size());
201  groupObservationsByPriority(obsIdsInCategory, obsAccessor, prioritySplitter);
202  if (options_.shuffle)
203  prioritySplitter.shuffleGroups();
204 
205  // Select points to retain within the category.
206  thinCategory(obsData, obsIdsInCategory, prioritySplitter, numSpatialDims, numNonspatialDims,
207  isThinned);
208  }
209 
210  obsAccessor.flagRejectedObservations(isThinned, flagged);
211 }
212 
214  if (options_.categoryVariable.value() != boost::none) {
216  obsdb_, *options_.categoryVariable.value());
217  } else {
219  }
220 }
221 
223  const ObsAccessor &obsAccessor, int &numSpatialDims, int &numNonspatialDims) const
224 {
225  ObsData obsData;
226 
227  numSpatialDims = 0;
228  numNonspatialDims = 0;
229 
231  if (obsData.minHorizontalSpacings != boost::none) {
232  validateSpacings(*obsData.minHorizontalSpacings, "min_horizontal_spacing");
233  obsData.latitudes = obsAccessor.getFloatVariableFromObsSpace("MetaData", "latitude");
234  obsData.longitudes = obsAccessor.getFloatVariableFromObsSpace("MetaData", "longitude");
235  numSpatialDims = 3;
236  }
237 
239  if (obsData.minVerticalSpacings != boost::none) {
240  validateSpacings(*obsData.minVerticalSpacings, "min_vertical_spacing");
241  obsData.pressures = obsAccessor.getFloatVariableFromObsSpace("MetaData", "air_pressure");
242  ++numNonspatialDims;
243  }
244 
245  obsData.minTimeSpacings = options_.minTimeSpacing.value();
246  if (obsData.minTimeSpacings != boost::none) {
247  validateSpacings(*obsData.minTimeSpacings, "min_time_spacing");
248  obsData.times = obsAccessor.getDateTimeVariableFromObsSpace("MetaData", "datetime");
249  ++numNonspatialDims;
250  }
251 
252  const boost::optional<Variable> priorityVariable = options_.priorityVariable;
253  if (priorityVariable != boost::none) {
254  obsData.priorities = obsAccessor.getIntVariableFromObsSpace(
255  priorityVariable.get().group(), priorityVariable.get().variable());
256  }
257 
258  obsData.totalNumObs = obsAccessor.totalNumObservations();
259 
260  return obsData;
261 }
262 
263 template <typename ValueType>
265  const util::ScalarOrMap<int, ValueType> &spacingsByPriority,
266  const std::string &parameterName) const {
267  if (spacingsByPriority.isScalar())
268  return;
269 
270  if (spacingsByPriority.begin() == spacingsByPriority.end())
271  throw eckit::BadParameter(parameterName + " must be a scalar or a non-empty map");
272 
273  // The map is ordered by increasing priority, so the spacing of every item must be
274  // no larger than that of the previous item
275  ValueType prevSpacing = spacingsByPriority.begin()->second;
276  for (const auto &priorityAndSpacing : spacingsByPriority) {
277  const ValueType &spacing = priorityAndSpacing.second;
278  if (spacing > prevSpacing)
279  throw eckit::BadParameter(parameterName +
280  ": exclusion volumes of lower-priority observations must be "
281  "at least as large as those of higher-priority ones.");
282  }
283 }
284 
286  const std::vector<bool> & apply,
287  const Variables & filtervars,
288  const ObsAccessor &obsAccessor) const {
289  std::vector<size_t> validObsIds = obsAccessor.getValidObservationIds(apply, *flags_, filtervars);
290 
291  if (!options_.shuffle) {
292  // The user wants to process observations in fixed (non-random) order. Ensure the filter
293  // produces the same results regardless of the number of MPI ranks by ordering the observations
294  // to be processed as if we were running in serial: by record ID.
295  const std::vector<size_t> recordIds = obsAccessor.getRecordIds();
296  std::stable_sort(validObsIds.begin(), validObsIds.end(),
297  [&recordIds](size_t obsIdA, size_t obsIdB)
298  { return recordIds[obsIdA] < recordIds[obsIdB]; });
299  }
300 
301  return validObsIds;
302 }
303 
304 void PoissonDiskThinning::synchroniseRandomNumberGenerators(const eckit::mpi::Comm &comm) const
305 {
306  const size_t rootRank = 0;
307 
308  size_t seed;
309  if (options_.randomSeed.value() != boost::none) {
310  seed = *options_.randomSeed.value();
311  } else {
312  if (comm.rank() == rootRank)
313  // Perhaps oops could provide a function returning this default seed.
314  seed = static_cast<std::uint32_t>(std::time(nullptr));
315  comm.broadcast(seed, rootRank);
316  }
317 
318  RecursiveSplitter splitter(1);
319  splitter.setSeed(seed, true /* force? */);
320 }
321 
323  const std::vector<size_t> &validObsIds,
324  const ObsAccessor &obsAccessor,
325  RecursiveSplitter &splitter) const {
326  boost::optional<Variable> priorityVariable = options_.priorityVariable;
327  if (priorityVariable == boost::none)
328  return;
329 
330  // TODO(wsmigaj): reuse the priority vector from obsData.
331  std::vector<int> priority = obsAccessor.getIntVariableFromObsSpace(
332  priorityVariable.get().group(), priorityVariable.get().variable());
333 
334  auto reverse = [](int i) {
335  return -i - std::numeric_limits<int>::lowest() + std::numeric_limits<int>::max();
336  };
337 
338  std::vector<int> validObsPriorities(validObsIds.size());
339  for (size_t validObsIndex = 0; validObsIndex < validObsIds.size(); ++validObsIndex)
340  // reversing because we want to start with the highest-priority items
341  validObsPriorities[validObsIndex] = reverse(priority[validObsIds[validObsIndex]]);
342  splitter.groupBy(validObsPriorities);
343 }
344 
346  const std::vector<size_t> &obsIdsInCategory,
347  const RecursiveSplitter &prioritySplitter,
348  int numSpatialDims,
349  int numNonspatialDims,
350  std::vector<bool> &isThinned) const {
351  switch (numSpatialDims + numNonspatialDims) {
352  case 0:
353  return; // nothing to do
354  case 1:
355  return thinCategory<1>(obsData, obsIdsInCategory, prioritySplitter, numSpatialDims, isThinned);
356  case 2:
357  return thinCategory<2>(obsData, obsIdsInCategory, prioritySplitter, numSpatialDims, isThinned);
358  case 3:
359  return thinCategory<3>(obsData, obsIdsInCategory, prioritySplitter, numSpatialDims, isThinned);
360  case 4:
361  return thinCategory<4>(obsData, obsIdsInCategory, prioritySplitter, numSpatialDims, isThinned);
362  case 5:
363  return thinCategory<5>(obsData, obsIdsInCategory, prioritySplitter, numSpatialDims, isThinned);
364  }
365 
366  ABORT("Unexpected number of thinning dimensions");
367 }
368 
369 template <int numDims>
371  const std::vector<size_t> &obsIdsInCategory,
372  const RecursiveSplitter &prioritySplitter,
373  int numSpatialDims,
374  std::vector<bool> &isThinned) const {
375  KDTree<numDims> pointIndex;
376 
377  for (auto priorityGroup : prioritySplitter.groups()) {
378  for (size_t obsIndex : priorityGroup) {
379  const size_t obsId = obsIdsInCategory[obsIndex];
380  std::array<float, numDims> point = getObservationPosition<numDims>(obsId, obsData);
381  std::array<float, numDims> semiAxes = getExclusionVolumeSemiAxes<numDims>(obsId, obsData);
383  pointIndex.isAnyPointInCylinderInterior(point, semiAxes, numSpatialDims)) ||
385  pointIndex.isAnyPointInEllipsoidInterior(point, semiAxes))) {
386  isThinned[obsId] = true;
387  } else {
388  pointIndex.insert(point);
389  }
390  }
391  }
392 }
393 
394 template <int numDims>
396  size_t obsId, const ObsData &obsData) const {
397  std::array<float, numDims> position;
398 
399  unsigned int dim = 0;
400 
401  if (obsData.latitudes && obsData.longitudes) {
402  const float deg2rad = static_cast<float>(M_PI / 180.0);
403  const float earthRadius = Constants::mean_earth_rad;
404 
405  const float lon = deg2rad * (*obsData.longitudes)[obsId];
406  const float lat = deg2rad * (*obsData.latitudes)[obsId];
407  const float sinLat = std::sin(lat);
408  const float cosLat = std::cos(lat);
409  const float sinLon = std::sin(lon);
410  const float cosLon = std::cos(lon);
411 
412  position[dim++] = earthRadius * cosLat * cosLon;
413  position[dim++] = earthRadius * cosLat * sinLon;
414  position[dim++] = earthRadius * sinLat;
415  }
416 
417  if (obsData.pressures) {
418  position[dim++] = (*obsData.pressures)[obsId];
419  }
420 
421  if (obsData.times) {
422  // We choose obsData.times[0] as the reference time when converting datetimes to floats.
423  // Maybe there's a better way.
424  position[dim++] = ((*obsData.times)[obsId] - (*obsData.times)[0]).toSeconds();
425  }
426 
427  return position;
428 }
429 
430 template <int numDims>
432  size_t obsId, const ObsData &obsData) const {
433 
434  std::array<float, numDims> semiAxes;
435 
436  const int priority = obsData.priorities == boost::none ? 0 : (*obsData.priorities)[obsId];
437 
438  unsigned int dim = 0;
439 
440  if (obsData.minHorizontalSpacings) {
441  const float earthDiameter = 2 * Constants::mean_earth_rad;
442  const float invEarthDiameter = 1 / earthDiameter;
443 
444  const float minGeodesicDistance = obsData.minHorizontalSpacings->at(priority);
445  const float minEuclideanDistance =
446  earthDiameter * std::sin(minGeodesicDistance * invEarthDiameter);
447 
448  semiAxes[dim++] = minEuclideanDistance;
449  semiAxes[dim++] = minEuclideanDistance;
450  semiAxes[dim++] = minEuclideanDistance;
451  }
452 
453  if (obsData.minVerticalSpacings) {
454  semiAxes[dim++] = obsData.minVerticalSpacings->at(priority);
455  }
456 
457  if (obsData.minTimeSpacings) {
458  semiAxes[dim++] = obsData.minTimeSpacings->at(priority).toSeconds();
459  }
460 
461  return semiAxes;
462 }
463 
464 void PoissonDiskThinning::print(std::ostream & os) const {
465  os << "PoissonDiskThinning: config = " << options_ << std::endl;
466 }
467 
468 // -----------------------------------------------------------------------------
469 
470 } // namespace ufo
Base class for UFO QC filters.
Definition: FilterBase.h:45
This class provides access to observations that may be held on multiple MPI ranks.
Definition: ObsAccessor.h:56
RecursiveSplitter splitObservationsIntoIndependentGroups(const std::vector< size_t > &validObsIds, bool opsCompatibilityMode=false) const
Definition: ObsAccessor.cc:197
std::vector< size_t > getValidObservationIds(const std::vector< bool > &apply, const ioda::ObsDataVector< int > &flags, const Variables &filtervars, bool validIfAnyFilterVariablePassedQC=true) const
Return the IDs of observation locations that should be treated as valid by a filter.
Definition: ObsAccessor.cc:99
std::vector< int > getIntVariableFromObsSpace(const std::string &group, const std::string &variable) const
Return the values of the specified variable at successive observation locations.
Definition: ObsAccessor.cc:139
static ObsAccessor toAllObservations(const ioda::ObsSpace &obsdb)
Create an accessor to observations from the observation space obsdb, assuming that the whole set of o...
Definition: ObsAccessor.cc:84
std::vector< size_t > getRecordIds() const
Return the vector of IDs of records successive observation locations belong to.
Definition: ObsAccessor.cc:164
static ObsAccessor toObservationsSplitIntoIndependentGroupsByVariable(const ioda::ObsSpace &obsdb, const Variable &variable)
Create an accessor to the collection of observations held in obsdb, assuming that observations with d...
Definition: ObsAccessor.cc:94
std::vector< float > getFloatVariableFromObsSpace(const std::string &group, const std::string &variable) const
Definition: ObsAccessor.cc:144
std::vector< util::DateTime > getDateTimeVariableFromObsSpace(const std::string &group, const std::string &variable) const
Definition: ObsAccessor.cc:159
void flagRejectedObservations(const std::vector< bool > &isRejected, std::vector< std::vector< bool > > &flagged) const
Update flags of observations held on the current MPI rank.
Definition: ObsAccessor.cc:243
size_t totalNumObservations() const
Definition: ObsAccessor.cc:170
ioda::ObsSpace & obsdb_
std::shared_ptr< ioda::ObsDataVector< int > > flags_
void thinCategory(const ObsData &obsData, const std::vector< size_t > &obsIdsInCategory, const RecursiveSplitter &prioritySplitter, int numSpatialDims, int numNonspatialDims, std::vector< bool > &isThinned) const
std::array< float, numDims > getObservationPosition(size_t obsId, const ObsData &obsData) const
void print(std::ostream &) const override
void applyFilter(const std::vector< bool > &, const Variables &, std::vector< std::vector< bool >> &) const override
void synchroniseRandomNumberGenerators(const eckit::mpi::Comm &comm) const
ObsAccessor createObsAccessor() const
PoissonDiskThinning(ioda::ObsSpace &obsdb, const Parameters_ &parameters, std::shared_ptr< ioda::ObsDataVector< int > > flags, std::shared_ptr< ioda::ObsDataVector< float > > obserr)
std::vector< size_t > getValidObservationIds(const std::vector< bool > &apply, const Variables &filtervars, const ObsAccessor &obsAccessor) const
void groupObservationsByPriority(const std::vector< size_t > &validObsIds, const ObsAccessor &obsAccessor, RecursiveSplitter &splitter) const
std::array< float, numDims > getExclusionVolumeSemiAxes(size_t obsId, const ObsData &obsData) const
ObsData getObsData(const ObsAccessor &obsAccessor, int &numSpatialDims, int &numNonspatialDims) const
Collect all observation data components used for thinning.
void validateSpacings(const util::ScalarOrMap< int, ValueType > &spacingsByPriority, const std::string &parameterName) const
Options controlling the operation of the PoissonDiskThinning filter.
oops::OptionalParameter< util::ScalarOrMap< Priority, util::Duration > > minTimeSpacing
oops::OptionalParameter< Variable > categoryVariable
oops::OptionalParameter< util::ScalarOrMap< Priority, float > > minVerticalSpacing
oops::OptionalParameter< Variable > priorityVariable
oops::OptionalParameter< util::ScalarOrMap< Priority, float > > minHorizontalSpacing
oops::Parameter< ExclusionVolumeShape > exclusionVolumeShape
Partitions an array into groups of elements equivalent according to certain criteria.
GroupRange groups() const
Return the range of equivalence classes.
void groupBy(const std::vector< size_t > &categories)
Split existing equivalence classes according to a new criterion.
MultiElementGroupRange multiElementGroups() const
Return the range of equivalence classes consisting of more than one element.
void shuffleGroups()
Randomly shuffle the elements of each equivalence class.
void setSeed(unsigned int seed, bool force)
Initialise the random number generator used by shuffleGroups() with a seed.
An implementation of PointIndex storing the point set in a kd-tree.
Abstract interface of a container storing point sets and able to answer spatial queries needed by Poi...
virtual bool isAnyPointInEllipsoidInterior(const Point &center, const Extent &semiAxes) const =0
virtual bool isAnyPointInCylinderInterior(const Point &center, const Extent &semiAxes, int numSpatialDims) const =0
std::array< float, 3 > Point
ObsPair reverse(const ObsPair &pair)
real(kind_real), parameter, public deg2rad
Definition: RunCRTM.h:27
static constexpr double mean_earth_rad
Definition: Constants.h:40
boost::optional< std::vector< float > > longitudes
boost::optional< util::ScalarOrMap< int, float > > minVerticalSpacings
boost::optional< std::vector< int > > priorities
boost::optional< util::ScalarOrMap< int, float > > minHorizontalSpacings
boost::optional< std::vector< float > > pressures
boost::optional< util::ScalarOrMap< int, util::Duration > > minTimeSpacings
boost::optional< std::vector< float > > latitudes
boost::optional< std::vector< util::DateTime > > times