UFO
src/ufo/utils/RecursiveSplitter.h
Go to the documentation of this file.
1 /*
2  * (C) Copyright 2019 Met Office UK
3  *
4  * This software is licensed under the terms of the Apache Licence Version 2.0
5  * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
6  */
7 
8 #ifndef UFO_UTILS_RECURSIVESPLITTER_H_
9 #define UFO_UTILS_RECURSIVESPLITTER_H_
10 
11 #include <algorithm>
12 #include <cassert>
13 #include <cstddef> // for size_t
14 #include <string>
15 #include <vector>
16 
17 #include "ufo/utils/ArrowProxy.h"
19 
20 namespace ufo {
21 
22 /// \brief Partitions an array into groups of elements equivalent according to certain criteria.
23 ///
24 /// Example:
25 /// \code
26 /// RecursiveSplitter splitter(8);
27 /// // Tell the splitter that according to a certain criterion, some elements belong to category #1
28 /// // and others to category #2
29 /// splitter.groupBy(std::vector<int>({1, 2, 1, 2, 1, 2, 1, 2}));
30 /// std::cout << "After first split:" << std::endl;
31 /// for (const auto &group : splitter.groups()) {
32 /// std::cout << " Elements with these indices are equivalent: ";
33 /// for (const auto &index : group)
34 /// std::cout << index << ", ";
35 /// std::cout << std::endl;
36 /// }
37 /// // Tell the splitter that according to a different criterion, there is a different split between
38 /// // categories
39 /// splitter.groupBy(std::vector<int>({1, 1, 1, 1, 2, 2, 2, 2}));
40 /// std::cout << "After second split:" << std::endl;
41 /// for (const auto &group : splitter.groups()) {
42 /// std::cout << " Elements with these indices are equivalent: ";
43 /// for (const auto &index : group)
44 /// std::cout << index << ", ";
45 /// std::cout << std::endl;
46 /// }
47 /// \endcode
48 ///
49 /// Output:
50 /// \code{.unparsed}
51 /// After first split:
52 /// Elements with these indices are equivalent: 0, 2, 4, 6,
53 /// Elements with these indices are equivalent: 1, 3, 5, 7,
54 /// After second split:
55 /// Elements with these indices are equivalent: 0, 2,
56 /// Elements with these indices are equivalent: 4, 6,
57 /// Elements with these indices are equivalent: 1, 3,
58 /// Elements with these indices are equivalent: 5, 7,
59 /// \endcode
60 ///
61 /// \internal In the implementation, indices into the partitioned array are referred to as _ids_.
62 /// The term _index_ denotes an index into the vector \c orderedIds_.
64 {
65  public:
66  /// \brief A range of indices of all array elements belonging to a particular equivalence class.
67  class Group
68  {
69  public:
70  Group(const std::vector<size_t>::const_iterator &begin,
71  const std::vector<size_t>::const_iterator &end)
72  : begin_(begin), end_(end)
73  {}
74 
75  std::vector<size_t>::const_iterator begin() const
76  {
77  return begin_;
78  }
79 
80  std::vector<size_t>::const_iterator end() const
81  {
82  return end_;
83  }
84 
85  private:
86  std::vector<size_t>::const_iterator begin_;
87  std::vector<size_t>::const_iterator end_;
88  };
89 
90  /// \brief An iterator over equivalence classes consisting of more than one element.
92  {
93  public:
94  typedef ptrdiff_t difference_type;
95  typedef Group value_type;
97  typedef std::forward_iterator_tag iterator_category;
98 
99  struct BeginTag {};
100  struct EndTag {};
101 
103  : splitter_(splitter) {
104  if (splitter_.encodedGroups_.empty()) {
105  firstIndexInGroup_ = 0;
106  } else {
108  }
109  }
110 
112  : splitter_(splitter) {
114  }
115 
116  Group operator*() const {
117  assert(!isSentinel());
118  size_t lastIndexInGroup = splitter_.encodedGroups_[firstIndexInGroup_ + 1];
120  splitter_.orderedIds_.begin() + lastIndexInGroup + 1);
121  }
122 
124  return ArrowProxy<Group>(operator*());
125  }
126 
128  const size_t lastIndexInGroup = splitter_.encodedGroups_[firstIndexInGroup_ + 1];
129  if (lastIndexInGroup + 1 < splitter_.encodedGroups_.size()) {
130  firstIndexInGroup_ = splitter_.encodedGroups_[lastIndexInGroup + 1];
131  } else {
133  }
134  return *this;
135  }
136 
137  bool operator==(const MultiElementGroupIterator& other) const {
138  // In principle we should also check splitters for equality. We don't do it for efficiency.
139  return firstIndexInGroup_ == other.firstIndexInGroup_;
140  }
141 
142  bool operator!=(const MultiElementGroupIterator& other) const {
143  return !operator==(other);
144  }
145 
146  protected:
147  bool isSentinel() const {
149  }
150 
151  protected:
154  };
155 
156  /// \brief An iterator over all equivalence classes.
158  public:
163 
166 
167  explicit GroupIterator(const RecursiveSplitter &splitter, BeginTag)
169  {}
170 
171  explicit GroupIterator(const RecursiveSplitter &splitter, EndTag)
173  {}
174 
175  Group operator*() const {
176  assert(!isSentinel());
179  } else {
180  return Group(splitter_.orderedIds_.begin() + currentIndex_,
181  splitter_.orderedIds_.begin() + currentIndex_ + 1);
182  }
183  }
184 
186  return ArrowProxy<Group>(operator*());
187  }
188 
191  const size_t lastIndexInGroup = splitter_.encodedGroups_[firstIndexInGroup_ + 1];
192  if (lastIndexInGroup + 1 < splitter_.encodedGroups_.size()) {
193  firstIndexInGroup_ = splitter_.encodedGroups_[lastIndexInGroup + 1];
194  } else {
196  }
197  currentIndex_ = lastIndexInGroup + 1;
198  } else {
199  ++currentIndex_;
200  }
201  return *this;
202  }
203 
204  bool operator==(const GroupIterator& other) const {
205  // In principle we should also check splitters for equality. We don't do it for efficiency.
206  return firstIndexInGroup_ == other.firstIndexInGroup_ &&
207  currentIndex_ == other.currentIndex_;
208  }
209 
210  bool operator!=(const GroupIterator& other) const {
211  return !operator==(other);
212  }
213 
214  private:
215  bool isSentinel() const {
216  return currentIndex_ == splitter_.encodedGroups_.size();
217  }
218 
219  private:
221  };
222 
223  /// \brief A range of equivalence classes.
225  {
226  public:
227  explicit GroupRange(const RecursiveSplitter& splitter)
228  : splitter_(splitter)
229  {}
230 
233  }
234 
235  GroupIterator end() const {
237  }
238  private:
240  };
241 
242  /// \brief A range of equivalence classes consisting of more than one element.
244  {
245  public:
246  explicit MultiElementGroupRange(const RecursiveSplitter& splitter)
247  : splitter_(splitter)
248  {}
249 
252  }
253 
256  }
257  private:
259  };
260 
261  /// \brief Initialize partitioning of an array of \p numIds elements.
262  ///
263  /// Initially, all elements are assumed to belong to the same equivalence class.
264  ///
265  /// By default, all sorting operations performed by this class are done using a stable sorting
266  /// algorithm, but if \p opsCompatibilityMode is true, the same algorithm as in the Met Office
267  /// OPS system (heap sort) is used instead. This has an effect on the order of elements in each
268  /// equivalence class.
269  explicit RecursiveSplitter(size_t numIds, bool opsCompatibilityMode = false);
270 
271  /// \brief Split existing equivalence classes according to a new criterion.
272  ///
273  /// \param categories
274  /// A vector assigning each element of the partitioned array to a unique category.
275  ///
276  /// Each existing equivalence class E consisting of N_E elements with indices E_i (0 <= i < N_E)
277  /// is split into one or more classes according to the equivalence relation
278  ///
279  /// E_i'th element is equivalent to E_j'th element if and only if
280  /// categories[E_i] == categories[E_j].
281  void groupBy(const std::vector<size_t> &categories) {
282  return groupByImpl(categories);
283  }
284 
285  /// \overload
286  void groupBy(const std::vector<int> &categories) {
287  return groupByImpl(categories);
288  }
289 
290  /// \overload
291  void groupBy(const std::vector<std::string> &categories) {
292  return groupByImpl(categories);
293  }
294 
295  /// \brief Return the range of equivalence classes.
296  GroupRange groups() const { return GroupRange(*this); }
297 
298  /// \brief Return the range of equivalence classes consisting of more than one element.
300 
301  /// \brief Sort the elements in each equivalence class in ascending order.
302  ///
303  /// The elements are ranked by keys produced by the unary function \p key taking the index
304  /// of an element of the partitioned array.
305  template <typename UnaryOperation>
306  void sortGroupsBy(const UnaryOperation &key);
307 
308  /// \brief Randomly shuffle the elements of each equivalence class.
309  void shuffleGroups();
310 
311  /// \brief Initialise the random number generator used by shuffleGroups() with a seed.
312  ///
313  /// \param seed
314  /// Seed with which to initialise the generator.
315  /// \param force
316  /// If false, the seed will only be reset if the program has not made any calls to
317  /// util::shuffle() yet.
318  void setSeed(unsigned int seed, bool force);
319 
320  private:
322 
323  private:
324  template <typename T>
325  void groupByImpl(const std::vector<T> &categories);
326 
328  /// Indices of elements of the partitioned array ordered by equivalence class.
329  std::vector<size_t> orderedIds_;
330  /// Encoded locations of multi-element equivalence classes in orderedIds_.
331  std::vector<size_t> encodedGroups_;
332 };
333 
334 template <typename UnaryOperation>
335 void RecursiveSplitter::sortGroupsBy(const UnaryOperation &key) {
336  for (Group group : multiElementGroups()) {
337  std::vector<size_t>::iterator nonConstGroupBegin =
338  orderedIds_.begin() + (group.begin() - orderedIds_.cbegin());
339  std::vector<size_t>::iterator nonConstGroupEnd =
340  orderedIds_.begin() + (group.end() - orderedIds_.cbegin());
342  metOfficeSort(nonConstGroupBegin, nonConstGroupEnd, key);
343  else
344  std::stable_sort(nonConstGroupBegin, nonConstGroupEnd,
345  [&key] (size_t a, size_t b) { return key(a) < key(b); });
346  }
347 }
348 
349 } // namespace ufo
350 
351 #endif // UFO_UTILS_RECURSIVESPLITTER_H_
Utility class used in overloads of operator-> in forward iterators.
Definition: ArrowProxy.h:17
A range of indices of all array elements belonging to a particular equivalence class.
std::vector< size_t >::const_iterator end() const
std::vector< size_t >::const_iterator begin_
std::vector< size_t >::const_iterator begin() const
std::vector< size_t >::const_iterator end_
Group(const std::vector< size_t >::const_iterator &begin, const std::vector< size_t >::const_iterator &end)
An iterator over all equivalence classes.
GroupIterator(const RecursiveSplitter &splitter, EndTag)
bool operator==(const GroupIterator &other) const
MultiElementGroupIterator::difference_type difference_type
bool operator!=(const GroupIterator &other) const
GroupIterator(const RecursiveSplitter &splitter, BeginTag)
MultiElementGroupIterator::value_type value_type
MultiElementGroupIterator::BeginTag BeginTag
MultiElementGroupIterator::iterator_category iterator_category
MultiElementGroupIterator::reference reference
GroupRange(const RecursiveSplitter &splitter)
An iterator over equivalence classes consisting of more than one element.
MultiElementGroupIterator(const RecursiveSplitter &splitter, EndTag)
bool operator!=(const MultiElementGroupIterator &other) const
MultiElementGroupIterator(const RecursiveSplitter &splitter, BeginTag)
bool operator==(const MultiElementGroupIterator &other) const
A range of equivalence classes consisting of more than one element.
Partitions an array into groups of elements equivalent according to certain criteria.
GroupRange groups() const
Return the range of equivalence classes.
void groupBy(const std::vector< size_t > &categories)
Split existing equivalence classes according to a new criterion.
void groupByImpl(const std::vector< T > &categories)
void groupBy(const std::vector< std::string > &categories)
This is an overloaded member function, provided for convenience. It differs from the above function o...
MultiElementGroupRange multiElementGroups() const
Return the range of equivalence classes consisting of more than one element.
void shuffleGroups()
Randomly shuffle the elements of each equivalence class.
void sortGroupsBy(const UnaryOperation &key)
Sort the elements in each equivalence class in ascending order.
std::vector< size_t > orderedIds_
Indices of elements of the partitioned array ordered by equivalence class.
RecursiveSplitter(size_t numIds, bool opsCompatibilityMode=false)
Initialize partitioning of an array of numIds elements.
std::vector< size_t > encodedGroups_
Encoded locations of multi-element equivalence classes in orderedIds_.
void groupBy(const std::vector< int > &categories)
This is an overloaded member function, provided for convenience. It differs from the above function o...
void setSeed(unsigned int seed, bool force)
Initialise the random number generator used by shuffleGroups() with a seed.
std::set< size_t > Group
Definition: RunCRTM.h:27
void metOfficeSort(RandomIt first, RandomIt last, const UnaryOperation &key)
Sort the range [first, last) in the order of ascending keys using the same algorithm as the Ops_Integ...