UFO
DataExtractorCSVBackend.cc
Go to the documentation of this file.
1 /*
2  * (C) Crown copyright 2021, Met Office
3  *
4  * This software is licensed under the terms of the Apache Licence Version 2.0
5  * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
6  */
7 
8 #include <string>
9 #include <unordered_map>
10 #include <utility> // for move
11 #include <vector>
12 
13 #include <boost/multi_array.hpp>
14 #include <boost/variant.hpp>
15 
16 #include "eckit/exception/Exceptions.h"
17 #include "eckit/parser/CSVParser.h"
18 #include "eckit/utils/StringTools.h"
19 
20 #include "ioda/Misc/StringFuncs.h" // for convertV1PathToV2Path
21 #include "ioda/ObsSpace.h" // for ObsDtype
22 
23 #include "oops/util/Logger.h"
24 
27 
28 namespace ufo
29 {
30 
31 namespace {
32 
33 /// Number of header rows in CSV files.
34 const size_t numHeaderRows = 2;
35 
36 /// Representation of missing values in CSV files (same as in NetCDF's CDL).
37 const char *missingValuePlaceholder = "_";
38 
39 /// Visitor whose operator() takes a vector and appends to it the value passed to the constructor
40 /// (treating "_" as a placeholder for "missing value").
41 class AppendValueVisitor : public boost::static_visitor<void> {
42  public:
43  explicit AppendValueVisitor(const eckit::Value &value) :
44  value_(value)
45  {}
46 
47  void operator()(std::vector<int> &values) const {
48  int value;
49  if (value_.as<std::string>() == missingValuePlaceholder)
50  value = util::missingValue(value);
51  else
52  // NOLINTNEXTLINE(runtime/int): It's not our fault that eckit uses the 'long long' type...
53  value = static_cast<int>(value_.as<long long>());
54  values.push_back(value);
55  }
56 
57  void operator()(std::vector<float> &values) const {
58  float value;
59  if (value_.as<std::string>() == missingValuePlaceholder)
60  value = util::missingValue(value);
61  else
62  value = static_cast<float>(value_.as<double>());
63  values.push_back(value);
64  }
65 
66  void operator()(std::vector<std::string> &values) const {
67  std::string value = value_.as<std::string>();
68  if (value == missingValuePlaceholder)
69  value = util::missingValue(value);
70  values.push_back(value);
71  }
72 
73  private:
74  const eckit::Value &value_;
75 };
76 
77 template <typename Source, typename Destination>
78 void convertVectorToColumnArray(const std::vector<Source> &source,
79  boost::multi_array<Destination, 3> &destination) {
80  const Source missingSource = util::missingValue(Source());
81  const Destination missingDestination = util::missingValue(Destination());
82  destination.resize(boost::extents[source.size()][1][1]);
83  for (size_t i = 0; i < source.size(); ++i)
84  if (source[i] != missingSource)
85  destination[i][0][0] = static_cast<Destination>(source[i]);
86  else
87  destination[i][0][0] = missingDestination;
88 }
89 
90 /// Visitor that converts an std::vector to a boost::multi_array with one column.
91 template <typename ExtractedValue>
92 class ConvertToBoostMultiArrayVisitor : public boost::static_visitor<void> {
93  public:
94  explicit ConvertToBoostMultiArrayVisitor(boost::multi_array<ExtractedValue, 3> &output) :
95  output_(output)
96  {}
97 
98  template <typename T,
99  typename std::enable_if<std::is_convertible<T, ExtractedValue>::value, bool>::type
100  = true>
101  void operator()(const std::vector<T> &values) {
102  output_.resize(boost::extents[values.size()][1][1]);
103  for (size_t i = 0; i < values.size(); ++i)
104  output_[i][0][0] = values[i];
105  }
106 
107  template <typename T,
108  typename std::enable_if<!std::is_convertible<T, ExtractedValue>::value, bool>::type
109  = true>
110  void operator()(const std::vector<T> &) {
111  // Should never be called
112  throw eckit::NotImplemented(Here());
113  }
114 
115  private:
116  boost::multi_array<ExtractedValue, 3> &output_;
117 };
118 
119 /// \brief Find the index of the column whose name ends with `@` followed by `payloadGroup`
120 /// or begins with `payloadGroup` followed by `/`.
121 ///
122 /// Throw an exception if there's no such column or there's more than one.
123 size_t findPayloadColumn(const std::vector<std::string> &columnNames,
124  const std::string &payloadGroup) {
125  const std::string prefix = payloadGroup + '/';
126  const std::string suffix = '@' + payloadGroup;
127  auto isInPayloadGroup = [&prefix, &suffix](const std::string &name) {
128  return eckit::StringTools::beginsWith(name, prefix) ||
129  eckit::StringTools::endsWith(name, suffix);
130  };
131  auto payloadColumnIt = std::find_if(columnNames.begin(), columnNames.end(), isInPayloadGroup);
132  if (payloadColumnIt == columnNames.end())
133  throw eckit::UserError("No payload column found: no column name begins with '" + prefix +
134  "' or ends with '" + suffix + "'",
135  Here());
136  if (std::any_of(payloadColumnIt + 1, columnNames.end(), isInPayloadGroup))
137  throw eckit::UserError("Multiple payload candidates found: "
138  "more than one column name begins with '" + prefix +
139  "' or ends with '" + suffix + "'", Here());
140  return payloadColumnIt - columnNames.begin();
141 }
142 
143 template <typename T>
144 std::vector<T> createColumn(size_t numValues) {
145  std::vector<T> values;
146  values.reserve(numValues);
147  return values;
148 }
149 
150 /// \brief Throw an exception if contents of columns of type `type` can't be converted to values
151 /// of type `ExtractedValue`.
152 template <typename ExtractedValue>
153 void checkPayloadColumnType(const std::string &type);
154 
155 template <>
156 void checkPayloadColumnType<float>(const std::string &type) {
157  if (type != "float" && type != "int")
158  throw eckit::UserError("The payload column must contain numeric data", Here());
159 }
160 
161 template <>
162 void checkPayloadColumnType<int>(const std::string &type) {
163  if (type != "float" && type != "int")
164  throw eckit::UserError("The payload column must contain numeric data", Here());
165 }
166 
167 template <>
168 void checkPayloadColumnType<std::string>(const std::string &type) {
169  if (type != "string" && type != "datetime")
170  throw eckit::UserError("The payload column must contain strings or datetimes", Here());
171 }
172 
173 } // namespace
174 
175 template <typename ExtractedValue>
177  : filepath_(filepath)
178 {}
179 
180 template <typename ExtractedValue>
182  const std::string &interpolatedArrayGroup) const {
184 
185  const eckit::Value contents = eckit::CSVParser::decodeFile(filepath_, false /* hasHeader? */);
186  const size_t numRows = contents.size();
187  // Ensure we have at least three lines:
188  // * column names
189  // * data types
190  // * one row of values.
191  if (numRows <= numHeaderRows)
192  throw eckit::UserError("No data could be loaded from the file '" + filepath_ + "'", Here());
193  const size_t numValues = numRows - numHeaderRows;
194 
195  // Read column names from the first line
196  const eckit::Value nameHeader = contents[0];
197  const size_t numColumns = nameHeader.size();
198  std::vector<std::string> columnNames(numColumns);
199  columnNames.reserve(numColumns);
200  for (size_t column = 0; column < numColumns; ++column)
201  columnNames[column] = nameHeader[column].as<std::string>();
202 
203  const size_t payloadColumnIndex = findPayloadColumn(columnNames, interpolatedArrayGroup);
204 
205  // Now that we won't need to include column names in any further error messages, convert
206  // them to the ioda-v2 convention (Group/var rather than var@Group)
207  for (std::string &columnName : columnNames)
208  columnName = ioda::convertV1PathToV2Path(columnName);
209 
210  // Read data types from the second line
211  const eckit::Value typeHeader = contents[1];
212  if (typeHeader.size() != numColumns)
213  throw eckit::UserError("The number of columns in line 2 differs from that in line 1", Here());
214 
215  // Allocate vectors for values to be loaded from subsequent lines
216  std::vector<DataExtractorInputBase::Coordinate> columns(numColumns);
217  for (size_t column = 0; column < numColumns; ++column) {
218  const std::string type = typeHeader[column];
219  if (column == payloadColumnIndex)
220  checkPayloadColumnType<ExtractedValue>(type);
221  if (type == "string" || type == "datetime") {
222  columns[column] = createColumn<std::string>(numValues);
223  } else if (type == "int" || type == "integer") {
224  columns[column] = createColumn<int>(numValues);
225  } else if (type == "float") {
226  columns[column] = createColumn<float>(numValues);
227  } else {
228  throw eckit::UserError("Unsupported data type '" + type + "'", Here());
229  }
230  }
231 
232  // Load values from the rest of the CSV file
233  for (size_t row = numHeaderRows; row < numRows; ++row) {
234  const eckit::Value rowContents = contents[row];
235  if (rowContents.size() == 1 && rowContents[0] == "")
236  continue; // empty line
237  if (rowContents.size() != numColumns)
238  throw eckit::UserError("The number of columns in line " + std::to_string(1 + row) +
239  " differs from that in line 1", Here());
240  for (size_t column = 0; column < numColumns; ++column)
241  boost::apply_visitor(AppendValueVisitor(rowContents[column]), columns[column]);
242  }
243 
244  // Store the loaded data in the result object
245  const int firstDim = 0;
246  result.dim2CoordMapping.resize(1);
247  for (size_t column = 0; column < numColumns; ++column) {
248  if (column == payloadColumnIndex) {
249  ConvertToBoostMultiArrayVisitor<ExtractedValue> visitor(result.payloadArray);
250  boost::apply_visitor(visitor, columns[column]);
251  } else {
252  result.coordsVals[columnNames[column]] = std::move(columns[column]);
253  result.coord2DimMapping[columnNames[column]] = firstDim;
254  result.dim2CoordMapping[firstDim].push_back(columnNames[column]);
255  }
256  }
257 
258  if (result.payloadArray.shape()[0] == 0)
259  throw eckit::UserError("No data could be loaded from the file '" + filepath_ + "'", Here());
260 
261  return result;
262 }
263 
264 // Explicit instantiations
265 template class DataExtractorCSVBackend<float>;
266 template class DataExtractorCSVBackend<int>;
268 
269 } // namespace ufo
Produces input for a DataExtractor by loading data from a CSV file.
DataExtractorInput< ExtractedValue > loadData(const std::string &payloadGroup) const override
Load data for subsequent extraction.
DataExtractorCSVBackend(const std::string &filepath)
Create a new instance.
Visitor that converts an std::vector to a boost::multi_array with one column.
size_t findPayloadColumn(const std::vector< std::string > &columnNames, const std::string &payloadGroup)
Find the index of the column whose name ends with @ followed by payloadGroup or begins with payloadGr...
void checkPayloadColumnType(const std::string &type)
Throw an exception if contents of columns of type type can't be converted to values of type Extracted...
void convertVectorToColumnArray(const std::vector< Source > &source, boost::multi_array< Destination, 3 > &destination)
const char * missingValuePlaceholder
Representation of missing values in CSV files (same as in NetCDF's CDL).
const size_t numHeaderRows
Number of header rows in CSV files.
Definition: RunCRTM.h:27
Coordinates coordsVals
Coordinates indexing the payload array.
std::unordered_map< std::string, int > coord2DimMapping
Maps coordinate names to dimensions (0 or 1) of the payload array.
std::vector< std::vector< std::string > > dim2CoordMapping
Maps dimensions of the payload array (0 or 1) to coordinate names.
Input data for the DataExtractor.
boost::multi_array< ExtractedValue, 3 > payloadArray
Array from which values will be extracted.