IODA Bundle
String.h
Go to the documentation of this file.
1 /*
2  * (C) Copyright 1996-2012 ECMWF.
3  *
4  * This software is licensed under the terms of the Apache Licence Version 2.0
5  * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
6  * In applying this licence, ECMWF does not waive the privileges and immunities
7  * granted to it by virtue of its status as an intergovernmental organisation nor
8  * does it submit to any jurisdiction.
9  */
10 
11 #ifndef odc_core_codec_String_H
12 #define odc_core_codec_String_H
13 
14 #include "odc/core/Codec.h"
15 #include "odc/codec/Integer.h"
16 
17 namespace odc {
18 namespace codec {
19 
20 //----------------------------------------------------------------------------------------------------------------------
21 
22 /// @note CodecChars is _only_ used as an intermediate codec. It encodes data during the
23 /// normal Writer phase that is then _reencoded_ using Int16String,...
24 /// We should NEVER see 'chars' in the output data.
25 
26 template<typename ByteOrder>
27 class CodecChars : public core::DataStreamCodec<ByteOrder> {
28 
29 public: // definitions
30 
31  constexpr static const char* codec_name() { return "chars"; }
32 
33 public: // methods
34 
35  CodecChars(api::ColumnType type, const std::string& name=codec_name());
36  ~CodecChars() override {}
37 
40  void load(core::DataStream<ByteOrder>& ds) override;
41  void save(core::DataStream<ByteOrder>& ds) override;
42 
43 protected: // methods
44 
45  std::unique_ptr<core::Codec> clone() override;
46 
47 private: // methods
48 
49  unsigned char* encode(unsigned char* p, const double& d) override;
50  void decode(double* out) override;
51  void skip() override;
52  void gatherStats(const double& v) override;
53 
54  size_t numStrings() const override { return strings_.size(); }
55  void copyStrings(core::Codec& rhs) override;
56 
57  size_t dataSizeDoubles() const override { return decodedSizeDoubles_; }
58  void dataSizeDoubles(size_t count) override { decodedSizeDoubles_ = count; }
59 
60  void print(std::ostream &s) const override;
61 
62 protected: // members
63 
64  std::map<std::string, int64_t> stringLookup_;
65  std::vector<std::string> strings_;
67 };
68 
69 
70 //----------------------------------------------------------------------------------------------------------------------
71 
72 
73 template<typename ByteOrder, typename InternalCodec>
74 class IntStringCodecBase : public CodecChars<ByteOrder> {
75 
76  static_assert(std::is_same<typename InternalCodec::value_type, int64_t>::value, "Safety check");
77  using InternalInt = typename InternalCodec::value_type;
78 
79 public: // methods
80 
82  CodecChars<ByteOrder>(type, name),
83  intCodec_(api::INTEGER) {
84 
85  this->min_ = odc::MDI::integerMDI();
86  this->max_ = this->min_;
87  this->missingValue_ = this->min_;
88  intCodec_.min(0);
89  }
90  ~IntStringCodecBase() override {}
91 
92 private: // methods
93 
94  std::unique_ptr<core::Codec> clone() override {
95  std::unique_ptr<core::Codec> cdc = CodecChars<ByteOrder>::clone();
96  auto& c = static_cast<IntStringCodecBase<ByteOrder, InternalCodec>&>(*cdc);
97  c.intCodec_.min(intCodec_.min());
98  c.intCodec_.max(intCodec_.max());
99  c.min(this->min());
100  c.max(this->max());
101  return cdc;
102  }
103 
104  /// Ensure that data streams are passed through to the internal coder
108  intCodec_.setDataStream(ds);
109  }
110 
111  void clearDataStream() override {
113  intCodec_.clearDataStream();
114  }
115 
116  unsigned char* encode(unsigned char* p, const double& d) override {
117 
118  /// n.b. Yes this is ugly. This is a hack into the existing API - and it assumes
119  /// that the double& provided actually is the first element of a longer string.
120 
121  size_t len = ::strnlen(reinterpret_cast<const char*>(&d), this->decodedSizeDoubles_*sizeof(double));
122  std::string s(reinterpret_cast<const char*>(&d), len);
123 
124  auto it = this->stringLookup_.find(s);
125  ASSERT(it != this->stringLookup_.end());
126 
127  // n.b. Reinterpret cast is yucky, but is for backward compatibility with old interface.
128  // CodecInt*<, int64_t> undoes that internally.
129  // WARNING: This is very type unsafe
130  InternalInt internal = it->second;
131  return static_cast<core::Codec&>(intCodec_).encode(p, reinterpret_cast<const double&>(internal));
132  }
133 
134  void decode(double* out) override {
135 
136  // n.b. Reinterpret cast is yucky, but is for backward compatibility with old interface.
137  // CodecInt*<, int64_t> undoes that internally.
138  // WARNING: This is very type unsafe
139 
140  InternalInt i;
141  static_cast<core::Codec&>(intCodec_).decode(reinterpret_cast<double*>(&i));
142 
143  ASSERT(i < long(this->strings_.size()));
144  const std::string& s(this->strings_[i]);
145 
146  ::memset(out, 0, this->decodedSizeDoubles_*sizeof(double));
147  ::memcpy(reinterpret_cast<char*>(out), &s[0], std::min(s.length(), this->decodedSizeDoubles_*sizeof(double)));
148  }
149 
150  void skip() override {
151  static_cast<core::Codec&>(intCodec_).skip();
152  }
153 
157 
158  // Load the table of strings
159  // This is based on the old-style hash-table storage, so it isn't a trivial list of strings
160  int32_t numStrings;
161  ds.read(numStrings);
162  ASSERT(numStrings >= 0);
163 
164  this->strings_.resize(numStrings);
165 
166  // How many doubles-worth of memory is needed to decode the largest string?
167  this->decodedSizeDoubles_ = 1;
168 
169  for (size_t i = 0; i < size_t(numStrings); i++) {
170  std::string s;
171  ds.read(s);
172 
173  int32_t cnt;
174  ds.read(cnt);
175 
176  int32_t index;
177  ds.read(index);
178 
179  ASSERT(index < numStrings);
180  this->strings_[index] = s;
181 
182  if (s.length() != 0) {
183  this->decodedSizeDoubles_ = std::max(this->decodedSizeDoubles_, ((s.length()-1)/sizeof(double))+1);
184  }
185  }
186 
187  // Ensure that the string lookup is EMPTY. We don't use it after reading
188  ASSERT(this->stringLookup_.size() == 0);
189  }
190 
193 
195 
196  ds.write(static_cast<int32_t>(this->strings_.size()));
197 
198  for (size_t i = 0; i < this->strings_.size(); i++) {
199  ds.write(this->strings_[i]);
200  ds.write(static_cast<int32_t>(0)); // "cnt" field is not used.
201  ds.write(static_cast<int32_t>(i));
202  }
203  }
204 
205 private: // members
206 
207  InternalCodec intCodec_;
208 };
209 
210 //----------------------------------------------------------------------------------------------------------------------
211 
212 
213 template<typename ByteOrder>
214 struct CodecInt8String : public IntStringCodecBase<ByteOrder, CodecInt8<ByteOrder, int64_t>> {
215  constexpr static const char* codec_name() { return "int8_string"; }
217  ~CodecInt8String() override {}
218 };
219 
220 
221 template<typename ByteOrder>
222 struct CodecInt16String : public IntStringCodecBase<ByteOrder, CodecInt16<ByteOrder, int64_t>> {
223  constexpr static const char* codec_name() { return "int16_string"; }
225  ~CodecInt16String() override {}
226 };
227 
228 
229 //----------------------------------------------------------------------------------------------------------------------
230 
231 // Implementation
232 
233 template<typename ByteOrder>
235  core::DataStreamCodec<ByteOrder>(name, type),
236  decodedSizeDoubles_(1) {}
237 
238 template<typename ByteOrder>
239 unsigned char* CodecChars<ByteOrder>::encode(unsigned char* p, const double& s) {
240 
241  /// n.b. Yes this is ugly. This is a hack into the existing API - and it assumes
242  /// that the double& provided actually is the first element of a longer string.
243 
244  memcpy(p, &s, decodedSizeDoubles_*sizeof(double));
245  return p + (decodedSizeDoubles_*sizeof(double));
246 }
247 
248 template<typename ByteOrder>
249 void CodecChars<ByteOrder>::decode(double* out) {
250 
251  this->ds().readBytes(out, sizeof(double)*decodedSizeDoubles_);
252 }
253 
254 template <typename ByteOrder>
256  this->ds().advance(sizeof(double) * decodedSizeDoubles_);
257 }
258 
259 template<typename ByteOrder>
260 void CodecChars<ByteOrder>::gatherStats(const double& v) {
261 
262  size_t len = ::strnlen(reinterpret_cast<const char*>(&v), decodedSizeDoubles_*sizeof(double));
263  std::string s(reinterpret_cast<const char*>(&v), len);
264 
265  char buf[255];
266  memcpy(buf, &v, sizeof(double));
267  buf[sizeof(double)] = 0;
268 
269  if (stringLookup_.find(s) == stringLookup_.end()) {
270  size_t index = strings_.size();
271  strings_.push_back(s);
272  stringLookup_[s] = index;
273  }
274 
275  // In case the column is const, the const value will be copied and used by the optimized codec.
276  this->min_ = v;
277 
278 }
279 
280 
281 template<typename ByteOrder>
284  int32_t nStrings;
285  ds.read(nStrings);
286  ASSERT(nStrings == 0); // No string table stored
287 }
288 
289 
290 template<typename ByteOrder>
292  // String table only stored in derived int-storing types
294  ds.write(static_cast<int32_t>(0));
295 }
296 
297 template<typename ByteOrder>
298 std::unique_ptr<core::Codec> CodecChars<ByteOrder>::clone() {
299 
300  std::unique_ptr<core::Codec> cdc = core::Codec::clone();
301  auto& c = static_cast<CodecChars&>(*cdc);
302  c.stringLookup_ = stringLookup_;
303  c.strings_ = strings_;
304  c.decodedSizeDoubles_ = decodedSizeDoubles_;
305  ASSERT(c.min() == this->min_);
306  ASSERT(c.max() == this->max_);
307  return cdc;
308 }
309 
310 template<typename ByteOrder>
312  CodecChars<ByteOrder>* c = dynamic_cast<CodecChars<ByteOrder>*>(&rhs);
313  ASSERT(c);
314  strings_ = c->strings_;
315  stringLookup_ = c->stringLookup_;
316 }
317 
318 template<typename ByteOrder>
319 void CodecChars<ByteOrder>::print(std::ostream& s) const {
320  s << this->name_
321  << ", width=" << (decodedSizeDoubles_ * sizeof(double))
322  << ", #words=" << strings_.size();
323 }
324 
325 //----------------------------------------------------------------------------------------------------------------------
326 
327 } // namespace codec
328 } // namespace odc
329 
330 #endif
331 
static void count(void *counter, const double *data, size_t n)
Definition: UnitTests.cc:531
static double integerMDI()
Definition: MDI.h:22
void print(std::ostream &s) const override
Definition: String.h:319
std::vector< std::string > strings_
Definition: String.h:65
void dataSizeDoubles(size_t count) override
Definition: String.h:58
std::unique_ptr< core::Codec > clone() override
Definition: String.h:298
unsigned char * encode(unsigned char *p, const double &d) override
Definition: String.h:239
void skip() override
Definition: String.h:255
void decode(double *out) override
Definition: String.h:249
size_t decodedSizeDoubles_
Definition: String.h:66
constexpr static const char * codec_name()
Definition: String.h:31
std::map< std::string, int64_t > stringLookup_
Definition: String.h:64
void gatherStats(const double &v) override
Definition: String.h:260
CodecChars(api::ColumnType type, const std::string &name=codec_name())
Definition: String.h:234
size_t numStrings() const override
Definition: String.h:54
void copyStrings(core::Codec &rhs) override
Definition: String.h:311
size_t dataSizeDoubles() const override
Definition: String.h:57
~CodecChars() override
Definition: String.h:36
typename InternalCodec::value_type InternalInt
Definition: String.h:77
void skip() override
Definition: String.h:150
void clearDataStream() override
Definition: String.h:111
unsigned char * encode(unsigned char *p, const double &d) override
Definition: String.h:116
~IntStringCodecBase() override
Definition: String.h:90
std::unique_ptr< core::Codec > clone() override
Definition: String.h:94
IntStringCodecBase(api::ColumnType type, const std::string &name)
Definition: String.h:81
void setDataStream(core::DataStream< ByteOrder > &ds) override
Definition: String.h:106
void load(core::DataStream< ByteOrder > &ds) override
Definition: String.h:155
void save(core::DataStream< ByteOrder > &ds) override
Definition: String.h:192
void decode(double *out) override
Definition: String.h:134
const std::string & name() const
Definition: Codec.h:40
double min() const
Definition: Codec.h:67
double missingValue_
Definition: Codec.h:99
double max_
Definition: Codec.h:101
double min_
Definition: Codec.h:100
virtual std::unique_ptr< Codec > clone()
Definition: Codec.cc:32
double max() const
Definition: Codec.h:70
void clearDataStream() override
Definition: Codec.h:131
void setDataStream(GeneralDataStream &ds)
Definition: Codec.cc:44
void save(GeneralDataStream &ds)
Definition: Codec.cc:76
void load(GeneralDataStream &ds)
Definition: Codec.cc:60
DataStream< ByteOrder > & ds()
Definition: Codec.h:157
void read(T &elem)
Definition: DataStream.h:205
void write(const T &elem)
Definition: DataStream.h:276
Definition: ColumnInfo.h:23
CodecInt16String(api::ColumnType type)
Definition: String.h:224
~CodecInt16String() override
Definition: String.h:225
constexpr static const char * codec_name()
Definition: String.h:223
~CodecInt8String() override
Definition: String.h:217
constexpr static const char * codec_name()
Definition: String.h:215
CodecInt8String(api::ColumnType type)
Definition: String.h:216