IODA Bundle
SplitTool.cc
Go to the documentation of this file.
1 /*
2  * (C) Copyright 1996-2012 ECMWF.
3  *
4  * This software is licensed under the terms of the Apache Licence Version 2.0
5  * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
6  * In applying this licence, ECMWF does not waive the privileges and immunities
7  * granted to it by virtue of its status as an intergovernmental organisation nor
8  * does it submit to any jurisdiction.
9  */
10 
11 #include "SplitTool.h"
12 
13 #include <ostream>
14 
15 #include "eckit/filesystem/PathName.h"
16 #include "eckit/io/PartFileHandle.h"
17 #include "eckit/types/Types.h"
18 
19 #include "odc/core/TablesReader.h"
20 #include "odc/DispatchingWriter.h"
21 #include "odc/Reader.h"
22 #include "odc/Select.h"
23 #include "odc/TemplateParameters.h"
24 
25 using namespace eckit;
26 using namespace std;
27 
28 namespace odc {
29 namespace tool {
30 
31 SplitTool::SplitTool (int argc, char *argv[])
32 : Tool(argc, argv),
33  sort_(false),
34  maxOpenFiles_(200)
35 {
36  registerOptionWithArgument("-maxopenfiles");
37 }
38 
40 {
41  if (parameters().size() != 3)
42  {
43  Log::error() << "Usage: ";
45  Log::error() << endl;
46  return;
47  }
48 
49  if (optionIsSet("-sort")) sort_ = true;
50 
51  maxOpenFiles_ = optionArgument("-maxopenfiles", maxOpenFiles_);
52  Log::debug() << "SplitTool: maxOpenFiles_ = " << maxOpenFiles_ << endl;
53 
54  PathName inFile (parameters(1));
55  string outFileTemplate (parameters(2));
56 
57  if (sort_)
58  presortAndSplit(inFile, outFileTemplate);
59  else
60  split(inFile, outFileTemplate, maxOpenFiles_, !optionIsSet("-no_verification"));
61 }
62 
63 /**
64  * @param maxExpandedSize maximum size of the data in chunks after decoding
65 */
66 vector<pair<Offset,Length> > SplitTool::getChunks(const PathName& inFile, size_t maxExpandedSize)
67 {
68  ostream &L(Log::debug());
69  L << "SplitTool::getChunks: " << endl;
70 
71  vector<pair<Offset,Length> > r;
72 
73  core::TablesReader reader(inFile);
74  auto it(reader.begin()), end(reader.end());
75 
76  Offset currentOffset(0);
77  Length currentLength(0);
78  size_t currentSize (0);
79 
80  for(; it != end; ++it)
81  {
82  Offset offset(it->startPosition());
83  Length length(it->nextPosition() - it->startPosition());
84  size_t numberOfRows (it->rowCount());
85  size_t numberOfColumns (it->columnCount());
86 
87  L << "SplitTool::getChunks: " << offset << " " << length << endl;
88 
89  size_t size (numberOfRows * numberOfColumns * sizeof(double));
90  if (currentSize + size > maxExpandedSize)
91  {
92  L << "SplitTool::getChunks: collect " << currentOffset << " " << currentLength << endl;
93  r.push_back(make_pair(currentOffset, currentLength));
94  currentOffset = offset;
95  currentLength = length;
96  } else {
97  currentLength += length;
98  currentSize += numberOfRows * numberOfColumns * sizeof(double);
99  }
100  }
101  if (r.size() == 0 || r.back().first != currentOffset)
102  r.push_back(make_pair(currentOffset, currentLength));
103  return r;
104 }
105 
106 std::string SplitTool::genOrderBySelect(const std::string& inFile, const std::string& outFileTemplate)
107 {
108  core::TablesReader reader(inFile);
109  auto it = reader.begin();
110  TemplateParameters templateParameters;
111  TemplateParameters::parse(outFileTemplate, templateParameters, it->columns());
112  std::stringstream ss;
113  ss << "select * order by ";
114  for (size_t i = 0; i < templateParameters.size(); ++i)
115  {
116  if (i) ss << ",";
117  ss << templateParameters[i]->name;
118  }
119  std::string sql (ss.str());
120  Log::info() << "SplitTool::genOrderBySelect: sql: '" << sql << "'" << endl;
121  return sql;
122 }
123 
124 void SplitTool::presortAndSplit(const PathName& inFile, const std::string& outFileTemplate)
125 {
126  odc::DispatchingWriter out(outFileTemplate, 1);
128 
129  string sql(genOrderBySelect(inFile, outFileTemplate));
130 
131  vector<std::pair<Offset,Length> > chunks(getChunks(inFile));
132  for(size_t i=0; i < chunks.size(); ++i)
133  {
134  PartFileHandle h(inFile, chunks[i].first, chunks[i].second);
135  h.openForRead();
136  odc::Select in(sql, h);
137  outIt->pass1(in.begin(), in.end());
138  }
139 }
140 
141 void SplitTool::split(const PathName& inFile, const std::string& outFileTemplate, size_t maxOpenFiles, bool verify)
142 {
143  odc::Reader in(inFile);
144  odc::DispatchingWriter out(outFileTemplate, maxOpenFiles);
145 
147  outIt->pass1(in.begin(), in.end());
148 
149  odc::Reader input(inFile);
150  odc::Reader::iterator begin(input.begin());
151  odc::Reader::iterator end(input.end());
152  outIt->close();
153  if (verify) (**outIt).verify(begin, end);
154 }
155 
156 } // namespace tool
157 } // namespace odc
158 
const iterator end() const
Definition: Reader.cc:81
iterator begin()
Definition: Reader.cc:74
unsigned long pass1(T b, const T e)
void close()
const iterator end()
Definition: Select.cc:77
iterator begin()
Definition: Select.cc:81
static TemplateParameters & parse(const std::string &fileNameTemplate, TemplateParameters &, const core::MetaData &=nullMD)
bool optionIsSet(const std::string &)
T optionArgument(const std::string &, T defaultValue)
void registerOptionWithArgument(const std::string &)
const std::vector< std::string > parameters()
static void split(const eckit::PathName &, const std::string &, size_t, bool verify=true)
Definition: SplitTool.cc:141
static std::vector< std::pair< eckit::Offset, eckit::Length > > getChunks(const eckit::PathName &, size_t maxExpandedSize=100 *1024 *1024)
Definition: SplitTool.cc:66
static void usage(const std::string &name, std::ostream &o)
Definition: SplitTool.h:36
static std::string genOrderBySelect(const std::string &, const std::string &)
Definition: SplitTool.cc:106
static void presortAndSplit(const eckit::PathName &, const std::string &)
Definition: SplitTool.cc:124
Definition: ColumnInfo.h:23
Definition: encode.cc:30