10 #include <string_view>
13 #include "hebench/modules/general/include/hebench_utilities.h"
17 namespace DataLoader {
49 static bool isComment(std::string_view s_view);
50 static bool isEmpty(std::string_view s_view);
56 hebench::Utilities::ltrim(s_view);
57 return !s_view.empty() && s_view.front() ==
'#';
62 hebench::Utilities::trim(s_view);
63 return s_view.empty();
70 std::uint64_t m_pad_to_length;
72 std::filesystem::path m_working_path;
95 static void readDataBlock(std::vector<std::vector<std::vector<T>>> &out_data,
97 const std::vector<std::string_view> &control_line_tokens,
98 const std::filesystem::path &working_path,
99 std::uint64_t &line_num);
112 const std::filesystem::path &working_path);
138 void readLocalDataBlock(std::vector<std::vector<T>> &out_data,
140 std::uint64_t line_offset,
141 std::uint64_t num_samples,
142 std::uint64_t &line_num);
169 void readCSVDataBlock(std::vector<std::vector<T>> &out_data,
171 std::uint64_t line_offset,
172 std::uint64_t num_samples,
173 std::uint64_t &line_num);
187 std::vector<T> parseDataSample(std::string_view s_csv_line_sample);
200 static std::vector<T> parseTokenAsString(std::string_view sv_token);
203 template <
typename T>
206 const std::vector<std::string_view> &control_line_tokens,
207 const std::filesystem::path &working_path,
208 std::uint64_t &line_num)
210 assert(control_line_tokens.size() >= EDLHelper::ControlLine::ControlTokenSize);
212 assert(std::filesystem::exists(working_path) && std::filesystem::is_directory(working_path));
216 std::uint64_t component_index;
217 std::uint64_t num_samples;
218 std::uint64_t pad_to_length = 0;
219 T pad_value =
static_cast<T
>(0);
220 std::stringstream ss;
226 std::string s_component_index(control_line_tokens[EDLHelper::ControlLine::Index_ControlComponentIndex]);
227 ss = std::stringstream(s_component_index);
228 if (!(ss >> component_index))
229 throw std::runtime_error(
"Invalid value \"" + s_component_index +
"\".");
231 catch (std::exception &ex)
233 ss = std::stringstream();
234 ss <<
": " << ex.what();
241 if (!err_msg.empty())
243 ss = std::stringstream();
244 ss <<
"Error reading <component_index> from control line" << err_msg;
245 throw std::runtime_error(ss.str());
251 std::string s_num_samples(control_line_tokens[EDLHelper::ControlLine::Index_ControlNumSamples]);
252 ss = std::stringstream(s_num_samples);
253 if (!(ss >> num_samples))
254 throw std::runtime_error(
"Invalid value \"" + s_num_samples +
"\".");
256 catch (std::exception &ex)
258 ss = std::stringstream();
259 ss <<
": " << ex.what();
266 if (!err_msg.empty())
268 ss = std::stringstream();
269 ss <<
"Error reading <num_samples> from control line" << err_msg;
270 throw std::runtime_error(ss.str());
276 if (control_line_tokens.size() > EDLHelper::ControlLine::Index_ControlPadToLength)
278 std::string s_pad_to_len(control_line_tokens[EDLHelper::ControlLine::Index_ControlPadToLength]);
279 ss = std::stringstream(s_pad_to_len);
280 if (!(ss >> pad_to_length))
281 throw std::runtime_error(
"Invalid value \"" + s_pad_to_len +
"\".");
284 catch (std::exception &ex)
286 ss = std::stringstream();
287 ss <<
": " << ex.what();
294 if (!err_msg.empty())
296 ss = std::stringstream();
297 ss <<
"Error reading <pad_to_length> from control line" << err_msg;
298 throw std::runtime_error(ss.str());
304 if (control_line_tokens.size() > EDLHelper::ControlLine::Index_ControlPadValue)
306 std::string s_pad_value(control_line_tokens[EDLHelper::ControlLine::Index_ControlPadValue]);
307 ss = std::stringstream(s_pad_value);
308 if (!(ss >> pad_value))
309 throw std::runtime_error(
"Invalid value \"" + s_pad_value +
"\".");
312 catch (std::exception &ex)
314 ss = std::stringstream();
315 ss <<
": " << ex.what();
322 if (!err_msg.empty())
324 ss = std::stringstream();
325 ss <<
"Error reading <pad_value> from control line" << err_msg;
326 throw std::runtime_error(ss.str());
329 while (out_data.size() <= component_index)
330 out_data.emplace_back(std::vector<std::vector<T>>());
337 helper.readLocalDataBlock(out_data[component_index],
343 helper.readCSVDataBlock(out_data[component_index],
349 ss = std::stringstream();
350 ss <<
"Invalid control line kind: \"" << control_line_tokens[EDLHelper::ControlLine::Index_ControlKind] <<
"\".";
351 throw std::runtime_error(ss.str());
355 template <
typename T>
358 const std::filesystem::path &working_path) :
359 m_pad_to_length(pad_to_length),
360 m_pad_value(pad_value)
362 m_working_path = std::filesystem::canonical(working_path);
365 template <
typename T>
366 void EDLTypedHelper<T>::readLocalDataBlock(std::vector<std::vector<T>> &out_data,
368 std::uint64_t line_offset,
369 std::uint64_t num_samples,
370 std::uint64_t &line_num)
374 std::uint64_t samples_read = 0;
377 while (is && line_num < line_offset)
380 if (std::getline(is, s_line))
385 while (is && samples_read < num_samples)
388 if (std::getline(is, s_line))
393 auto sample = parseDataSample(s_line);
394 out_data.emplace_back(std::move(sample));
402 template <
typename T>
403 void EDLTypedHelper<T>::readCSVDataBlock(std::vector<std::vector<T>> &out_data,
405 std::uint64_t line_offset,
406 std::uint64_t num_samples,
407 std::uint64_t &line_num)
412 std::uint64_t samples_read = 0;
415 while (is && line_num < line_offset)
418 if (std::getline(is, s_line))
423 while (is && samples_read < num_samples)
426 if (std::getline(is, s_line))
431 auto csv_sample_tokens = hebench::Utilities::CSVTokenizer::tokenizeLine(s_line);
432 if (csv_sample_tokens.size() < EDLHelper::CSVLine::CSVTokenSize)
434 std::stringstream ss;
435 ss <<
"Invalid CSV sample detected. A CSV sample must follow the following format: <filename>[, <from_line>[, <num_samples>]]";
436 throw std::runtime_error(ss.str());
439 std::uint64_t csv_line_offset = 0;
440 std::uint64_t csv_num_samples = std::numeric_limits<std::uint64_t>::max();
441 std::filesystem::path csv_path = csv_sample_tokens[EDLHelper::CSVLine::Index_CSVFilename];
442 if (csv_path.is_relative())
443 csv_path = this->m_working_path / csv_path;
444 csv_path = std::filesystem::canonical(csv_path);
446 std::ifstream fnum_csv;
450 fnum_csv.open(csv_path, std::ifstream::in);
451 if (!fnum_csv.is_open())
452 throw std::ios_base::failure(
"Unable to open file for reading: " + std::string(csv_path));
454 catch (std::exception &ex)
456 err_msg = std::string(
": ") + ex.what();
462 if (!err_msg.empty())
464 std::stringstream ss;
465 ss <<
"Error occurred attempting to parse CSV sample" << err_msg;
466 throw std::runtime_error(ss.str());
469 if (csv_sample_tokens.size() > EDLHelper::CSVLine::Index_CSVFromLine)
475 const std::string &s_csv_line_offset = csv_sample_tokens[EDLHelper::CSVLine::Index_CSVFromLine];
476 std::stringstream ss = std::stringstream(s_csv_line_offset);
477 if (!(ss >> csv_line_offset))
478 throw std::runtime_error(
"Invalid value \"" + s_csv_line_offset +
"\".");
480 catch (std::exception &ex)
482 err_msg = std::string(
": ") + ex.what();
488 if (!err_msg.empty())
490 std::stringstream ss;
491 ss <<
"Error reading <from_line> from CSV sample" << err_msg;
492 throw std::runtime_error(ss.str());
496 if (csv_sample_tokens.size() > EDLHelper::CSVLine::Index_CSVNumSamples)
501 const std::string &s_csv_num_samples = csv_sample_tokens[EDLHelper::CSVLine::Index_CSVNumSamples];
502 std::stringstream ss = std::stringstream(s_csv_num_samples);
503 if (!(ss >> csv_num_samples))
504 throw std::runtime_error(
"Invalid value \"" + s_csv_num_samples +
"\".");
506 catch (std::exception &ex)
508 err_msg = std::string(
": ") + ex.what();
514 if (!err_msg.empty())
516 std::stringstream ss;
517 ss <<
"Error reading <num_samples> from CSV sample" << err_msg;
518 throw std::runtime_error(ss.str());
524 EDLTypedHelper<T> helper(m_pad_to_length, m_pad_value, csv_path.parent_path());
525 std::uint64_t csv_line_num = 0;
526 helper.readLocalDataBlock(out_data,
528 csv_line_offset, csv_num_samples,
538 template <
typename T>
539 std::vector<T> EDLTypedHelper<T>::parseDataSample(std::string_view s_csv_line_sample)
541 std::vector<T> retval;
542 auto csv_tokens = hebench::Utilities::CSVTokenizer::tokenizeLine(s_csv_line_sample);
544 std::uint64_t size_since_last_pad = 0;
545 for (std::size_t i = 0; i < csv_tokens.size(); ++i)
547 if (csv_tokens[i].empty())
550 if (m_pad_to_length > 0)
552 while (size_since_last_pad < m_pad_to_length)
554 retval.emplace_back(m_pad_value);
555 ++size_since_last_pad;
559 retval.emplace_back(m_pad_value);
561 size_since_last_pad = 0;
566 std::vector<T> value(1);
567 std::stringstream ss = std::stringstream(csv_tokens[i]);
570 if (!(ss >> value.front()))
571 throw std::runtime_error(
"Invalid value.");
576 value = EDLTypedHelper<T>::parseTokenAsString(csv_tokens[i]);
579 retval.insert(retval.end(), value.begin(), value.end());
580 size_since_last_pad += value.size();
585 if (m_pad_to_length > 0)
586 while (size_since_last_pad < m_pad_to_length)
588 retval.emplace_back(m_pad_value);
589 ++size_since_last_pad;
595 template <
typename T>
596 std::vector<T> EDLTypedHelper<T>::parseTokenAsString(std::string_view sv_token)
598 std::vector<T> retval;
600 hebench::Utilities::trim(sv_token);
602 if (!sv_token.empty())
605 if (sv_token.front() ==
'\"')
606 sv_token.remove_prefix(1);
607 if (sv_token.back() ==
'\"')
608 sv_token.remove_suffix(1);
610 retval.reserve(sv_token.length());
611 for (
auto it = sv_token.begin(); it != sv_token.end(); ++it)
612 retval.emplace_back(
static_cast<T
>(*it));
622 template <
typename T,
typename E>
624 std::uint64_t max_loaded_size)
626 std::uint64_t line_num = 0;
631 std::filesystem::path filepath = std::filesystem::canonical(filename);
633 fnum.open(filepath, std::ifstream::in);
635 throw std::ios_base::failure(
"Unable to open file for reading: " + std::string(filepath));
646 if (std::getline(fnum, s_line))
652 auto csv_tokens = hebench::Utilities::CSVTokenizer::tokenizeLineInPlace(s_line);
653 if (csv_tokens.size() < EDLHelper::ControlLine::ControlTokenSize)
655 std::stringstream ss;
656 ss <<
"ExternalDatasetLoader: in file " << filename <<
":" << line_num <<
": Invalid number of items in control line. Expected "
657 << EDLHelper::ControlLine::ControlTokenSize <<
", but " << csv_tokens.size() <<
" found.";
658 throw std::runtime_error(ss.str());
660 std::vector<std::vector<std::vector<T>>> *data =
nullptr;
667 std::stringstream ss;
668 ss <<
"ExternalDatasetLoader: in file " << filename <<
":" << line_num <<
": Invalid control line identifier: \""
669 << csv_tokens[EDLHelper::ControlLine::Index_ControlIdentifier] <<
"\".";
670 throw std::runtime_error(ss.str());
675 filepath.parent_path(),
681 catch (std::exception &ex)
683 err_msg = std::string(
": ") + ex.what();
687 err_msg = std::string(
" line.");
689 if (!err_msg.empty())
691 std::stringstream ss;
692 ss <<
"ExternalDatasetLoader: in file " << filepath <<
":" << line_num
693 <<
": Error occurred while parsing" << err_msg;
694 throw std::runtime_error(ss.str());
700 std::uint64_t loaded_size = 0;
701 if (max_loaded_size > 0)
703 for (std::size_t param_i = 0; param_i < retval.
inputs.size(); ++param_i)
704 for (std::size_t sample_i = 0; sample_i < retval.
inputs[param_i].size(); ++sample_i)
705 loaded_size +=
sizeof(T) * retval.
inputs[param_i][sample_i].size();
706 for (std::size_t result_i = 0; result_i < retval.
outputs.size(); ++result_i)
707 for (std::size_t sample_i = 0; sample_i < retval.
outputs[result_i].size(); ++sample_i)
708 loaded_size +=
sizeof(T) * retval.
outputs[result_i][sample_i].size();
711 if (loaded_size > max_loaded_size)
713 std::stringstream ss;
714 ss <<
"ExternalDatasetLoader: in file " << filepath <<
": Loaded data exceeds maximum available size. "
715 <<
"Maximum size expected was " << max_loaded_size <<
" bytes, but " << loaded_size <<
" bytes read.";
716 throw std::runtime_error(ss.str());
constexpr static const char * ControlLineKindLocal
static bool isCommentOrEmpty(std::string_view s_view)
constexpr static const char * ControlLineKindCSV
static bool isComment(std::string_view s_view)
@ Index_ControlComponentIndex
@ Index_ControlNumSamples
@ Index_ControlIdentifier
@ Index_ControlPadToLength
constexpr static const char * ControlLineOutput
constexpr static const char * ControlLineInput
static bool isEmpty(std::string_view s_view)
static void readDataBlock(std::vector< std::vector< std::vector< T >>> &out_data, std::istream &is, const std::vector< std::string_view > &control_line_tokens, const std::filesystem::path &working_path, std::uint64_t &line_num)
Reads the next data block as described by the control line from the specified stream.
static ExternalDataset< T > loadFromCSV(const std::string &filename, std::uint64_t max_loaded_size=0)
Loads a dataset from an external csv file that follows the defined structure.
std::vector< std::vector< std::vector< T > > > inputs
Contains the samples for each input parameter as loaded from external source.
std::vector< std::vector< std::vector< T > > > outputs
Contains the samples for each result component as loaded from external source.