HEBench
hebench_dataset_loader.cpp
Go to the documentation of this file.
1 // Copyright (C) 2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #include <cassert>
5 #include <filesystem>
6 #include <fstream>
7 #include <limits>
8 #include <sstream>
9 #include <string>
10 #include <string_view>
11 #include <vector>
12 
13 #include "hebench/modules/general/include/hebench_utilities.h"
14 #include "hebench_dataset_loader.h"
15 
16 namespace hebench {
17 namespace DataLoader {
18 
19 class EDLHelper
20 {
21 public:
22  enum ControlLine : std::uint64_t
23  {
28  ControlTokenSize, // number of mandatory tokens in the control line
31  ControlMaxTokenSize, // maximum number of tokens in the control line
32  };
33 
34  enum CSVLine : std::uint64_t
35  {
37  CSVTokenSize, // number of mandatory tokens in the CSV line
40  CSVMaxTokenSize // maximum number of tokens in the CSV line
41  };
42 
43  constexpr static const char *ControlLineInput = "input";
44  constexpr static const char *ControlLineOutput = "output";
45  constexpr static const char *ControlLineKindLocal = "local";
46  constexpr static const char *ControlLineKindCSV = "csv";
47 
48 public:
49  static bool isComment(std::string_view s_view);
50  static bool isEmpty(std::string_view s_view);
51  static bool isCommentOrEmpty(std::string_view s_view) { return isComment(s_view) || isEmpty(s_view); }
52 };
53 
54 bool EDLHelper::isComment(std::string_view s_view)
55 {
56  hebench::Utilities::ltrim(s_view);
57  return !s_view.empty() && s_view.front() == '#';
58 }
59 
60 bool EDLHelper::isEmpty(std::string_view s_view)
61 {
62  hebench::Utilities::trim(s_view);
63  return s_view.empty();
64 }
65 
66 template <typename T>
68 {
69 private:
70  std::uint64_t m_pad_to_length;
71  T m_pad_value;
72  std::filesystem::path m_working_path;
73 
74 public:
95  static void readDataBlock(std::vector<std::vector<std::vector<T>>> &out_data,
96  std::istream &is,
97  const std::vector<std::string_view> &control_line_tokens,
98  const std::filesystem::path &working_path,
99  std::uint64_t &line_num);
100 
101 private:
110  EDLTypedHelper(std::uint64_t pad_to_length,
111  const T &pad_value,
112  const std::filesystem::path &working_path);
113 
138  void readLocalDataBlock(std::vector<std::vector<T>> &out_data,
139  std::istream &is,
140  std::uint64_t line_offset,
141  std::uint64_t num_samples,
142  std::uint64_t &line_num);
169  void readCSVDataBlock(std::vector<std::vector<T>> &out_data,
170  std::istream &is,
171  std::uint64_t line_offset,
172  std::uint64_t num_samples,
173  std::uint64_t &line_num);
174 
187  std::vector<T> parseDataSample(std::string_view s_csv_line_sample);
200  static std::vector<T> parseTokenAsString(std::string_view sv_token);
201 };
202 
203 template <typename T>
204 void EDLTypedHelper<T>::readDataBlock(std::vector<std::vector<std::vector<T>>> &out_data,
205  std::istream &is,
206  const std::vector<std::string_view> &control_line_tokens,
207  const std::filesystem::path &working_path,
208  std::uint64_t &line_num)
209 {
210  assert(control_line_tokens.size() >= EDLHelper::ControlLine::ControlTokenSize);
211  assert(is.good());
212  assert(std::filesystem::exists(working_path) && std::filesystem::is_directory(working_path));
213 
214  // parse the control line
215 
216  std::uint64_t component_index;
217  std::uint64_t num_samples;
218  std::uint64_t pad_to_length = 0;
219  T pad_value = static_cast<T>(0);
220  std::stringstream ss;
221  std::string err_msg;
222 
223  err_msg.clear();
224  try
225  {
226  std::string s_component_index(control_line_tokens[EDLHelper::ControlLine::Index_ControlComponentIndex]);
227  ss = std::stringstream(s_component_index);
228  if (!(ss >> component_index))
229  throw std::runtime_error("Invalid value \"" + s_component_index + "\".");
230  }
231  catch (std::exception &ex)
232  {
233  ss = std::stringstream();
234  ss << ": " << ex.what();
235  err_msg = ss.str();
236  }
237  catch (...)
238  {
239  err_msg = ".";
240  }
241  if (!err_msg.empty())
242  {
243  ss = std::stringstream();
244  ss << "Error reading <component_index> from control line" << err_msg;
245  throw std::runtime_error(ss.str());
246  } // end if
247 
248  err_msg.clear();
249  try
250  {
251  std::string s_num_samples(control_line_tokens[EDLHelper::ControlLine::Index_ControlNumSamples]);
252  ss = std::stringstream(s_num_samples);
253  if (!(ss >> num_samples))
254  throw std::runtime_error("Invalid value \"" + s_num_samples + "\".");
255  }
256  catch (std::exception &ex)
257  {
258  ss = std::stringstream();
259  ss << ": " << ex.what();
260  err_msg = ss.str();
261  }
262  catch (...)
263  {
264  err_msg = ".";
265  }
266  if (!err_msg.empty())
267  {
268  ss = std::stringstream();
269  ss << "Error reading <num_samples> from control line" << err_msg;
270  throw std::runtime_error(ss.str());
271  } // end if
272 
273  err_msg.clear();
274  try
275  {
276  if (control_line_tokens.size() > EDLHelper::ControlLine::Index_ControlPadToLength)
277  {
278  std::string s_pad_to_len(control_line_tokens[EDLHelper::ControlLine::Index_ControlPadToLength]);
279  ss = std::stringstream(s_pad_to_len);
280  if (!(ss >> pad_to_length))
281  throw std::runtime_error("Invalid value \"" + s_pad_to_len + "\".");
282  } // end if
283  }
284  catch (std::exception &ex)
285  {
286  ss = std::stringstream();
287  ss << ": " << ex.what();
288  err_msg = ss.str();
289  }
290  catch (...)
291  {
292  err_msg = ".";
293  }
294  if (!err_msg.empty())
295  {
296  ss = std::stringstream();
297  ss << "Error reading <pad_to_length> from control line" << err_msg;
298  throw std::runtime_error(ss.str());
299  } // end if
300 
301  err_msg.clear();
302  try
303  {
304  if (control_line_tokens.size() > EDLHelper::ControlLine::Index_ControlPadValue)
305  {
306  std::string s_pad_value(control_line_tokens[EDLHelper::ControlLine::Index_ControlPadValue]);
307  ss = std::stringstream(s_pad_value);
308  if (!(ss >> pad_value))
309  throw std::runtime_error("Invalid value \"" + s_pad_value + "\".");
310  } // end if
311  }
312  catch (std::exception &ex)
313  {
314  ss = std::stringstream();
315  ss << ": " << ex.what();
316  err_msg = ss.str();
317  }
318  catch (...)
319  {
320  err_msg = ".";
321  }
322  if (!err_msg.empty())
323  {
324  ss = std::stringstream();
325  ss << "Error reading <pad_value> from control line" << err_msg;
326  throw std::runtime_error(ss.str());
327  } // end if
328 
329  while (out_data.size() <= component_index)
330  out_data.emplace_back(std::vector<std::vector<T>>());
331 
332  EDLTypedHelper<T> helper(pad_to_length, pad_value, working_path);
333 
334  // identify `kind` of data: `local` or `csv`
335  if (control_line_tokens[EDLHelper::ControlLine::Index_ControlKind] == EDLHelper::ControlLineKindLocal)
336  {
337  helper.readLocalDataBlock(out_data[component_index],
338  is, 0, num_samples,
339  line_num);
340  } // end if
341  else if (control_line_tokens[EDLHelper::ControlLine::Index_ControlKind] == EDLHelper::ControlLineKindCSV)
342  {
343  helper.readCSVDataBlock(out_data[component_index],
344  is, 0, num_samples,
345  line_num);
346  } // end else if
347  else
348  {
349  ss = std::stringstream();
350  ss << "Invalid control line kind: \"" << control_line_tokens[EDLHelper::ControlLine::Index_ControlKind] << "\".";
351  throw std::runtime_error(ss.str());
352  } // end else
353 }
354 
355 template <typename T>
356 EDLTypedHelper<T>::EDLTypedHelper(std::uint64_t pad_to_length,
357  const T &pad_value,
358  const std::filesystem::path &working_path) :
359  m_pad_to_length(pad_to_length),
360  m_pad_value(pad_value)
361 {
362  m_working_path = std::filesystem::canonical(working_path);
363 }
364 
365 template <typename T>
366 void EDLTypedHelper<T>::readLocalDataBlock(std::vector<std::vector<T>> &out_data,
367  std::istream &is,
368  std::uint64_t line_offset,
369  std::uint64_t num_samples,
370  std::uint64_t &line_num)
371 {
372  assert(is.good());
373 
374  std::uint64_t samples_read = 0;
375 
376  // skip the line offset
377  while (is && line_num < line_offset)
378  {
379  std::string s_line;
380  if (std::getline(is, s_line))
381  ++line_num;
382  } // end while
383 
384  // read samples until we reach number of samples requested or end of file
385  while (is && samples_read < num_samples)
386  {
387  std::string s_line;
388  if (std::getline(is, s_line))
389  {
390  ++line_num;
391  if (!EDLHelper::isCommentOrEmpty(s_line))
392  {
393  auto sample = parseDataSample(s_line);
394  out_data.emplace_back(std::move(sample));
395 
396  ++samples_read;
397  } // end if
398  } // end if
399  } // end while
400 }
401 
402 template <typename T>
403 void EDLTypedHelper<T>::readCSVDataBlock(std::vector<std::vector<T>> &out_data,
404  std::istream &is,
405  std::uint64_t line_offset,
406  std::uint64_t num_samples,
407  std::uint64_t &line_num)
408 {
409  assert(is.good());
410 
411  std::string err_msg;
412  std::uint64_t samples_read = 0;
413 
414  // skip the line offset
415  while (is && line_num < line_offset)
416  {
417  std::string s_line;
418  if (std::getline(is, s_line))
419  ++line_num;
420  } // end while
421 
422  // read samples until we reach number of samples requested or end of file
423  while (is && samples_read < num_samples)
424  {
425  std::string s_line;
426  if (std::getline(is, s_line))
427  {
428  ++line_num;
429  if (!EDLHelper::isCommentOrEmpty(s_line))
430  {
431  auto csv_sample_tokens = hebench::Utilities::CSVTokenizer::tokenizeLine(s_line);
432  if (csv_sample_tokens.size() < EDLHelper::CSVLine::CSVTokenSize)
433  {
434  std::stringstream ss;
435  ss << "Invalid CSV sample detected. A CSV sample must follow the following format: <filename>[, <from_line>[, <num_samples>]]";
436  throw std::runtime_error(ss.str());
437  } // end if
438 
439  std::uint64_t csv_line_offset = 0;
440  std::uint64_t csv_num_samples = std::numeric_limits<std::uint64_t>::max();
441  std::filesystem::path csv_path = csv_sample_tokens[EDLHelper::CSVLine::Index_CSVFilename];
442  if (csv_path.is_relative())
443  csv_path = this->m_working_path / csv_path;
444  csv_path = std::filesystem::canonical(csv_path);
445 
446  std::ifstream fnum_csv;
447  err_msg.clear();
448  try
449  {
450  fnum_csv.open(csv_path, std::ifstream::in);
451  if (!fnum_csv.is_open())
452  throw std::ios_base::failure("Unable to open file for reading: " + std::string(csv_path));
453  }
454  catch (std::exception &ex)
455  {
456  err_msg = std::string(": ") + ex.what();
457  }
458  catch (...)
459  {
460  err_msg = ".";
461  }
462  if (!err_msg.empty())
463  {
464  std::stringstream ss;
465  ss << "Error occurred attempting to parse CSV sample" << err_msg;
466  throw std::runtime_error(ss.str());
467  } // end if
468 
469  if (csv_sample_tokens.size() > EDLHelper::CSVLine::Index_CSVFromLine)
470  {
471  // read custom batch sizes
472  err_msg.clear();
473  try
474  {
475  const std::string &s_csv_line_offset = csv_sample_tokens[EDLHelper::CSVLine::Index_CSVFromLine];
476  std::stringstream ss = std::stringstream(s_csv_line_offset);
477  if (!(ss >> csv_line_offset))
478  throw std::runtime_error("Invalid value \"" + s_csv_line_offset + "\".");
479  }
480  catch (std::exception &ex)
481  {
482  err_msg = std::string(": ") + ex.what();
483  }
484  catch (...)
485  {
486  err_msg = ".";
487  }
488  if (!err_msg.empty())
489  {
490  std::stringstream ss;
491  ss << "Error reading <from_line> from CSV sample" << err_msg;
492  throw std::runtime_error(ss.str());
493  } // end if
494  } // end if
495 
496  if (csv_sample_tokens.size() > EDLHelper::CSVLine::Index_CSVNumSamples)
497  {
498  err_msg.clear();
499  try
500  {
501  const std::string &s_csv_num_samples = csv_sample_tokens[EDLHelper::CSVLine::Index_CSVNumSamples];
502  std::stringstream ss = std::stringstream(s_csv_num_samples);
503  if (!(ss >> csv_num_samples))
504  throw std::runtime_error("Invalid value \"" + s_csv_num_samples + "\".");
505  }
506  catch (std::exception &ex)
507  {
508  err_msg = std::string(": ") + ex.what();
509  }
510  catch (...)
511  {
512  err_msg = ".";
513  }
514  if (!err_msg.empty())
515  {
516  std::stringstream ss;
517  ss << "Error reading <num_samples> from CSV sample" << err_msg;
518  throw std::runtime_error(ss.str());
519  } // end if
520  } // end if
521 
522  // CSV file specified in line successfully opened.
523  // Treat contents of CSV file as local data block (with header already parsed).
524  EDLTypedHelper<T> helper(m_pad_to_length, m_pad_value, csv_path.parent_path());
525  std::uint64_t csv_line_num = 0;
526  helper.readLocalDataBlock(out_data,
527  fnum_csv,
528  csv_line_offset, csv_num_samples,
529  csv_line_num);
530 
531  // CSV sample line read
532  ++samples_read;
533  } // end if
534  } // end if
535  } // end while
536 }
537 
538 template <typename T>
539 std::vector<T> EDLTypedHelper<T>::parseDataSample(std::string_view s_csv_line_sample)
540 {
541  std::vector<T> retval;
542  auto csv_tokens = hebench::Utilities::CSVTokenizer::tokenizeLine(s_csv_line_sample);
543 
544  std::uint64_t size_since_last_pad = 0;
545  for (std::size_t i = 0; i < csv_tokens.size(); ++i)
546  {
547  if (csv_tokens[i].empty())
548  {
549  // add padding as requested
550  if (m_pad_to_length > 0)
551  // if pad requested and size has been satisfied, no padding is added
552  while (size_since_last_pad < m_pad_to_length)
553  {
554  retval.emplace_back(m_pad_value);
555  ++size_since_last_pad;
556  } // end while
557  else
558  // if no pad requested, empty values are set to the pad value
559  retval.emplace_back(m_pad_value);
560  // reset pad to start a new section
561  size_since_last_pad = 0;
562  } // end if
563  else
564  {
565  // read the next value
566  std::vector<T> value(1);
567  std::stringstream ss = std::stringstream(csv_tokens[i]);
568  try
569  {
570  if (!(ss >> value.front()))
571  throw std::runtime_error("Invalid value.");
572  }
573  catch (...)
574  {
575  // error reading value, attempt to read it as a string
576  value = EDLTypedHelper<T>::parseTokenAsString(csv_tokens[i]);
577  }
578 
579  retval.insert(retval.end(), value.begin(), value.end());
580  size_since_last_pad += value.size();
581  } // end else
582  } // end for
583 
584  // pad if we still have leftover space
585  if (m_pad_to_length > 0)
586  while (size_since_last_pad < m_pad_to_length)
587  {
588  retval.emplace_back(m_pad_value);
589  ++size_since_last_pad;
590  } // end while
591 
592  return retval;
593 }
594 
595 template <typename T>
596 std::vector<T> EDLTypedHelper<T>::parseTokenAsString(std::string_view sv_token)
597 {
598  std::vector<T> retval;
599 
600  hebench::Utilities::trim(sv_token);
601 
602  if (!sv_token.empty())
603  {
604  // remove surrounding quotations, if any
605  if (sv_token.front() == '\"')
606  sv_token.remove_prefix(1);
607  if (sv_token.back() == '\"')
608  sv_token.remove_suffix(1);
609 
610  retval.reserve(sv_token.length());
611  for (auto it = sv_token.begin(); it != sv_token.end(); ++it)
612  retval.emplace_back(static_cast<T>(*it));
613  } // end if
614 
615  return retval;
616 }
617 
618 //------------------------------
619 // class ExternalDatasetLoader
620 //------------------------------
621 
622 template <typename T, typename E>
624  std::uint64_t max_loaded_size)
625 {
626  std::uint64_t line_num = 0;
627  ExternalDataset<T> retval;
628 
629  std::ifstream fnum;
630 
631  std::filesystem::path filepath = std::filesystem::canonical(filename);
632 
633  fnum.open(filepath, std::ifstream::in);
634  if (!fnum.is_open())
635  throw std::ios_base::failure("Unable to open file for reading: " + std::string(filepath));
636 
637  //EDLTypedHelper<T> helper(pad_to_length, pad_value, filename);
638 
639  std::string err_msg;
640  try
641  {
642 
643  while (fnum)
644  {
645  std::string s_line;
646  if (std::getline(fnum, s_line))
647  {
648  ++line_num; // new line
649  if (!EDLHelper::isCommentOrEmpty(s_line))
650  {
651  // read control line
652  auto csv_tokens = hebench::Utilities::CSVTokenizer::tokenizeLineInPlace(s_line);
653  if (csv_tokens.size() < EDLHelper::ControlLine::ControlTokenSize)
654  {
655  std::stringstream ss;
656  ss << "ExternalDatasetLoader: in file " << filename << ":" << line_num << ": Invalid number of items in control line. Expected "
657  << EDLHelper::ControlLine::ControlTokenSize << ", but " << csv_tokens.size() << " found.";
658  throw std::runtime_error(ss.str());
659  } // end if
660  std::vector<std::vector<std::vector<T>>> *data = nullptr;
661  if (csv_tokens[EDLHelper::ControlLine::Index_ControlIdentifier] == EDLHelper::ControlLineInput)
662  data = &retval.inputs;
663  else if (csv_tokens[EDLHelper::ControlLine::Index_ControlIdentifier] == EDLHelper::ControlLineOutput)
664  data = &retval.outputs;
665  else
666  {
667  std::stringstream ss;
668  ss << "ExternalDatasetLoader: in file " << filename << ":" << line_num << ": Invalid control line identifier: \""
669  << csv_tokens[EDLHelper::ControlLine::Index_ControlIdentifier] << "\".";
670  throw std::runtime_error(ss.str());
671  } // end else
673  fnum,
674  csv_tokens,
675  filepath.parent_path(),
676  line_num);
677  } // end if
678  } // end if
679  } // end while
680  }
681  catch (std::exception &ex)
682  {
683  err_msg = std::string(": ") + ex.what();
684  }
685  catch (...)
686  {
687  err_msg = std::string(" line.");
688  }
689  if (!err_msg.empty())
690  {
691  std::stringstream ss;
692  ss << "ExternalDatasetLoader: in file " << filepath << ":" << line_num
693  << ": Error occurred while parsing" << err_msg;
694  throw std::runtime_error(ss.str());
695  } // end if
696 
697  fnum.close();
698 
699  // validate read size
700  std::uint64_t loaded_size = 0;
701  if (max_loaded_size > 0)
702  {
703  for (std::size_t param_i = 0; param_i < retval.inputs.size(); ++param_i)
704  for (std::size_t sample_i = 0; sample_i < retval.inputs[param_i].size(); ++sample_i)
705  loaded_size += sizeof(T) * retval.inputs[param_i][sample_i].size();
706  for (std::size_t result_i = 0; result_i < retval.outputs.size(); ++result_i)
707  for (std::size_t sample_i = 0; sample_i < retval.outputs[result_i].size(); ++sample_i)
708  loaded_size += sizeof(T) * retval.outputs[result_i][sample_i].size();
709  } // end if
710 
711  if (loaded_size > max_loaded_size)
712  {
713  std::stringstream ss;
714  ss << "ExternalDatasetLoader: in file " << filepath << ": Loaded data exceeds maximum available size. "
715  << "Maximum size expected was " << max_loaded_size << " bytes, but " << loaded_size << " bytes read.";
716  throw std::runtime_error(ss.str());
717  } // end if
718 
719  return retval;
720 }
721 
722 } // namespace DataLoader
723 } // namespace hebench
constexpr static const char * ControlLineKindLocal
static bool isCommentOrEmpty(std::string_view s_view)
constexpr static const char * ControlLineKindCSV
static bool isComment(std::string_view s_view)
constexpr static const char * ControlLineOutput
constexpr static const char * ControlLineInput
static bool isEmpty(std::string_view s_view)
static void readDataBlock(std::vector< std::vector< std::vector< T >>> &out_data, std::istream &is, const std::vector< std::string_view > &control_line_tokens, const std::filesystem::path &working_path, std::uint64_t &line_num)
Reads the next data block as described by the control line from the specified stream.
static ExternalDataset< T > loadFromCSV(const std::string &filename, std::uint64_t max_loaded_size=0)
Loads a dataset from an external csv file that follows the defined structure.
std::vector< std::vector< std::vector< T > > > inputs
Contains the samples for each input parameter as loaded from external source.
std::vector< std::vector< std::vector< T > > > outputs
Contains the samples for each result component as loaded from external source.