bes Updated for version 3.20.13
build_dmrpp_util.cc
1// -*- mode: c++; c-basic-offset:4 -*-
2
3// This file is part of the Hyrax data server.
4
5// Copyright (c) 2022 OPeNDAP, Inc.
6// Author: James Gallagher <jgallagher@opendap.org>
7//
8// This library is free software; you can redistribute it and/or
9// modify it under the terms of the GNU Lesser General Public
10// License as published by the Free Software Foundation; either
11// version 2.1 of the License, or (at your option) any later version.
12//
13// This library is distributed in the hope that it will be useful,
14// but WITHOUT ANY WARRANTY; without even the implied warranty of
15// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16// Lesser General Public License for more details.
17//
18// You should have received a copy of the GNU Lesser General Public
19// License along with this library; if not, write to the Free Software
20// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21//
22// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
23
24#include <iostream>
25#include <sstream>
26#include <memory>
27#include <iterator>
28
29#include <cstdlib>
30
31#include <H5Ppublic.h>
32#include <H5Dpublic.h>
33#include <H5Epublic.h>
34#include <H5Zpublic.h> // Constants for compression filters
35#include <H5Spublic.h>
36#include <H5Tpublic.h>
37
38#include "h5common.h" // This is in the hdf5 handler
39
40#include <libdap/Array.h>
41#include <libdap/util.h>
42#include <libdap/D4Attributes.h>
43
44#include <BESDebug.h>
45#include <BESNotFoundError.h>
46#include <BESInternalError.h>
47
48#include "DMRpp.h"
49#include "DmrppTypeFactory.h"
50#include "DmrppD4Group.h"
51
52#if 0
53#define H5S_MAX_RANK 32
54#define H5O_LAYOUT_NDIMS (H5S_MAX_RANK+1)
55
56/*
57 * "Generic" chunk record. Each chunk is keyed by the minimum logical
58 * N-dimensional coordinates and the datatype size of the chunk.
59 * The fastest-varying dimension is assumed to reference individual bytes of
60 * the array, so a 100-element 1-D array of 4-byte integers would really be a
61 * 2-D array with the slow varying dimension of size 100 and the fast varying
62 * dimension of size 4 (the storage dimensionality has very little to do with
63 * the real dimensionality).
64 *
65 * The chunk's file address, filter mask and size on disk are not key values.
66 */
67typedef struct H5D_chunk_rec_t {
68 hsize_t scaled[H5O_LAYOUT_NDIMS]; /* Logical offset to start */
69 uint32_t nbytes; /* Size of stored data */
70 uint32_t filter_mask; /* Excluded filters */
71 haddr_t chunk_addr; /* Address of chunk in file */
72} H5D_chunk_rec_t;
73#endif
74
75using namespace std;
76using namespace libdap;
77using namespace dmrpp;
78
79namespace build_dmrpp_util {
80
81bool verbose = false; // Optionally set by build_dmrpp's main().
82
83#define VERBOSE(x) do { if (verbose) (x); } while(false)
84
85// FYI: Filter IDs
86// H5Z_FILTER_ERROR (-1) no filter
87// H5Z_FILTER_NONE 0 reserved indefinitely
88// H5Z_FILTER_DEFLATE 1 deflation like gzip
89// H5Z_FILTER_SHUFFLE 2 shuffle the data
90// H5Z_FILTER_FLETCHER32 3 fletcher32 checksum of EDC
91// H5Z_FILTER_SZIP 4 szip compression
92// H5Z_FILTER_NBIT 5 nbit compression
93// H5Z_FILTER_SCALEOFFSET 6 scale+offset compression
94// H5Z_FILTER_RESERVED 256 filter ids below this value are reserved for library use
95// FYI: Filter IDs
96// H5Z_FILTER_ERROR (-1) no filter
97// H5Z_FILTER_NONE 0 reserved indefinitely
98// H5Z_FILTER_DEFLATE 1 deflation like gzip
99// H5Z_FILTER_SHUFFLE 2 shuffle the data
100// H5Z_FILTER_FLETCHER32 3 fletcher32 checksum of EDC
101// H5Z_FILTER_SZIP 4 szip compression
102// H5Z_FILTER_NBIT 5 nbit compression
103// H5Z_FILTER_SCALEOFFSET 6 scale+offset compression
104// H5Z_FILTER_RESERVED 256 filter ids below this value are reserved for library use
105
106string h5_filter_name(int type) {
107 string name;
108 switch(type) {
109 case H5Z_FILTER_NONE:
110 name = "H5Z_FILTER_NONE";
111 break;
112 case H5Z_FILTER_DEFLATE:
113 name = "H5Z_FILTER_DEFLATE";
114 break;
115 case H5Z_FILTER_SHUFFLE:
116 name = "H5Z_FILTER_SHUFFLE";
117 break;
118 case H5Z_FILTER_FLETCHER32:
119 name = "H5Z_FILTER_FLETCHER32";
120 break;
121 case H5Z_FILTER_SZIP:
122 name = "H5Z_FILTER_SZIP";
123 break;
124 case H5Z_FILTER_NBIT:
125 name = "H5Z_FILTER_NBIT";
126 break;
127 case H5Z_FILTER_SCALEOFFSET:
128 name = "H5Z_FILTER_SCALEOFFSET";
129 break;
130 default:
131 {
132 ostringstream oss("ERROR! Unknown HDF5 FILTER! type: ", std::ios::ate);
133 oss << type;
134 name = oss.str();
135 break;
136 }
137 }
138 return name;
139}
140
147static void set_filter_information(hid_t dataset_id, DmrppCommon *dc) {
148 hid_t plist_id = H5Dget_create_plist(dataset_id);
149
150 try {
151 int numfilt = H5Pget_nfilters(plist_id);
152 VERBOSE(cerr << "Number of filters associated with dataset: " << numfilt << endl);
153 string filters;
154
155 for (int filter = 0; filter < numfilt; filter++) {
156 size_t nelmts = 0;
157 unsigned int flags, filter_info;
158 H5Z_filter_t filter_type = H5Pget_filter2(plist_id, filter, &flags, &nelmts,
159 nullptr, 0, nullptr, &filter_info);
160 VERBOSE(cerr << "Found H5 Filter Type: " << h5_filter_name(filter_type) << " (" << filter_type << ")" << endl);
161 switch (filter_type) {
162 case H5Z_FILTER_DEFLATE:
163 filters.append("deflate ");
164 break;
165 case H5Z_FILTER_SHUFFLE:
166 filters.append("shuffle ");
167 break;
168 case H5Z_FILTER_FLETCHER32:
169 filters.append("fletcher32 ");
170 break;
171 default:
172 ostringstream oss("Unsupported HDF5 filter: ", std::ios::ate);
173 oss << filter_type;
174 oss << " (" << h5_filter_name(filter_type) << ")";
175 throw BESInternalError(oss.str(), __FILE__, __LINE__);
176 }
177 }
178 //trimming trailing space from compression (aka filter) string
179 filters = filters.substr(0, filters.length() - 1);
180 dc->set_filter(filters);
181 }
182 catch (...) {
183 H5Pclose(plist_id);
184 throw;
185 }
186
187 H5Pclose(plist_id);
188}
189
190bool
191is_hdf5_fill_value_defined(hid_t dataset_id)
192{
193 hid_t plist_id;
194
195 // Suppress errors to stderr.
196 H5Eset_auto2(H5E_DEFAULT, nullptr, nullptr);
197
198 // Get creation properties list
199 if ( (plist_id = H5Dget_create_plist(dataset_id)) < 0 )
200 throw BESInternalError("Unable to open HDF5 dataset id.", __FILE__, __LINE__);
201
202 // How the fill value is defined?
203 H5D_fill_value_t status;
204 if ( (H5Pfill_value_defined(plist_id, &status)) < 0 ) {
205 H5Pclose(plist_id);
206 throw BESInternalError("Unable to access HDF5 Fillvalue information.", __FILE__, __LINE__);
207 }
208
209 H5Pclose(plist_id);
210
211 return status != H5D_FILL_VALUE_UNDEFINED;
212}
213
224string
225get_value_as_string(hid_t h5_type_id, vector<char> &value)
226{
227 H5T_class_t class_type = H5Tget_class(h5_type_id);
228 int sign;
229 switch (class_type) {
230 case H5T_INTEGER:
231 sign = H5Tget_sign(h5_type_id);
232 switch (H5Tget_size(h5_type_id)) {
233 case 1:
234 if (sign == H5T_SGN_2)
235 return to_string(*(int8_t *)(value.data()));
236 else
237 return to_string(*(uint8_t *)(value.data()));
238
239 case 2:
240 if (sign == H5T_SGN_2)
241 return to_string(*(int16_t *)(value.data()));
242 else
243 return to_string(*(uint16_t *)(value.data()));
244
245 case 4:
246 if (sign == H5T_SGN_2)
247 return to_string(*(int32_t *)(value.data()));
248 else
249 return to_string(*(uint32_t *)(value.data()));
250
251 case 8:
252 if (sign == H5T_SGN_2)
253 return to_string(*(int64_t *)(value.data()));
254 else
255 return to_string(*(uint64_t *)(value.data()));
256
257 default:
258 throw BESInternalError("Unable extract integer fill value.", __FILE__, __LINE__);
259 }
260
261
262 case H5T_FLOAT: {
263 ostringstream oss;
264 switch (H5Tget_size(h5_type_id)) {
265 case 4:
266 oss << *(float *) (value.data());
267 return oss.str();
268
269 case 8:
270 oss << *(double *) (value.data());
271 return oss.str();
272
273 default:
274 throw BESInternalError("Unable extract float fill value.", __FILE__, __LINE__);
275 }
276 }
277
278 // TODO jhrg 4/22/22
279 case H5T_STRING:
280 return "unsupported-string";
281 case H5T_ARRAY:
282 return "unsupported-array";
283 case H5T_COMPOUND:
284 return "unsupported-compound";
285
286 case H5T_REFERENCE:
287 default:
288 throw BESInternalError("Unable extract fill value.", __FILE__, __LINE__);
289 }
290}
291
297string get_hdf5_fill_value(hid_t dataset_id)
298{
299 // Suppress errors to stderr.
300 H5Eset_auto2(H5E_DEFAULT, nullptr, nullptr);
301
302 // Get creation properties list
303 hid_t plist_id = H5Dget_create_plist(dataset_id);
304 if (plist_id < 0 )
305 throw BESInternalError("Unable to open HDF5 dataset id.", __FILE__, __LINE__);
306
307 try {
308 hid_t dtype_id = H5Dget_type(dataset_id);
309 if (dtype_id < 0)
310 throw BESInternalError("Unable to get HDF5 dataset type id.", __FILE__, __LINE__);
311
312 vector<char> value(H5Tget_size(dtype_id));
313 if (H5Pget_fill_value(plist_id, dtype_id, value.data()) < 0)
314 throw BESInternalError("Unable to access HDF5 Fill Value.", __FILE__, __LINE__);
315
316 H5Pclose(plist_id);
317
318 return get_value_as_string(dtype_id, value);
319 }
320 catch (...) {
321 H5Pclose(plist_id);
322 throw;
323 }
324}
325
336static void get_variable_chunk_info(hid_t dataset, DmrppCommon *dc) {
337 std::string byteOrder;
338 H5T_order_t byte_order;
339
340 // Added support for HDF5 Fill Value. jhrg 4/22/22
341 bool fill_value_defined = is_hdf5_fill_value_defined(dataset);
342 if (fill_value_defined) {
343 string fill_value = get_hdf5_fill_value(dataset);
344 dc->set_uses_fill_value(fill_value_defined);
346 }
347
348 try {
349 hid_t dcpl = H5Dget_create_plist(dataset);
350 uint8_t layout_type = H5Pget_layout(dcpl);
351
352 hid_t fspace_id = H5Dget_space(dataset);
353 hid_t dtypeid = H5Dget_type(dataset);
354
355 byte_order = H5Tget_order(dtypeid);
356 switch (byte_order) {
357 case H5T_ORDER_LE:
358 byteOrder = "LE";
359 break;
360 case H5T_ORDER_BE:
361 byteOrder = "BE";
362 break;
363 case H5T_ORDER_NONE:
364 break;
365 default:
366 // unsupported enumerations: H5T_ORDER_[ERROR,VAX,MIXED]
367 ostringstream oss("Unsupported HDF5 dataset byteOrder: ", std::ios::ate);
368 oss << byte_order << ".";
369 throw BESInternalError(oss.str(), __FILE__, __LINE__);
370 }
371
372 int dataset_rank = H5Sget_simple_extent_ndims(fspace_id);
373
374 size_t dsize = H5Tget_size(dtypeid);
375
376 /* layout_type: 1 contiguous 2 chunk 3 compact */
377 switch (layout_type) {
378
379 case H5D_CONTIGUOUS: { /* Contiguous storage */
380 VERBOSE(cerr << "Storage: contiguous" << endl);
381
382 haddr_t cont_addr = H5Dget_offset(dataset);
383 hsize_t cont_size = H5Dget_storage_size(dataset);
384
385 VERBOSE(cerr << " Addr: " << cont_addr << endl);
386 VERBOSE(cerr << " Size: " << cont_size << endl);
387 VERBOSE(cerr << "byteOrder: " << byteOrder << endl);
388
389 if (cont_size > 0 && dc) {
390 dc->add_chunk(byteOrder, cont_size, cont_addr, "" /*pos in array*/);
391 }
392 break;
393 }
394 case H5D_CHUNKED: { /*chunking storage */
395 hsize_t num_chunks = 0;
396 herr_t status = H5Dget_num_chunks(dataset, fspace_id, &num_chunks);
397 if (status < 0) {
398 throw BESInternalError("Could not get the number of chunks", __FILE__, __LINE__);
399 }
400
401 VERBOSE(cerr << "Storage: chunked." << endl);
402 VERBOSE(cerr << "Number of chunks is: " << num_chunks << endl);
403
404 if (dc)
405 set_filter_information(dataset, dc);
406
407 // Get chunking information: rank and dimensions
408 vector<hsize_t> chunk_dims(dataset_rank);
409 unsigned int chunk_rank = H5Pget_chunk(dcpl, dataset_rank, chunk_dims.data());
410 if (chunk_rank != dataset_rank)
411 throw BESNotFoundError(
412 "Found a chunk with rank different than the dataset's (aka variables') rank", __FILE__,
413 __LINE__);
414
415 if (dc) dc->set_chunk_dimension_sizes(chunk_dims);
416
417 for (unsigned int i = 0; i < num_chunks; ++i) {
418 vector<hsize_t> chunk_coords(dataset_rank);
419 haddr_t addr = 0;
420 hsize_t size = 0;
421
422 status = H5Dget_chunk_info(dataset, fspace_id, i, chunk_coords.data(),
423 nullptr, &addr, &size);
424 if (status < 0) {
425 VERBOSE(cerr << "ERROR" << endl);
426 throw BESInternalError("Cannot get HDF5 dataset storage info.", __FILE__, __LINE__);
427 }
428
429 VERBOSE(cerr << "chk_idk: " << i << ", addr: " << addr << ", size: " << size << endl);
430 if (dc) dc->add_chunk(byteOrder, size, addr, chunk_coords);
431 }
432
433 break;
434 }
435
436 case H5D_COMPACT: { /* Compact storage */
437 VERBOSE(cerr << "Storage: compact" << endl);
438
439 size_t comp_size = H5Dget_storage_size(dataset);
440 VERBOSE(cerr << " Size: " << comp_size << endl);
441
442 if (comp_size == 0) {
443 throw BESInternalError("Cannot obtain the compact storage size.", __FILE__, __LINE__);
444 }
445
446 vector<uint8_t> values;
447
448 auto btp = dynamic_cast<Array *>(dc);
449 if (btp != nullptr) {
450 dc->set_compact(true);
451 size_t memRequired = btp->length() * dsize;
452
453 if (comp_size != memRequired) {
454 throw BESInternalError("Compact storage size does not match D4Array.", __FILE__, __LINE__);
455 }
456
457 switch (btp->var()->type()) {
458 case dods_byte_c:
459 case dods_char_c:
460 case dods_int8_c:
461 case dods_uint8_c:
462 case dods_int16_c:
463 case dods_uint16_c:
464 case dods_int32_c:
465 case dods_uint32_c:
466 case dods_float32_c:
467 case dods_float64_c:
468 case dods_int64_c:
469 case dods_uint64_c: {
470 values.resize(memRequired);
471 get_data(dataset, reinterpret_cast<void *>(values.data()));
472 btp->set_read_p(true);
473 btp->val2buf(reinterpret_cast<void *>(values.data()));
474 break;
475
476 }
477
478 case dods_str_c: {
479 if (H5Tis_variable_str(dtypeid) > 0) {
480 vector<string> finstrval = {""}; // passed by reference to read_vlen_string
481 read_vlen_string(dataset, 1, nullptr, nullptr, nullptr, finstrval);
482 btp->set_value(finstrval, (int)finstrval.size());
483 btp->set_read_p(true);
484 }
485 else {
486 // For this case, the Array is really a single string - check for that
487 // with the following assert - but is an Array because the string data
488 // is stored as an array of chars (hello, FORTRAN). Read the chars, make
489 // a string and load that into a vector<string> (which will be a vector
490 // of length one). Set that as the value of the Array. Really, this
491 // value could be stored as a scalar, but that's complicated and client
492 // software might be expecting an array, so better to handle it this way.
493 // jhrg 9/17/20
494 assert(btp->length() == 1);
495 values.resize(memRequired);
496 get_data(dataset, reinterpret_cast<void *>(values.data()));
497 string str(values.begin(), values.end());
498 vector<string> strings = {str};
499 btp->set_value(strings, (int)strings.size());
500 btp->set_read_p(true);
501 }
502 break;
503 }
504
505 default:
506 throw BESInternalError("Unsupported compact storage variable type.", __FILE__, __LINE__);
507 }
508
509 }
510 else {
511 throw BESInternalError("Compact storage variable is not a D4Array.",
512 __FILE__, __LINE__);
513 }
514 break;
515 }
516
517 default:
518 ostringstream oss("Unsupported HDF5 dataset layout type: ", std::ios::ate);
519 oss << layout_type << ".";
520 throw BESInternalError(oss.str(), __FILE__, __LINE__);
521 }
522 }
523 catch (...) {
524 H5Dclose(dataset);
525 throw;
526 }
527
528 H5Dclose(dataset);
529}
530
538void get_chunks_for_all_variables(hid_t file, D4Group *group) {
539 // variables in the group
540 for (auto v = group->var_begin(), ve = group->var_end(); v != ve; ++v) {
541 // if this variable has a 'fullnamepath' attribute, use that and not the
542 // FQN value.
543 D4Attributes *d4_attrs = (*v)->attributes();
544 if (!d4_attrs)
545 throw BESInternalError("Expected to find an attribute table for " + (*v)->name() + " but did not.",
546 __FILE__, __LINE__);
547
548 // Look for the full name path for this variable
549 // If one was not given via an attribute, use BaseType::FQN() which
550 // relies on the variable's position in the DAP dataset hierarchy.
551 const D4Attribute *attr = d4_attrs->get("fullnamepath");
552 // I believe the logic is more clear in this way:
553 // If fullnamepath exists and the H5Dopen2 fails to open, it should throw an error.
554 // If fullnamepath doesn't exist, we should ignore the error as the reason described below:
555 // (However, we should suppress the HDF5 dataset open error message.) KY 2019-12-02
556 // It's not an error if a DAP variable in a DMR from the hdf5 handler
557 // doesn't exist in the file _if_ there's no 'fullnamepath' because
558 // that variable was synthesized (likely for CF compliance)
559 hid_t dataset;
560 if (attr) {
561 string FQN;
562 if (attr->num_values() == 1)
563 FQN = attr->value(0);
564 else
565 FQN = (*v)->FQN();
566 BESDEBUG("dmrpp", "Working on: " << FQN << endl);
567 dataset = H5Dopen2(file, FQN.c_str(), H5P_DEFAULT);
568 if (dataset < 0)
569 throw BESInternalError("HDF5 dataset '" + FQN + "' cannot be opened.", __FILE__, __LINE__);
570
571 } else {
572 // The current design seems to still prefer to open the dataset when the fullnamepath doesn't exist
573 // So go ahead to open the dataset. Continue even if the dataset cannot be open. KY 2019-12-02
574 //
575 // A comment from an older version of the code:
576 // It's not an error if a DAP variable in a DMR from the hdf5 handler
577 // doesn't exist in the file _if_ there's no 'fullnamepath' because
578 // that variable was synthesized (likely for CF compliance)
579 H5Eset_auto2(H5E_DEFAULT, nullptr, nullptr);
580 string FQN = (*v)->FQN();
581 BESDEBUG("dmrpp", "Working on: " << FQN << endl);
582 dataset = H5Dopen2(file, FQN.c_str(), H5P_DEFAULT);
583 if (dataset < 0)
584 continue;
585 }
586
587 get_variable_chunk_info(dataset, dynamic_cast<DmrppCommon *>(*v));
588 }
589
590 // all groups in the group
591 for (auto g = group->grp_begin(), ge = group->grp_end(); g != ge; ++g)
592 get_chunks_for_all_variables(file, *g);
593}
594
600void add_chunk_information(const string &h5_file_name, DMRpp *dmrpp)
601{
602 // Open the hdf5 file
603 hid_t file = H5Fopen(h5_file_name.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
604 if (file < 0) {
605 throw BESNotFoundError(string("Error: HDF5 file '").append(h5_file_name).append("' cannot be opened."), __FILE__, __LINE__);
606 }
607
608 // iterate over all the variables in the DMR
609 try {
610 get_chunks_for_all_variables(file, dmrpp->root());
611 H5Fclose(file);
612 }
613 catch (...) {
614 H5Fclose(file);
615 throw;
616 }
617}
618
619} // namespace build_dmrpp_util
exception thrown if internal error encountered
error thrown if the resource requested cannot be found
Provide a way to print the DMR++ response.
Definition: DMRpp.h:44
Size and offset information of data included in DMR++ files.
Definition: DmrppCommon.h:94
virtual void set_fill_value_string(const std::string &fv)
Set the fill value (using a string)
Definition: DmrppCommon.h:214
virtual void set_uses_fill_value(bool ufv)
Set the uses_fill_value property.
Definition: DmrppCommon.h:211
void set_chunk_dimension_sizes(const std::vector< unsigned long long > &chunk_dims)
Set the value of the chunk dimension sizes given a vector of HDF5 hsize_t.
Definition: DmrppCommon.h:235
virtual unsigned long add_chunk(std::shared_ptr< http::url > d_data_url, const std::string &byte_order, unsigned long long size, unsigned long long offset, const std::string &position_in_array)
Adds a chunk to the vector of chunk refs (byteStreams) and returns the size of the chunks internal ve...
Definition: DmrppCommon.cc:208
void set_filter(const std::string &value)
Set the value of the filters property.
Definition: DmrppCommon.cc:102
void set_compact(bool value)
Set the value of the compact property.
Definition: DmrppCommon.h:163
void get_data(hid_t dset, void *buf)
Definition: h5common.cc:50