bes Updated for version 3.20.13
merge_dmrpp.cc
1#include <iostream>
2#include <fstream>
3#include <sstream>
4#include <string>
5#include <vector>
6#include <set>
7#include <algorithm>
8
9using namespace std;
10
11// The following block of functions retrieve the "missing" variable type, variable name and data value information.
12bool obtain_var_info(const string & miss_dmrpp_info,vector<string> & var_types, vector<string>&var_names,vector<string>&chunk_info_list,bool & is_chunk_mark1);
13bool find_var_name(const string &str,size_t &str_pos,size_t &var_name_pos_start,size_t &var_name_pos_end);
14bool find_end_var_block(const string&str, const string&var_type, const size_t &str_pos, size_t &var_end_pos);
15bool find_chunk_info(const string &str,const size_t&str_pos,size_t &chunk_info_pos_start, size_t &chunk_info_pos_end,const size_t&var_end_pos,bool & is_mark1);
16
17// The following block of functions add the file address(mostly the absolute path of the HDF5 file that stores the data value) to the chunk block.
18bool add_faddr_chunk_info(const string& miss_dmrpp_info,vector<string>&chunk_info_list,bool is_dmrpp_mark1,const string faddr_source = "");
19bool add_faddr_contig_line(string &chunk_info,const string &file_addr);
20bool add_faddr_chunk_comp_lines(string & chunk_info,const string &file_addr);
21
22// The following block of functions merge the "missing" variable data value information to the original dmrpp file.
23bool add_missing_info_to_file(const string &fname2,const vector<string> &var_types,const vector<string> &var_names,const vector<string> &chunk_info_list);
24void gen_block(const vector<string>&var_type_list,const vector<string>&var_name_list,vector<string>&block_begin,vector<string>&block_end);
25bool check_overlap_intervals(const vector<size_t> &sort_block_pos, const vector<size_t> & block_pos_start);
26void obtain_bindex_in_modified_string(const vector<size_t>& block_pos_start, vector<int>& block_index);
27bool split_string(const string & str, vector<string> &str_vec,const vector<string> &block_begin, const vector<string> &block_end,vector<int> &block_index);
28bool convert_dmrppstr_to_vec(const string &dmrpp_str,vector<string> &dmrpp_str_vec,const vector<string> &var_types,const vector<string> &var_names,vector<int> & block_index);
29void add_missing_info_to_vec(vector<string> &dmrpp_str_vec,const vector<string> &chunk_info_list,const vector<int>&block_index);
30void write_vec_to_file(const string &fname,const vector<string> &dmrpp_str_vec);
31
32// The following two functions are helper functions
33void file_to_string(const string &filename, string & out);
34void string_tokenize(const string &in_str,const char delim,vector<string>&out_vec);
35
36
37int main (int argc,char**argv)
38{
39 string dmrpp_line;
40 vector<string>var_types;
41 vector<string>var_names;
42 vector<string>chunk_info_list;
43
44 bool add_dmrpp_info = false;
45 bool is_chunk_mark1 = true;
46
47 string missing_dmrpp_str;
48
49 if(argc != 5) {
50 cout<<"Please provide four arguments: "<< endl;
51 cout<<" The first is the dmrpp file that contains the missing variable value information. "<<endl;
52 cout<<" The second is the original dmrpp file. "<<endl;
53 cout<<" An third one is the href to the missing variables HDF5 file. "<<endl;
54 cout<<" The fourth one is the text file that includes the missing variable information. "<<endl;
55 return 0;
56 }
57
58 // Obtain the dmrpp file name that contains the missing variable value.
59 string fname(argv[1]);
60
61 // Read the "missing dmrpp file" to a string
62 file_to_string(fname,missing_dmrpp_str);
63
64 // Obtain the missing chunk information from the dmrpp file.
65 add_dmrpp_info = obtain_var_info(missing_dmrpp_str,var_types,var_names,chunk_info_list,is_chunk_mark1);
66
67 // Just output a warning that there is no chunk info, in the supplemental dmrpp file.
68 if(false == add_dmrpp_info) {
69 cout<<"Cannot find corresponding chunk info. from the supplemental dmrpp file."<<endl;
70 cout<<"You may need to check if there is any variable in the dmrpp file. "<<endl;
71 cout<<"The dmrpp file is "<<fname <<endl;
72 }
73
74 if(var_types.size() !=var_names.size() || var_names.size() != chunk_info_list.size()) {
75 cout <<"Var type, var name and chunk_info must have the same number of sizes. "<<endl;
76 cout <<"The dmrpp file is "<<fname <<endl;
77 return 0;
78 }
79
80#if 0
81
82 for (size_t i =0; i<var_names.size();i++) {
83//cout<<"var type["<<i<<"]"<< var_types[i]<<endl;
84//cout<<"var name["<<i<<"]"<< var_names[i]<<endl;
85//cout<<"chunk_info_list["<<i<<"]"<< endl;
86
87 }
88#endif
89
90 // We need to erase those variables that are not really missing but are added by the generation program
91 string mvar_fname(argv[4]);
92 string missing_vname_str;
93
94 // Read the missing variable names to a string and tokenize the string to a vector of string.
95 file_to_string(mvar_fname,missing_vname_str);
96
97 vector<string> missing_vname_list;
98 char delim=',';
99 string_tokenize(missing_vname_str,delim,missing_vname_list);
100
101#if 0
102 for(size_t i = 0;i<missing_vname_list.size();i++)
103 cout <<"missing_vname_list["<<i<<"]"<<missing_vname_list[i]<<endl;
104#endif
105
106 // Remove the additional variables added by the filenetCDF-4 module.
107 vector<string>new_var_types;
108 vector<string>new_var_names;
109 vector<string>new_chunk_info_list;
110
111 for (size_t i =0; i<var_names.size();i++) {
112 for(size_t j = 0; j<missing_vname_list.size();j++) {
113 if(var_names[i] == missing_vname_list[j]) {
114 new_var_names.push_back(var_names[i]);
115 new_var_types.push_back(var_types[i]);
116 new_chunk_info_list.push_back(chunk_info_list[i]);
117 break;
118 }
119 }
120 }
121
122 // Add file address to each chunk. Mostly the file address is the absolute path of the HDF5 files.
123 string fadd_source(argv[3]);
124 add_faddr_chunk_info(missing_dmrpp_str,new_chunk_info_list,is_chunk_mark1,fadd_source);
125
126#if 0
127for (size_t i =0; i<new_var_types.size();i++) {
128cout<<"new chunk_info_list["<<i<<"]"<< endl;
129cout<<new_chunk_info_list[i]<<endl;
130}
131#endif
132
133 //string dmrpp_str;
134 string fname2(argv[2]);
135
136 // Add the missing chunk info to the original dmrpp file.
137 bool well_formed = add_missing_info_to_file(fname2,new_var_types,new_var_names,new_chunk_info_list);
138
139 if(false == well_formed) {
140 cout <<"The dmrpp file to be modified is either not well-formed or contains nested variable blocks that cannot be supported by this routine" <<endl;
141 cout <<"The dmrpp file is "<<fname2<<endl;
142
143 }
144
145 return 0;
146
147}
148
149// Obtain the var info from the supplemental(missing) dmrpp file. The variable types we checked are limited to DAP2 data types plus 64-bit integers.
150bool obtain_var_info(const string & miss_dmrpp_info,vector<string> & var_types, vector<string>&var_names,vector<string>&chunk_info_list,bool & is_chunk_mark1) {
151
152 bool ret = false;
153 vector<string> var_type_list;
154 var_type_list.push_back("Float32");
155 var_type_list.push_back("Int32");
156 var_type_list.push_back("Float64");
157 var_type_list.push_back("Byte");
158 var_type_list.push_back("Int16");
159 var_type_list.push_back("UInt16");
160 var_type_list.push_back("String");
161 var_type_list.push_back("UInt32");
162 var_type_list.push_back("Int8");
163 var_type_list.push_back("Int64");
164 var_type_list.push_back("UInt64");
165 var_type_list.push_back("UInt8");
166 var_type_list.push_back("Char");
167
168 size_t var_type_pos_start =0;
169 size_t var_name_pos_start = 0;
170 size_t var_name_pos_end = 0;
171 size_t chunk_pos_start = 0;
172 size_t chunk_pos_end = 0;
173 size_t var_end_pos= 0;
174 size_t str_pos = 0;
175
176
177 if(miss_dmrpp_info.empty())
178 return ret;
179
180 size_t str_last_char_pos = miss_dmrpp_info.size()-1;
181 bool well_formed = true;
182
183 // Go through the whole missing dmrpp string
184 while (str_pos <=str_last_char_pos && well_formed) {
185
186 size_t i = 0;
187 string var_sign;
188 string temp_var_sign;
189 size_t temp_var_type_pos_start=string::npos;
190 int var_type_index = -1;
191
192 // Go through the var_type_list to obtain the var data type
193 // We need to find the index in the var_type_list to
194 // obtain the correct var datatype.
195 while(i <var_type_list.size()) {
196 var_sign = "<"+var_type_list[i]+" name=\"";
197 var_type_pos_start = miss_dmrpp_info.find(var_sign,str_pos);
198 if(var_type_pos_start ==string::npos) {
199 i++;
200 continue;
201 }
202 else {
203 // We want to make sure we don't skip any vars.
204 if(temp_var_type_pos_start>var_type_pos_start){
205 temp_var_type_pos_start = var_type_pos_start;
206 var_type_index = i;
207 temp_var_sign = var_sign;
208 }
209 i++;
210 }
211
212 }
213
214 // Ensure all variables are scanned.
215 if(temp_var_type_pos_start !=string::npos) {
216 var_type_pos_start = temp_var_type_pos_start;
217 var_sign = temp_var_sign;
218
219 }
220
221 // This line will ignore datatypes that are not in the var_type_list
222 if(var_type_pos_start == string::npos) {
223 str_pos = string::npos;
224 continue;
225 }
226 else
227 str_pos = var_type_pos_start+var_sign.size();
228
229 // Now we can retrieve var name, var type and the corresponding chunk info
230 // Sanity check is also applied.
231 if(false == find_var_name(miss_dmrpp_info,str_pos,var_name_pos_start,var_name_pos_end))
232 well_formed = false;
233 else if(false == find_end_var_block(miss_dmrpp_info,var_type_list[var_type_index],str_pos,var_end_pos))
234 well_formed = false;
235 else if(false == find_chunk_info(miss_dmrpp_info,str_pos,chunk_pos_start,chunk_pos_end,var_end_pos,is_chunk_mark1))
236 well_formed = false;
237 else {
238 // Move the string search pos to the next block
239 str_pos = var_end_pos+1;
240 // Obtain var type, var name and chunk info. and save them to vectors.
241 var_types.push_back(var_type_list[var_type_index]);
242 var_names.push_back(miss_dmrpp_info.substr(var_name_pos_start,var_name_pos_end-var_name_pos_start));
243 string temp_chunk_info = miss_dmrpp_info.substr(chunk_pos_start,chunk_pos_end-chunk_pos_start);
244 if(true == is_chunk_mark1)
245 temp_chunk_info +="</dmrpp:chunks>";
246 else
247 temp_chunk_info +="/>";
248 chunk_info_list.push_back(temp_chunk_info);
249 }
250
251 }
252 return well_formed;
253
254}
255
256// Find var name in the supplemental dmrpp file.
257// var name block must end with " such as name="temperature"
258bool find_var_name(const string &str,size_t &str_pos,size_t &var_name_pos_start,size_t &var_name_pos_end) {
259
260 bool ret = true;
261 var_name_pos_start = str_pos;
262 var_name_pos_end = str.find("\"",str_pos);
263 if(var_name_pos_end == string::npos)
264 ret = false;
265 else
266 str_pos = var_name_pos_end;
267#if 0
268if(ret==false)
269cout<<"cannot find var name"<<endl;
270#endif
271
272 return ret;
273}
274
275// The end var block must be something like </Float32>
276bool find_end_var_block(const string&str, const string&var_type, const size_t &str_pos, size_t &var_end_pos) {
277
278 string end_var = "</" + var_type + '>';
279 var_end_pos = str.find(end_var,str_pos);
280#if 0
281if(var_end_pos==string::npos)
282cout<<"cannot find end var block"<<endl;
283#endif
284 return !(var_end_pos==string::npos);
285
286}
287
288// The chunk info must be confined by either <dmrpp::chunks> and </dmrpp::chunks> or <dmrpp:chunk> and />.
289bool find_chunk_info(const string &str,const size_t&str_pos,size_t &chunk_info_pos_start, size_t &chunk_info_pos_end,const size_t&var_end_pos,bool & is_mark1){
290
291 bool ret = true;
292 string chunk_start_mark1 = "<dmrpp:chunks";
293 string chunk_end_mark1 = "</dmrpp:chunks>";
294 string chunk_start_mark2 = "<dmrpp:chunk ";
295 string chunk_end_mark2 = "/>";
296 char wspace=' ';
297
298#if 0
299cout<<"str_pos is "<<str_pos <<endl;
300cout<<"var_end_pos is "<<var_end_pos <<endl;
301cout<<"substr is "<<str.substr(str_pos,var_end_pos-str_pos)<<endl;
302#endif
303 chunk_info_pos_start = str.find(chunk_start_mark1,str_pos);
304
305 if(string::npos == chunk_info_pos_start) {
306
307 chunk_info_pos_start = str.find(chunk_start_mark2,str_pos);
308 if(string::npos != chunk_info_pos_start)
309 chunk_info_pos_end =str.find(chunk_end_mark2,str_pos);
310
311 //This line is used to find the starting point of <dmrpp:chunk,
312 //The character ahead of "<dmrpp::chunk" is always a ' ' (space)
313 chunk_info_pos_start = str.find_last_not_of(wspace,chunk_info_pos_start-1)+1;
314 is_mark1 = false;
315 }
316 else {
317
318 chunk_info_pos_start = str.find_last_not_of(wspace,chunk_info_pos_start-1)+1;
319 chunk_info_pos_end = str.find(chunk_end_mark1,str_pos);
320 is_mark1 = true;
321 //chunk_info_pos_end = str.find(chunk_end_mark1.c_str(),str_pos,var_end_pos-str_pos);
322 }
323 if(string::npos == chunk_info_pos_start || string::npos== chunk_info_pos_end)
324 ret = false;
325 else if(var_end_pos <=chunk_info_pos_end)
326 ret = false;
327#if 0
328if(ret==false)
329cout<<"cannot find_chunk_info "<<endl;
330#endif
331 return ret;
332}
333
334// We need to add the supplemental file path to the chunk info.
335// The file name usually starts with "name= ..." and the path usually starts with dmrpp:href="
336bool add_faddr_chunk_info(const string &str,vector<string>& chunk_info,bool is_dmrpp_mark1, const string faddr_source) {
337
338 bool well_formed= true;
339 if(chunk_info.size()==0)
340 return true;
341 string addr_mark = "dmrpp:href=\"";
342
343 // The missing DMRPP file can have file address specified along with chunk info.
344 // But we assume if they do this for one chunk, they should do this for all chunks.
345 // If this is the case, no need to find address.
346 if(chunk_info[0].find(addr_mark)!=string::npos)
347 return true;
348
349 // retrieve name and reference
350 string hdf5_fname;
351 string hdf5_faddr;
352 string name_mark = " name=\"";
353 string end_delim1 ="\"";
354
355 // We must find a valid hdf5 file name.
356 size_t hdf5_fname_start_pos = str.find(name_mark);
357 if(hdf5_fname_start_pos == string::npos)
358 well_formed = false;
359 size_t hdf5_fname_end_pos = str.find(end_delim1,hdf5_fname_start_pos+name_mark.size());
360 if(hdf5_fname_end_pos == string::npos)
361 well_formed = false;
362 hdf5_fname = str.substr(hdf5_fname_start_pos+name_mark.size(),hdf5_fname_end_pos-hdf5_fname_start_pos-name_mark.size());
363 if(hdf5_fname=="")
364 well_formed = false;
365
366 // We also must find a valid file location .
367 size_t hdf5_faddr_start_pos = str.find(addr_mark);
368 if(hdf5_faddr_start_pos != string::npos) {
369 size_t hdf5_faddr_end_pos = str.find(end_delim1,hdf5_faddr_start_pos+addr_mark.size());
370 if(hdf5_faddr_end_pos == string::npos)
371 well_formed = false;
372 hdf5_faddr = str.substr(hdf5_faddr_start_pos+addr_mark.size(),hdf5_faddr_end_pos-hdf5_faddr_start_pos-addr_mark.size());
373 }
374
375 // The string for use in each missing_variable <chunk href:"value" >
376 hdf5_faddr = " href=\"" + faddr_source + end_delim1;
377
378 /*if (hdf5_faddr.rfind(hdf5_fname) == string::npos) {
379 //trim hdf5 file address.
380 hdf5_faddr = " href=\"" +hdf5_faddr+'/'+hdf5_fname+end_delim1;
381 }
382 else {
383 hdf5_faddr = " href=\"" +hdf5_faddr+end_delim1;
384 }*/
385
386//cout<<"hdf5_faddr is "<<hdf5_faddr <<endl;
387
388 for (size_t i = 0;i<chunk_info.size();i++) {
389
390 //If is_dmrpp_mark1 is true,
391 //add hdf5_faddr to each chunk line(The chunk line should have offset==)
392 //However, the variable may also use the contiguous storage.
393 //That chunk line marks with (nbyte==). Essentially it is not a chunk but
394 //the dmrpp still starts with the dmrpp:chunk.
395 if(true == is_dmrpp_mark1)
396 add_faddr_chunk_comp_lines(chunk_info[i],hdf5_faddr);
397 else
398 add_faddr_contig_line(chunk_info[i],hdf5_faddr);
399
400 }
401 return well_formed;
402
403}
404
405// Add chunk address when HDF5 chunking is used.
406bool add_faddr_chunk_comp_lines(string & chunk_info,const string &file_addr) {
407
408 string chunk_line_mark = "<dmrpp:chunk offset=";
409 string chunk_line_end_mark = "/>";
410 string chunk_stop_mark = "</dmrpp:chunks>";
411 size_t str_pos = 0;
412 size_t temp_pos = 0;
413 size_t chunk_line_end_pos = 0;
414 bool loop_continue = true;
415 string temp_str;
416 bool well_formed = true;
417 bool find_chunk_line = false;
418
419 // While loop from <dmrpp::chunks, until /dmrpp:chunks>
420 while(true == loop_continue) {
421 temp_pos = chunk_info.find(chunk_line_mark,str_pos);
422 if(temp_pos != string::npos) {
423 chunk_line_end_pos = chunk_info.find(chunk_line_end_mark,temp_pos);
424 if(chunk_line_end_pos != string::npos) {
425 find_chunk_line = true;
426 temp_str += chunk_info.substr(str_pos,chunk_line_end_pos-str_pos);
427 temp_str += file_addr;
428 str_pos = chunk_line_end_pos;
429 }
430 else {// Each chunk offset line must end with "/>"
431 loop_continue = false;
432 well_formed = false;
433 }
434 }
435 else {// We will go to the last line </dmrpp:chunks>
436 temp_pos = chunk_info.find(chunk_stop_mark,str_pos);
437 loop_continue = false;
438 //Add the last part of the chunk info. Note: a space between
439 //.h5" and "/>"
440 if(temp_pos!=string::npos)
441 temp_str += ' '+chunk_info.substr(str_pos);
442 else
443 well_formed = false;
444 }
445 }
446 if(true == find_chunk_line)
447 chunk_info = temp_str;
448 else
449 well_formed = false;
450 return well_formed;
451
452}
453
454// Add the file address with the contiguous storage.
455bool add_faddr_contig_line(string &chunk_info,const string &file_addr) {
456
457 bool well_formed = true;
458 string chunk_line_start_mark ="<dmrpp::chunk nBytes=";
459 string chunk_line_end_mark = "/>";
460 string temp_str;
461
462 // Just find the line and change it,this should always be the first line.
463 //May add a check to see if the start position is always 0.
464 size_t chunk_line_end_pos = chunk_info.find(chunk_line_end_mark);
465 if(string::npos == chunk_line_end_pos)
466 well_formed = false;
467 else {
468 temp_str = chunk_info.substr(0,chunk_line_end_pos);
469 temp_str +=file_addr;
470 temp_str +=' ' +chunk_info.substr(chunk_line_end_pos);
471 chunk_info = temp_str;
472 }
473 return well_formed;
474}
475
476// Add the missing info to the original dmrpp file.
477bool add_missing_info_to_file(const string &fname,const vector<string> &var_types,const vector<string> &var_names,const vector<string> &chunk_info_list) {
478
479 bool well_formed = true;
480 string dmrpp_str;
481
482 // The original dmrpp file to string
483 file_to_string(fname,dmrpp_str);
484 vector<string>dmrpp_str_vec;
485 vector <int> block_index;
486
487 // Convert the original DMRPP string to vector according to var_types and var_names.
488 // We need to remember the block index of the missing variables
489 // since the missing variable order in the supplemental dmrpp
490 // may be different than the original one..
491 well_formed = convert_dmrppstr_to_vec(dmrpp_str,dmrpp_str_vec,var_types,var_names,block_index);
492
493 // Release the memory of dmpstr. For a >10MB dmrpp file, this is not a small value.
494 string().swap(dmrpp_str);
495
496 // adding the missing chunk info to the dmrpp vector and then write back to the file.
497 if(true == well_formed) {
498 add_missing_info_to_vec(dmrpp_str_vec,chunk_info_list,block_index);
499 write_vec_to_file(fname,dmrpp_str_vec);
500 }
501 return well_formed;
502
503}
504
505// Convert the original dmrpp to vectors according to the *missing* variables.
506// Here we should NOT tokenize the orginal dmrpp according to every variable in it.
507// We only care about feeding those variables that miss the value information.
508bool convert_dmrppstr_to_vec(const string &dmrpp_str,vector<string> &dmrpp_str_vec,const vector<string> &var_types,const vector<string> &var_names,vector<int>&block_index){
509
510 vector<string>block_begin;
511 block_begin.resize(var_types.size());
512 vector<string>block_end;
513 block_end.resize(var_types.size());
514 gen_block(var_types,var_names,block_begin,block_end);
515
516#if 0
517for(size_t i =0; i<block_begin.size();i++)
518{
519cout<<"block_begin["<<i<<"]= "<<block_begin[i]<<endl;
520cout<<"block_end["<<i<<"]= "<<block_end[i]<<endl;
521
522}
523#endif
524
525 bool well_formed = split_string(dmrpp_str,dmrpp_str_vec,block_begin,block_end,block_index);
526 return well_formed;
527
528}
529
530// Add missing information to vector according to the right block_index
531void add_missing_info_to_vec(vector<string> &dmrpp_str_vec,const vector<string> &chunk_info_list,const vector<int> &block_index) {
532 string temp_str;
533 char insert_mark = '>';
534 for (size_t i = 0; i<block_index.size();i++) {
535 //cout<<"["<<2*i+1 <<"]= "<<dmrpp_str_vec[2*i+1]<<endl;
536 // The vector has to include the beginning and ending block.
537 // An example:
538 // The original string: Moses gre up i Egypt.
539 // The missing information is w in 'gre' and n in 'i'.
540 // So we have 2 missing blocks: grew and in.
541 // The original string should be divided into 5 to patch the
542 // missing characters. "Moses ","gre"," up ","i"," Egypt.".
543 // The final string then can be "Moses grew up in Egypt."
544 temp_str = dmrpp_str_vec[2*i+1];
545 size_t insert_pos = temp_str.find_last_of(insert_mark);
546 insert_pos = temp_str.find_last_of(insert_mark,insert_pos-1);
547
548 // The block_index[i] will ensure the right chunk info.
549 string temp_str2 = '\n'+chunk_info_list[block_index[i]];
550 temp_str.insert(insert_pos+1,temp_str2);
551#if 0
552 //cout<<"chunk_list["<<block_index[i]<<"]= "<<chunk_info_list[block_index[i]]<<endl;
553 //cout<<"temp_str is "<<temp_str <<endl;
554#endif
555 dmrpp_str_vec[2*i+1] = temp_str;
556 }
557
558 return;
559
560}
561
562// Used in the final step: to generate the final DMRPP file since
563// the dmrpp is relatively small, rewriting is still the fast way.
564void write_vec_to_file(const string &fname,const vector<string> &dmrpp_str_vec) {
565
566 string str_to_file;
567 for (size_t i =0;i<dmrpp_str_vec.size();i++)
568 str_to_file +=dmrpp_str_vec[i];
569 //str_to_file +=dmrpp_str_vec[i]+'\n';
570 ofstream outFile;
571 outFile.open(fname.c_str());
572 outFile<<str_to_file;
573 outFile.close();
574
575}
576
577// Obtain the beginning and the ending information of the block information.
578void gen_block(const vector<string>&var_type_list,const vector<string>&var_name_list,vector<string>&block_begin,vector<string>&block_end) {
579
580 for (size_t i =0; i<var_type_list.size();i++) {
581 block_begin[i] = '<' +var_type_list[i] +' '+"name=\""+var_name_list[i]+"\">";
582 block_end[i] = "</" + var_type_list[i] + '>';
583 }
584}
585
586// Split the string into different blocks.
587bool split_string(const string & str, vector<string> &str_vec,const vector<string> &block_begin, const vector<string>&block_end,vector<int>&block_index) {
588
589 bool well_formed = true;
590 vector<size_t> block_begin_pos;
591 vector<size_t> block_end_pos;
592 block_begin_pos.resize(block_begin.size());
593 block_end_pos.resize(block_end.size());
594
595 // Note:
596 // 1) We just want to split the string according to the variables that miss values.
597 // 2) block_begin_pos in the orginal dmrpp file may NOT be sorted.
598 // However, when we read back the string vector, we want to read from beginnng to the end.
599 // So we need to remember the index of each <var block> of the supplemental dmrpp file
600 // in the original dmrpp file so that the correct chunk info can be given to the var block that misses the values.
601 for(size_t i = 0; i<block_begin.size();i++) {
602 block_begin_pos[i] = str.find(block_begin[i]);
603 block_end_pos[i] = str.find(block_end[i],block_begin_pos[i])+(block_end[i].size());
604 }
605
606 obtain_bindex_in_modified_string(block_begin_pos,block_index);
607
608#if 0
609for(size_t i = 0; i<block_index.size();i++)
610cout<<"block_index["<<i<<"] is: "<<block_index[i] <<endl;
611#endif
612 vector<size_t>block_pos;
613 block_pos.resize(2*block_begin_pos.size());
614 for (size_t i = 0; i<block_begin.size();i++) {
615 block_pos[2*i] = block_begin_pos[i];
616 block_pos[2*i+1] = block_end_pos[i];
617 }
618
619 // This will ensure the string vector is kept from beginning to the end.
620 sort(block_pos.begin(),block_pos.end());
621
622 // Use a set: resume a different set, compare with the previous one. set_difference
623 // This will ensure that each <var block> doesn't overlap with others.
624 // It is a sanity check.
625 well_formed = check_overlap_intervals(block_pos,block_begin_pos);
626
627 // We need to consider the starting and the ending of the string
628 // So the string vector size is block_size + 1.
629 // Examples:
630 // string: Moses grew up in Egypt. It has four space intervals but five substrings.
631 if(true == well_formed) {
632 size_t str_block_pos = 0;
633 str_vec.resize(block_pos.size()+1);
634 for (size_t i =0; i<block_pos.size(); i++) {
635 str_vec[i] = str.substr(str_block_pos,block_pos[i]-str_block_pos);
636 str_block_pos = block_pos[i];
637 }
638 str_vec[block_pos.size()] = str.substr(str_block_pos);
639
640#if 0
641for(size_t i = 0; i <str_vec.size();i++)
642 cout<<"str_vec["<<i<<"] is: "<<str_vec[i] <<endl;
643#endif
644 }
645 return well_formed;
646
647}
648
649// Check if there are overlaps between any two var blocks.
650// Note: If there are no overlaps between var blocks, the sorted block-start's position set should be
651// the same as the unsorted one. This will take O(nlogn) rather than O(n*n) time.
652bool check_overlap_intervals(const vector<size_t> &sort_block_pos, const vector<size_t>&block_pos_start){
653
654 // No overlapping, return true.
655 set<size_t>sort_start_pos;
656 set<size_t>start_pos;
657 for (size_t i = 0; i<block_pos_start.size();i++) {
658 sort_start_pos.insert(sort_block_pos[2*i]);
659 start_pos.insert(block_pos_start[i]);
660 }
661 return (sort_start_pos == start_pos);
662
663}
664
665// Obtain the block index of the var block in the supplemental dmrpp file.
666// We need to remember the index of a var block in the supplemental dmrpp file to correctly match
667// the same var block in the original dmrpp file.
668// An example:
669// ex.h5.dmrpp has the variables as the order: ex1,ex2,lon,ex3,fakedim,lat.
670// It misses the values of lon,fakedime,lat.
671// In the supplemental dmrpp that has the value information, the variable order is lat,lon,fakedim.
672// In order to correctly provide the value info of lon,fakedim and lat without explicitly searching
673// the string. I decide to remember the vector index of variables in the supplemental dmrpp file.
674// In this case, the index of lat is 0, lon is 1 and fakedim is 2. While adding value info of the
675// missing variables in the ex.h5.dmrpp, I can just use the index to identify which chunk info I
676// should use to fill in.
677//
678void obtain_bindex_in_modified_string(const vector<size_t>& block_pos_start, vector<int>& block_index) {
679
680 vector<pair<size_t,int> > pos_index;
681 for (size_t i = 0; i <block_pos_start.size();i++)
682 pos_index.push_back(make_pair(block_pos_start[i],i));
683
684 // The pos_index will be sorted according to the first element,block_pos_start
685 sort(pos_index.begin(),pos_index.end());
686
687 for (size_t i = 0; i <block_pos_start.size();i++)
688 block_index.push_back(pos_index[i].second);
689 return;
690}
691
692// Help function: read the file content to a string.
693void file_to_string(const string &filename, string &out_str) {
694
695 ifstream inFile;
696 inFile.open(filename.c_str());
697
698 stringstream strStream;
699 strStream << inFile.rdbuf();
700
701 // Save the content to the string
702 out_str = strStream.str();
703 inFile.close();
704
705}
706
707//tokenize the string to a vector of string according the delim.
708void string_tokenize(const string &in_str,const char delim,vector<string>&out_vec) {
709 stringstream ss_str(in_str);
710 string temp_str;
711 while (getline(ss_str,temp_str,delim)) {
712 out_vec.push_back(temp_str);
713 }
714}
715