bes Updated for version 3.20.13
ScanElement.cc
1
2// This file is part of the "NcML Module" project, a BES module designed
3// to allow NcML files to be used to be used as a wrapper to add
4// AIS to existing datasets of any format.
5//
6// Copyright (c) 2009 OPeNDAP, Inc.
7// Author: Michael Johnson <m.johnson@opendap.org>
8//
9// For more information, please also see the main website: http://opendap.org/
10//
11// This library is free software; you can redistribute it and/or
12// modify it under the terms of the GNU Lesser General Public
13// License as published by the Free Software Foundation; either
14// version 2.1 of the License, or (at your option) any later version.
15//
16// This library is distributed in the hope that it will be useful,
17// but WITHOUT ANY WARRANTY; without even the implied warranty of
18// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19// Lesser General Public License for more details.
20//
21// You should have received a copy of the GNU Lesser General Public
22// License along with this library; if not, write to the Free Software
23// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24//
25// Please see the files COPYING and COPYRIGHT for more information on the GLPL.
26//
27// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
29#include "config.h"
30
31#include "ScanElement.h"
32
33#include <algorithm> // std::sort
34#include <cstring>
35#include <cerrno>
36#include <dirent.h>
37#include <iostream>
38#include <sstream>
39#include <sys/time.h>
40#include <sys/types.h>
41#include <sys/stat.h>
42
43#include "AggregationElement.h"
44#include "DirectoryUtil.h" // agg_util
45#include "NCMLDebug.h"
46#include "NCMLParser.h"
47#include "NetcdfElement.h"
48#include "RCObject.h"
49#include "SimpleTimeParser.h"
50#include "XMLHelpers.h"
51
52#include <libdap/Error.h> // libdap
53
54// ICU includes for the SimpleDateFormat used in this file only
55#include <unicode/smpdtfmt.h> // class SimpleDateFormat
56#include <unicode/timezone.h> // class TimeZone
57
60
61namespace ncml_module {
62const string ScanElement::_sTypeName = "scan";
63const vector<string> ScanElement::_sValidAttrs = getValidAttributes();
64
65// The rep for the opaque pointer in the header.
66struct ScanElement::DateFormatters {
67 DateFormatters() :
68 _pDateFormat(0), _pISO8601(0), _markPos(0), _sdfLen(0)
69 {
70 }
71 ~DateFormatters()
72 {
73 SAFE_DELETE(_pDateFormat);
74 SAFE_DELETE(_pISO8601);
75 }
76
77 // If we have a _dateFormatMark, we'll create a single
78 // instance of the icu SimpleDateFormat in order
79 // to process each file.
80 icu::SimpleDateFormat* _pDateFormat;
81 // We also will create a single instance of a format
82 // for ISO 8601 times for output into the coordinate
83 icu::SimpleDateFormat* _pISO8601;
84
85 // The position of the # mark in the date format string
86 // We match the preceding characters with the filename.
87 size_t _markPos;
88
89 // The length of the pattern we actually use for the
90 // simple date format. Thus the SDF pattern is
91 // the portion of the string from _markPos+1 to _sdfLen.
92 size_t _sdfLen;
93};
94
95ScanElement::ScanElement() :
96 RCObjectInterface(), NCMLElement(0), _location(""), _suffix(""), _regExp(""), _subdirs(""), _olderThan(""), _dateFormatMark(
97 ""), _enhance(""), _ncoords(""), _pParent(0), _pDateFormatters(0)
98{
99}
100
101ScanElement::ScanElement(const ScanElement& proto) :
102 RCObjectInterface(), NCMLElement(0), _location(proto._location), _suffix(proto._suffix), _regExp(proto._regExp), _subdirs(
103 proto._subdirs), _olderThan(proto._olderThan), _dateFormatMark(proto._dateFormatMark), _enhance(proto._enhance), _ncoords(
104 proto._ncoords), _pParent(proto._pParent) // weak ref so this is fair...
105 , _pDateFormatters(0)
106{
107 if (!_dateFormatMark.empty()) {
108 initSimpleDateFormats(_dateFormatMark);
109 }
110}
111
112ScanElement::~ScanElement()
113{
114 deleteDateFormats();
115 _pParent = 0;
116}
117
118AggregationElement*
119ScanElement::getParent() const
120{
121 return _pParent;
122}
123
124void ScanElement::setParent(AggregationElement* pParent)
125{
126 _pParent = pParent;
127}
128
129const string&
130ScanElement::getTypeName() const
131{
132 return _sTypeName;
133}
134
136ScanElement::clone() const
137{
138 return new ScanElement(*this);
139}
140
141void ScanElement::setAttributes(const XMLAttributeMap& attrs)
142{
143 _location = attrs.getValueForLocalNameOrDefault("location", "");
144 _suffix = attrs.getValueForLocalNameOrDefault("suffix", "");
145 _regExp = attrs.getValueForLocalNameOrDefault("regExp", "");
146 _subdirs = attrs.getValueForLocalNameOrDefault("subdirs", "true");
147 _olderThan = attrs.getValueForLocalNameOrDefault("olderThan", "");
148 _dateFormatMark = attrs.getValueForLocalNameOrDefault("dateFormatMark", "");
149 _enhance = attrs.getValueForLocalNameOrDefault("enhance", "");
150 _ncoords = attrs.getValueForLocalNameOrDefault("ncoords", "");
151
152 // default is to print errors and throw which we want.
153 validateAttributes(attrs, _sValidAttrs);
154
155 // Until we implement them, we'll throw parse errors for those not yet implemented.
156 throwOnUnhandledAttributes();
157
158 // Create the SimpleDateFormat's if we have a _dateFormatMark
159 if (!_dateFormatMark.empty()) {
160 initSimpleDateFormats(_dateFormatMark);
161 }
162}
163
164void ScanElement::handleBegin()
165{
166 if (!_parser->isScopeAggregation()) {
167 THROW_NCML_PARSE_ERROR(line(), "ScanElement: " + toString() + " "
168 "was not the direct child of an <aggregation> element as required!");
169 }
170}
171
172void ScanElement::handleContent(const string& content)
173{
174 // shouldn't be any, use the super impl to throw if not whitespace.
175 NCMLElement::handleContent(content);
176}
177
178void ScanElement::handleEnd()
179{
180 // Get to the our parent aggregation so we can add
181 NetcdfElement* pCurrentDataset = _parser->getCurrentDataset();
182 VALID_PTR(pCurrentDataset);
183 AggregationElement* pParentAgg = pCurrentDataset->getChildAggregation();
184 NCML_ASSERT_MSG(pParentAgg, "ScanElement::handleEnd(): Couldn't"
185 " find the the child aggregation of the current dataset, which is "
186 "supposed to be our parent!");
187 pParentAgg->addScanElement(this);
188}
189
190string ScanElement::toString() const
191{
192 return "<" + _sTypeName + " " + "location=\"" + _location + "\" "
193 + // always print this one even in empty.
194 printAttributeIfNotEmpty("suffix", _suffix) + printAttributeIfNotEmpty("regExp", _regExp)
195 + printAttributeIfNotEmpty("subdirs", _subdirs) + printAttributeIfNotEmpty("olderThan", _olderThan)
196 + printAttributeIfNotEmpty("dateFormatMark", _dateFormatMark) + printAttributeIfNotEmpty("ncoords", _ncoords)
197 + ">";
198}
199
200const string&
201ScanElement::ncoords() const
202{
203 return _ncoords;
204}
205
206bool ScanElement::shouldScanSubdirs() const
207{
208 return (_subdirs == "true");
209}
210
211long ScanElement::getOlderThanAsSeconds() const
212{
213 if (_olderThan.empty()) {
214 return 0L;
215 }
216
217 long secs = 0;
218 bool success = agg_util::SimpleTimeParser::parseIntoSeconds(secs, _olderThan);
219 if (!success) {
220 THROW_NCML_PARSE_ERROR(line(), "Couldn't parse the olderThan attribute! Expect a string of the form: "
221 "\"%d %units\" where %d is a number and %units is a time unit string such as "
222 " \"hours\" or \"s\".");
223 }
224 else {
225 return secs;
226 }
227}
228
229void ScanElement::getDatasetList(vector<NetcdfElement*>& datasets) const
230{
231 // Use BES root as our root
232 DirectoryUtil scanner;
233 scanner.setRootDir(scanner.getBESRootDir());
234
235 BESDEBUG("ncml", "Scan will be relative to the BES root data path = " << scanner.getRootDir() << endl);
236
237 setupFilters(scanner);
238
239 vector<FileInfo> files;
240 //vector<FileInfo> dirs;
241 try // catch BES errors to give more context,,,,
242 {
243 // Call the right version depending on setting of subtree recursion.
244 if (shouldScanSubdirs()) {
245 scanner.getListingOfRegularFilesRecursive(_location, files);
246 }
247 else {
248 scanner.getListingForPath(_location, &files, 0);
249 }
250 }
251 catch (BESNotFoundError& ex) {
252 ostringstream oss;
253 oss << "In processing " << toString() << " we got a BESNotFoundError with msg=";
254 ex.dump(oss);
255 oss << " Perhaps a path is incorrect?" << endl;
256 THROW_NCML_PARSE_ERROR(line(), oss.str());
257 }
258
259 // Let the other exceptions percolate up... Internal errors
260 // and Forbidden are pretty clear and likely not a typo
261 // in the NCML like NotFound could be.
262
263 BESDEBUG("ncml", "Scan " << toString() << " returned matching regular files: " << endl);
264 if (files.empty()) {
265 BESDEBUG("ncml", "WARNING: No matching files found!" << endl);
266 }
267 else {
269 }
270
271 // Let the user know we're performing syntactic sugar with ncoords
272 // We'll let the other context decide whether its proper to use it.
273 if (!_ncoords.empty()) {
274 BESDEBUG("ncml",
275 "Scan has ncoords attribute specified: ncoords=" << _ncoords << " Will be inherited by all matching datasets!" << endl);
276 }
277
278 // Adapt the file list into a temp vector of NetcdfElements
279 // created from the parser's factory so they
280 // get added to its memory pool
281 // We use a temp vector since we need to sort the datasets
282 // before appending them to the output dataset vector.
283 XMLAttributeMap attrs;
284 vector<NetcdfElement*> scannedDatasets;
285 scannedDatasets.reserve(files.size());
286 // Now add them...
287 for (vector<FileInfo>::const_iterator it = files.begin(); it != files.end(); ++it) {
288 // start fresh
289 attrs.clear();
290
291 // The path to the file, relative to the BES root as needed.
292 attrs.addAttribute(XMLAttribute("location", it->getFullPath()));
293
294 // If the user has specified the ncoords sugar,
295 // pass it down into the netcdf element.
296 if (!_ncoords.empty()) {
297 attrs.addAttribute(XMLAttribute("ncoords", _ncoords));
298 }
299
300 // If there's a dateFormatMark, pull out the coordVal
301 // and add it to the attrs map since we want to use that and
302 // not the location for the new map vector.
303 if (!_dateFormatMark.empty()) {
304 string timeCoord = extractTimeFromFilename(it->basename());
305 BESDEBUG("ncml", "Got an ISO 8601 time from dateFormatMark: " << timeCoord << endl);
306 attrs.addAttribute(XMLAttribute("coordValue", timeCoord));
307 }
308
309 // Make the dataset using the parser so it's in the parser memory pool.
310 RCPtr<NCMLElement> dataset = _parser->_elementFactory.makeElement("netcdf", attrs, *_parser);
311 VALID_PTR(dataset.get());
312
313 // Up the ref count (since it's in an RCPtr) and add to the result vector
314 scannedDatasets.push_back(static_cast<NetcdfElement*>(dataset.refAndGet()));
315 }
316
317 // We have the scanned datasets in scannedDatasets vector now, so sort
318 // on location() or coordValue() depending on whether we have a dateFormatMark...
319 if (_dateFormatMark.empty()) // sort by location()
320 {
321 BESDEBUG("ncml", "Sorting scanned datasets by location()..." << endl);
322 std::sort(scannedDatasets.begin(), scannedDatasets.end(), NetcdfElement::isLocationLexicographicallyLessThan);
323 }
324 else // sort by coordValue()
325 {
326 BESDEBUG("ncml",
327 "Sorting scanned datasets by coordValue() since we got a dateFormatMark" " and the coordValue are ISO 8601 dates..." << endl);
328 std::sort(scannedDatasets.begin(), scannedDatasets.end(), NetcdfElement::isCoordValueLexicographicallyLessThan);
329 }
330
331 // Also, if there's a dateFormatMark, we want to specify that a new
332 // _CoordinateAxisType attribute be added with value "Time" (according to NcML Aggregations page)
333 if (!_dateFormatMark.empty()) {
334 VALID_PTR(getParent());
335 getParent()->setAggregationVariableCoordinateAxisType("Time");
336 }
337
338 // Now we can append the sorted local vector of datasets to the output.
339 // We need not worry about reference counts since the scannedDatasets
340 // has them red'd already and won't deref when it goes out of scope.
341 // We are merely transferring them to the ouput, so the
342 // refcount is still correct as is.
343 BESDEBUG("ncml", "Adding the sorted scanned datasets to the current aggregation list..." << endl);
344 datasets.reserve(datasets.size() + scannedDatasets.size());
345 datasets.insert(datasets.end(), scannedDatasets.begin(), scannedDatasets.end());
346}
347
348void ScanElement::setupFilters(agg_util::DirectoryUtil& scanner) const
349{
350 // If we have a suffix, set the filter.
351 if (!_suffix.empty()) {
352 BESDEBUG("ncml", "Scan will filter against suffix=\"" << _suffix << "\"" << endl);
353 scanner.setFilterSuffix(_suffix);
354 }
355
356 if (!_regExp.empty()) {
357 BESDEBUG("ncml", "Scan will filter against the regExp=\"" << _regExp << "\"" << endl);
358
359 // If there's a problem compiling it, we'll know now.
360 // So catch it and wrap it as a parse error, which tecnically it is.
361 try {
362 scanner.setFilterRegExp(_regExp);
363 }
364 catch (libdap::Error& err) {
365 THROW_NCML_PARSE_ERROR(line(),
366 "There was a problem compiling the regExp=\"" + _regExp + "\" : " + err.get_error_message());
367 }
368 }
369
370 if (!_olderThan.empty()) {
371 long secs = getOlderThanAsSeconds();
372 struct timeval tvNow;
373 gettimeofday(&tvNow, 0);
374 long cutoffTime = tvNow.tv_sec - secs;
375 scanner.setFilterModTimeOlderThan(static_cast<time_t>(cutoffTime));
376 BESDEBUG("ncml",
377 "Setting scan filter modification time using duration: " << secs << " from the olderThan attribute=\"" << _olderThan << "\"" " The cutoff modification time based on now is: " << getTimeAsString(cutoffTime) << endl);
378 }
379}
380
381// SimpleDateFormat to produce ISO 8601
382static const string ISO_8601_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";
383
388static bool convertUnicodeStringToStdString(std::string& toString, const icu::UnicodeString& fromUniString)
389{
390 // This call exists in 4.2 but not 4.0 or 3.6
391 // TODO use this call if we up our minimum ICU version
392 // fromUniString.toUTF8String(toString);
393
394 toString = ""; // empty it in case of error
395 vector<char> buffer; // std::string element[0] isn't guaranteed contiguous like vectors, so we need a temp...
396 buffer.resize(fromUniString.length() + 1); // +1 for NULL terminator
397 UErrorCode errorCode = U_ZERO_ERROR;
398 int32_t patternLen = fromUniString.extract(buffer.data(), buffer.size(), 0, errorCode);
399 if (patternLen >= static_cast<int32_t>(buffer.size()) || U_FAILURE(errorCode)) {
400 return false;
401 }
402 else {
403 toString = std::string(buffer.data());
404 return true;
405 }
406}
407
408void ScanElement::initSimpleDateFormats(const std::string& dateFormatMark)
409{
410 // Make sure no accidental leaks
411 deleteDateFormats();
412 _pDateFormatters = new DateFormatters;
413 VALID_PTR(_pDateFormatters);
414
415 _pDateFormatters->_markPos = dateFormatMark.find_last_of("#");
416 if (_pDateFormatters->_markPos == string::npos) {
417 THROW_NCML_PARSE_ERROR(line(), "The scan@dateFormatMark attribute did not contain"
418 " a marking # character before the date format!"
419 " dateFormatMark=\"" + dateFormatMark + "\"");
420 }
421
422 // Get just the portion that is the SDF string
423 string dateFormat = dateFormatMark.substr(_pDateFormatters->_markPos + 1, string::npos);
424 BESDEBUG("ncml", "Using a date format of: " << dateFormat << endl);
425 icu::UnicodeString usDateFormat(dateFormat.c_str());
426
427 // Cache the length of the pattern for later substr calcs.
428 _pDateFormatters->_sdfLen = dateFormat.size();
429
430 // Try to make the formatter from the user given string
431 UErrorCode success = U_ZERO_ERROR;
432 _pDateFormatters->_pDateFormat = new icu::SimpleDateFormat(usDateFormat, success);
433 if (U_FAILURE(success)) {
434 THROW_NCML_PARSE_ERROR(line(), "Scan element failed to parse the SimpleDateFormat pattern: " + dateFormat);
435 }
436 VALID_PTR(_pDateFormatters->_pDateFormat);
437 // Set it to the GMT timezone since we expect UTC times by default.
438 _pDateFormatters->_pDateFormat->setTimeZone(*(icu::TimeZone::getGMT()));
439
440 // Also create an ISO 8601 formatter for creating the coordValue's
441 // from the parsed UDate's.
442 _pDateFormatters->_pISO8601 = new icu::SimpleDateFormat(success);
443 if (U_FAILURE(success)) {
444 THROW_NCML_PARSE_ERROR(line(), "Scan element failed to create the ISO 8601 SimpleDateFormat"
445 " using the pattern " + ISO_8601_FORMAT);
446 }
447 VALID_PTR(_pDateFormatters->_pISO8601);
448 // We want to output UTC, so GMT as well.
449 _pDateFormatters->_pISO8601->setTimeZone(*(icu::TimeZone::getGMT()));
450 _pDateFormatters->_pISO8601->applyPattern(ISO_8601_FORMAT.c_str());
451}
452
453std::string ScanElement::extractTimeFromFilename(const std::string& filename) const
454{
455 VALID_PTR(_pDateFormatters);
456 VALID_PTR(_pDateFormatters->_pDateFormat);
457 VALID_PTR(_pDateFormatters->_pISO8601);
458
459 // Skip the first set of chars before the # mark (we don't care that
460 // they match, just the quantity).
461 string sdfPortion = filename.substr(_pDateFormatters->_markPos, _pDateFormatters->_sdfLen);
462
463 icu::UnicodeString usPattern;
464 _pDateFormatters->_pDateFormat->toPattern(usPattern);
465 string sdfPattern;
466 bool conversionSuccess = convertUnicodeStringToStdString(sdfPattern, usPattern);
467 NCML_ASSERT_MSG(conversionSuccess,
468 "ScanElement::extractTimeFromFilename: couldn't convert the UnicodeString date pattern to a std::string!");
469
470 BESDEBUG("ncml",
471 "Scan is now matching the date portion of the filename " << sdfPortion << " to the SimpleDateFormat=" "\"" << sdfPattern << "\"" << endl);
472
473 UErrorCode status = U_ZERO_ERROR;
474 UDate theDate = _pDateFormatters->_pDateFormat->parse(sdfPortion.c_str(), status);
475 if (U_FAILURE(status)) {
476 THROW_NCML_PARSE_ERROR(line(), "SimpleDateFormat could not parse the pattern="
477 "\"" + sdfPattern + "\""
478 " on the filename portion=" + "\"" + sdfPortion + "\""
479 " of the filename=" + "\"" + filename + "\""
480 " Either the pattern was invalid or the filename did not match.");
481 }
482
483 icu::UnicodeString usISODate;
484 _pDateFormatters->_pISO8601->format(theDate, usISODate);
485 string result;
486 conversionSuccess = convertUnicodeStringToStdString(result, usISODate);
487 NCML_ASSERT_MSG(conversionSuccess,
488 "ScanElement::extractTimeFromFilename: failed to convert the UnicodeString ISO date to a std::string!");
489 // usISODate.toUTF8String(result);
490 return result;
491}
492
493void ScanElement::deleteDateFormats() noexcept
494{
495 SAFE_DELETE(_pDateFormatters);
496}
497
498vector<string> ScanElement::getValidAttributes()
499{
500 vector<string> attrs;
501 attrs.push_back("location");
502 attrs.push_back("suffix");
503 attrs.push_back("regExp");
504 attrs.push_back("subdirs");
505 attrs.push_back("olderThan");
506 attrs.push_back("dateFormatMark");
507
508 // it's in the schema, but we don't support it yet.
509 // Will throw later.
510 attrs.push_back("enhance");
511
512 // OPeNDAP extension, syntactic sugar applied to all matches.
513 attrs.push_back("ncoords");
514
515 return attrs;
516}
517
518void ScanElement::throwOnUnhandledAttributes()
519{
520 if (!_enhance.empty()) {
521 THROW_NCML_PARSE_ERROR(line(), "ScanElement: Sorry, enhance attribute is not yet supported.");
522 }
523}
524
525std::string ScanElement::getTimeAsString(time_t theTime)
526{
527 struct tm* pTM = gmtime(&theTime);
528 char buf[128];
529 // this should be "Year-Month-Day Hour:Minute:Second"
530 strftime(buf, 128, "%F %T", pTM);
531 return string(buf);
532}
533
534}
error thrown if the resource requested cannot be found
void dump(std::ostream &strm) const override
Displays debug information about this object.
static std::string getBESRootDir()
void setFilterRegExp(const std::string &regexp)
static void printFileInfoList(std::ostream &os, const std::vector< FileInfo > &listing)
void setRootDir(const std::string &rootDir, bool allowRelativePaths=false, bool allowSymLinks=false)
void getListingOfRegularFilesRecursive(const std::string &path, std::vector< FileInfo > &rRegularFiles)
void setFilterSuffix(const std::string &suffix)
void getListingForPath(const std::string &path, std::vector< FileInfo > *pRegularFiles, std::vector< FileInfo > *pDirectories)
void setFilterModTimeOlderThan(time_t newestModTime)
const std::string & getRootDir() const
A reference to an RCObject which automatically ref() and deref() on creation and destruction.
Definition: RCObject.h:284
T * refAndGet() const
Definition: RCObject.h:353
static bool parseIntoSeconds(long &seconds, const std::string &duration)
void addScanElement(ScanElement *pScanner)
Concrete class for NcML <netcdf> element.
Definition: NetcdfElement.h:64
AggregationElement * getChildAggregation() const
const std::string getValueForLocalNameOrDefault(const std::string &localname, const std::string &defVal="") const
Definition: XMLHelpers.cc:181
void addAttribute(const XMLAttribute &attribute)
Definition: XMLHelpers.cc:167
NcML Parser for adding/modifying/removing metadata (attributes) to existing local datasets using NcML...