bes Updated for version 3.20.13
url_impl.cc
1
2// -*- mode: c++; c-basic-offset:4 -*-
3
4// This file is part of the BES http package, part of the Hyrax data server.
5
6// Copyright (c) 2020 OPeNDAP, Inc.
7// Author: Nathan Potter <ndp@opendap.org>
8//
9// This library is free software; you can redistribute it and/or
10// modify it under the terms of the GNU Lesser General Public
11// License as published by the Free Software Foundation; either
12// version 2.1 of the License, or (at your option) any later version.
13//
14// This library is distributed in the hope that it will be useful,
15// but WITHOUT ANY WARRANTY; without even the implied warranty of
16// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17// Lesser General Public License for more details.
18//
19// You should have received a copy of the GNU Lesser General Public
20// License along with this library; if not, write to the Free Software
21// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22//
23// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24
25// Authors:
26// ndp Nathan Potter <ndp@opendap.org>
27
28#include "config.h"
29
30#include <string>
31#include <sstream>
32#include <map>
33#include <vector>
34#include <algorithm>
35#include <cctype>
36#include <functional>
37#include <time.h>
38
39#include "BESDebug.h"
40#include "BESUtil.h"
41#include "BESCatalogList.h"
42#include "HttpNames.h"
43
44#include "url_impl.h"
45
46using namespace std;
47using std::chrono::system_clock;
48
49#define MODULE HTTP_MODULE
50#define prolog string("url::").append(__func__).append("() - ")
51
52#define PROTOCOL_KEY "http_url_protocol"
53#define HOST_KEY "http_url_host"
54#define PATH_KEY "http_url_path"
55#define QUERY_KEY "http_url_query"
56#define SOURCE_URL_KEY "http_url_target_url"
57#define INGEST_TIME_KEY "http_url_ingest_time"
58
59
60namespace http {
61
62#if 0
67url::url(const map<string,string> &kvp)
68{
69 map<string,string> kvp_copy = kvp;
70 map<string,string>::const_iterator it;
71 map<string,string>::const_iterator itc;
72
73 it = kvp.find(PROTOCOL_KEY);
74 itc = kvp_copy.find(PROTOCOL_KEY);
75 if(it != kvp.end() && itc != kvp_copy.end()){
76 d_protocol = it->second;
77 kvp_copy.erase(it->first);
78 BESDEBUG(MODULE, prolog << "Located PROTOCOL_KEY(" << PROTOCOL_KEY << ") value: " << d_protocol << endl);
79 }
80 it = kvp.find(HOST_KEY);
81 itc = kvp_copy.find(HOST_KEY);
82 if(it != kvp.end() && itc != kvp_copy.end()){
83 d_host = it->second;
84 kvp_copy.erase(it->first);
85 BESDEBUG(MODULE, prolog << "Located HOST_KEY(" << HOST_KEY << ") value: " << d_host << endl);
86 }
87 it = kvp.find(PATH_KEY);
88 itc = kvp_copy.find(PATH_KEY);
89 if(it != kvp.end() && itc != kvp_copy.end()){
90 d_path = it->second;
91 kvp_copy.erase(it->first);
92 BESDEBUG(MODULE, prolog << "Located PATH_KEY(" << PATH_KEY << ") value: " << d_path << endl);
93 }
94 it = kvp.find(QUERY_KEY);
95 itc = kvp_copy.find(QUERY_KEY);
96 if(it != kvp.end() && itc != kvp_copy.end()){
97 d_query = it->second;
98 kvp_copy.erase(it->first);
99 BESDEBUG(MODULE, prolog << "Located QUERY_KEY(" << QUERY_KEY << ") value: " << d_query << endl);
100 }
101 it = kvp.find(SOURCE_URL_KEY);
102 itc = kvp_copy.find(SOURCE_URL_KEY);
103 if(it != kvp.end() && itc != kvp_copy.end()){
104 d_source_url_str = it->second;
105 kvp_copy.erase(it->first);
106 BESDEBUG(MODULE, prolog << "Located SOURCE_URL_KEY(" << SOURCE_URL_KEY << ") value: " << d_source_url_str << endl);
107 }
108
109 for(itc = kvp_copy.begin(); itc != kvp_copy.end(); itc++){
110 string key = itc->first;
111 string value = itc->second;
112 map<string, vector<string>* >::const_iterator record_it;
113 record_it = d_query_kvp.find(key);
114 if(record_it != d_query_kvp.end()){
115 vector<string> *values = record_it->second;
116 values->push_back(value);
117 }
118 else {
119 vector<string> *values = new vector<string>();
120 values->push_back(value);
121 d_query_kvp.insert(pair<string, vector<string>*>(key, values));
122 }
123 }
124
125}
126#endif
127
131url::~url()
132{
133 if(!d_query_kvp.empty()){
134 map<string, vector<string>* >::const_iterator it;
135 for(it = d_query_kvp.begin() ; it != d_query_kvp.end(); it++){
136 delete it->second;
137 }
138 }
139}
140
141
148void url::parse() {
149 const string protocol_end("://");
150 BESDEBUG(MODULE, prolog << "BEGIN (parsing: '" << d_source_url_str << "')" << endl);
151
152 // If the supplied string does not start with a protocol, we assume it must be a
153 // path relative the BES.Catalog.catalog.RootDirectory because that's the only
154 // thing we are going to allow, even when it starts with slash '/'. Basically
155 // we force it to be in the BES.Catalog.catalog.RootDirectory tree.
156 if(d_source_url_str.find(protocol_end) == string::npos){
157 // Since we want a valid path in the file system tree for data, we make it so by adding
158 // the file path that starts with the catalog root dir.
160 string default_catalog_name = bcl->default_catalog_name();
161 BESDEBUG(MODULE, prolog << "Searching for catalog: " << default_catalog_name << endl);
162 BESCatalog *bcat = bcl->find_catalog(default_catalog_name);
163 if (bcat) {
164 BESDEBUG(MODULE, prolog << "Found catalog: " << bcat->get_catalog_name() << endl);
165 } else {
166 string msg = "OUCH! Unable to locate default catalog!";
167 BESDEBUG(MODULE, prolog << msg << endl);
168 throw BESInternalError(msg, __FILE__, __LINE__);
169 }
170 string catalog_root = bcat->get_root();
171 BESDEBUG(MODULE, prolog << "Catalog root: " << catalog_root << endl);
172
173 string file_path = BESUtil::pathConcat(catalog_root,d_source_url_str);
174 if(file_path[0] != '/')
175 file_path = "/" + file_path;
176 d_source_url_str = FILE_PROTOCOL + file_path;
177 }
178
179 const string parse_url_target(d_source_url_str);
180
181 string::const_iterator prot_i = search(parse_url_target.begin(), parse_url_target.end(),
182 protocol_end.begin(), protocol_end.end());
183
184 if (prot_i != parse_url_target.end())
185 advance(prot_i, protocol_end.length());
186
187 d_protocol.reserve(distance(parse_url_target.begin(), prot_i));
188 transform(parse_url_target.begin(), prot_i,
189 back_inserter(d_protocol),
190 [](int c) { return tolower(c); }); // protocol is icase
191 if (prot_i == parse_url_target.end())
192 return;
193
194 if (d_protocol == FILE_PROTOCOL) {
195 d_path = parse_url_target.substr(d_protocol.length());
196 BESDEBUG(MODULE, prolog << "FILE_PROTOCOL d_path: " << d_path << endl);
197 }
198 else if( d_protocol == HTTP_PROTOCOL || d_protocol == HTTPS_PROTOCOL){
199 string::const_iterator path_i = find(prot_i, parse_url_target.end(), '/');
200 d_host.reserve(distance(prot_i, path_i));
201 transform(prot_i, path_i,
202 back_inserter(d_host),
203 [](int c) { return tolower(c); });// host is icase
204 string::const_iterator query_i = find(path_i, parse_url_target.end(), '?');
205 d_path.assign(path_i, query_i);
206 if (query_i != parse_url_target.end())
207 ++query_i;
208 d_query.assign(query_i, parse_url_target.end());
209
210 if (!d_query.empty()) {
211 vector<string> records;
212 string delimiters = "&";
213 BESUtil::tokenize(d_query, records, delimiters);
214 vector<string>::iterator i = records.begin();
215 for (; i != records.end(); i++) {
216 size_t index = i->find('=');
217 if (index != string::npos) {
218 string key = i->substr(0, index);
219 string value = i->substr(index + 1);
220 BESDEBUG(MODULE, prolog << "key: " << key << " value: " << value << endl);
221 map<string, vector<string> *>::const_iterator record_it;
222 record_it = d_query_kvp.find(key);
223 if (record_it != d_query_kvp.end()) {
224 vector<string> *values = record_it->second;
225 values->push_back(value);
226 } else {
227 vector<string> *values = new vector<string>();
228 values->push_back(value);
229 d_query_kvp.insert(pair<string, vector<string> *>(key, values));
230 }
231 }
232 }
233 }
234 }
235 else {
236 stringstream msg;
237 msg << prolog << "Unsupported URL protocol " << d_protocol << " found in URL: " << d_source_url_str;
238 BESDEBUG(MODULE, msg.str() << endl);
239 throw BESInternalError(msg.str(), __FILE__, __LINE__);
240 }
241 BESDEBUG(MODULE, prolog << "END (parsing: '" << d_source_url_str << "')" << endl);
242
243}
244
245
251string url::query_parameter_value(const string &key) const
252{
253 string value;
254 map<string, vector<string>* >::const_iterator it;
255 it = d_query_kvp.find(key);
256 if(it != d_query_kvp.end()){
257 vector<string> *values = it->second;
258 if(!values->empty()){
259 value = (*values)[0];
260 }
261 }
262 return value;
263}
264
270void url::query_parameter_values(const string &key, vector<string> &values) const
271{
272 map<string, vector<string>* >::const_iterator it;
273 it = d_query_kvp.find(key);
274 if(it != d_query_kvp.end()){
275 values = *it->second;
276 }
277}
278
279#if 0
280
285void url::kvp(map<string,string> &kvp){
286 stringstream ss;
287
288 // Do the basic stuff
289 kvp.insert(pair<string,string>(PROTOCOL_KEY, d_protocol));
290 kvp.insert(pair<string,string>(HOST_KEY, d_host));
291 kvp.insert(pair<string,string>(PATH_KEY, d_path));
292 kvp.insert(pair<string,string>(QUERY_KEY, d_query));
293 kvp.insert(pair<string,string>(SOURCE_URL_KEY, d_source_url_str));
294 ss << d_ingest_time;
295 kvp.insert(pair<string,string>(INGEST_TIME_KEY,ss.str()));
296
297 // Now grab the query string. Only the first value of multi valued keys is used.
298 map<string, vector<string>* >::const_iterator it;
299 for(it=d_query_kvp.begin(); it != d_query_kvp.end(); it++){
300 kvp.insert(pair<string,string>(it->first,(*it->second)[0]));
301 }
302}
303#endif
304
312{
313
314 bool stale;
315 std::time_t now = system_clock::to_time_t(system_clock::now());
316
317 BESDEBUG(MODULE, prolog << "now: " << now << endl);
318 // We set the expiration time to the default, in case other avenues don't work out so well.
319 std::time_t expires_time = ingest_time() + HTTP_EFFECTIVE_URL_DEFAULT_EXPIRES_INTERVAL;
320
321 string cf_expires = query_parameter_value(CLOUDFRONT_EXPIRES_HEADER_KEY);
322 string aws_expires_str = query_parameter_value(AMS_EXPIRES_HEADER_KEY);
323
324 if(!cf_expires.empty()){ // CloudFront expires header?
325 std::istringstream(cf_expires) >> expires_time;
326 BESDEBUG(MODULE, prolog << "Using "<< CLOUDFRONT_EXPIRES_HEADER_KEY << ": " << expires_time << endl);
327 }
328 else if(!aws_expires_str.empty()){
329
330 long long aws_expires;
331 std::istringstream(aws_expires_str) >> aws_expires;
332 // AWS Expires header?
333 //
334 // By default we'll use the time we made the URL object, ingest_time
335 std::time_t aws_start_time = ingest_time();
336
337 // But if there's an AWS Date we'll parse that and compute the time
338 // @TODO move to NgapApi::decompose_url() and add the result to the map
339 string aws_date = query_parameter_value(AWS_DATE_HEADER_KEY);
340
341 if(!aws_date.empty()){
342
343 string date = aws_date; // 20200624T175046Z
344 string year = date.substr(0,4);
345 string month = date.substr(4,2);
346 string day = date.substr(6,2);
347 string hour = date.substr(9,2);
348 string minute = date.substr(11,2);
349 string second = date.substr(13,2);
350
351 BESDEBUG(MODULE, prolog << "date: "<< date <<
352 " year: " << year << " month: " << month << " day: " << day <<
353 " hour: " << hour << " minute: " << minute << " second: " << second << endl);
354
355 std::time_t old_now;
356 time(&old_now); /* get current time; same as: timer = time(NULL) */
357 BESDEBUG(MODULE, prolog << "old_now: " << old_now << endl);
358 struct tm *ti = gmtime(&old_now);
359 ti->tm_year = stoll(year) - 1900;
360 ti->tm_mon = stoll(month) - 1;
361 ti->tm_mday = stoll(day);
362 ti->tm_hour = stoll(hour);
363 ti->tm_min = stoll(minute);
364 ti->tm_sec = stoll(second);
365
366 BESDEBUG(MODULE, prolog << "ti->tm_year: "<< ti->tm_year <<
367 " ti->tm_mon: " << ti->tm_mon <<
368 " ti->tm_mday: " << ti->tm_mday <<
369 " ti->tm_hour: " << ti->tm_hour <<
370 " ti->tm_min: " << ti->tm_min <<
371 " ti->tm_sec: " << ti->tm_sec << endl);
372
373
374 aws_start_time = mktime(ti);
375 BESDEBUG(MODULE, prolog << "AWS start_time (computed): " << aws_start_time << endl);
376 }
377
378 expires_time = aws_start_time + aws_expires;
379 BESDEBUG(MODULE, prolog << "Using "<< AMS_EXPIRES_HEADER_KEY << ": " << aws_expires <<
380 " (expires_time: " << expires_time << ")" << endl);
381 }
382 std::time_t remaining = expires_time - now;
383 BESDEBUG(MODULE, prolog << "expires_time: " << expires_time <<
384 " remaining: " << remaining <<
385 " threshold: " << HTTP_URL_REFRESH_THRESHOLD << endl);
386
387 stale = remaining < HTTP_URL_REFRESH_THRESHOLD;
388 BESDEBUG(MODULE, prolog << "stale: " << (stale?"true":"false") << endl);
389
390 return stale;
391}
392
397string url::dump(){
398 stringstream ss;
399 string indent_inc = " ";
400 string indent = indent_inc;
401
402 ss << "http::url [" << this << "] " << endl;
403 ss << indent << "d_source_url_str: " << d_source_url_str << endl;
404 ss << indent << "d_protocol: " << d_protocol << endl;
405 ss << indent << "d_host: " << d_host << endl;
406 ss << indent << "d_path: " << d_path << endl;
407 ss << indent << "d_query: " << d_query << endl;
408
409 std::map<std::string, std::vector<std::string>* >::iterator it;
410
411 string idt = indent+indent_inc;
412 for(it=d_query_kvp.begin(); it !=d_query_kvp.end(); it++){
413 ss << indent << "d_query_kvp["<<it->first<<"]: " << endl;
414 std::vector<std::string> *values = it->second;
415 for(size_t i=0; i<values->size(); i++){
416 ss << idt << "value[" << i << "]: " << (*values)[i] << endl;
417 }
418 }
419 ss << indent << "d_ingest_time: " << d_ingest_time.time_since_epoch().count() << endl;
420 return ss.str();
421}
422
423} // namespace http
List of all registered catalogs.
virtual std::string default_catalog_name() const
The name of the default catalog.
static BESCatalogList * TheCatalogList()
Get the singleton BESCatalogList instance.
Catalogs provide a hierarchical organization for data.
Definition: BESCatalog.h:51
virtual std::string get_root() const =0
virtual std::string get_catalog_name() const
Get the name for this catalog.
Definition: BESCatalog.h:102
exception thrown if internal error encountered
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
Definition: BESUtil.cc:992
static std::string pathConcat(const std::string &firstPart, const std::string &secondPart, char separator='/')
Concatenate path fragments making sure that they are separated by a single '/' character.
Definition: BESUtil.cc:751
virtual void query_parameter_values(const std::string &key, std::vector< std::string > &values) const
Definition: url_impl.cc:270
virtual std::string query_parameter_value(const std::string &key) const
Definition: url_impl.cc:251
virtual std::string dump()
Definition: url_impl.cc:397
virtual bool is_expired()
Definition: url_impl.cc:311
utility class for the HTTP catalog module
Definition: AllowedHosts.cc:55