41#include "BESInternalError.h"
42#include "BESForbiddenError.h"
43#include "BESSyntaxUserError.h"
44#include "BESNotFoundError.h"
45#include "BESTimeoutError.h"
54#include "RemoteResource.h"
55#include "TheBESKeys.h"
56#include "BESStopWatch.h"
61#define BES_CATALOG_ROOT_KEY "BES.Catalog.catalog.RootDirectory"
63#define prolog std::string("RemoteResource::").append(__func__).append("() - ")
64#define MODULE HTTP_MODULE
68RemoteResource::RemoteResource(
69 std::shared_ptr<http::url> target_url,
70 const std::string &uid,
71 long long expiredInterval)
72 : d_remoteResourceUrl(std::move(target_url)){
75 d_initialized =
false;
79 d_resourceCacheFileName.clear();
80 d_response_headers =
new vector<string>();
81 d_http_response_headers =
new map<string, string>();
83 d_expires_interval = expiredInterval;
86 if(d_remoteResourceUrl->protocol() == FILE_PROTOCOL){
87 BESDEBUG(MODULE,prolog <<
"Found FILE protocol." << endl);
88 d_resourceCacheFileName = d_remoteResourceUrl->path();
91 d_resourceCacheFileName = d_resourceCacheFileName.substr(0,d_resourceCacheFileName.length()-1);
98 throw BESInternalError( prolog +
"ERROR - "+ BES_CATALOG_ROOT_KEY +
"is not set",__FILE__,__LINE__);
100 if(d_resourceCacheFileName.find(catalog_root) !=0 ){
103 BESDEBUG(MODULE,
"d_resourceCacheFileName: " << d_resourceCacheFileName << endl);
106 else if( d_remoteResourceUrl->protocol() == HTTPS_PROTOCOL || d_remoteResourceUrl->protocol() == HTTP_PROTOCOL ){
107 BESDEBUG(MODULE, prolog <<
"URL: " << d_remoteResourceUrl->str() << endl);
111 string client_id_hdr =
"User-Id: " + d_uid;
112 BESDEBUG(MODULE, prolog << client_id_hdr << endl);
113 d_request_headers.push_back(client_id_hdr);
115 if (!d_echo_token.empty()){
116 string echo_token_hdr =
"Echo-Token: " + d_echo_token;
117 BESDEBUG(MODULE, prolog << echo_token_hdr << endl);
118 d_request_headers.push_back(echo_token_hdr);
124 string err = prolog +
"Unsupported protocol: " + d_remoteResourceUrl->protocol();
139 RemoteResource::RemoteResource(
const std::string &url,
const std::string &uid,
const std::string &echo_token) {
142 d_initialized =
false;
145 d_echo_token = echo_token;
149 d_resourceCacheFileName.clear();
150 d_response_headers =
new vector<string>();
151 d_request_headers =
new vector<string>();
152 d_http_response_headers =
new map<string, string>();
155 throw BESInternalError(prolog +
"Remote resource URL is empty.", __FILE__, __LINE__);
158 if(url.find(FILE_PROTOCOL) == 0){
159 d_resourceCacheFileName = url.substr(strlen(FILE_PROTOCOL));
162 d_resourceCacheFileName = d_resourceCacheFileName.substr(0,d_resourceCacheFileName.length()-1);
169 throw BESInternalError( prolog +
"ERROR - "+ BES_CATALOG_ROOT_KEY +
"is not set",__FILE__,__LINE__);
171 if(d_resourceCacheFileName.find(catalog_root) !=0 ){
176 else if(url.find(HTTPS_PROTOCOL) == 0 || url.find(HTTP_PROTOCOL) == 0){
177 d_remoteResourceUrl = url;
178 BESDEBUG(MODULE, prolog <<
"URL: " << d_remoteResourceUrl << endl);
181 string client_id_hdr =
"User-Id: " + d_uid;
182 BESDEBUG(MODULE, prolog << client_id_hdr << endl);
183 d_request_headers->push_back(client_id_hdr);
185 if (!d_echo_token.empty()){
186 string echo_token_hdr =
"Echo-Token: " + d_echo_token;
187 BESDEBUG(MODULE, prolog << echo_token_hdr << endl);
188 d_request_headers->push_back(echo_token_hdr);
192 string err = prolog +
"Unsupported protocol: " + url;
207RemoteResource::~RemoteResource() {
208 BESDEBUG(MODULE, prolog <<
"BEGIN resourceURL: " << d_remoteResourceUrl->str() << endl);
210 delete d_response_headers;
211 d_response_headers = 0;
212 BESDEBUG(MODULE, prolog <<
"Deleted d_response_headers." << endl);
214 if (!d_resourceCacheFileName.empty()) {
215 HttpCache *cache = HttpCache::get_instance();
218 BESDEBUG(MODULE, prolog <<
"Closed and unlocked " << d_resourceCacheFileName << endl);
219 d_resourceCacheFileName.clear();
222 BESDEBUG(MODULE, prolog <<
"END" << endl);
229std::string RemoteResource::getCacheFileName() {
230 if (!d_initialized) {
231 throw BESInternalError(prolog +
"STATE ERROR: Remote Resource " + d_remoteResourceUrl->str() +
232 " has Not Been Retrieved.", __FILE__, __LINE__);
234 return d_resourceCacheFileName;
244void RemoteResource::retrieveResource() {
245 std::map<std::string, std::string> content_filters;
246 retrieveResource(content_filters);
260void RemoteResource::retrieveResource(
const std::map<std::string, std::string> &content_filters) {
261 BESDEBUG(MODULE, prolog <<
"BEGIN resourceURL: " << d_remoteResourceUrl->str() << endl);
266 BESDEBUG(MODULE, prolog <<
"END Already initialized." << endl);
270 HttpCache *cache = HttpCache::get_instance();
273 oss << prolog <<
"FAILED to get local cache. ";
274 oss <<
"Unable to proceed with request for " << this->d_remoteResourceUrl->str();
275 oss <<
" The server MUST have a valid HTTP cache configuration to operate." << endl;
276 BESDEBUG(MODULE, oss.str());
282 d_resourceCacheFileName = cache->
get_cache_file_name(d_uid, d_remoteResourceUrl->str(), mangle);
283 BESDEBUG(MODULE, prolog <<
"d_resourceCacheFileName: " << d_resourceCacheFileName << endl);
291 http::get_type_from_url(d_remoteResourceUrl->str(), d_type);
292 BESDEBUG(MODULE, prolog <<
"d_type: " << d_type << endl);
297 prolog <<
"Remote resource is already in cache. cache_file_name: " << d_resourceCacheFileName
300 if (cached_resource_is_expired()) {
301 BESDEBUG(MODULE, prolog <<
"EXISTS - UPDATING " << endl);
302 update_file_and_headers(content_filters);
305 BESDEBUG(MODULE, prolog <<
"EXISTS - LOADING " << endl);
307 load_hdrs_from_file();
309 d_initialized =
true;
316 BESDEBUG(MODULE, prolog <<
"DOESN'T EXIST - CREATING " << endl);
317 update_file_and_headers(content_filters);
319 BESDEBUG(MODULE, prolog <<
" WAS CREATED - LOADING " << endl);
321 load_hdrs_from_file();
323 d_initialized =
true;
328 msg << prolog +
"Failed to acquire cache read lock for remote resource: '";
329 msg << d_remoteResourceUrl->str() << endl;
334 BESDEBUG(MODULE, prolog <<
"Caught BESError. type: " << besError.
get_bes_error_type() <<
337 " Will unlock cache and re-throw." << endl);
342 BESDEBUG(MODULE, prolog <<
"Caught unknown exception. Will unlock cache and re-throw." << endl);
352void RemoteResource::update_file_and_headers(){
353 std::map<std::string, std::string> content_filters;
354 update_file_and_headers(content_filters);
362void RemoteResource::update_file_and_headers(
const std::map<std::string, std::string> &content_filters){
365 HttpCache *cache = HttpCache::get_instance();
368 oss << prolog <<
"FAILED to get local cache. ";
369 oss <<
"Unable to proceed with request for " << this->d_remoteResourceUrl->str();
370 oss <<
" The server MUST have a valid HTTP cache configuration to operate." << endl;
371 BESDEBUG(MODULE, oss.str());
377 writeResourceToFile(d_fd);
381 unlink(d_resourceCacheFileName.c_str());
387 filter_retrieved_resource(content_filters);
390 string hdr_filename = d_resourceCacheFileName +
".hdrs";
391 std::ofstream hdr_out(hdr_filename.c_str());
393 for (
size_t i = 0; i < this->d_response_headers->size(); i++) {
394 hdr_out << (*d_response_headers)[i] << endl;
400 unlink(hdr_filename.c_str());
401 unlink(d_resourceCacheFileName.c_str());
410 cache->exclusive_to_shared_lock(d_fd);
411 BESDEBUG(MODULE, prolog <<
"Converted exclusive cache lock to shared lock." << endl);
416 unsigned long long size = cache->update_cache_info(d_resourceCacheFileName);
417 BESDEBUG(MODULE, prolog <<
"Updated cache info" << endl);
419 if (cache->cache_too_big(size)) {
420 cache->update_and_purge(d_resourceCacheFileName);
421 BESDEBUG(MODULE, prolog <<
"Updated and purged cache." << endl);
423 BESDEBUG(MODULE, prolog <<
"END" << endl);
431void RemoteResource::load_hdrs_from_file(){
432 string hdr_filename = d_resourceCacheFileName +
".hdrs";
433 std::ifstream hdr_ifs(hdr_filename.c_str());
435 if(!hdr_ifs.is_open()){
437 msg <<
"ERROR. Internal state error. The headers file: " << hdr_filename <<
" could not be opened for reading.";
438 BESDEBUG(MODULE, prolog << msg.str() << endl);
442 BESDEBUG(MODULE, prolog <<
"Reading response headers from: " << hdr_filename << endl);
443 for (std::string line; std::getline(hdr_ifs, line);) {
444 (*d_response_headers).push_back(line);
445 BESDEBUG(MODULE, prolog <<
"header: " << line << endl);
447 ingest_http_headers_and_type();
457bool RemoteResource::cached_resource_is_expired(){
458 BESDEBUG(MODULE, prolog <<
"BEGIN" << endl);
461 if (stat(d_resourceCacheFileName.c_str(), &statbuf) == -1){
464 BESDEBUG(MODULE, prolog <<
"File exists" << endl);
466 time_t cacheTime = statbuf.st_ctime;
467 BESDEBUG(MODULE, prolog <<
"Cache file creation time: " << cacheTime << endl);
468 time_t nowTime = time(0);
469 BESDEBUG(MODULE, prolog <<
"Time now: " << nowTime << endl);
470 double diffSeconds = difftime(nowTime,cacheTime);
471 BESDEBUG(MODULE, prolog <<
"Time difference between cacheTime and nowTime: " << diffSeconds << endl);
473 if (diffSeconds > d_expires_interval){
474 BESDEBUG(MODULE, prolog <<
" refresh = TRUE " << endl);
478 BESDEBUG(MODULE, prolog <<
" refresh = FALSE " << endl);
491void RemoteResource::writeResourceToFile(
int fd) {
493 BESDEBUG(MODULE, prolog <<
"BEGIN" << endl);
498 besTimer.
start(prolog +
"source url: " + d_remoteResourceUrl->str());
501 int status = lseek(fd, 0, SEEK_SET);
503 throw BESNotFoundError(
"Could not seek within the response file.", __FILE__, __LINE__);
504 BESDEBUG(MODULE, prolog <<
"Reset file descriptor to start of file." << endl);
506 status = ftruncate(fd, 0);
508 throw BESInternalError(
"Could not truncate the file prior to updating from remote. ", __FILE__, __LINE__);
509 BESDEBUG(MODULE, prolog <<
"Truncated file, length is zero." << endl);
511 BESDEBUG(MODULE, prolog <<
"Saving resource " << d_remoteResourceUrl <<
" to cache file " << d_resourceCacheFileName << endl);
512 curl::http_get_and_write_resource(d_remoteResourceUrl, fd, d_response_headers);
514 BESDEBUG(MODULE, prolog <<
"Resource " << d_remoteResourceUrl->str() <<
" saved to cache file " << d_resourceCacheFileName << endl);
520 status = lseek(fd, 0, SEEK_SET);
522 throw BESNotFoundError(
"Could not seek within the response file.", __FILE__, __LINE__);
523 BESDEBUG(MODULE, prolog <<
"Reset file descriptor to start of file." << endl);
526 ingest_http_headers_and_type();
531 BESDEBUG(MODULE, prolog <<
"END" << endl);
537void RemoteResource::ingest_http_headers_and_type() {
538 BESDEBUG(MODULE, prolog <<
"BEGIN" << endl);
540 const string colon_space =
": ";
541 for (
size_t i = 0; i < this->d_response_headers->size(); i++) {
542 string header = (*d_response_headers)[i];
543 BESDEBUG(MODULE, prolog <<
"Processing header " << header << endl);
544 size_t colon_index = header.find(colon_space);
545 if(colon_index == string::npos){
546 BESDEBUG(MODULE, prolog <<
"Unable to locate the colon space \": \" delimiter in the header " <<
547 "string: '" << header <<
"' SKIPPING!" << endl);
551 string value = header.substr(colon_index + colon_space.length());
552 BESDEBUG(MODULE, prolog <<
"key: " << key <<
" value: " << value << endl);
553 (*d_http_response_headers)[key] = value;
556 BESDEBUG(MODULE, prolog <<
"Ingested " << d_http_response_headers->size() <<
" response headers." << endl);
558 std::map<string, string>::iterator it;
563 BESDEBUG(MODULE, prolog <<
"Checking Content-Disposition headers for type information." << endl);
564 string content_disp_hdr;
565 content_disp_hdr = get_http_response_header(
"content-disposition");
566 if (!content_disp_hdr.empty()) {
570 BESDEBUG(MODULE,prolog <<
"Evaluated content-disposition '" << content_disp_hdr <<
"' matched type: \"" << type <<
"\"" << endl);
577 BESDEBUG(MODULE, prolog <<
"Checking Content-Type headers for type information." << endl);
578 string content_type = get_http_response_header(
"content-type");
579 if (type.empty() && !content_type.empty()) {
580 http::get_type_from_content_type(content_type, type);
581 BESDEBUG(MODULE,prolog <<
"Evaluated content-type '" << content_type <<
"' matched type \"" << type <<
"\"" << endl);
586 BESDEBUG(MODULE, prolog <<
"Checking URL path for type information." << endl);
588 http::get_type_from_url(d_remoteResourceUrl->str(), type);
589 BESDEBUG(MODULE, prolog <<
"Evaluated url '" << d_remoteResourceUrl->str() <<
"' matched type: \"" << type <<
"\"" << endl);
593 string err = prolog +
"Unable to determine the type of data"
594 +
" returned from '" + d_remoteResourceUrl->str() +
"' Setting type to 'unknown'";
595 BESDEBUG(MODULE, err << endl);
600 BESDEBUG(MODULE, prolog <<
"END (dataset type: " << d_type <<
")" << endl);
609RemoteResource::get_http_response_header(
const std::string header_name) {
611 std::map<string, string>::iterator it;
613 if (it != d_http_response_headers->end())
629void RemoteResource::filter_retrieved_resource(
const std::map<std::string, std::string> &content_filters){
632 if(content_filters.empty()){
636 string resource_content;
638 std::stringstream buffer;
641 std::ifstream cr_istrm(d_resourceCacheFileName);
642 if (!cr_istrm.is_open()) {
643 string msg =
"Could not open '" + d_resourceCacheFileName +
"' to read cached response.";
644 BESDEBUG(MODULE, prolog << msg << endl);
647 buffer << cr_istrm.rdbuf();
650 resource_content = buffer.str();
653 for (
const auto& apair : content_filters) {
655 BESDEBUG(MODULE, prolog <<
"Replaced " << replace_count <<
" instance(s) of template(" <<
656 apair.first <<
") with " << apair.second <<
" in cached RemoteResource" << endl);
662 std::ofstream cr_ostrm(d_resourceCacheFileName);
663 if (!cr_ostrm.is_open()) {
664 string msg =
"Could not open '" + d_resourceCacheFileName +
"' to write modified cached response.";
665 BESDEBUG(MODULE, prolog << msg << endl);
668 cr_ostrm << resource_content;
675std::string RemoteResource::get_response_as_string() {
679 msg <<
"ERROR. Internal state error. " << __PRETTY_FUNCTION__ <<
" was called prior to retrieving resource.";
680 BESDEBUG(MODULE, prolog << msg.str() << endl);
683 string cache_file = getCacheFileName();
686 std::ifstream file_istream(cache_file, std::ofstream::in);
689 if(file_istream.is_open()){
691 BESDEBUG(MODULE, prolog <<
"Using cached file: " << cache_file << endl);
692 std::stringstream buffer;
693 buffer << file_istream.rdbuf();
698 msg <<
"ERROR. Failed to open cache file " << cache_file <<
" for reading.";
699 BESDEBUG(MODULE, prolog << msg.str() << endl);
712rapidjson::Document RemoteResource::get_as_json() {
713 string response = get_response_as_string();
714 rapidjson::Document d;
715 d.Parse(response.c_str());
722vector<string> *RemoteResource::getResponseHeaders() {
724 throw BESInternalError(prolog +
"STATE ERROR: Remote Resource Has Not Been Retrieved.",__FILE__,__LINE__);
726 return d_response_headers;
731void RemoteResource::setType(
const vector<string> *resp_hdrs) {
733 BESDEBUG(MODULE, prolog <<
"BEGIN" << endl);
743 vector<string>::const_iterator i = resp_hdrs->begin();
744 vector<string>::const_iterator e = resp_hdrs->end();
745 for (; i != e; i++) {
746 string hdr_line = (*i);
748 BESDEBUG(MODULE, prolog <<
"Evaluating header: " << hdr_line << endl);
752 string colon_space =
": ";
753 int index = hdr_line.find(colon_space);
754 string hdr_name = hdr_line.substr(0, index);
755 string hdr_value = hdr_line.substr(index + colon_space.length());
757 BESDEBUG(MODULE, prolog <<
"hdr_name: '" << hdr_name <<
"' hdr_value: '" << hdr_value <<
"' " << endl);
759 if (hdr_name.find(
"content-disposition") != string::npos) {
761 BESDEBUG(MODULE, prolog <<
"Located content-disposition header." << endl);
764 if (hdr_name.find(
"content-type") != string::npos) {
765 BESDEBUG(MODULE, prolog <<
"Located content-type header." << endl);
774 HttpUtils::Get_type_from_disposition(disp, type);
775 BESDEBUG(MODULE,prolog <<
"Evaluated content-disposition '" << disp <<
"' matched type: \"" << type <<
"\"" << endl);
782 if (type.empty() && !ctype.empty()) {
783 HttpUtils::Get_type_from_content_type(ctype, type);
784 BESDEBUG(MODULE,prolog <<
"Evaluated content-type '" << ctype <<
"' matched type \"" << type <<
"\"" << endl);
790 HttpUtils::Get_type_from_url(d_remoteResourceUrl, type);
791 BESDEBUG(MODULE,prolog <<
"Evaluated url '" << d_remoteResourceUrl <<
"' matched type: \"" << type <<
"\"" << endl);
796 string err = prolog +
"Unable to determine the type of data"
797 +
" returned from '" + d_remoteResourceUrl +
"' Setting type to 'unknown'";
798 BESDEBUG(MODULE, err << endl);
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Base exception class for the BES with basic string message.
unsigned int get_line() const
get the line number where the exception was thrown
unsigned int get_bes_error_type() const
Return the return code for this error class.
std::string get_file() const
get the file name where the exception was thrown
std::string get_message() const
get the error message for this exception
virtual void unlock_cache()
virtual void unlock_and_close(const std::string &target)
virtual bool create_and_lock(const std::string &target, int &fd)
Create a file in the cache and lock it for write access.
virtual void exclusive_to_shared_lock(int fd)
Transfer from an exclusive lock to a shared lock.
virtual bool get_read_lock(const std::string &target, int &fd)
Get a read-only lock on the file if it exists.
virtual bool get_exclusive_lock(const std::string &target, int &fd)
exception thrown if internal error encountered
error thrown if the resource requested cannot be found
virtual bool start(std::string name)
static bool endsWith(std::string const &fullString, std::string const &ending)
static unsigned int replace_all(std::string &s, std::string find_this, std::string replace_with_this)
Operates on the string 's' to replaces every occurrence of the value of the string 'find_this' with t...
static std::string lowercase(const std::string &s)
static std::string pathConcat(const std::string &firstPart, const std::string &secondPart, char separator='/')
Concatenate path fragments making sure that they are separated by a single '/' character.
void get_value(const std::string &s, std::string &val, bool &found)
Retrieve the value of a given key, if set.
static TheBESKeys * TheKeys()
A cache for content accessed via HTTP.
virtual std::string get_cache_file_name(const std::string &uid, const std::string &src, bool mangle=true)
utility class for the HTTP catalog module
void get_type_from_disposition(const string &disp, string &type)