bes Updated for version 3.20.13
EffectiveUrlCache.cc
1// -*- mode: c++; c-basic-offset:4 -*-
2
3// This file is part of the BES http package, part of the Hyrax data server.
4
5// Copyright (c) 2020 OPeNDAP, Inc.
6// Author: Nathan Potter <ndp@opendap.org>
7//
8// This library is free software; you can redistribute it and/or
9// modify it under the terms of the GNU Lesser General Public
10// License as published by the Free Software Foundation; either
11// version 2.1 of the License, or (at your option) any later version.
12//
13// This library is distributed in the hope that it will be useful,
14// but WITHOUT ANY WARRANTY; without even the implied warranty of
15// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16// Lesser General Public License for more details.
17//
18// You should have received a copy of the GNU Lesser General Public
19// License along with this library; if not, write to the Free Software
20// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21//
22// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
23
24// Authors:
25// ndp Nathan Potter <ndp@opendap.org>
26
27#include "config.h"
28
29#ifdef HAVE_STDLIB_H
30#include <cstdlib>
31#endif
32
33#include <mutex>
34
35#include <sstream>
36#include <string>
37
38#include "EffectiveUrlCache.h"
39
40#include "BESSyntaxUserError.h"
41#include "TheBESKeys.h"
42#include "BESDebug.h"
43#include "BESStopWatch.h"
44#include "BESUtil.h"
45#include "BESLog.h"
46#include "CurlUtils.h"
47#include "HttpNames.h"
48#include "EffectiveUrl.h"
49
50using namespace std;
51
52#define MODULE "euc"
53#define MODULE_DUMPER "euc:dump"
54#define prolog std::string("EffectiveUrlCache::").append(__func__).append("() - ")
55
56namespace http {
57
58EffectiveUrlCache *EffectiveUrlCache::d_instance = nullptr;
59static std::once_flag d_euc_init_once;
60
69EffectiveUrlCache *
71{
72 std::call_once(d_euc_init_once,EffectiveUrlCache::initialize_instance);
73
74 return d_instance;
75}
76
81void EffectiveUrlCache::initialize_instance()
82{
83
84 d_instance = new EffectiveUrlCache;
85#ifdef HAVE_ATEXIT
86 atexit(delete_instance);
87#endif
88
89}
90
94void EffectiveUrlCache::delete_instance()
95{
96 delete d_instance;
97 d_instance = 0;
98}
99
100
105EffectiveUrlCache::~EffectiveUrlCache()
106{
107 d_effective_urls.clear();
108
109 if(d_skip_regex){
110 delete d_skip_regex;
111 d_skip_regex = 0;
112 }
113}
114
115
123void EffectiveUrlCache::dump(ostream &strm) const
124{
125 strm << BESIndent::LMarg << prolog << "(this: " << (void *) this << ")" << endl;
126 BESIndent::Indent();
127 strm << BESIndent::LMarg << "d_skip_regex: " << (d_skip_regex?d_skip_regex->pattern():"WAS NOT SET") << endl;
128 if (!d_effective_urls.empty()) {
129 strm << BESIndent::LMarg << "effective url list:" << endl;
130 BESIndent::Indent();
131 auto it = d_effective_urls.begin();
132 while( it!= d_effective_urls.end()){
133 strm << BESIndent::LMarg << (*it).first << " --> " << (*it).second->str();
134 it++;
135 }
136 BESIndent::UnIndent();
137 }
138 else {
139 strm << BESIndent::LMarg << "effective url list: EMPTY" << endl;
140 }
141 BESIndent::UnIndent();
142}
143
152{
153 stringstream sstrm;
154 dump(sstrm);
155 return sstrm.str();
156}
157
158
163shared_ptr<http::EffectiveUrl> EffectiveUrlCache::get_cached_eurl(string const &url_key){
164 shared_ptr<http::EffectiveUrl> effective_url(nullptr);
165 auto it = d_effective_urls.find(url_key);
166 if(it!=d_effective_urls.end()){
167 effective_url = (*it).second;
168 }
169 return effective_url;
170}
171
172
173//########################################################################################
174//########################################################################################
175//########################################################################################
176
177
185shared_ptr<http::EffectiveUrl> EffectiveUrlCache::get_effective_url(shared_ptr<http::url> source_url) {
186
187 // This lock is a RAII implementation. It will block until the mutex is
188 // available and the lock will be released when the instance is destroyed.
189 std::lock_guard<std::mutex> lock_me(d_cache_lock_mutex);
190
191 BESDEBUG(MODULE, prolog << "BEGIN url: " << source_url->str() << endl);
192 BESDEBUG(MODULE_DUMPER, prolog << "dump: " << endl << dump() << endl);
193
194 if (!is_enabled()) {
195 BESDEBUG(MODULE, prolog << "CACHE IS DISABLED." << endl);
196 return shared_ptr<http::EffectiveUrl>(new http::EffectiveUrl(source_url));
197
198 }
199
200 // if it's not an HTTP url there is nothing to cache.
201 if (source_url->str().find(HTTP_PROTOCOL) != 0 && source_url->str().find(HTTPS_PROTOCOL) != 0) {
202 BESDEBUG(MODULE, prolog << "END Not an HTTP request, SKIPPING." << endl);
203 return shared_ptr<http::EffectiveUrl>(new http::EffectiveUrl(source_url));
204 }
205
206 BESRegex *skip_regex = get_skip_regex();
207 if( skip_regex ) {
208 size_t match_length = 0;
209 match_length = skip_regex->match(source_url->str().c_str(), source_url->str().length());
210 if (match_length == source_url->str().length()) {
211 BESDEBUG(MODULE, prolog << "END Candidate url matches the "
212 "no_redirects_regex_pattern [" << skip_regex->pattern() <<
213 "][match_length=" << match_length << "] SKIPPING." << endl);
214 return shared_ptr<http::EffectiveUrl>(new http::EffectiveUrl(source_url));
215 }
216 BESDEBUG(MODULE, prolog << "Candidate url: '" << source_url->str() << "' does NOT match the "
217 "skip_regex pattern [" << skip_regex->pattern() << "]" << endl);
218 }
219 else {
220 BESDEBUG(MODULE, prolog << "The cache_effective_urls_skip_regex() was NOT SET "<< endl);
221 }
222
223 shared_ptr<http::EffectiveUrl> effective_url = get_cached_eurl(source_url->str());
224
225 // If the source_url does not have an associated EffectiveUrl instance in the cache
226 // then we know we have to get one.
227 bool retrieve_and_cache = !effective_url;
228
229 // But, if there is a value in the cache, we must check to see
230 // if it is expired, in which case we will retrieve and cache it.
231 if(effective_url){
232 // It was in the cache. w00t. But, is it expired?.
233 BESDEBUG(MODULE, prolog << "Cache hit for: " << source_url->str() << endl);
234 retrieve_and_cache = effective_url->is_expired();
235 BESDEBUG(MODULE, prolog << "Cached target URL is " << (retrieve_and_cache?"":"not ") << "expired." << endl);
236 }
237
238 // It not found or expired, reload.
239 if(retrieve_and_cache){
240 BESDEBUG(MODULE, prolog << "Acquiring effective URL for " << source_url->str() << endl);
241 {
242 BESStopWatch sw;
243 if(BESDebug::IsSet(MODULE) || BESDebug::IsSet(TIMING_LOG_KEY))
244 sw.start(prolog + "Retrieve and cache effective url for source url: " + source_url->str());
245 effective_url = curl::retrieve_effective_url(source_url);
246 }
247 BESDEBUG(MODULE, prolog << " source_url: " << source_url->str() << " (" << (source_url->is_trusted()?"":"NOT ") << "trusted)" << endl);
248 BESDEBUG(MODULE, prolog << "effective_url: " << effective_url->dump() << " (" << (source_url->is_trusted()?"":"NOT ") << "trusted)" << endl);
249
250 d_effective_urls[source_url->str()] = effective_url;
251
252 BESDEBUG(MODULE, prolog << "Updated record for "<< source_url->str() << " cache size: " << d_effective_urls.size() << endl);
253
254 // Since we don't want there to be a concurrency issue when we release the lock, we don't
255 // return the instance of shared_ptr<EffectiveUrl> that we placed in the cache. Rather
256 // we make a clone and return that. It will have it's own lifecycle independent of
257 // the instance we placed in the cache - it can be modified and the one in the cache
258 // is unchanged. Trusted state was established from source_url when effective_url was
259 // created in curl::retrieve_effective_url()
260 effective_url = shared_ptr<EffectiveUrl>(new EffectiveUrl(effective_url));
261 }
262 else {
263 // Here we have a !expired instance of a shared_ptr<EffectiveUrl> retrieved from the cache.
264 // Now we need to make a copy to return, inheriting trust from the
265 // requesting URL.
266 effective_url = shared_ptr<EffectiveUrl>(new EffectiveUrl(effective_url,source_url->is_trusted()));
267 }
268
269 BESDEBUG(MODULE_DUMPER, prolog << "dump: " << endl << dump() << endl);
270
271 BESDEBUG(MODULE, prolog << "END" << endl);
272
273 return effective_url;
274}// The lock is released when the point of execution reaches this brace and lock_me goes out of scope.
275
276
281bool EffectiveUrlCache::is_enabled()
282{
283 // The first time here, the value of d_enabled is -1. Once we check for it in TheBESKeys
284 // The value will be 0 (false) or 1 (true) and TheBESKeys will not be checked again.
285 if(d_enabled < 0){
286 bool found;
287 string value;
288 TheBESKeys::TheKeys()->get_value(HTTP_CACHE_EFFECTIVE_URLS_KEY,value,found);
289 BESDEBUG(MODULE, prolog << HTTP_CACHE_EFFECTIVE_URLS_KEY <<": '" << value << "'" << endl);
290 d_enabled = found && BESUtil::lowercase(value)=="true";
291 }
292 BESDEBUG(MODULE, prolog << "d_enabled: " << (d_enabled?"true":"false") << endl);
293 return d_enabled;
294}
295
300BESRegex *EffectiveUrlCache::get_skip_regex()
301{
302 if(!d_skip_regex){
303 bool found;
304 string value;
305 TheBESKeys::TheKeys()->get_value(HTTP_CACHE_EFFECTIVE_URLS_SKIP_REGEX_KEY, value, found);
306 if(found && value.length()){
307 BESDEBUG(MODULE, prolog << HTTP_CACHE_EFFECTIVE_URLS_SKIP_REGEX_KEY <<": " << value << endl);
308 d_skip_regex = new BESRegex(value.c_str());
309 }
310 }
311 BESDEBUG(MODULE, prolog << "d_skip_regex: " << (d_skip_regex?d_skip_regex->pattern():"Value has not been set.") << endl);
312 return d_skip_regex;
313}
314
315
316
317
318
319} // namespace http
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:168
Regular expression matching.
Definition: BESRegex.h:53
int match(const char *s, int len, int pos=0) const
Does the pattern match.
Definition: BESRegex.cc:127
virtual bool start(std::string name)
Definition: BESStopWatch.cc:67
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:254
void get_value(const std::string &s, std::string &val, bool &found)
Retrieve the value of a given key, if set.
Definition: TheBESKeys.cc:340
static TheBESKeys * TheKeys()
Definition: TheBESKeys.cc:71
static EffectiveUrlCache * TheCache()
Get the singleton EffectiveUrlCache instance.
virtual std::string dump() const
dumps information about this object
std::shared_ptr< EffectiveUrl > get_effective_url(std::shared_ptr< url > source_url)
utility class for the HTTP catalog module
Definition: AllowedHosts.cc:55