bes Updated for version 3.20.13
HttpdDirScraper.cc
1// -*- mode: c++; c-basic-offset:4 -*-
2//
3// This file is part of httpd_catalog_module, A C++ module that can be loaded in to
4// the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5//
6// Copyright (c) 2018 OPeNDAP, Inc.
7// Author: Nathan Potter <ndp@opendap.org>
8//
9// This library is free software; you can redistribute it and/or
10// modify it under the terms of the GNU Lesser General Public
11// License as published by the Free Software Foundation; either
12// version 2.1 of the License, or (at your option) any later version.
13//
14// This library is distributed in the hope that it will be useful,
15// but WITHOUT ANY WARRANTY; without even the implied warranty of
16// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17// Lesser General Public License for more details.
18//
19// You should have received a copy of the GNU Lesser General Public
20// License along with this library; if not, write to the Free Software
21// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22//
23// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24
25#include <iostream>
26#include <fstream>
27#include <sstream>
28#include <stdlib.h> /* atol */
29#include <ctype.h> /* isalpha and isdigit */
30#include <time.h> /* mktime */
31
32#include <BESDebug.h>
33#include <BESUtil.h>
34#include <BESRegex.h>
35#include <BESCatalogList.h>
36#include <BESCatalogUtils.h>
37#include <CatalogItem.h>
38
39#include "RemoteResource.h"
40#include "HttpdCatalogNames.h"
41
42#include "HttpdDirScraper.h"
43
44using namespace std;
46
47#define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
48
49namespace httpd_catalog {
50
51HttpdDirScraper::HttpdDirScraper()
52{
53 // There was probably a better way to make this association but this worked.
54 d_months.insert(pair<string, int>(string("jan"), 0));
55 d_months.insert(pair<string, int>(string("feb"), 1));
56 d_months.insert(pair<string, int>(string("mar"), 2));
57 d_months.insert(pair<string, int>(string("apr"), 3));
58 d_months.insert(pair<string, int>(string("may"), 4));
59 d_months.insert(pair<string, int>(string("jun"), 5));
60 d_months.insert(pair<string, int>(string("jul"), 6));
61 d_months.insert(pair<string, int>(string("aug"), 7));
62 d_months.insert(pair<string, int>(string("sep"), 8));
63 d_months.insert(pair<string, int>(string("oct"), 9));
64 d_months.insert(pair<string, int>(string("nov"), 10));
65 d_months.insert(pair<string, int>(string("dec"), 11));
66}
67
68/*
69 * @brief Converts an Apache httpd directory page "size" string (23K, 45M, 32G, etc)
70 * to an actual value, approximate though it may be.
71 */
72long HttpdDirScraper::get_size_val(const string size_str) const
73{
74 char scale_c = *size_str.rbegin();
75 long scale = 1;
76
77 switch (scale_c) {
78 case 'K':
79 scale = 1e3;
80 break;
81 case 'M':
82 scale = 1e6;
83 break;
84 case 'G':
85 scale = 1e9;
86 break;
87 case 'T':
88 scale = 1e12;
89 break;
90 case 'P':
91 scale = 1e15;
92 break;
93 default:
94 scale = 1;
95 break;
96 }
97 BESDEBUG(MODULE, prolog << "scale: " << scale << endl);
98
99 string result = size_str;
100 if (isalpha(scale_c)) result = size_str.substr(0, size_str.length() - 1);
101
102 long size = atol(result.c_str());
103 BESDEBUG(MODULE, prolog << "raw size: " << size << endl);
104
105 size *= scale;
106 BESDEBUG(MODULE, prolog << "scaled size: " << size << endl);
107 return size;
108}
109
113string show_tm_struct(const tm tms)
114{
115 stringstream ss;
116 ss << "tm_sec: " << tms.tm_sec << endl;
117 ss << "tm_min: " << tms.tm_min << endl;
118 ss << "tm_hour: " << tms.tm_hour << endl;
119 ss << "tm_mday: " << tms.tm_mday << endl;
120 ss << "tm_mon: " << tms.tm_mon << endl;
121 ss << "tm_year: " << tms.tm_year << endl;
122 ss << "tm_wday: " << tms.tm_wday << endl;
123 ss << "tm_yday: " << tms.tm_yday << endl;
124 ss << "tm_isdst: " << tms.tm_isdst << endl;
125 return ss.str();
126}
127
131void zero_tm_struct(tm &tms)
132{
133 tms.tm_sec = 0;
134 tms.tm_min = 0;
135 tms.tm_hour = 0;
136 tms.tm_mday = 1;
137 tms.tm_mon = 0;
138 tms.tm_year = 0;
139 tms.tm_wday = 0;
140 tms.tm_yday = 0;
141 tms.tm_isdst = 0;
142}
143
144
145string HttpdDirScraper::httpd_time_to_iso_8601(const string httpd_time) const
146{
147 vector<string> tokens;
148 string delimiters = "- :";
149 BESUtil::tokenize(httpd_time, tokens, delimiters);
150
151 BESDEBUG(MODULE, prolog << "Found " << tokens.size() << " tokens." << endl);
152 vector<string>::iterator it = tokens.begin();
153 int i = 0;
154 if (BESDebug::IsSet(MODULE)) {
155 while (it != tokens.end()) {
156 BESDEBUG(MODULE, prolog << " token["<< i++ << "]: "<< *it << endl);
157 it++;
158 }
159 }
160
161 BESDEBUG(MODULE, prolog << "Second Field: "<< tokens[1] << endl);
162
163 const char *second_field = tokens[1].c_str();
164 bool is_alpha = true;
165 for(unsigned long i=0; is_alpha && i< tokens[1].length(); i++){
166 is_alpha = isalpha(second_field[i]);
167 }
168 time_t theTime;
169 if(is_alpha){
170 BESDEBUG(MODULE, prolog << "Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
171 theTime = parse_time_format_A(tokens);
172 }
173 else {
174 BESDEBUG(MODULE, prolog << "Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
175 theTime = parse_time_format_B(tokens);
176 }
177 return BESUtil::get_time(theTime, false);
178
179}
180
186time_t HttpdDirScraper::parse_time_format_A(const vector<string> tokens) const
187{
188 // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
189 struct tm tm;
190 zero_tm_struct(tm);
191
192 if (tokens.size() > 2) {
193 std::istringstream(tokens[0]) >> tm.tm_mday;
194 BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
195
196 pair<string, int> mnth = *d_months.find(BESUtil::lowercase(tokens[1]));
197 BESDEBUG(MODULE, prolog << " mnth.first: "<< mnth.first << endl);
198 BESDEBUG(MODULE, prolog << " mnth.second: "<< mnth.second << endl);
199 tm.tm_mon = mnth.second;
200 BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
201
202 std::istringstream(tokens[2]) >> tm.tm_year;
203 tm.tm_year -= 1900;
204 BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
205
206 if (tokens.size() > 4) {
207 std::istringstream(tokens[3]) >> tm.tm_hour;
208 BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
209 std::istringstream(tokens[4]) >> tm.tm_min;
210 BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
211 }
212 }
213
214 BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
215
216 time_t theTime = mktime(&tm);
217 BESDEBUG(MODULE, prolog << "theTime: " << theTime << endl);
218 return theTime;
219}
220
226time_t HttpdDirScraper::parse_time_format_B(const vector<string> tokens) const
227{
228 // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
229 struct tm tm;
230 zero_tm_struct(tm);
231
232 if (tokens.size() > 2) {
233 std::istringstream(tokens[0]) >> tm.tm_year;
234 tm.tm_year -= 1900;
235 BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
236
237 std::istringstream(tokens[1]) >> tm.tm_mon;
238 BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
239
240 std::istringstream(tokens[2]) >> tm.tm_mday;
241 BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
242
243 if (tokens.size() > 4) {
244 std::istringstream(tokens[3]) >> tm.tm_hour;
245 BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
246 std::istringstream(tokens[4]) >> tm.tm_min;
247 BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
248 }
249 }
250
251 BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
252
253 time_t theTime = mktime(&tm);
254 BESDEBUG(MODULE, prolog << "ISO-8601 Time: " << theTime << endl);
255 return theTime;
256}
257
274void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items) const
275{
276 const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
277
278 // Go get the text from the remote resource
279 std::shared_ptr<http::url> url_ptr(new http::url(url));
280 http::RemoteResource rhr(url_ptr);
281 rhr.retrieveResource();
282 stringstream buffer;
283
284 ifstream cache_file_is(rhr.getCacheFileName().c_str());
285 if(!cache_file_is.is_open()){
286 string msg = prolog + "ERROR - Failed to open cache file: " + rhr.getCacheFileName();
287 BESDEBUG(MODULE, msg << endl);
288 throw BESInternalError(msg ,__FILE__, __LINE__ );
289 }
290
291 buffer << cache_file_is.rdbuf();
292 string pageStr = buffer.str();
293 BESDEBUG(MODULE, prolog << "Page Content: " << endl << pageStr << endl);
294
295 // Does it look like an Apache httpd Index listing?
296 if(pageStr.find("<title>Index of ") == string::npos){
297 // Nope. Time to leave.
298 BESDEBUG(MODULE, prolog << "The url: " << url << " does not appear to reference an Apache httpd Index page." << endl);
299 return;
300 }
301
302 string aOpenStr = "<a ";
303 string aCloseStr = "</a>";
304 string hrefStr = "href=\"";
305 string tdOpenStr = "<td ";
306 string tdCloseStr = "</td>";
307
308 BESRegex hrefExcludeRegex("(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
309 BESRegex nameExcludeRegex("^Parent Directory$");
310
311 bool done = false;
312 int next_start = 0;
313 while (!done) {
314 int aOpenIndex = pageStr.find(aOpenStr, next_start);
315 if (aOpenIndex < 0) {
316 done = true;
317 }
318 else {
319 int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.length());
320 if (aCloseIndex < 0) {
321 done = true;
322 }
323 else {
324 int length;
325
326 // Locate the entire <a /> element
327 BESDEBUG(MODULE, prolog << "aOpenIndex: " << aOpenIndex << endl);
328 BESDEBUG(MODULE, prolog << "aCloseIndex: " << aCloseIndex << endl);
329 length = aCloseIndex + aCloseStr.length() - aOpenIndex;
330 string aElemStr = pageStr.substr(aOpenIndex, length);
331 BESDEBUG(MODULE, prolog << "Processing link: " << aElemStr << endl);
332
333 // Find the link text
334 int start = aElemStr.find(">") + 1;
335 int end = aElemStr.find("<", start);
336 length = end - start;
337 string linkText = aElemStr.substr(start, length);
338 BESDEBUG(MODULE, prolog << "Link Text: " << linkText << endl);
339
340 // Locate the href attribute
341 start = aElemStr.find(hrefStr) + hrefStr.length();
342 end = aElemStr.find("\"", start);
343 length = end - start;
344 string href = aElemStr.substr(start, length);
345 BESDEBUG(MODULE, prolog << "href: " << href << endl);
346
347 // attempt to get time string
348 string time_str;
349 int start_pos = getNextElementText(pageStr, "td", aCloseIndex + aCloseStr.length(), time_str);
350 BESDEBUG(MODULE, prolog << "time_str: '" << time_str << "'" << endl);
351
352 // attempt to get size string
353 string size_str;
354 start_pos = getNextElementText(pageStr, "td", start_pos, size_str);
355 BESDEBUG(MODULE, prolog << "size_str: '" << size_str << "'" << endl);
356
357 if ((linkText.find("<img") != string::npos) || !(linkText.length()) || (linkText.find("<<<") != string::npos)
358 || (linkText.find(">>>") != string::npos)) {
359 BESDEBUG(MODULE, prolog << "SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
360 }
361 else {
362 if (href.length() == 0 || (((href.find("http://") == 0) || (href.find("https://") == 0)) && !(href.find(url) == 0))) {
363 // SKIPPING
364 BESDEBUG(MODULE, prolog << "SKIPPING(null or remote): " << href << endl);
365 }
366 else if (hrefExcludeRegex.match(href.c_str(), href.length(), 0) > 0) {
367 // SKIPPING
368 BESDEBUG(MODULE, prolog << "SKIPPING(hrefExcludeRegex) - href: '" << href << "'"<< endl);
369 }
370 else if (nameExcludeRegex.match(linkText.c_str(), linkText.length(), 0) > 0) {
371 // SKIPPING
372 BESDEBUG(MODULE, prolog << "SKIPPING(nameExcludeRegex) - name: '" << linkText << "'" << endl);
373 }
374 else if (BESUtil::endsWith(href, "/")) {
375 string node_name = href.substr(0, href.length() - 1);
376 // it's a directory aka a node
377 BESDEBUG(MODULE, prolog << "NODE: " << node_name << endl);
378 bes::CatalogItem *childNode = new bes::CatalogItem();
379 childNode->set_type(CatalogItem::node);
380 childNode->set_name(node_name);
381 childNode->set_is_data(false);
382 string iso_8601_time = httpd_time_to_iso_8601(time_str);
383 childNode->set_lmt(iso_8601_time);
384 // FIXME: For nodes the size should be the number of children, but how without crawling?
385 long size = get_size_val(size_str);
386 childNode->set_size(size);
387
388 items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
389 }
390 else {
391 // It's a file aka a leaf
392 BESDEBUG(MODULE, prolog << "LEAF: " << href << endl);
393 CatalogItem *leafItem = new CatalogItem();
394 leafItem->set_type(CatalogItem::leaf);
395 leafItem->set_name(href);
396 leafItem->set_is_data(cat_utils->is_data(href));
397 string iso_8601_time = httpd_time_to_iso_8601(time_str);
398 leafItem->set_lmt(iso_8601_time);
399 long size = get_size_val(size_str);
400 leafItem->set_size(size);
401
402 items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
403 }
404 }
405 }
406 next_start = aCloseIndex + aCloseStr.length();
407 }
408 }
409}
410
423int HttpdDirScraper::getNextElementText(const string &page_str, const string element_name, int startIndex, string &resultText, bool trim) const
424{
425 string e_open_str = "<" + element_name + " ";
426 string e_close_str = "</" + element_name + ">";
427
428 // Locate the next "element_name" element
429 int start = page_str.find(e_open_str, startIndex);
430 int end = page_str.find(e_close_str, start + e_open_str.length());
431 if(start<0 || end<0 || end<start){
432 resultText="";
433 return startIndex;
434 }
435
436 int length = end + e_close_str.length() - start;
437 string element_str = page_str.substr(start, length);
438
439 // Find the text
440 start = element_str.find(">") + 1;
441 end = element_str.find("<", start);
442 length = end - start;
443 resultText = element_str.substr(start, length);
444
445 if (trim) BESUtil::removeLeadingAndTrailingBlanks(resultText);
446
447 BESDEBUG(MODULE, prolog << "resultText: '" << resultText << "'" << endl);
448 return startIndex + element_str.length();
449}
450
451/*
452 * @brief Returns the catalog node represented by the httpd directory page returned
453 * by dereferencing the passed url.
454 * @param url The url of the Apache httpd directory to process.
455 * @param path The path prefix that associates the location of this generated CatalogNode with it's
456 * correct position in the local service path.
457 */
458bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
459{
460 BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
461 bes::CatalogNode *node = new bes::CatalogNode(path);
462
463 if (BESUtil::endsWith(url, "/")) {
464 // This always means the URL points to a node when coming from httpd
465 map<string, bes::CatalogItem *> items;
466 createHttpdDirectoryPageMap(url, items);
467
468 BESDEBUG(MODULE, prolog << "Found " << items.size() << " items." << endl);
469 map<string, bes::CatalogItem *>::iterator it;
470 it = items.begin();
471 while (it != items.end()) {
472 bes::CatalogItem *item = it->second;
473 BESDEBUG(MODULE, prolog << "Adding item: '" << item->get_name() << "'"<< endl);
474 if (item->get_type() == CatalogItem::node)
475 node->add_node(item);
476 else
477 node->add_leaf(item);
478 it++;
479 }
480 }
481 else {
482 // It's a leaf aka "item" response.
483 const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
484 std::vector<std::string> url_parts = BESUtil::split(url, '/', true);
485 string leaf_name = url_parts.back();
486
487 CatalogItem *item = new CatalogItem();
488 item->set_type(CatalogItem::leaf);
489 item->set_name(leaf_name);
490 item->set_is_data(cat_utils->is_data(leaf_name));
491
492 // FIXME: Find the Last Modified date? Head??
493 item->set_lmt(BESUtil::get_time(true));
494
495 // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
496 item->set_size(1);
497
498 node->set_leaf(item);
499 }
500 return node;
501}
502
503#if 0
504
505bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
506{
507 BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
508 bes::CatalogNode *node = new bes::CatalogNode(path);
509
510 if (BESUtil::endsWith(url, "/")) {
511
512 set<string> pageNodes;
513 set<string> pageLeaves;
514 createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
515
516 BESDEBUG(MODULE, prolog << "Found " << pageNodes.size() << " nodes." << endl);
517 BESDEBUG(MODULE, prolog << "Found " << pageLeaves.size() << " leaves." << endl);
518
519 set<string>::iterator it;
520
521 it = pageNodes.begin();
522 while (it != pageNodes.end()) {
523 string pageNode = *it;
524 if (BESUtil::endsWith(pageNode, "/")) pageNode = pageNode.substr(0, pageNode.length() - 1);
525
526 bes::CatalogItem *childNode = new bes::CatalogItem();
527 childNode->set_type(CatalogItem::node);
528
529 childNode->set_name(pageNode);
530 childNode->set_is_data(false);
531
532 // FIXME: Figure out the LMT if we can... HEAD?
533 childNode->set_lmt(BESUtil::get_time(true));
534
535 // FIXME: For nodes the size should be the number of children, but how without crawling?
536 childNode->set_size(0);
537
538 node->add_node(childNode);
539 it++;
540 }
541
542 it = pageLeaves.begin();
543 while (it != pageLeaves.end()) {
544 string leaf = *it;
545 CatalogItem *leafItem = new CatalogItem();
546 leafItem->set_type(CatalogItem::leaf);
547 leafItem->set_name(leaf);
548
549 // FIXME: wrangle up the Typematch and see if we think this thing is data or not.
550 leafItem->set_is_data(false);
551
552 // FIXME: Find the Last Modified date?
553 leafItem->set_lmt(BESUtil::get_time(true));
554
555 // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
556 leafItem->set_size(1);
557
558 node->add_leaf(leafItem);
559 it++;
560 }
561 }
562 else {
563 std::vector<std::string> url_parts = BESUtil::split(url,'/',true);
564 string leaf_name = url_parts.back();
565
566 CatalogItem *item = new CatalogItem();
567 item->set_type(CatalogItem::leaf);
568 item->set_name(leaf_name);
569 // FIXME: Find the Last Modified date?
570 item->set_lmt(BESUtil::get_time(true));
571
572 // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
573 item->set_size(1);
574
575 node->set_leaf(item);
576
577 }
578 return node;
579
580}
581#endif
582
583}
584 // namespace httpd_catalog
585
static BESCatalogList * TheCatalogList()
Get the singleton BESCatalogList instance.
bool is_data(const std::string &item) const
is there a handler that can process this
virtual BESCatalogUtils * get_catalog_utils() const
Get a pointer to the utilities, customized for this catalog.
Definition: BESCatalog.h:112
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:168
exception thrown if internal error encountered
Regular expression matching.
Definition: BESRegex.h:53
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
Definition: BESUtil.cc:1065
static bool endsWith(std::string const &fullString, std::string const &ending)
Definition: BESUtil.cc:834
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
Definition: BESUtil.cc:992
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:254
static void removeLeadingAndTrailingBlanks(std::string &key)
Definition: BESUtil.cc:445
static std::string get_time(bool use_local_time=false)
Definition: BESUtil.cc:1014
void set_name(std::string n)
Set the name of the item.
Definition: CatalogItem.h:135
std::string get_name() const
The name of this item in the node.
Definition: CatalogItem.h:133
void set_size(size_t s)
Set the size of the item.
Definition: CatalogItem.h:140
void set_is_data(bool id)
Is this item data that the BES should interpret?
Definition: CatalogItem.h:150
void set_lmt(std::string lmt)
Set the LMT for this item.
Definition: CatalogItem.h:145
item_type get_type() const
Get the type of this item (unknown, node or leaf)
Definition: CatalogItem.h:153
void set_type(item_type t)
Set the type for this item.
Definition: CatalogItem.h:155