35#include <BESCatalogList.h>
36#include <BESCatalogUtils.h>
37#include <CatalogItem.h>
39#include "RemoteResource.h"
40#include "HttpdCatalogNames.h"
42#include "HttpdDirScraper.h"
47#define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
49namespace httpd_catalog {
51HttpdDirScraper::HttpdDirScraper()
54 d_months.insert(pair<string, int>(
string(
"jan"), 0));
55 d_months.insert(pair<string, int>(
string(
"feb"), 1));
56 d_months.insert(pair<string, int>(
string(
"mar"), 2));
57 d_months.insert(pair<string, int>(
string(
"apr"), 3));
58 d_months.insert(pair<string, int>(
string(
"may"), 4));
59 d_months.insert(pair<string, int>(
string(
"jun"), 5));
60 d_months.insert(pair<string, int>(
string(
"jul"), 6));
61 d_months.insert(pair<string, int>(
string(
"aug"), 7));
62 d_months.insert(pair<string, int>(
string(
"sep"), 8));
63 d_months.insert(pair<string, int>(
string(
"oct"), 9));
64 d_months.insert(pair<string, int>(
string(
"nov"), 10));
65 d_months.insert(pair<string, int>(
string(
"dec"), 11));
72long HttpdDirScraper::get_size_val(
const string size_str)
const
74 char scale_c = *size_str.rbegin();
97 BESDEBUG(MODULE, prolog <<
"scale: " << scale << endl);
99 string result = size_str;
100 if (isalpha(scale_c)) result = size_str.substr(0, size_str.length() - 1);
102 long size = atol(result.c_str());
103 BESDEBUG(MODULE, prolog <<
"raw size: " << size << endl);
106 BESDEBUG(MODULE, prolog <<
"scaled size: " << size << endl);
113string show_tm_struct(
const tm tms)
116 ss <<
"tm_sec: " << tms.tm_sec << endl;
117 ss <<
"tm_min: " << tms.tm_min << endl;
118 ss <<
"tm_hour: " << tms.tm_hour << endl;
119 ss <<
"tm_mday: " << tms.tm_mday << endl;
120 ss <<
"tm_mon: " << tms.tm_mon << endl;
121 ss <<
"tm_year: " << tms.tm_year << endl;
122 ss <<
"tm_wday: " << tms.tm_wday << endl;
123 ss <<
"tm_yday: " << tms.tm_yday << endl;
124 ss <<
"tm_isdst: " << tms.tm_isdst << endl;
131void zero_tm_struct(tm &tms)
145string HttpdDirScraper::httpd_time_to_iso_8601(
const string httpd_time)
const
147 vector<string> tokens;
148 string delimiters =
"- :";
151 BESDEBUG(MODULE, prolog <<
"Found " << tokens.size() <<
" tokens." << endl);
152 vector<string>::iterator it = tokens.begin();
155 while (it != tokens.end()) {
156 BESDEBUG(MODULE, prolog <<
" token["<< i++ <<
"]: "<< *it << endl);
161 BESDEBUG(MODULE, prolog <<
"Second Field: "<< tokens[1] << endl);
163 const char *second_field = tokens[1].c_str();
164 bool is_alpha =
true;
165 for(
unsigned long i=0; is_alpha && i< tokens[1].length(); i++){
166 is_alpha = isalpha(second_field[i]);
170 BESDEBUG(MODULE, prolog <<
"Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
171 theTime = parse_time_format_A(tokens);
174 BESDEBUG(MODULE, prolog <<
"Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
175 theTime = parse_time_format_B(tokens);
186time_t HttpdDirScraper::parse_time_format_A(
const vector<string> tokens)
const
192 if (tokens.size() > 2) {
193 std::istringstream(tokens[0]) >> tm.tm_mday;
194 BESDEBUG(MODULE, prolog <<
" tm.tm_mday: "<< tm.tm_mday << endl);
197 BESDEBUG(MODULE, prolog <<
" mnth.first: "<< mnth.first << endl);
198 BESDEBUG(MODULE, prolog <<
" mnth.second: "<< mnth.second << endl);
199 tm.tm_mon = mnth.second;
200 BESDEBUG(MODULE, prolog <<
" tm.tm_mon: "<< tm.tm_mon << endl);
202 std::istringstream(tokens[2]) >> tm.tm_year;
204 BESDEBUG(MODULE, prolog <<
" tm.tm_year: "<< tm.tm_year << endl);
206 if (tokens.size() > 4) {
207 std::istringstream(tokens[3]) >> tm.tm_hour;
208 BESDEBUG(MODULE, prolog <<
" tm.tm_hour: "<< tm.tm_hour << endl);
209 std::istringstream(tokens[4]) >> tm.tm_min;
210 BESDEBUG(MODULE, prolog <<
" tm.tm_min: "<< tm.tm_min << endl);
214 BESDEBUG(MODULE, prolog <<
"tm struct: " << endl << show_tm_struct(tm));
216 time_t theTime = mktime(&tm);
217 BESDEBUG(MODULE, prolog <<
"theTime: " << theTime << endl);
226time_t HttpdDirScraper::parse_time_format_B(
const vector<string> tokens)
const
232 if (tokens.size() > 2) {
233 std::istringstream(tokens[0]) >> tm.tm_year;
235 BESDEBUG(MODULE, prolog <<
" tm.tm_year: "<< tm.tm_year << endl);
237 std::istringstream(tokens[1]) >> tm.tm_mon;
238 BESDEBUG(MODULE, prolog <<
" tm.tm_mon: "<< tm.tm_mon << endl);
240 std::istringstream(tokens[2]) >> tm.tm_mday;
241 BESDEBUG(MODULE, prolog <<
" tm.tm_mday: "<< tm.tm_mday << endl);
243 if (tokens.size() > 4) {
244 std::istringstream(tokens[3]) >> tm.tm_hour;
245 BESDEBUG(MODULE, prolog <<
" tm.tm_hour: "<< tm.tm_hour << endl);
246 std::istringstream(tokens[4]) >> tm.tm_min;
247 BESDEBUG(MODULE, prolog <<
" tm.tm_min: "<< tm.tm_min << endl);
251 BESDEBUG(MODULE, prolog <<
"tm struct: " << endl << show_tm_struct(tm));
253 time_t theTime = mktime(&tm);
254 BESDEBUG(MODULE, prolog <<
"ISO-8601 Time: " << theTime << endl);
274void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items)
const
279 std::shared_ptr<http::url> url_ptr(
new http::url(url));
281 rhr.retrieveResource();
284 ifstream cache_file_is(rhr.getCacheFileName().c_str());
285 if(!cache_file_is.is_open()){
286 string msg = prolog +
"ERROR - Failed to open cache file: " + rhr.getCacheFileName();
287 BESDEBUG(MODULE, msg << endl);
291 buffer << cache_file_is.rdbuf();
292 string pageStr = buffer.str();
293 BESDEBUG(MODULE, prolog <<
"Page Content: " << endl << pageStr << endl);
296 if(pageStr.find(
"<title>Index of ") == string::npos){
298 BESDEBUG(MODULE, prolog <<
"The url: " << url <<
" does not appear to reference an Apache httpd Index page." << endl);
302 string aOpenStr =
"<a ";
303 string aCloseStr =
"</a>";
304 string hrefStr =
"href=\"";
305 string tdOpenStr =
"<td ";
306 string tdCloseStr =
"</td>";
308 BESRegex hrefExcludeRegex(
"(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
309 BESRegex nameExcludeRegex(
"^Parent Directory$");
314 int aOpenIndex = pageStr.find(aOpenStr, next_start);
315 if (aOpenIndex < 0) {
319 int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.length());
320 if (aCloseIndex < 0) {
327 BESDEBUG(MODULE, prolog <<
"aOpenIndex: " << aOpenIndex << endl);
328 BESDEBUG(MODULE, prolog <<
"aCloseIndex: " << aCloseIndex << endl);
329 length = aCloseIndex + aCloseStr.length() - aOpenIndex;
330 string aElemStr = pageStr.substr(aOpenIndex, length);
331 BESDEBUG(MODULE, prolog <<
"Processing link: " << aElemStr << endl);
334 int start = aElemStr.find(
">") + 1;
335 int end = aElemStr.find(
"<", start);
336 length = end - start;
337 string linkText = aElemStr.substr(start, length);
338 BESDEBUG(MODULE, prolog <<
"Link Text: " << linkText << endl);
341 start = aElemStr.find(hrefStr) + hrefStr.length();
342 end = aElemStr.find(
"\"", start);
343 length = end - start;
344 string href = aElemStr.substr(start, length);
345 BESDEBUG(MODULE, prolog <<
"href: " << href << endl);
349 int start_pos = getNextElementText(pageStr,
"td", aCloseIndex + aCloseStr.length(), time_str);
350 BESDEBUG(MODULE, prolog <<
"time_str: '" << time_str <<
"'" << endl);
354 start_pos = getNextElementText(pageStr,
"td", start_pos, size_str);
355 BESDEBUG(MODULE, prolog <<
"size_str: '" << size_str <<
"'" << endl);
357 if ((linkText.find(
"<img") != string::npos) || !(linkText.length()) || (linkText.find(
"<<<") != string::npos)
358 || (linkText.find(
">>>") != string::npos)) {
359 BESDEBUG(MODULE, prolog <<
"SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
362 if (href.length() == 0 || (((href.find(
"http://") == 0) || (href.find(
"https://") == 0)) && !(href.find(url) == 0))) {
364 BESDEBUG(MODULE, prolog <<
"SKIPPING(null or remote): " << href << endl);
366 else if (hrefExcludeRegex.match(href.c_str(), href.length(), 0) > 0) {
368 BESDEBUG(MODULE, prolog <<
"SKIPPING(hrefExcludeRegex) - href: '" << href <<
"'"<< endl);
370 else if (nameExcludeRegex.match(linkText.c_str(), linkText.length(), 0) > 0) {
372 BESDEBUG(MODULE, prolog <<
"SKIPPING(nameExcludeRegex) - name: '" << linkText <<
"'" << endl);
375 string node_name = href.substr(0, href.length() - 1);
377 BESDEBUG(MODULE, prolog <<
"NODE: " << node_name << endl);
379 childNode->
set_type(CatalogItem::node);
382 string iso_8601_time = httpd_time_to_iso_8601(time_str);
383 childNode->
set_lmt(iso_8601_time);
385 long size = get_size_val(size_str);
388 items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
392 BESDEBUG(MODULE, prolog <<
"LEAF: " << href << endl);
394 leafItem->
set_type(CatalogItem::leaf);
397 string iso_8601_time = httpd_time_to_iso_8601(time_str);
398 leafItem->
set_lmt(iso_8601_time);
399 long size = get_size_val(size_str);
402 items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
406 next_start = aCloseIndex + aCloseStr.length();
423int HttpdDirScraper::getNextElementText(
const string &page_str,
const string element_name,
int startIndex,
string &resultText,
bool trim)
const
425 string e_open_str =
"<" + element_name +
" ";
426 string e_close_str =
"</" + element_name +
">";
429 int start = page_str.find(e_open_str, startIndex);
430 int end = page_str.find(e_close_str, start + e_open_str.length());
431 if(start<0 || end<0 || end<start){
436 int length = end + e_close_str.length() - start;
437 string element_str = page_str.substr(start, length);
440 start = element_str.find(
">") + 1;
441 end = element_str.find(
"<", start);
442 length = end - start;
443 resultText = element_str.substr(start, length);
447 BESDEBUG(MODULE, prolog <<
"resultText: '" << resultText <<
"'" << endl);
448 return startIndex + element_str.length();
458bes::CatalogNode *HttpdDirScraper::get_node(
const string &url,
const string &path)
const
460 BESDEBUG(MODULE, prolog <<
"Processing url: '" << url <<
"'"<< endl);
465 map<string, bes::CatalogItem *> items;
466 createHttpdDirectoryPageMap(url, items);
468 BESDEBUG(MODULE, prolog <<
"Found " << items.size() <<
" items." << endl);
469 map<string, bes::CatalogItem *>::iterator it;
471 while (it != items.end()) {
473 BESDEBUG(MODULE, prolog <<
"Adding item: '" << item->
get_name() <<
"'"<< endl);
474 if (item->
get_type() == CatalogItem::node)
475 node->add_node(item);
477 node->add_leaf(item);
484 std::vector<std::string> url_parts =
BESUtil::split(url,
'/',
true);
485 string leaf_name = url_parts.back();
498 node->set_leaf(item);
505bes::CatalogNode *HttpdDirScraper::get_node(
const string &url,
const string &path)
const
507 BESDEBUG(MODULE, prolog <<
"Processing url: '" << url <<
"'"<< endl);
512 set<string> pageNodes;
513 set<string> pageLeaves;
514 createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
516 BESDEBUG(MODULE, prolog <<
"Found " << pageNodes.size() <<
" nodes." << endl);
517 BESDEBUG(MODULE, prolog <<
"Found " << pageLeaves.size() <<
" leaves." << endl);
519 set<string>::iterator it;
521 it = pageNodes.begin();
522 while (it != pageNodes.end()) {
523 string pageNode = *it;
524 if (
BESUtil::endsWith(pageNode,
"/")) pageNode = pageNode.substr(0, pageNode.length() - 1);
527 childNode->
set_type(CatalogItem::node);
538 node->add_node(childNode);
542 it = pageLeaves.begin();
543 while (it != pageLeaves.end()) {
546 leafItem->
set_type(CatalogItem::leaf);
558 node->add_leaf(leafItem);
563 std::vector<std::string> url_parts =
BESUtil::split(url,
'/',
true);
564 string leaf_name = url_parts.back();
575 node->set_leaf(item);
static BESCatalogList * TheCatalogList()
Get the singleton BESCatalogList instance.
bool is_data(const std::string &item) const
is there a handler that can process this
virtual BESCatalogUtils * get_catalog_utils() const
Get a pointer to the utilities, customized for this catalog.
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
exception thrown if internal error encountered
Regular expression matching.
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
static bool endsWith(std::string const &fullString, std::string const &ending)
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
static std::string lowercase(const std::string &s)
static void removeLeadingAndTrailingBlanks(std::string &key)
static std::string get_time(bool use_local_time=false)
void set_name(std::string n)
Set the name of the item.
std::string get_name() const
The name of this item in the node.
void set_size(size_t s)
Set the size of the item.
void set_is_data(bool id)
Is this item data that the BES should interpret?
void set_lmt(std::string lmt)
Set the LMT for this item.
item_type get_type() const
Get the type of this item (unknown, node or leaf)
void set_type(item_type t)
Set the type for this item.