module UrlScrubber

Constants

VERSION

Public Class Methods

find_identity_from_url(url) click to toggle source
# File lib/url_scrubber.rb, line 124
def self.find_identity_from_url(url)
  return nil unless url.present?
  url = UrlScrubber.scrub(url)
  url ? url.split("/").last : nil
end
find_linkedin_identity_from_url(url) click to toggle source
# File lib/url_scrubber.rb, line 131
def self.find_linkedin_identity_from_url(url)
  return nil if url.nil?
  scrubbed_url = scrub(url)
  if scrubbed_url && linkedin_company_url?(scrubbed_url)
    scrubbed_url.split("/").last
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
    scrubbed_url.split("/").last
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/pub/')
    id_partition = scrubbed_url.partition('linkedin.com/pub/')
    id_partition[2] && id_partition[2] != "" ? drop_url_ampersand!(id_partition[2].split('/').first) : nil
  elsif scrubbed_url.include?('linkedin.com/groups/')
    scrubbed_url.split("/").last
  elsif scrubbed_url.include?('linkedin.com/groups?gid=')
    id_partition = scrubbed_url.partition('linkedin.com/groups?gid=')
    drop_url_ampersand!(id_partition[2])
  end
end
ideal_form?(url) click to toggle source
# File lib/url_scrubber.rb, line 70
def self.ideal_form?(url)
  url = scrub(url)
  return false unless url

  case service_of(url)
  when :vkontakte
    !!url.match(%r{^http://vk\.com/[\w_]+$})
  when :weibo
    !!url.match(%r{^http://weibo\.com/[\w_-]+$})
  when :youtube
    !!url.match(%r{^http://youtube\.com/[\w_-]+$})
  when :twitter
    !!url.match(%r{^http://twitter\.com/[\w_]+$})
  when :facebook
    !!url.match(%r{^http://facebook\.com/(profile\.php?id=\d+|[\w_\.-]+)$}) || !!url.match(%r{^http://facebook\.com/groups/[\w_\.-]+$})
  when :linkedin
    !!url.match(%r{^http://linkedin\.com/pub/[\w-]+/[\w]+/[\w]+/[\w]+$}) || !!url.match(%r{^http://linkedin\.com/in/[\w_-]+$}) || !!url.match(%r{^http://linkedin\.com/(company/[\w_-]+|profile/view\?id=\d+)$}) || !!url.match(%r{^http://linkedin\.com/(groups\?gid=[0-9]+)$}) || !!url.match(%r{^http://linkedin\.com/(groups/[\w_-]+)$})
  when :google
    !!url.match(%r{^http://plus\.google\.com/(\+[\w_-]+|\d+)$}) || !!url.match(%r{^http://plus\.google\.com/communities/\d+$})
  when :slideshare
    !!url.match(%r{^http://slideshare\.net/[\w_-]+$})
  when :flickr
    !!url.match(%r{^http://flickr\.com/[\w_\@-]+$}) || !!url.match(%r{^http://flickr\.com/groups/[\w_\@\.-]+$})
  when :pinterest
    !!url.match(%r{^http://pinterest\.com/[\w_-]+$})
  when :yelp
    !!url.match(%r{^http://yelp\.com/[\w_-]+$})
  when :vimeo
    (!!url.match(%r{^http://vimeo\.com/[\w_-]+$}) && !url.match(%r{/\d+$})) || !!url.match(%r{^http://vimeo\.com/groups/[\w_\.-]+$})
  when :instagram
    !!url.match(%r{^http://instagram\.com/[\w_]+$})
  when :tumblr
    #Rails.logger.debug "CCC  Tumblr - url=#{url}, ideal=#{!!url.match(%r{^http://[\w_]+\.tumblr\.com$})}, www=#{url.index("://www.") ? url.index("://www.") : 'NIL'}"
    !!url.match(%r{^http://[\w_]+\.tumblr\.com$}) && !url.index("://www.")
  else
    true
  end
end
linkedin_company_url?(url) click to toggle source
# File lib/url_scrubber.rb, line 110
def self.linkedin_company_url?(url)
  url = scrub(url)
  return false unless url
  return url.include?('http://linkedin.com/company/')
end
linkedin_personal_url?(url) click to toggle source
# File lib/url_scrubber.rb, line 117
def self.linkedin_personal_url?(url)
  url = scrub(url)
  return false unless url
  return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
end
maps_to_public_url(url) click to toggle source
# File lib/url_scrubber.rb, line 163
def self.maps_to_public_url(url)
  scrubbed = scrub(url)
  parsed = URI.parse(URI.escape(url)) or return nil
  host = Domainatrix.parse(parsed.host)
  if host.domain == "facebook" && host.subdomain == "business"
    public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com")
  elsif host.domain == "google" && host.subdomain == "business"
    public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com")
  else
    public_url = nil
  end
  public_url
end
scrub(url) click to toggle source
# File lib/url_scrubber.rb, line 9
def self.scrub(url)
  return url if url.blank?
  return url if /^app:\/\//.match(url)  # Do not scrub app-only URLs
  return url if /^https?:\/\/(www.)?business.tiktok\.com\/manage\//.match(url) # Don't scrub tik tok business manager urls, quick fix until we can implement a different solution, https://business.tiktok.com/manage/overview?org_id=6974497704617492482

  url = url.clone # don't modify the original argument

  m = url.match(/(htt?ps?:\/\/\S+)/i)
  return nil unless m

  url = m[1]
  url.sub!(/^https/i, 'http')
  url.sub!(/^htp/i, 'http')
  url.sub!(/\/+$/, '')
  url.sub!(/;+$/, '')
  url.sub!('#!/', '')
  url.sub!('%27', '\'')
  url = downcase_domain(url)
  remove_subdomain!(url)
  remove_html_tags!(url)
  # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
  url = drop_anchor!(special_cases(url))
  url.sub!(/,+$/, "")    # remove one or more trailing commas at the end of the URL
  url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
  return url
end
service_of(url) click to toggle source
# File lib/url_scrubber.rb, line 37
def self.service_of(url)
  url_parts = Domainatrix.parse(url)
  if url_parts.host.present?

    case url_parts.domain
    when 'facebook'           then return :facebook
    when 'fb'                 then return :facebook
    when 'flickr'             then return :flickr
    when 'instagram'          then return :instagram
    when 'linkedin'           then return :linkedin
    when 'pinterest'          then return :pinterest
    when 'slideshare'         then return :slideshare
    when 'tumblr'             then return :tumblr
    when 'twitter'            then return :twitter
    when 'vimeo'              then return :vimeo
    when 'vk'                 then return :vkontakte
    when 'weibo'              then return :weibo
    when 'yelp'               then return :yelp
    when 'youtube'            then return :youtube
    end

    case url_parts.host
    when /\bplus\.google\.com$/ then return :google
    end

  else
    Rails.logger.debug "No Domain Match"
  end

  :other
end
valid_url?(url) click to toggle source

Requirements:

  1. must have http/https scheme

  2. no “@” in any of the passed in url string

  3. valid uri as determined by Addressable::URI

# File lib/url_scrubber.rb, line 154
def self.valid_url?(url)
  schemes = %w(http https)
  parsed = URI.parse(URI.escape(url)) or return false
  schemes.include?(parsed.scheme) && !url.include?("@")
  rescue URI::InvalidURIError
  false
end

Private Class Methods

check_for_facebook_redirection(uri_str, limit = 5) click to toggle source
# File lib/url_scrubber.rb, line 438
def self.check_for_facebook_redirection(uri_str, limit = 5)
  #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
  # finds any redirects   intended for facebook URLs only!!!!
  login_patterns = [
    # pages that require user logins
    %r{^.*/login[^/]*$}
  ]

  failure_patterns = [
    # pages that give 200 codes but actually indicate a not found
    %r{linkedin\.com/home\?report%2Efailure}i
  ]

  raise 'Too many HTTP redirects' if limit == 0

  uri_str_new = uri_str.sub('http://', 'https://')
  uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")

  begin
    url = URI.parse(URI.escape(uri_str_new))
  rescue URI::InvalidURIError => e
    return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
  end

  http = Net::HTTP.new(url.host, url.port)
  http = Net::HTTP.new(url.host, url.port)
  http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
  http.read_timeout = 10 # and up to 10 seconds for a response
  if url.port == 443
    http.use_ssl = true
    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
  else
    http.use_ssl = false
  end
  request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })

  begin
    response = http.request(request)
  rescue Timeout::Error
    #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
    failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
    return [uri_str_new, failure_response]
  rescue Exception => e
    failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
    return [uri_str_new, failure_response]
  end

  if response.is_a? Net::HTTPRedirection
    if response['location'][0,4] == "http"
      if failure_patterns.any? { |pattern| response['location'].match(pattern) }
        # got redirected to a page indicating failure, so act like it's a 404
        failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
        #puts "check_for_facebook_redirection 404"
        return [uri_str_new, failure_response]
      end

      if login_patterns.any? { |pattern| redirected_url.match(pattern) }
        # got redirected to a login page. return the ultimate response, but the previous url
        failure_response = Net::HTTPClientError.new('1.1', '401', 'Inaccessible')
        #puts "check_for_facebook_redirection 401"
        return [uri_str_new, failure_response]
      end
      #puts "check_for_facebook_redirection 1 limit=#{limit.to_s}"
      redirected_url, base_response = check_for_facebook_redirection(response['location'], limit - 1)
      return [redirected_url, base_response]

    else
      redir_url = "http://#{url.host}#{response['location']}"
      #puts "check_for_facebook_redirection recalled limit =#{limit.to_s}"
      redirected_url, base_response = check_for_facebook_redirection(redir_url, limit - 1)
      return [redirected_url, base_response]
    end
  else
    #puts "check_for_facebook_redirection return code #{response.code.to_s}"
    return [uri_str_new, response]
  end
end
downcase_domain(url) click to toggle source
# File lib/url_scrubber.rb, line 183
def self.downcase_domain(url)
  domain_match = url.match(%r{http://[^/]+}i)
  if domain_match
    domain_match[0].downcase + domain_match.post_match
  else
    url
  end
end
drop_anchor!(url) click to toggle source
# File lib/url_scrubber.rb, line 242
def self.drop_anchor!(url)
  #puts "drop anchor"
  url.sub!(/#.*$/, '')
  url
end
drop_url_ampersand!(url) click to toggle source
# File lib/url_scrubber.rb, line 230
def self.drop_url_ampersand!(url)
  url.sub!(/\&.*$/, '')
  url
end
drop_url_query!(url) click to toggle source
# File lib/url_scrubber.rb, line 236
def self.drop_url_query!(url)
  url.sub!(/\?.*$/, '')
  url
end
remove_html_tags!(url) click to toggle source
# File lib/url_scrubber.rb, line 224
def self.remove_html_tags!(url)
  url.gsub!(/<\/?[^>]+>/, '')
  url
end
remove_subdomain!(url) click to toggle source
# File lib/url_scrubber.rb, line 213
def self.remove_subdomain!(url)
  # url.sub!(%r{://www\d*\.}, '://')
  url.sub!(%r{^https?://www?w?\d*\.}i, 'http://')
  url.sub!(%r{^https?://m\d*\.}i, 'http://')
  url.sub!(%r{^https?://mobile\d*\.}i, 'http://')
  url.sub!(%r{^https?://touch\d*\.}i, 'http://')
  url.sub!(%r{^https?://mbasic\.facebook\.com}i, 'http://facebook.com')
  url
end
sc_facebook(url) click to toggle source

TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me

# File lib/url_scrubber.rb, line 294
def self.sc_facebook(url)

  url = url.gsub(/(_rdr=.+&)|(&_rdr=.+$)/,"")

  regex1  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
  regex2  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
  regex3  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
  regex4  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
  regex5  = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
  regex6  = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/home\/accounts\?business_id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i

  # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
  # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
  if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url)
    url = mdata[:base_url]
  end

  if url.match("/media/albums") || url.match("/media/set")
    url = url.match('\&') ? url.split('&',2)[0] : url
  elsif mdata = url.match(regex1)
    # "http://facebook.com/pages/Command-Canada/1434248516885065/timeline"
    url = mdata[:url]
    uname = mdata[:uname]
    uid = mdata[:uid]
  elsif mdata = url.match(regex2)
    # "https://www.facebook.com/profile.php?id=100009574328879"
    url, http_response = check_for_facebook_redirection(mdata[:url])
    uid = mdata[:uid]
  elsif mdata = url.match(regex4)
    # "http://facebook.com/home.php?#!/person.name"
      url = mdata[:url] + mdata[:uname]
      url = drop_url_query!(url)
    elsif mdata = url.match(regex5)
      # "https://www.facebook.com/100009574328879"
      url = "http://facebook.com/" + mdata[:uid]
      uid = mdata[:uid]
    elsif mdata = url.match(regex6)
      # "http://business.facebook.com/home/accounts?business_id=1145724702268347"
      url = mdata[:url]
      uid = mdata[:uid]
  elsif mdata = url.match(regex3)
    # "http://facebook.com/TonyMollHomeLoans/timeline"
    # "http://facebook.com/pg/TonyMollHomeLoans/timeline"
    # "https://www.facebook.com/groups/practicewithclaritygroup"
    if ["group", "groups", "page", "pages", "pg"].exclude?(mdata[:uname])
      url = (mdata[:group] ? "http://facebook.com/groups/" : "http://facebook.com/") + mdata[:uname]
      uname = mdata[:uname]
    end
    url = drop_url_query!(url)
  elsif url.include?("facebook.com/profile.php?id=")
    # puts "profile.php"
    # these were being truncated, they do redirect, but typically a 301 response is generated
    # so the url is returned unchanged.  Better than truncation.
    url, http_response = check_for_facebook_redirection(url)
  else
    # puts "else"
    url = drop_url_query!(url)
  end

  # Due to the redirection check, "https" and "www." can be re-introduced
  url = url.sub(%r{^https?://www.}i, 'http://')
  url = url.sub(/\?_rdr.*/, '')
  url
end
sc_flickr(url) click to toggle source
# File lib/url_scrubber.rb, line 400
def self.sc_flickr(url)
  if url.include?('flickr.com/groups/')
    groups_partition = url.partition('flickr.com/groups/')
    if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
      extraneous_slash_partition = groups_partition[2].partition('/')
      if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
        # need to trim off the sub page stuff
        return "http://flickr.com/groups/" + extraneous_slash_partition[0]
      else
        return url
      end
    end
  end
  user_match = url.match(%r{flickr\.com/(photos/|people/)?([^/]+)})
  return url unless user_match

  "http://flickr.com/#{user_match[2]}"
end
sc_generic(url) click to toggle source
# File lib/url_scrubber.rb, line 432
def self.sc_generic(url)
  drop_url_query!(url)
  url
end
sc_google_plus(url) click to toggle source
# File lib/url_scrubber.rb, line 384
def self.sc_google_plus(url)
  url.sub!('com/u/0/b/', 'com/')
  url.sub!('com/u/0/', 'com/')
  url.sub!('com/b/', 'com/')
  url.sub!('/photos', '')
  url.sub!('/of', '')
  url.sub!('/albums', '')

  community_page = url.include?('plus.google.com/communities/')
  path_match = community_page ? url.match(/^http:\/\/plus\.google\.com\/communities\/([^\/]+)/) : url.match(/^http:\/\/plus\.google\.com\/([^\/]+)/)
  return url unless path_match

  community_page ? "http://plus.google.com/communities/#{path_match[1]}" : "http://plus.google.com/#{path_match[1]}"
end
sc_linkedin(url) click to toggle source

TODO This needs to be rewritten to be independent of the LinkedIn domain and public suffix used: e.g. linkedin.com vs lnkd.in vs linkedin.ca

# File lib/url_scrubber.rb, line 361
def self.sc_linkedin(url)
  url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
  if !!url.match(%r{com/company/})
    drop_url_query!(url)
  elsif !!url.match(%r{com/in/})
    drop_url_query!(url)
  elsif !!url.match(%r{com/pub/})
    drop_url_query!(url)
  elsif url.include?('linkedin.com/groups/')
    drop_url_query!(url)
  elsif url.include?('linkedin.com/groups?gid=')
    drop_url_ampersand!(url)
  elsif url.include?('linkedin.com/groups?home=&gid=')
    id_partition = url.partition('linkedin.com/groups?home=&gid=')
    url = "http://linkedin.com/groups?gid=" + drop_url_ampersand!(id_partition[2])
  elsif url.include?('linkedin.com/groups?homeNewMember=&gid=')
    id_partition = url.partition('linkedin.com/groups?homeNewMember=&gid=')
    url = "http://linkedin.com/groups?gid=" + drop_url_ampersand!(id_partition[2])
  end
  url
end
sc_pinterest(url) click to toggle source
# File lib/url_scrubber.rb, line 420
def self.sc_pinterest(url)
  drop_url_query!(url)
  url
end
sc_twitter(url) click to toggle source
# File lib/url_scrubber.rb, line 274
def self.sc_twitter(url)
  url.sub!('twitter.com/@', 'twitter.com/')

  status_match = url.match(%r{(twitter\.com/[^/]+)/statuses/\d+})
  if status_match
    url = "http://#{status_match[1]}"
  end

  search_match = url.match(%r{twitter\.com/search(?:/realtime)?(?:/|\?q=)(?:@|%40)(\S*)$})
  if search_match
    url = "http://twitter.com/#{search_match[1]}"
  end

  url = drop_url_query!(url)

  url
end
sc_vimeo(url) click to toggle source
# File lib/url_scrubber.rb, line 259
def self.sc_vimeo(url)
  if url.include?('vimeo.com/groups/')
    groups_partition = url.partition('vimeo.com/groups/')
    if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
      extraneous_slash_partition = groups_partition[2].partition('/')
      if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
        # need to trim off the sub page stuff
        return "http://vimeo.com/groups/" + extraneous_slash_partition[0]
      end
    end
  end
  url
end
sc_yelp(url) click to toggle source
# File lib/url_scrubber.rb, line 426
def self.sc_yelp(url)
  drop_url_query!(url)
  url
end
sc_youtube(url) click to toggle source
# File lib/url_scrubber.rb, line 249
def self.sc_youtube(url)
  # We need to allow the /user version of the URL due to how YouTube allows users to have their own URL
  # which is not separate channel with it's own customUrl.
  # url.sub!('youtube.com/user/', 'youtube.com/')
  url.sub!('youtube.com/profile?user=', 'youtube.com/')
  drop_url_query!(url)
  url
end
special_cases(url) click to toggle source
# File lib/url_scrubber.rb, line 193
def self.special_cases(url)
  #puts "special_cases"
  case service_of(url)
  when :youtube   then return sc_youtube(url)
  when :twitter   then return sc_twitter(url)
  when :facebook  then return sc_facebook(url)
  when :linkedin  then return sc_linkedin(url)
  when :google    then return sc_google_plus(url)
  when :flickr    then return sc_flickr(url)
  when :pinterest then return sc_pinterest(url)
  when :vimeo     then return sc_vimeo(url)
  when :yelp      then return sc_yelp(url)
  else
    sc_generic(url)
  end

  url
end