module PostRank::URI
Constants
- C14N
- URIREGEX
- VERSION
Public Instance Methods
c14n(uri, opts = {})
click to toggle source
# File lib/postrank-uri.rb, line 163 def c14n(uri, opts = {}) u = parse(uri, opts) u = embedded(u) if q = u.query_values(Array) q.delete_if { |k,v| C14N[:global].include?(k) } q.delete_if { |k,v| C14N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } } end u.query_values = q if u.host =~ /^(mobile\.)?twitter\.com$/ && u.fragment && u.fragment.match(/^!(.*)/) u.fragment = nil u.path = $1 end if u.host =~ /tumblr\.com$/ && u.path =~ /\/post\/\d+\// u.path = u.path.gsub(/[^\/]+$/, '') end u end
clean(uri, opts = {})
click to toggle source
# File lib/postrank-uri.rb, line 145 def clean(uri, opts = {}) uri = normalize(c14n(unescape(uri), opts)) opts[:raw] ? uri : uri.to_s end
embedded(uri)
click to toggle source
# File lib/postrank-uri.rb, line 185 def embedded(uri) embedded = if uri.host == 'news.google.com' && uri.path == '/news/url' \ || uri.host == 'xfruits.com' uri.query_values['url'] elsif uri.host =~ /myspace\.com/ && uri.path =~ /PostTo/ embedded = uri.query_values['u'] end uri = clean(embedded, :raw => true) if embedded uri end
escape(uri)
click to toggle source
# File lib/postrank-uri.rb, line 127 def escape(uri) uri.gsub(URIREGEX[:escape]) do '%' + $1.unpack('H2' * $1.size).join('%').upcase end.gsub(' ','%20') end
extract(text)
click to toggle source
# File lib/postrank-uri.rb, line 97 def extract(text) return [] if !text urls = [] text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query| # Only extract the URL if the domain is valid if PublicSuffix.valid?(domain, default_rule: nil) url = clean(url) urls.push url.to_s end end urls.compact end
extract_href(text, host = nil)
click to toggle source
# File lib/postrank-uri.rb, line 111 def extract_href(text, host = nil) urls = [] Nokogiri.HTML(text).search('a').each do |a| begin url = clean(a.attr('href'), :raw => true, :host => host) next unless url.absolute? urls.push [url.to_s, a.text] rescue next end end urls end
hash(uri, opts = {})
click to toggle source
# File lib/postrank-uri.rb, line 150 def hash(uri, opts = {}) Digest::MD5.hexdigest(opts[:clean] == true ? clean(uri) : uri) end
normalize(uri, opts = {})
click to toggle source
# File lib/postrank-uri.rb, line 154 def normalize(uri, opts = {}) u = parse(uri, opts) u.path = u.path.gsub(URIREGEX[:double_slash_outside_scheme], '/') u.path = u.path.chomp('/') if u.path.size != 1 u.query = nil if u.query && u.query.empty? u.fragment = nil u end
parse(uri, opts = {})
click to toggle source
# File lib/postrank-uri.rb, line 198 def parse(uri, opts = {}) return uri if uri.is_a? Addressable::URI uri = Addressable::URI.parse(uri) if !uri.host && uri.scheme !~ /^javascript|mailto|xmpp$/ if uri.scheme # With no host and scheme yes, the parser exploded return parse("http://#{uri}", opts) end if opts[:host] uri.host = opts[:host] else parts = uri.path.to_s.split(/[\/:]/) if parts.first =~ URIREGEX[:valid_domain] host = parts.shift uri.path = '/' + parts.join('/') uri.host = host end end end uri.scheme = 'http' if uri.host && !uri.scheme uri.normalize! end
unescape(uri)
click to toggle source
# File lib/postrank-uri.rb, line 133 def unescape(uri) u = parse(uri) u.query = u.query.tr('+', ' ') if u.query u.to_s.gsub(URIREGEX[:unescape]) do |encoded| if !encoded.match(URIREGEX[:reserved_characters]).nil? encoded else [encoded.delete('%')].pack('H*') end end end
valid?(uri)
click to toggle source
# File lib/postrank-uri.rb, line 225 def valid?(uri) # URI is only valid if it is not nil, parses cleanly as a URI, # and the domain has a recognized, valid TLD component return false if uri.nil? is_valid = false cleaned_uri = clean(uri, :raw => true) if host = cleaned_uri.host is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil) end is_valid end