class HackerCurse::RedditNewsParser
Public Class Methods
new(config={})
click to toggle source
Calls superclass method
HackerCurse::AbstractSiteParser::new
# File lib/hacker/curse/redditnewsparser.rb, line 6 def initialize config={} @host = config[:host] || "https://www.reddit.com" subforum = config[:subforum] || "unknown" _url="#{@host}/r/#{subforum}/.mobile" config[:url] ||= _url @subforum = subforum super config end
Public Instance Methods
_retrieve_comments(url)
click to toggle source
reddit @return array of ForumComment
objects
For each, you may retrieve +hash+ or individual items such as comment_text, points, age, age_text, submitter, head
# File lib/hacker/curse/redditnewsparser.rb, line 26 def _retrieve_comments url arr = to_hash_comment url pages = hash_to_comment_class arr return pages end
_retrieve_page(url)
click to toggle source
# File lib/hacker/curse/redditnewsparser.rb, line 14 def _retrieve_page url $stderr.puts "_retrieve_page got url #{url} " raise "url should be string" unless url.is_a? String arr = to_hash url return nil unless arr # exception was caught page = hash_to_class arr #to_yml "#{@subforum}OLD.yml", arr return page end
hash_to_class(h)
click to toggle source
# File lib/hacker/curse/redditnewsparser.rb, line 122 def hash_to_class h p = ForumPage.new p.url = h[:url] p.next_url = h[:next_url] p.create_date = h[:create_date] p.subforum = h[:subforum] #p.create_date_seconds = h[:create_date_seconds] art = h[:articles] arts = [] art.each do |a| fa = ForumArticle.new a fa.parent = self arts << fa end p.articles = arts return p end
hash_to_comment_class(arr)
click to toggle source
# File lib/hacker/curse/redditnewsparser.rb, line 227 def hash_to_comment_class arr page = ForumArticle.new arr return page end
old_hash_to_comment_class(arr)
click to toggle source
this returns an array of Forumcomments but that means the article title
etc is not there, and if the output is saved, then that info may be required.
# File lib/hacker/curse/redditnewsparser.rb, line 233 def old_hash_to_comment_class arr co = arr[:comments] pages = Array.new co.each do |h| page = ForumComment.new h pages << page end return pages end
to_hash(url)
click to toggle source
reddit parse to hash containing :url, :mext_url and :articles (an array of hashes for each article)
# File lib/hacker/curse/redditnewsparser.rb, line 32 def to_hash url page = {} arr = Array.new doc = get_doc_for_url url return nil unless doc # exception was caught page[:url] = url now = Time.now page[:create_date_seconds] = now.to_i page[:create_date] = now page[:subforum] = @subforum #filename = "r.#{subr}.yml" links = doc.css("li div.link") links.each do |li| h = {} e = li.css("a.title") if !e.empty? e = e.first h[:title] = e.text h[:article_url] = e["href"] end e = li.css("a.domain") if !e.empty? e = e.first h[:domain] = e.text h[:domain_url] = e["href"] end e = li.css("a.author") if !e.empty? e = e.first h[:submitter] = e.text h[:submitter_url] = e["href"] end e = li.css("span.buttons > a") if !e.empty? e = e.first #h[:comment_count] = e.text.to_i h[:comment_count] = e.text.to_i.to_s.rjust(4) h[:comments_url] = e["href"] else h[:comment_count] = " 0" h[:comments_url] = "" end byline = li.css("p.byline").text h[:byline] = byline # 2014-08-14 - 13:34 in some cases the byline just says "17 minutes ago" with no BAR or "by" # In one case in 'science' the name itself had BARs to the parse failed # In another case there was no comments, so parts[2] was nil !! parts = byline.split("|") age = points = nil parts.each do |ppp| if ppp.index("points") points = ppp.strip elsif ppp.index("comments") # we've taken it already elsif ppp.index(" ago ") age = ppp.split("by").first.strip end end #age = parts.last.split("by").first.strip #age = parts[2].split("by").first.strip if age if age.scan(/\d+ \w/).first.nil? raise "Nil in age: #{age} , parts = #{parts}" end end h[:age_text]= age.scan(/\d+ \w/).first.rjust(4) if age #h[:age_text]= age h[:age] = human_age_to_unix(age) if age #h[:points]= points.to_i h[:points]= points.to_i.to_s.rjust(4) #puts points #puts age arr << h end # some cases like rising do not have next prev #next_prev_url= doc.css("p.nextprev").first.css("a").first["href"] next_prev_url= doc.css("p.nextprev").first if next_prev_url #&& !next_prev_url.empty? next_prev_url = next_prev_url.css("a").first["href"] page[:next_url] = next_prev_url end page[:articles] = arr #arr << { :next_prev_url => next_prev_url } #@more_url = next_prev_url return page end
to_hash_comment(url)
click to toggle source
returns a hash. hash returns an array of hashes containing comment details
# File lib/hacker/curse/redditnewsparser.rb, line 153 def to_hash_comment url # for testing i may send in a saved file, so i don't keep hitting HN if !File.exists? url unless url.index("http") url = @host + "/" + url end end # comments are nested and there is a div for that, # Also blockquotes for when commenter quotes another. doc = Nokogiri::HTML(open(url)) h = {} main = doc.css("li div.link") maintext = main.text #puts maintext #puts main.css("a").count #puts main.css("a").first # this dumps the whole line h[:main_text] = maintext main.css("a").each_with_index do |l, i| # this breaks the main line into text and links case i when 0 h[:title] = l.text h[:article_url] = l["href"] when 1 h[:comment_count] = l.text h[:comments_url] = l["href"] when 2 h[:submitter] = l.text h[:submitter_url] = l["href"] when 3 h[:domain] = l.text h[:domain_url] = l["href"] end end byline = main.css("p.byline").text h[:byline] = byline points = byline.scan(/\d+ point/).first age_text = byline.scan(/\d+ \w+ ago/).first h[:points] = points h[:age_text] = age_text arr = [] comments = doc.css("li div.comment") comments.each_with_index do |co, ix| #puts ix hh = {} arr << hh comment = co.css("div.md").text hh[:comment_text] = comment byline = co.css("p.byline") #puts "byline:" #puts byline bytext = byline.text hh[:head] = bytext #puts "bytext:" #puts bytext m = bytext.scan(/\d+ \w+ ago/) hh[:age_text] = m.first.sub(/ago/,"") hh[:age] = human_age_to_unix(m.first) link = byline.css("a").first if link commenter = link.text hh[:submitter] = commenter submitter_url = link["href"] hh[:submitter_url] = submitter_url end points = byline.css("span.score").text rescue "" hh[:points] = points.sub(/points?/,"") end h[:comments] = arr return h end