class HackerCurse::AbstractSiteParser
rn = RNParser.new [url] rn.subreddit = “ruby” resultset = rn.get_next_page :page => prevresultset, :number => 5 resultset.each do |art|
art.title, art.points art.comments
end
hn = HNewsParser @options hn.subxxx = “news” / “newest”
redditnews.rb -s ruby –pages 2 hackernews.rb -s newest –pages 2 -d '|'
Attributes
should the html be saved
Public Class Methods
HOST = “news.ycombinator.com”
# File lib/hacker/curse/abstractsiteparser.rb, line 139 def initialize options={} @options = options @url = @options[:url] @save_html = @options[:save_html] @htmloutfile = @options[:htmloutfile] @num_pages = @options[:num_pages] || 1 @more_url = nil #puts "initialize: url is #{@url} " end
Public Instance Methods
# File lib/hacker/curse/abstractsiteparser.rb, line 250 def _retrieve_comments url raise "Must be implemented by concrete class " end
# File lib/hacker/curse/abstractsiteparser.rb, line 176 def _retrieve_page url raise "must be implemented by concrete class" end
# File lib/hacker/curse/abstractsiteparser.rb, line 265 def get_comments index url = get_comments_url index if url #puts url comments = convert_comment_url url return comments #else #puts "Sorry no url for #{index} " end return [] end
# File lib/hacker/curse/abstractsiteparser.rb, line 254 def get_comments_url index arr = @arr entry = arr[index] if entry if entry.key? :comments_url return entry[:comments_url] end end return nil end
returns nokogiri html doc and writes html is required. returns nil if HTTPError
# File lib/hacker/curse/abstractsiteparser.rb, line 218 def get_doc_for_url url $stderr.puts "get_doc #{url} " doc = nil # 2016-03-20 - added check since sometimes server error was coming begin out = open(url) rescue StandardError=>e $stderr.puts "\tError: #{e}" # 2016-03-20 - adding exit since it will go to client that shelled this command. exit 1 else doc = Nokogiri::HTML(out) if @save_html subforum = @subforum || "unknown" outfile = @htmloutfile || "#{subforum}.html" #if !File.exists? url out.rewind File.open(outfile, 'w') {|f| f.write(out.read) } #end end end return doc end
# File lib/hacker/curse/abstractsiteparser.rb, line 148 def get_first_page #@arr = to_hash @url # 2016-03-20 - 23:45 page can be nil if HTTPError page = _retrieve_page @url end
# File lib/hacker/curse/abstractsiteparser.rb, line 153 def get_next_page opts={} page = opts[:page] num_pages = opts[:num_pages] || @num_pages num_pages ||= 1 u = @more_url || @url if page u = page.next_url end pages = nil num_pages.times do |i| page = _retrieve_page u if pages.nil? pages = page else pages.merge_page page end u = page.next_url break unless u # sometimes there is no next @more_url = u end return pages end
# File lib/hacker/curse/abstractsiteparser.rb, line 277 def human_age_to_unix age_text i = age_text.to_i ff=1 if age_text.index("hour") i *= ff*60*60 elsif age_text.index("second") i *= ff elsif age_text.index("minute") i *= ff*60 elsif age_text.index("day") i *= ff*60*60*24 elsif age_text.index("month") i *= ff*60*60*24*30 elsif age_text.index("week") i *= ff*60*60*24*7 elsif age_text.index("year") i *= ff*60*60*24*365 else #raise "don't know how to convert #{age_text} " return 0 end return (Time.now.to_i - i) end
this is a test method so we don't keep hitting HN while testing out and getting IP blocked.
# File lib/hacker/curse/abstractsiteparser.rb, line 242 def load_from_yml filename="hn.yml" @arr = YAML::load( File.open( filename ) ) next_url = @arr.last[:article_url] unless next_url.index("http") next_url = @host + "/" + next_url end @more_url = next_url end
retrieves the comments for a url and stores in outputfile in YML format
# File lib/hacker/curse/abstractsiteparser.rb, line 210 def save_comments_as_yml outputfile, url pages = _retrieve_comments url if pages to_yml outputfile, pages.hash end end
after called get_next_page
, one may pass its return value to this method to convert it into an array of hashes and store it as a yml file It's a bit silly, first we break the hash down into this structure
and then deconstruct the whole thing.
# File lib/hacker/curse/abstractsiteparser.rb, line 196 def save_page_as_yml outputfile, page h = {} h[:url] = page.url h[:next_url] = page.next_url h[:subforum] = page.subforum h[:create_date] = page.create_date articles = [] page.each do |a| articles << a.hash; end h[:articles] = articles to_yml outputfile, h end
write as yml, this doesn't work if multiple pages since we call x times
so previous is overwritten This should be called with final class
# File lib/hacker/curse/abstractsiteparser.rb, line 182 def to_yml outfile, arr = @arr require 'yaml' # cannot just convert / to __ in filename since path gets converted too #if outfile.index("/") #outfile = outfile.gsub("/","__") #end File.open(outfile, 'w' ) do |f| f << YAML::dump(arr) end end