class HackerCurse::AbstractSiteParser

rn = RNParser.new [url] rn.subreddit = “ruby” resultset = rn.get_next_page :page => prevresultset, :number => 5 resultset.each do |art|

art.title, art.points
art.comments

end

hn = HNewsParser @options hn.subxxx = “news” / “newest”

redditnews.rb -s ruby –pages 2 hackernews.rb -s newest –pages 2 -d '|'

Attributes

host[RW]
htmloutfile[RW]
more_url[R]
num_pages[RW]
save_html[RW]

should the html be saved

subforum[RW]

Public Class Methods

new(options={}) click to toggle source

HOST = “news.ycombinator.com

# File lib/hacker/curse/abstractsiteparser.rb, line 139
def initialize options={}
  @options = options
  @url = @options[:url]
  @save_html = @options[:save_html]
  @htmloutfile = @options[:htmloutfile]
  @num_pages = @options[:num_pages] || 1
  @more_url = nil
  #puts "initialize: url is #{@url} "
end

Public Instance Methods

_retrieve_comments(url) click to toggle source
# File lib/hacker/curse/abstractsiteparser.rb, line 250
def _retrieve_comments url
  raise "Must be implemented by concrete class "
end
_retrieve_page(url) click to toggle source
# File lib/hacker/curse/abstractsiteparser.rb, line 176
def _retrieve_page url
  raise "must be implemented by concrete class"
end
get_comments(index) click to toggle source
# File lib/hacker/curse/abstractsiteparser.rb, line 265
def get_comments index
  url = get_comments_url index
  if url
    #puts url
    comments = convert_comment_url url
    return comments
  #else
    #puts "Sorry no url for #{index} "
  end
  return []
end
Also aliased as: get_comments_for_link
get_comments_url(index) click to toggle source
# File lib/hacker/curse/abstractsiteparser.rb, line 254
def get_comments_url index
  arr = @arr
  entry = arr[index]
  if entry
    if entry.key? :comments_url
      return entry[:comments_url]
    end
  end
  return nil
end
get_doc_for_url(url) click to toggle source

returns nokogiri html doc and writes html is required. returns nil if HTTPError

# File lib/hacker/curse/abstractsiteparser.rb, line 218
def get_doc_for_url url
  $stderr.puts "get_doc #{url} "
  doc = nil
  # 2016-03-20 - added check since sometimes server error was coming
  begin
    out = open(url)
  rescue StandardError=>e
    $stderr.puts "\tError: #{e}"
    # 2016-03-20 - adding exit since it will go to client that shelled this command.
    exit 1
  else
    doc  = Nokogiri::HTML(out)
    if @save_html
      subforum = @subforum || "unknown"
      outfile = @htmloutfile || "#{subforum}.html"
      #if !File.exists? url
      out.rewind
      File.open(outfile, 'w') {|f| f.write(out.read) }
      #end
    end
  end
  return doc
end
get_first_page() click to toggle source
# File lib/hacker/curse/abstractsiteparser.rb, line 148
def get_first_page
  #@arr = to_hash @url
  # 2016-03-20 - 23:45 page can be nil if HTTPError
  page = _retrieve_page @url
end
get_next(opts={})
Alias for: get_next_page
get_next_page(opts={}) click to toggle source
# File lib/hacker/curse/abstractsiteparser.rb, line 153
def get_next_page opts={}
  page = opts[:page]
  num_pages = opts[:num_pages] || @num_pages
  num_pages ||= 1
  u = @more_url || @url
  if page 
    u = page.next_url
  end
  pages = nil
  num_pages.times do |i|
    page = _retrieve_page u
    if pages.nil?
      pages = page
    else
      pages.merge_page page
    end
    u = page.next_url
    break unless u  # sometimes there is no next
    @more_url = u
  end
  return pages
end
Also aliased as: get_next
human_age_to_unix(age_text) click to toggle source
# File lib/hacker/curse/abstractsiteparser.rb, line 277
def human_age_to_unix age_text
  i = age_text.to_i
  ff=1
  if age_text.index("hour")
    i *= ff*60*60
  elsif age_text.index("second")
    i *= ff
  elsif age_text.index("minute")
    i *= ff*60
  elsif age_text.index("day")
    i *= ff*60*60*24
  elsif age_text.index("month")
    i *= ff*60*60*24*30
  elsif age_text.index("week")
    i *= ff*60*60*24*7
  elsif age_text.index("year")
    i *= ff*60*60*24*365
  else
    #raise "don't know how to convert #{age_text} "
    return 0
  end
  return (Time.now.to_i - i)
end
load_from_yml(filename="hn.yml") click to toggle source

this is a test method so we don't keep hitting HN while testing out and getting IP blocked.

# File lib/hacker/curse/abstractsiteparser.rb, line 242
def load_from_yml filename="hn.yml"
  @arr = YAML::load( File.open( filename ) )
  next_url = @arr.last[:article_url]
  unless next_url.index("http")
    next_url = @host + "/" + next_url
  end
  @more_url = next_url
end
save_comments_as_yml(outputfile, url) click to toggle source

retrieves the comments for a url and stores in outputfile in YML format

# File lib/hacker/curse/abstractsiteparser.rb, line 210
def save_comments_as_yml outputfile, url
  pages = _retrieve_comments url
  if pages 
    to_yml outputfile, pages.hash
  end
end
save_page_as_yml(outputfile, page) click to toggle source

after called get_next_page, one may pass its return value to this method to convert it into an array of hashes and store it as a yml file It's a bit silly, first we break the hash down into this structure

and then deconstruct the whole thing.
# File lib/hacker/curse/abstractsiteparser.rb, line 196
def save_page_as_yml outputfile, page
  h = {}
  h[:url] = page.url
  h[:next_url] = page.next_url
  h[:subforum] = page.subforum
  h[:create_date] = page.create_date
  articles = []
  page.each do |a| articles << a.hash; end

  h[:articles] = articles

  to_yml outputfile, h
end
to_yml(outfile, arr = @arr) click to toggle source

write as yml, this doesn't work if multiple pages since we call x times

so previous is overwritten
This should be called with final class
# File lib/hacker/curse/abstractsiteparser.rb, line 182
def to_yml outfile, arr = @arr
  require 'yaml'
  # cannot just convert / to __ in filename since path gets converted too
  #if outfile.index("/")
    #outfile = outfile.gsub("/","__")
  #end
  File.open(outfile, 'w' ) do |f|
    f << YAML::dump(arr)
  end
end