class BNextRobot

BNextRobot Extract titles and links of daily/ weekly hot feeds.

Constants

CONTENT_XPATH
FEED_XPATH
IMGS_XPATH
INFO_XPATH
TAG_XPATH
TITLE_XPATH

Attributes

day_rank_feeds[RW]
week_rank_feeds[RW]

Public Class Methods

new() click to toggle source
# File lib/ext_class/bnext_robot.rb, line 22
def initialize
  load_page('http://www.bnext.com.tw/')
  analyze
  init_rank_feeds
end

Public Instance Methods

_extract_feed(feed_id) click to toggle source
# File lib/ext_class/bnext_robot.rb, line 85
def _extract_feed(feed_id)
  query_url = @domain[0..-2] + "#{feed_id}"
  document = Oga.parse_html(open(query_url))
  title = nil;
  author = nil;
  date = nil;
  content = nil;
  tags = nil;
  imgs = nil;

  begin
    title = document.xpath(TITLE_XPATH).text.force_encoding('utf-8')
  rescue
  end
  begin
    author = document.xpath(INFO_XPATH)[0].text.gsub('撰文者:'.force_encoding('ascii-8bit'), '').force_encoding('utf-8')
  rescue
  end
  begin
    date = document.xpath(INFO_XPATH)[1].text.gsub('發表日期:'.force_encoding('ascii-8bit'), '').force_encoding('utf-8')
  rescue
  end
  begin
    content = document.xpath(CONTENT_XPATH).text.force_encoding('utf-8')
  rescue
  end
  begin
    tags = document.xpath(TAG_XPATH).map { |i| i.text.force_encoding('utf-8') }
  rescue
  end
  begin
    imgs = document.xpath(IMGS_XPATH).map { |i| i.text.force_encoding('utf-8') }
  rescue
  end
  Feed.new(title, author, date, tags, query_url, content, imgs)
end
analyze() click to toggle source
# File lib/ext_class/bnext_robot.rb, line 28
def analyze
  cat_tags = @web_data.scan(/<li>.*?<\/li>/)
  atags = cat_tags.map { |x| x.match(/<a.*?<\/a>/).to_s }
  hrefs = atags.map { |x| x.match(/href=\".*?\"/).to_s[7..-2] }
  cat_names = atags.map { |x| x.match(/>.+?</).to_s[1..-2] }
  cats_pair = cat_names.zip(hrefs).select { |n, ref| ref.start_with? 'categories' }

  @cats = Hash.new(false)
  cats_pair.map { |n, ref| @cats[n] = @domain + ref }
  nil
end
get_feeds(cat, page_no) click to toggle source
# File lib/ext_class/bnext_robot.rb, line 70
def get_feeds(cat, page_no)
  # TODO: parse all feeds @ page: page_no
  query_url = @domain + "categories/#{cat}/?p=#{page_no}"
  document = Oga.parse_html(open(query_url))
  path = document.xpath(FEED_XPATH).map(&:text)
  # path.each do |feed_id|
  #   feed = _extract_feed(feed_id)
  #   puts "Title: #{feed.title}"
  #   puts "Author: #{feed.author}"
  #   puts "Date: #{feed.date}"
  #   puts "Tags: " + feed.tags.join(", ")
  # end
  path.map { |feed_id| _extract_feed(feed_id) }
end
init_rank_feeds() click to toggle source
# File lib/ext_class/bnext_robot.rb, line 50
def init_rank_feeds
  token_gen = ["//div[@id = '", "_rank']//a[@class = 'content']"]
  document = Oga.parse_html(@web_data)

  day_rank_hrefs = document.xpath(token_gen.join('day') + '/@href').map(&:text)
  week_rank_hrefs = document.xpath(token_gen.join('week') + '/@href').map(&:text)

  day_rank_titles = document.xpath(token_gen.join('day')).map(&:text)
  week_rank_titles = document.xpath(token_gen.join('week')).map(&:text)

  day_rank = day_rank_titles.zip(day_rank_hrefs).select { |title, href| href.start_with? '/' }
  day_rank = day_rank.map { |title, href| [title, @domain + href[1..-1]] }
  week_rank = week_rank_titles.zip(week_rank_hrefs).select { |title, href| href.start_with? '/' }
  week_rank = week_rank.map { |title, href| [title, @domain + href[1..-1]] }

  @day_rank_feeds = day_rank.map { |title, href| Feed.new(title, "", "", [], href, "") }
  @week_rank_feeds = week_rank.map { |title, href| Feed.new(title, "", "", [], href, "") }
  nil
end
show_day_rank() click to toggle source
# File lib/ext_class/bnext_robot.rb, line 40
def show_day_rank
  @day_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" }
  nil
end
show_week_rank() click to toggle source
# File lib/ext_class/bnext_robot.rb, line 45
def show_week_rank
  @week_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" }
  nil
end