class ChanCrawlerGem::Collector
Attributes
boards[R]
key_words[R]
relevant_links[R]
Public Class Methods
new(boards, key_words)
click to toggle source
# File lib/chanCrawlerGem.rb, line 19 def initialize(boards, key_words) @relevant_links = [] @boards = boards @@base_url = ENV['BASE_URL'] @key_words = key_words end
Public Instance Methods
analyze_threads(threads, board)
click to toggle source
# File lib/chanCrawlerGem.rb, line 45 def analyze_threads(threads, board) # puts 'Analyzing thread list' threads.each do |thread| if thread_relevant?(thread) relevant_links.push "#{@@base_url}#{board}/thread/#{thread['no']}" end end end
board_catalog_urls()
click to toggle source
# File lib/chanCrawlerGem.rb, line 26 def board_catalog_urls # puts 'Getting catalogs' catalogs = {} boards.each { |board| catalogs[board] = "http://a.4cdn.org/#{board}/catalog.json" } catalogs end
get_relevant_threads()
click to toggle source
# File lib/chanCrawlerGem.rb, line 54 def get_relevant_threads # puts 'Retrieving relevant threads' catalogs = board_catalog_urls catalogs.each do |board, catalog| catalog_content = JSON.parse(HTTParty.get(catalog).body) next if catalog_content.count < 1 catalog_content.each do |page| next unless page['threads'].count.positive? analyze_threads(page['threads'], board) end end # puts 'Relevant threads retrieved' end
thread_relevant?(thread)
click to toggle source
# File lib/chanCrawlerGem.rb, line 33 def thread_relevant?(thread) return false if thread['com'].nil? # puts "Checking thread relevancy for #{thread['com']}" @key_words.each do |word| return false unless thread['com'] .downcase .include?(word.downcase) && thread['images'] .positive? end end