class ChanCrawlerGem::Collector

Attributes

boards[R]
key_words[R]

Public Class Methods

new(boards, key_words) click to toggle source
# File lib/chanCrawlerGem.rb, line 19
def initialize(boards, key_words)
  @relevant_links = []
  @boards = boards
  @@base_url = ENV['BASE_URL']
  @key_words = key_words
end

Public Instance Methods

analyze_threads(threads, board) click to toggle source
# File lib/chanCrawlerGem.rb, line 45
def analyze_threads(threads, board)
  # puts 'Analyzing thread list'
  threads.each do |thread|
    if thread_relevant?(thread)
      relevant_links.push "#{@@base_url}#{board}/thread/#{thread['no']}"
    end
  end
end
board_catalog_urls() click to toggle source
# File lib/chanCrawlerGem.rb, line 26
def board_catalog_urls
  # puts 'Getting catalogs'
  catalogs = {}
  boards.each { |board| catalogs[board] = "http://a.4cdn.org/#{board}/catalog.json" }
  catalogs
end
get_relevant_threads() click to toggle source
# File lib/chanCrawlerGem.rb, line 54
def get_relevant_threads
  # puts 'Retrieving relevant threads'
  catalogs = board_catalog_urls
  catalogs.each do |board, catalog|
    catalog_content = JSON.parse(HTTParty.get(catalog).body)
    next if catalog_content.count < 1

    catalog_content.each do |page|
      next unless page['threads'].count.positive?

      analyze_threads(page['threads'], board)
    end
  end
  # puts 'Relevant threads retrieved'
end
thread_relevant?(thread) click to toggle source
# File lib/chanCrawlerGem.rb, line 33
def thread_relevant?(thread)
  return false if thread['com'].nil?

  # puts "Checking thread relevancy for #{thread['com']}"
  @key_words.each do |word|
    return false unless thread['com']
                        .downcase
                        .include?(word.downcase) && thread['images']
                        .positive?
  end
end