class SpeakerdeckScraper

Constants

SD_DOMAIN
SD_QUERY_FIRST_PAGE

Attributes

display[RW]
end_time[RW]
opts[RW]
page_object[R]
presentations[R]
query[RW]
start_time[RW]
url[R]

Public Class Methods

new(query, range = 5, display = '-v') click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 13
def initialize(query, range = 5, display = '-v')
    @url = "https://speakerdeck.com/"
    @query = query
    @page_object = ''
    @presentations = {}
    @start_time = Time.now
    @range = range
    @display = display
end

Public Instance Methods

concise_display() click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 62
def concise_display
    print "#"
    sleep(0.02)
end
html_gen() click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 124
    def html_gen
        # take data and sort it by views descending
        sorted_array = self.presentations.values.sort_by do |pres_hash| 
            pres_hash[:views]
        end.reverse

        File.open("spd-#{query}.html", "w") do |file|
            file.write( <<-HTML
                <html>
                <head>
                </head>
                <body>
                <h1>speakerdeck presentations - #{query}</h1>
                <h4>this site was generated in #{self.end_time - self.start_time} seconds (last queried at #{self.start_time})
                    <table class="tablesorter" border="1">
                    <tr>
                        <th>title</th>
                        <th>date</th>
                        <th>category</th>
                        <th>author</th>
                        <th>views</th>
                    </tr>
            HTML
            )
            sorted_array.each do |content_hash|
                link = "#{SD_DOMAIN}#{content_hash[:link]}"
                author_link = "#{SD_DOMAIN}#{content_hash[:author_link]}"
                file.write ( <<-HTML
                    <tr>
                        <td><a href=#{link}>#{content_hash[:title]}</a></td>
                        <td>#{content_hash[:date]}</td>
                        <td><a href="https://speakerdeck.com/c/#{content_hash[:category].downcase}">#{content_hash[:category]}</a></td>
                        <td><a href=#{author_link}>#{content_hash[:author]}</a></td>
                        <td>#{content_hash[:views]}</td>
                    </tr>  
                    HTML
                )
            end
            file.write(<<-HTML
                </table>
                </body>
                </html>
                HTML
                )
        end
    end
pres_author(pres_page) click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 108
def pres_author(pres_page)
    pres_page.css('div#content header h2 a').text
end
pres_category(pres_page) click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 120
def pres_category(pres_page)
    pres_page.css('div#talk-details mark a').text
end
pres_date(pres_page) click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 116
def pres_date(pres_page)
    pres_page.css('div#talk-details mark').first.text.strip
end
pres_page_scrape(id, pres_link) click to toggle source

grab data from one page note: this is a time consuming process – have to open each page (but necessary because the views data isn't stored on the query pages)

# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 79
def pres_page_scrape(id, pres_link)
    pres_page = Nokogiri::HTML(open("https://speakerdeck.com#{pres_link}"))
    
    presentations[id] = { 
        :title => pres_title(pres_page),
        :link => pres_link,
        :date => pres_date(pres_page),
        :author => pres_author(pres_page),
        :author_link => pres_author_link(pres_page),
        :category => pres_category(pres_page), 
        :views => pres_views(pres_page)
        }

    if self.display == '-c'
        concise_display
    else
    puts "#{presentations[id][:title]} has #{presentations[id][:views]} views!"
    end
end
pres_title(pres_page) click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 103
def pres_title(pres_page)
    pres_page.css('div#content header h1').text

end
pres_views(pres_page) click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 99
def pres_views(pres_page)
    pres_page.css('li.views').text.scan(/\d+/).join.to_i
end
query_results_scrape(range) click to toggle source
# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 23
def query_results_scrape(range)
    puts "grabbing presentations"
    begin
    single_results_page_scrape(SD_QUERY_FIRST_PAGE)
    (2..range).collect do |i|
        single_results_page_scrape(i)
    end
    rescue
        puts "error! prob nothing to worry about"
    end
    puts "\ncool! we got #{presentations.length} presentations"
end
scrape_all() click to toggle source

wrapper to run the single page scraper for all links

# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 69
def scrape_all
    puts "reading presentation data"
    self.presentations.each do |id, link|
        pres_page_scrape(id, link)
    end
    self.end_time = Time.now
end
single_results_page_scrape(i) click to toggle source

dumps the query results into a hash, presentations = { 'pres title' => 'pres_link.html' } not called explicitly, lives in query scrape wrapper

# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 38
def single_results_page_scrape(i)
    doc = Nokogiri::HTML(open "#{self.url}search?page=#{i}&q=#{query}")
    doc.css('div.talk').each do |presentation|
        # ensures a unique key in the hash
        pres_id = presentation.attr('data-id')
        
        pres_link = presentation.css('h3.title a').attr('href').text

        pres_title = presentation.css('h3.title').text.strip
        author_name = presentation.parent.css('h3.title a').last.text
        verbose_display(pres_title, author_name) if self.display == "-v"
        concise_display if self.display == "-c"

        self.presentations[pres_id] = pres_link 
    end
end
verbose_display(pres_title, author_name) click to toggle source

display options ############

# File lib/spdeck-scrape/spdeck-scraper-class.rb, line 56
def verbose_display(pres_title, author_name)
    good_words = ["awesome", "great", "amazing", "really cool", "tops", "mind-blowing", "super", "glittering", "thought-provoking", "glorious", "sweet", "classy","really great", "fun", "strong", "robust", "healthy", "fine", "superior", "quality", "thoughful", "intelligent", "clever", "genius","incredible", "smart", "beautiful", "handsome", "pulchritudinous", "elegant", "bespoke", "crazy", "satisfying", "inspirational", "inspiring", "mind-exploding", "hot"]
    puts "grabbed a #{good_words[rand(good_words.length)]} presentation #{pres_title} by #{author_name}"
    sleep(0.02)
end