module GoldTweets::Client

Constants

AUTHORID_SELECTOR
CONTENT_SELECTOR
DEFAULT_HEADERS

Static list of headers to be sent with API requests

DEFAULT_PARAMETERS

Static list of parameters sent with a search

FAVORITES_SELECTOR
GEO_SELECTOR
REPLIES_SELECTOR
RETWEETS_SELECTOR
Response

Interim response structure useful for tweet fetch and processing logic

SEARCH_PREFIX

URLs for searching and generating permalinks back to tweets

TIMESTAMP_SELECTOR
TWEETS_SELECTOR

XPath selectors

USERNAMES_PER_BATCH

How many usernames to put in a single search

USERNAMES_SELECTOR
USER_AGENTS

User agents to present to Twitter search

Public Class Methods

get_tweets(criteria) click to toggle source

Fetch tweets based on a GoldTweets::Search object This functionality is presently lacking several features of the original python library - proxy support, emoji handling, and allowing a provided block to be run on tweets as they are processed among them.

# File lib/goldtweets/client.rb, line 64
def self.get_tweets(criteria)
  user_agent = USER_AGENTS.sample
  cookie_jar = ''
  usernames  = usernames_for(criteria.usernames)
  batches    = usernames.each_slice(USERNAMES_PER_BATCH).to_a

  batches.map do |batch|
    refresh_cursor      = ''
    batch_results_count = 0
    collected_tweets    = []

    criteria.usernames = batch
    loop do
      response       = fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
      cookie_jar     = response.new_cookies if response.new_cookies
      refresh_cursor = response.new_cursor

      tweets   = response.body.xpath(TWEETS_SELECTOR).reduce([], &method(:parse_tweet))
      collected_tweets << tweets
      batch_results_count += tweets.length

      if (criteria.maximum_tweets.to_i > 0 && batch_results_count >= criteria.maximum_tweets) || (!response.more_items)
        break
      end
    end

    collected_tweets.flatten
  end.flatten
end

Private Class Methods

fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent) click to toggle source

Perform a search for tweets based on criteria specified

# File lib/goldtweets/client.rb, line 177
def self.fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent)
  search   = DEFAULT_PARAMETERS.dup
  get_data = []
  search['f'] = 'tweets' unless criteria.top_tweets?
  search['l'] = criteria.language if criteria.language

  get_data << criteria.query if criteria.query
  get_data << ([''] + criteria.exclude_words).join(' -')
  get_data << criteria.username.map { |u| "from:#{u}" }.join(' OR ') if criteria.username
  get_data << "since:#{criteria.since}" if criteria.since
  get_data << "until:#{criteria.upto}" if criteria.upto
  get_data << "min_replies:#{criteria.minimum_replies}" if criteria.minimum_replies
  get_data << "min_faves:#{criteria.minimum_faves}" if criteria.minimum_faves
  get_data << "min_retweets:#{criteria.minimum_retweets}" if criteria.minimum_retweets

  if criteria.maximum_distance
    if criteria.near
      get_data << "near:#{criteria.near} within:#{criteria.maximum_distance}"
    elsif criteria.lat && criteria.lon
      get_data << "geocode:#{criteria.lat},#{criteria.lon},#{criteria.maximum_distance}"
    end
  end

  search['q'] = get_data.join(' ').strip
  search['max_position'] = refresh_cursor

  url = SEARCH_PREFIX + URI.encode_www_form(search)
  uri = URI(url)

  Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
    request = Net::HTTP::Get.new(uri)
    DEFAULT_HEADERS.each { |(k,v)| request[k] = v }
    request['User-Agent'] = user_agent
    request['Referer'] = url
    request['Set-Cookie'] = cookie_jar

    response = http.request(request)

    json        = JSON.parse(response.body)
    html        = Nokogiri::HTML(json['items_html'])
    new_cursor  = json['min_position']
    new_cookies = response['set-cookie']
    unfinished  = json['has_more_items']

    return Response.new(html, new_cursor, new_cookies, unfinished)
  end
end
parse_tweet(tweets, tweet) click to toggle source

Function for folding a list of Nokogiri objects fetched from Twitter into a list of GoldTweets::Tweet objects

# File lib/goldtweets/client.rb, line 110
def self.parse_tweet(tweets, tweet)
  users    = tweet.xpath(USERNAMES_SELECTOR).map(&:text)
  return tweets if users.empty?

  message   = tweet.xpath(CONTENT_SELECTOR).map(&method(:sanitize_message)).first
  rt,f,re   = tweet_interactions(tweet)
  permalink = PERMALINK_PREFIX + tweet.attr('data-permalink-path')
  author    = tweet.xpath(AUTHORID_SELECTOR).map { |t| t.attr('data-user-id').to_i }.first
  timestamp = tweet.xpath(TIMESTAMP_SELECTOR).map { |t| Time.at(t.attr('data-time').to_i) }.first
  links     = tweet.xpath(LINK_SELECTOR)
  hts, ats  = tweet_hashtags_and_mentions(links)
  geo_span  = tweet.xpath(GEO_SELECTOR).map { |t| t.attr('title') }.first.to_s
  ext_links = links.map { |t| t.attr('data-expanded-url') }.select(&:itself)

  tweet_container           = ::GoldTweets::Tweet.new(users.first)
  tweet_container.to        = users[1]
  tweet_container.text      = message
  tweet_container.retweets  = rt
  tweet_container.faves     = f
  tweet_container.replies   = re
  tweet_container.id        = tweet.attr('data-tweet-id')
  tweet_container.permalink = permalink
  tweet_container.author_id = author
  tweet_container.timestamp = timestamp
  tweet_container.hashtags  = hts
  tweet_container.mentions  = ats
  tweet_container.geo       = geo_span
  tweet_container.links     = ext_links

  tweets + [tweet_container]
end
sanitize_message(tweet) click to toggle source

Normalize spacing and remove errant spaces following pound signs, at signs, and dollar signs

# File lib/goldtweets/client.rb, line 144
def self.sanitize_message(tweet)
  tweet.text
       .gsub(/\s+/, ' ')
       .gsub(/([#@\$]) /, '\1')
end
tweet_hashtags_and_mentions(links) click to toggle source

Classify links belonging to hashtags and (outgoing) mentions within a tweet

# File lib/goldtweets/client.rb, line 162
def self.tweet_hashtags_and_mentions(links)
  links.reduce([[], []]) do |(hashtags, mentions), link|
    href = link.attr('href')
    return [hashtags, mentions] unless href.to_s[0] == '/'
    if link.attr('data-mentioned-user-id')
      [hashtags, mentions + ['@' + href[1..-1]]]
    elsif /^\/hashtag\//.match(href)
      [hashtags + [href.sub(/(?:^\/hashtag\/)/, '#').sub(/(?:\?.*$)/, '')], mentions]
    else
      [hashtags, mentions]
    end
  end
end
tweet_interactions(tweet) click to toggle source

Classify interactions (retweets, faves, and replies to a given tweet)

# File lib/goldtweets/client.rb, line 151
def self.tweet_interactions(tweet)
  [RETWEETS_SELECTOR, FAVORITES_SELECTOR, REPLIES_SELECTOR].map do |selector|
    tweet.xpath(selector)
         .map { |node| node.attr('data-tweet-stat-count') }
         .first
         .to_i
  end
end
usernames_for(users) click to toggle source

Coerce usernames into a suitable representation for batching

# File lib/goldtweets/client.rb, line 97
def self.usernames_for(users)
  case users
  when Array
    users.map { |u| u.sub(/^@/, '').downcase }
  when String
    [ users.sub(/^@/, '').downcase ]
  else
    [[]]
  end
end