module GoldTweets::Client
Constants
- AUTHORID_SELECTOR
- CONTENT_SELECTOR
- DEFAULT_HEADERS
Static list of headers to be sent with API requests
- DEFAULT_PARAMETERS
Static list of parameters sent with a search
- FAVORITES_SELECTOR
- GEO_SELECTOR
- LINK_SELECTOR
- PERMALINK_PREFIX
- REPLIES_SELECTOR
- RETWEETS_SELECTOR
- Response
Interim response structure useful for tweet fetch and processing logic
- SEARCH_PREFIX
URLs for searching and generating permalinks back to tweets
- TIMESTAMP_SELECTOR
- TWEETS_SELECTOR
XPath selectors
- USERNAMES_PER_BATCH
How many usernames to put in a single search
- USERNAMES_SELECTOR
- USER_AGENTS
User agents to present to Twitter search
Public Class Methods
Fetch tweets based on a GoldTweets::Search object This functionality is presently lacking several features of the original python library - proxy support, emoji handling, and allowing a provided block to be run on tweets as they are processed among them.
# File lib/goldtweets/client.rb, line 64 def self.get_tweets(criteria) user_agent = USER_AGENTS.sample cookie_jar = '' usernames = usernames_for(criteria.usernames) batches = usernames.each_slice(USERNAMES_PER_BATCH).to_a batches.map do |batch| refresh_cursor = '' batch_results_count = 0 collected_tweets = [] criteria.usernames = batch loop do response = fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent) cookie_jar = response.new_cookies if response.new_cookies refresh_cursor = response.new_cursor tweets = response.body.xpath(TWEETS_SELECTOR).reduce([], &method(:parse_tweet)) collected_tweets << tweets batch_results_count += tweets.length if (criteria.maximum_tweets.to_i > 0 && batch_results_count >= criteria.maximum_tweets) || (!response.more_items) break end end collected_tweets.flatten end.flatten end
Private Class Methods
Perform a search for tweets based on criteria specified
# File lib/goldtweets/client.rb, line 177 def self.fetch_tweets(criteria, refresh_cursor, cookie_jar, user_agent) search = DEFAULT_PARAMETERS.dup get_data = [] search['f'] = 'tweets' unless criteria.top_tweets? search['l'] = criteria.language if criteria.language get_data << criteria.query if criteria.query get_data << ([''] + criteria.exclude_words).join(' -') get_data << criteria.username.map { |u| "from:#{u}" }.join(' OR ') if criteria.username get_data << "since:#{criteria.since}" if criteria.since get_data << "until:#{criteria.upto}" if criteria.upto get_data << "min_replies:#{criteria.minimum_replies}" if criteria.minimum_replies get_data << "min_faves:#{criteria.minimum_faves}" if criteria.minimum_faves get_data << "min_retweets:#{criteria.minimum_retweets}" if criteria.minimum_retweets if criteria.maximum_distance if criteria.near get_data << "near:#{criteria.near} within:#{criteria.maximum_distance}" elsif criteria.lat && criteria.lon get_data << "geocode:#{criteria.lat},#{criteria.lon},#{criteria.maximum_distance}" end end search['q'] = get_data.join(' ').strip search['max_position'] = refresh_cursor url = SEARCH_PREFIX + URI.encode_www_form(search) uri = URI(url) Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http| request = Net::HTTP::Get.new(uri) DEFAULT_HEADERS.each { |(k,v)| request[k] = v } request['User-Agent'] = user_agent request['Referer'] = url request['Set-Cookie'] = cookie_jar response = http.request(request) json = JSON.parse(response.body) html = Nokogiri::HTML(json['items_html']) new_cursor = json['min_position'] new_cookies = response['set-cookie'] unfinished = json['has_more_items'] return Response.new(html, new_cursor, new_cookies, unfinished) end end
Function for folding a list of Nokogiri objects fetched from Twitter into a list of GoldTweets::Tweet objects
# File lib/goldtweets/client.rb, line 110 def self.parse_tweet(tweets, tweet) users = tweet.xpath(USERNAMES_SELECTOR).map(&:text) return tweets if users.empty? message = tweet.xpath(CONTENT_SELECTOR).map(&method(:sanitize_message)).first rt,f,re = tweet_interactions(tweet) permalink = PERMALINK_PREFIX + tweet.attr('data-permalink-path') author = tweet.xpath(AUTHORID_SELECTOR).map { |t| t.attr('data-user-id').to_i }.first timestamp = tweet.xpath(TIMESTAMP_SELECTOR).map { |t| Time.at(t.attr('data-time').to_i) }.first links = tweet.xpath(LINK_SELECTOR) hts, ats = tweet_hashtags_and_mentions(links) geo_span = tweet.xpath(GEO_SELECTOR).map { |t| t.attr('title') }.first.to_s ext_links = links.map { |t| t.attr('data-expanded-url') }.select(&:itself) tweet_container = ::GoldTweets::Tweet.new(users.first) tweet_container.to = users[1] tweet_container.text = message tweet_container.retweets = rt tweet_container.faves = f tweet_container.replies = re tweet_container.id = tweet.attr('data-tweet-id') tweet_container.permalink = permalink tweet_container.author_id = author tweet_container.timestamp = timestamp tweet_container.hashtags = hts tweet_container.mentions = ats tweet_container.geo = geo_span tweet_container.links = ext_links tweets + [tweet_container] end
Normalize spacing and remove errant spaces following pound signs, at signs, and dollar signs
# File lib/goldtweets/client.rb, line 144 def self.sanitize_message(tweet) tweet.text .gsub(/\s+/, ' ') .gsub(/([#@\$]) /, '\1') end
Classify interactions (retweets, faves, and replies to a given tweet)
# File lib/goldtweets/client.rb, line 151 def self.tweet_interactions(tweet) [RETWEETS_SELECTOR, FAVORITES_SELECTOR, REPLIES_SELECTOR].map do |selector| tweet.xpath(selector) .map { |node| node.attr('data-tweet-stat-count') } .first .to_i end end
Coerce usernames into a suitable representation for batching
# File lib/goldtweets/client.rb, line 97 def self.usernames_for(users) case users when Array users.map { |u| u.sub(/^@/, '').downcase } when String [ users.sub(/^@/, '').downcase ] else [[]] end end