class Ring::SQA::Alarm
Constants
- CFG
- Config
Public Class Methods
Source
# File lib/ring/sqa/alarm.rb, line 36 def initialize nodes @nodes = nodes @methods = [] @methods << Email.new if CFG.email.to? @methods << UDP2IRC.new if Array === CFG.irc or CFG.irc.password? @methods << Exec.new if CFG.exec.command? @methods << Slack.new if CFG.slack.url? @methods << Collector.new @hostname = Ring::SQA::CFG.host.name @afi = Ring::SQA::CFG.afi @alarm = false end
Public Instance Methods
Source
# File lib/ring/sqa/alarm.rb, line 24 def clear alarm_buffer if @alarm == true @alarm = false msg = { short: "#{@hostname}: clearing #{@afi} alarm" } msg[:long] = msg[:short] Log.info msg[:short] @methods.each { |alarm_method| alarm_send alarm_method, 'clear', msg, alarm_buffer } if CFG.recovery.notify? end end
Source
# File lib/ring/sqa/alarm/message.rb, line 5 def message nodes_list, mtr_list, buffer_list, amount "Regarding: #{Ring::SQA::CFG.host.name} #{Ring::SQA::CFG.afi} This is an automated alert from the distributed partial outage monitoring system 'RING SQA'. At #{Time.now.utc} the following measurements were analysed as indicating that there is a high probability your NLNOG RING node cannot reach the entire internet. This could be down to your RING node, its local network, or disruption of peering and/or upstream networks (for example instability at an IXP or one of your transit providers). The following #{amount} nodes previously were reachable, but became unreachable over the course of the last 3 minutes: #{nodes_list} As a debug starting point 3 traceroutes were launched right after detecting the event, they might assist in pinpointing what broke: #{mtr_list} An alarm is raised under the following conditions: every 30 seconds your node pings all other nodes. The amount of nodes that cannot be reached is stored in a circular buffer, with each element representing a minute of measurements. In the event that the last three minutes are #{Ring::SQA::CFG.analyzer.tolerance.relative} above the median of the previous #{Ring::SQA::CFG.analyzer.median_of} measurement slots, a partial outage is assumed. The ring buffer's output is as following: #{buffer_list} Kind regards, NLNOG RING " end
Source
# File lib/ring/sqa/alarm.rb, line 15 def set alarm_buffer if @alarm == false @alarm = true msg = compose_message alarm_buffer Log.info msg[:short] @methods.each { |alarm_method| alarm_send alarm_method, 'raise', msg, alarm_buffer } end end
Private Instance Methods
Source
# File lib/ring/sqa/alarm.rb, line 83 def alarm_send alarm_method, status, msg, alarm_buffer alarm_method.send(short: msg[:short], long: msg[:long], status: status, alarm_buffer: alarm_buffer, nodes: @nodes, afi: @afi) end
Source
# File lib/ring/sqa/alarm.rb, line 49 def compose_message alarm_buffer exceeding_nodes = alarm_buffer.exceeding_nodes if exceeding_nodes.size > 0 msg = {short: "#{@hostname}: raising #{@afi} alarm - #{exceeding_nodes.size} new nodes down"} else msg = {short: "#{@hostname}: raising #{@afi} alarm - many nodes were unreachable, general instability"} end exceeding_nodes = exceeding_nodes.map { |node| @nodes.get node } addr_len = @afi == 'ipv6' ? 40 : 15 nodes_list = '' exceeding_nodes.sort_by{ |node| node[:cc] }.each do |node| nodes_list << "- %-35s %#{addr_len}s AS%-6s %2s\n" % [node[:name], node[:ip], node[:as], node[:cc]] end mtr_list = '' exceeding_nodes.sample(3).each do |node| mtr_list << "%-35s AS%-6s (%2s)\n" % [node[:name], node[:as], node[:cc]] mtr_list << MTR.run(node[:ip]) mtr_list << "\n" end buffer_list = '' time = alarm_buffer.array.size-1 alarm_buffer.array.each do |ary| buffer_list << "%2s min ago %3s measurements failed" % [time, ary.size/2] buffer_list << (time.to_i < 3 ? " (raised alarm)\n" : " (baseline)\n") time -= 1 end msg[:long] = message nodes_list, mtr_list, buffer_list, exceeding_nodes.size msg end