class Ring::SQA::Alarm

Constants

CFG
Config

Public Class Methods

new(nodes) click to toggle source
# File lib/ring/sqa/alarm.rb, line 36
def initialize nodes
  @nodes    = nodes
  @methods  = []
  @methods  << Email.new   if CFG.email.to?
  @methods  << UDP2IRC.new if Array === CFG.irc or CFG.irc.password?
  @methods  << Exec.new    if CFG.exec.command?
  @methods  << Slack.new   if CFG.slack.url?
  @methods  << Collector.new
  @hostname = Ring::SQA::CFG.host.name
  @afi      = Ring::SQA::CFG.afi
  @alarm    = false
end

Public Instance Methods

clear(alarm_buffer) click to toggle source
# File lib/ring/sqa/alarm.rb, line 24
def clear alarm_buffer
  if @alarm == true
    @alarm = false
    msg = { short: "#{@hostname}: clearing #{@afi} alarm" }
    msg[:long] = msg[:short]
    Log.info msg[:short]
    @methods.each { |alarm_method| alarm_send alarm_method, 'clear', msg, alarm_buffer } if CFG.recovery.notify?
  end
end
message(nodes_list, mtr_list, buffer_list, amount) click to toggle source
# File lib/ring/sqa/alarm/message.rb, line 5
    def message nodes_list, mtr_list, buffer_list, amount
"Regarding: #{Ring::SQA::CFG.host.name} #{Ring::SQA::CFG.afi}

This is an automated alert from the distributed partial outage
monitoring system 'RING SQA'.

At #{Time.now.utc} the following measurements were analysed
as indicating that there is a high probability your NLNOG RING node cannot
reach the entire internet. This could be down to your RING node, its local
network, or disruption of peering and/or upstream networks (for example
instability at an IXP or one of your transit providers).

The following #{amount} nodes previously were reachable, but became unreachable
over the course of the last 3 minutes:

#{nodes_list}

As a debug starting point 3 traceroutes were launched right after
detecting the event, they might assist in pinpointing what broke:

#{mtr_list}

An alarm is raised under the following conditions: every 30 seconds
your node pings all other nodes. The amount of nodes that cannot be
reached is stored in a circular buffer, with each element representing
a minute of measurements. In the event that the last three minutes are
#{Ring::SQA::CFG.analyzer.tolerance.relative} above the median of the previous #{Ring::SQA::CFG.analyzer.median_of} measurement slots, a partial
outage is assumed. The ring buffer's output is as following:

#{buffer_list}

Kind regards,

NLNOG RING
"
    end
set(alarm_buffer) click to toggle source
# File lib/ring/sqa/alarm.rb, line 15
def set alarm_buffer
  if @alarm == false
    @alarm = true
    msg = compose_message alarm_buffer
    Log.info msg[:short]
    @methods.each { |alarm_method| alarm_send alarm_method, 'raise', msg, alarm_buffer }
  end
end

Private Instance Methods

alarm_send(alarm_method, status, msg, alarm_buffer) click to toggle source
# File lib/ring/sqa/alarm.rb, line 83
def alarm_send alarm_method, status, msg, alarm_buffer
  alarm_method.send(short:        msg[:short],
                    long:         msg[:long],
                    status:       status,
                    alarm_buffer: alarm_buffer,
                    nodes:        @nodes,
                    afi:          @afi)
end
compose_message(alarm_buffer) click to toggle source
# File lib/ring/sqa/alarm.rb, line 49
def compose_message alarm_buffer
  exceeding_nodes = alarm_buffer.exceeding_nodes
  if exceeding_nodes.size > 0
      msg = {short: "#{@hostname}: raising #{@afi} alarm - #{exceeding_nodes.size} new nodes down"}
  else
      msg = {short: "#{@hostname}: raising #{@afi} alarm - many nodes were unreachable, general instability"}
  end
  exceeding_nodes = exceeding_nodes.map { |node| @nodes.get node }

  addr_len = @afi == 'ipv6' ? 40 : 15
  nodes_list = ''
  exceeding_nodes.sort_by{ |node| node[:cc] }.each do |node|
    nodes_list << "- %-35s %#{addr_len}s  AS%-6s  %2s\n" % [node[:name], node[:ip], node[:as], node[:cc]]
  end

  mtr_list = ''
  exceeding_nodes.sample(3).each do |node|
    mtr_list << "%-35s AS%-6s (%2s)\n" % [node[:name], node[:as], node[:cc]]
    mtr_list << MTR.run(node[:ip])
    mtr_list << "\n"
  end

  buffer_list = ''
  time = alarm_buffer.array.size-1
  alarm_buffer.array.each do |ary|
    buffer_list << "%2s min ago %3s measurements failed" % [time, ary.size/2]
    buffer_list << (time.to_i < 3 ? " (raised alarm)\n" : " (baseline)\n")
    time -= 1
  end

  msg[:long] = message nodes_list, mtr_list, buffer_list, exceeding_nodes.size
  msg
end