module Tlsh

Tlsh module implement interface for TLSH (Trend Micro Locality Sensitive Hash) computation. TLSH is usable for diff and similarity computations of binary data, because of the locality sensitivity.

Constants

LOG1_1
LOG1_3
LOG1_5
VERSION

Public Class Methods

diff_files(filename, other_filename) click to toggle source

Computes TLSH based diff between two files.

The closer to 0, the smaller the diff. If files are not found, error is raised.

# File lib/tlsh/tlsh.rb, line 18
def diff_files(filename, other_filename)
  file_a = File.read(filename)
  file_b = File.read(other_filename)

  tslh_a = tlsh_hash(file_a.bytes)
  tslh_b = tlsh_hash(file_b.bytes)
  tslh_a.diff(tslh_b)
end
hash_bytes(blob) click to toggle source

Computes TLSH of an bytes input.

# File lib/tlsh/tlsh.rb, line 40
def hash_bytes(blob)
  tlsh_hash(blob)
end
hash_file(filename) click to toggle source

Computes TLSH based diff between two files.

The closer to 0, the smaller the diff. If files are not found, error is raised.

# File lib/tlsh/tlsh.rb, line 32
def hash_file(filename)
  file = File.read(filename)
  tlsh_hash(file.bytes)
end

Private Class Methods

l_value(length) click to toggle source
# File lib/tlsh/tlsh.rb, line 62
def l_value(length)
  l = if length <= 656
        l_value_small(length)

      elsif length <= 3199
        l_value_medium(length)

      else
        l_value_large(length)
      end
  l & 255
end
l_value_large(length) click to toggle source
# File lib/tlsh/tlsh.rb, line 83
def l_value_large(length)
  Float(Math.log(length) / LOG1_1 - 62.5472).floor.to_i
end
l_value_medium(length) click to toggle source
# File lib/tlsh/tlsh.rb, line 79
def l_value_medium(length)
  Float(Math.log(length) / LOG1_3 - 8.72777).floor.to_i
end
l_value_small(length) click to toggle source
# File lib/tlsh/tlsh.rb, line 75
def l_value_small(length)
  Float(Math.log(length) / LOG1_5).floor.to_i
end
tlsh_hash(input) click to toggle source
# File lib/tlsh/tlsh.rb, line 46
def tlsh_hash(input)
  raise Tlsh::InputTooSmallError if input.size < 256
  buckets, checksum, filesize = Buckets.fill_buckets(input)

  # get the quartiles and their ratio
  q1, q2, q3 = Quartiles.quartile_points(buckets)
  q1_ratio = (q1 * 100 / q3) % 16
  q2_ratio = (q2 * 100 / q3) % 16
  q_ratio = ((q1_ratio & 0xF) << 4) | (q2_ratio & 0xF)

  # get the binary buckets representation
  bin_hash = Buckets.buckets_binary(buckets, q1, q2, q3)

  TlshInstance.new(checksum: checksum, l_value: l_value(filesize), q1_ratio: q1_ratio, q2_ratio: q2_ratio, q_ratio: q_ratio, body: bin_hash)
end