class Datadog::Profiling::Collectors::Stack
Collects stack trace samples from Ruby threads for both CPU-time (if available) and wall-clock. Runs on its own background thread.
Constants
- DEFAULT_MAX_TIME_USAGE_PCT
- MIN_INTERVAL
- THREAD_LAST_CPU_TIME_KEY
Attributes
Public Class Methods
# File lib/ddtrace/profiling/collectors/stack.rb, line 30 def initialize( recorder, max_frames:, trace_identifiers_helper:, # Usually an instance of Datadog::Profiling::TraceIdentifiers::Helper ignore_thread: nil, max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT, thread_api: Thread, fork_policy: Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default interval: MIN_INTERVAL, enabled: true ) @recorder = recorder @max_frames = max_frames @trace_identifiers_helper = trace_identifiers_helper @ignore_thread = ignore_thread @max_time_usage_pct = max_time_usage_pct @thread_api = thread_api # Workers::Async::Thread settings self.fork_policy = fork_policy # Workers::IntervalLoop settings self.loop_base_interval = interval # Workers::Polling settings self.enabled = enabled @warn_about_missing_cpu_time_instrumentation_only_once = Datadog::Utils::OnlyOnce.new # Cache this proc, since it's pretty expensive to keep recreating it @build_backtrace_location = method(:build_backtrace_location).to_proc # Cache this buffer, since it's pretty expensive to keep accessing it @stack_sample_event_recorder = recorder[Events::StackSample] end
Public Instance Methods
# File lib/ddtrace/profiling/collectors/stack.rb, line 200 def build_backtrace_location(_id, base_label, lineno, path) string_table = @stack_sample_event_recorder.string_table Profiling::BacktraceLocation.new( string_table.fetch_string(base_label), lineno, string_table.fetch_string(path) ) end
# File lib/ddtrace/profiling/collectors/stack.rb, line 79 def collect_and_wait run_time = Datadog::Utils::Time.measure do collect_events end # Update wait time to throttle profiling self.loop_wait_time = compute_wait_time(run_time) end
# File lib/ddtrace/profiling/collectors/stack.rb, line 88 def collect_events events = [] # Compute wall time interval current_wall_time = Datadog::Utils::Time.get_time last_wall_time = if instance_variable_defined?(:@last_wall_time) @last_wall_time else current_wall_time end wall_time_interval_ns = ((current_wall_time - last_wall_time).round(9) * 1e9).to_i @last_wall_time = current_wall_time # Collect backtraces from each thread thread_api.list.each do |thread| next unless thread.alive? next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread) event = collect_thread_event(thread, wall_time_interval_ns) events << event unless event.nil? end # Send events to recorder recorder.push(events) unless events.empty? events end
# File lib/ddtrace/profiling/collectors/stack.rb, line 117 def collect_thread_event(thread, wall_time_interval_ns) locations = thread.backtrace_locations return if locations.nil? # Get actual stack size then trim the stack stack_size = locations.length locations = locations[0..(max_frames - 1)] # Convert backtrace locations into structs locations = convert_backtrace_locations(locations) thread_id = thread.respond_to?(:pthread_thread_id) ? thread.pthread_thread_id : thread.object_id trace_id, span_id, trace_resource_container = trace_identifiers_helper.trace_identifiers_for(thread) cpu_time = get_cpu_time_interval!(thread) Events::StackSample.new( nil, locations, stack_size, thread_id, trace_id, span_id, trace_resource_container, cpu_time, wall_time_interval_ns ) end
# File lib/ddtrace/profiling/collectors/stack.rb, line 169 def compute_wait_time(used_time) # We took used_time to get the last sample. # # What we're computing here is -- if used_time corresponds to max_time_usage_pct of the time we should # spend working, how much is (100% - max_time_usage_pct) of the time? # # For instance, if we took 10ms to sample, and max_time_usage_pct is 1%, then the other 99% is 990ms, which # means we need to sleep for 990ms to guarantee that we don't spend more than 1% of the time working. used_time_ns = used_time * 1e9 interval = (used_time_ns / (max_time_usage_pct / 100.0)) - used_time_ns [interval / 1e9, MIN_INTERVAL].max end
Convert backtrace locations into structs Re-use old backtrace location objects if they already exist in the buffer
# File lib/ddtrace/profiling/collectors/stack.rb, line 184 def convert_backtrace_locations(locations) locations.collect do |location| # Re-use existing BacktraceLocation if identical copy, otherwise build a new one. @stack_sample_event_recorder.cache(:backtrace_locations).fetch( # Function name location.base_label, # Line number location.lineno, # Filename location.path, # Build function &@build_backtrace_location ) end end
# File lib/ddtrace/profiling/collectors/stack.rb, line 145 def get_cpu_time_interval!(thread) # Return if we can't get the current CPU time unless thread.respond_to?(:cpu_time_instrumentation_installed?) && thread.cpu_time_instrumentation_installed? warn_about_missing_cpu_time_instrumentation(thread) return end current_cpu_time_ns = thread.cpu_time(:nanosecond) # NOTE: This can still be nil even when all of the checks above passed because of a race: there's a bit of # initialization that needs to be done by the thread itself, and it's possible for us to try to sample # *before* the thread had time to finish the initialization return unless current_cpu_time_ns last_cpu_time_ns = (thread.thread_variable_get(THREAD_LAST_CPU_TIME_KEY) || current_cpu_time_ns) interval = current_cpu_time_ns - last_cpu_time_ns # Update CPU time for thread thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns) # Return interval interval end
# File lib/ddtrace/profiling/collectors/stack.rb, line 75 def loop_back_off? false end
# File lib/ddtrace/profiling/collectors/stack.rb, line 71 def perform collect_and_wait end
# File lib/ddtrace/profiling/collectors/stack.rb, line 65 def start @last_wall_time = Datadog::Utils::Time.get_time reset_cpu_time_tracking perform end
Private Instance Methods
If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to clean up any leftover per-thread cpu time counters, so that the first sample after starting doesn't end up with:
a) negative time: At least on my test docker container, and on the reliability environment, after the process
forks, the clock reference changes and (old cpu time - new cpu time) can be < 0
b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
# File lib/ddtrace/profiling/collectors/stack.rb, line 249 def reset_cpu_time_tracking thread_api.list.each do |thread| thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil) end end
# File lib/ddtrace/profiling/collectors/stack.rb, line 212 def warn_about_missing_cpu_time_instrumentation(thread) @warn_about_missing_cpu_time_instrumentation_only_once.run do # Is the profiler thread instrumented? If it is, then we know instrumentation is available, but seems to be # missing on this thread we just found. # # As far as we know, it can be missing due to one the following: # # a) The thread was started before we installed our instrumentation. # In this case, the fix is to make sure ddtrace gets loaded before any other parts of the application. # # b) The thread was started using the Ruby native APIs (e.g. from a C extension such as ffi). # Known cases right now that trigger this are the ethon/typhoeus gems. # We currently have no solution for this case; these threads will always be missing our CPU instrumentation. # # c) The thread was started with `Thread.start`/`Thread.fork` and hasn't yet enabled the instrumentation. # When threads are started using these APIs, there's a small time window during which the thread has started # but our code to apply the instrumentation hasn't run yet; in these cases it's just a matter of allowing # it to run and our instrumentation to be applied. # if thread_api.current.respond_to?(:cpu_time) && thread_api.current.cpu_time Datadog.logger.debug( "Thread ('#{thread}') is missing profiling instrumentation; other threads should be unaffected" ) end end end