Commit bf00ab2b authored by danielsdeleo's avatar danielsdeleo
Browse files

Merge branch 'track-child-by-pgid'

* Closes https://tickets.opscode.com/browse/MIXLIB-24
* Expand timeout process cleanup to include grandchildren.
parents 834cfbdb a4c67900
......@@ -29,13 +29,20 @@ module Mixlib
# to +stdout+ and +stderr+, and saving its exit status object to +status+
# === Returns
# returns +self+; +stdout+, +stderr+, +status+, and +exitstatus+ will be
# populated with results of the command
# populated with results of the command.
# === Raises
# * Errno::EACCES when you are not privileged to execute the command
# * Errno::ENOENT when the command is not available on the system (or not
# in the current $PATH)
# * Chef::Exceptions::CommandTimeout when the command does not complete
# within +timeout+ seconds (default: 600s)
# within +timeout+ seconds (default: 600s). When this happens, ShellOut
# will send a TERM and then KILL to the entire process group to ensure
# that any grandchild processes are terminated. If the invocation of
# the child process spawned multiple child processes (which commonly
# happens if the command is passed as a single string to be interpreted
# by bin/sh, and bin/sh is not bash), the exit status object may not
# contain the correct exit code of the process (of course there is no
# exit code if the command is killed by SIGKILL, also).
def run_command
@child_pid = fork_subprocess
@reaped = false
......@@ -51,6 +58,7 @@ module Mixlib
# CHEF-3390: Marshall.load on Ruby < 1.8.7p369 also has a GC bug related
# to Marshall.load, so try disabling GC first.
propagate_pre_exec_failure
@child_pgid = -Process.getpgid(@child_pid)
@result = nil
@execution_time = 0
......@@ -122,6 +130,12 @@ module Mixlib
Dir.chdir(cwd) if cwd
end
# Process group id of the child. Returned as a negative value so you can
# put it directly in arguments to kill, wait, etc.
def child_pgid
@child_pgid
end
def initialize_ipc
@stdin_pipe, @stdout_pipe, @stderr_pipe, @process_status_pipe = IO.pipe, IO.pipe, IO.pipe, IO.pipe
@process_status_pipe.last.fcntl(Fcntl::F_SETFD, Fcntl::FD_CLOEXEC)
......@@ -263,6 +277,14 @@ module Mixlib
initialize_ipc
fork do
# Child processes may themselves fork off children. A common case
# is when the command is given as a single string (instead of
# command name plus Array of arguments) and /bin/sh does not
# support the "ONESHOT" optimization (where sh -c does exec without
# forking). To support cleaning up all the children, we need to
# ensure they're in a unique process group.
Process.setsid
configure_subprocess_file_descriptors
clean_parent_file_descriptors
......@@ -302,14 +324,13 @@ module Mixlib
def reap_errant_child
return if attempt_reap
@terminate_reason = "Command execeded allowed execution time, killed by TERM signal."
@terminate_reason = "Command execeded allowed execution time, process terminated"
logger.error("Command execeded allowed execution time, sending TERM") if logger
Process.kill(:TERM, @child_pid)
Process.kill(:TERM, child_pgid)
sleep 3
return if attempt_reap
@terminate_reason = "Command execeded allowed execution time, did not respond to TERM. Killed by KILL signal."
logger.error("Command did not exit from TERM, sending KILL") if logger
Process.kill(:KILL, @child_pid)
attempt_reap
logger.error("Command execeded allowed execution time, sending KILL") if logger
Process.kill(:KILL, child_pgid)
reap
# Should not hit this but it's possible if something is calling waitall
......@@ -323,12 +344,22 @@ module Mixlib
@child_pid && !@reaped
end
# Unconditionally reap the child process. This is used in scenarios where
# we can be confident the child will exit quickly, and has not spawned
# and grandchild processes.
def reap
results = Process.waitpid2(@child_pid)
@reaped = true
@status = results.last
rescue Errno::ECHILD
# When cleaning up timed-out processes, we might send SIGKILL to the
# whole process group after we've cleaned up the direct child. In that
# case the grandchildren will have been adopted by init so we can't
# reap them even if we wanted to (we don't).
nil
end
# Try to reap the child process but don't block if it isn't dead yet.
def attempt_reap
if results = Process.waitpid2(@child_pid, Process::WNOHANG)
@reaped = true
......
......@@ -826,8 +826,15 @@ describe Mixlib::ShellOut do
end
context 'with subprocess that takes longer than timeout' do
def ruby_wo_shell(code)
parts = %w[ruby]
parts << "--disable-gems" if ruby_19?
parts << "-e"
parts << code
end
let(:cmd) do
ruby_eval.call(<<-CODE)
ruby_wo_shell(<<-CODE)
STDOUT.sync = true
trap(:TERM) { puts "got term"; exit!(123) }
sleep 10
......@@ -849,7 +856,7 @@ describe Mixlib::ShellOut do
context "and the child is unresponsive" do
let(:cmd) do
ruby_eval.call(<<-CODE)
ruby_wo_shell(<<-CODE)
STDOUT.sync = true
trap(:TERM) { puts "nanana cant hear you" }
sleep 10
......@@ -877,11 +884,64 @@ describe Mixlib::ShellOut do
shell_cmd.status.termsig.should == 9
log_output.string.should include("Command execeded allowed execution time, sending TERM")
log_output.string.should include("Command did not exit from TERM, sending KILL")
log_output.string.should include("Command execeded allowed execution time, sending KILL")
end
end
end
context "and the child process forks grandchildren" do
let(:cmd) do
ruby_wo_shell(<<-CODE)
STDOUT.sync = true
trap(:TERM) { print "got term in child\n"; exit!(123) }
fork do
trap(:TERM) { print "got term in grandchild\n"; exit!(142) }
sleep 10
end
sleep 10
CODE
end
it "should TERM the wayward child and grandchild" do
# note: let blocks don't correctly memoize if an exception is raised,
# so can't use executed_cmd
lambda { shell_cmd.run_command}.should raise_error(Mixlib::ShellOut::CommandTimeout)
shell_cmd.stdout.should include("got term in child")
shell_cmd.stdout.should include("got term in grandchild")
end
end
context "and the child process forks grandchildren that don't respond to TERM" do
let(:cmd) do
ruby_wo_shell(<<-CODE)
STDOUT.sync = true
trap(:TERM) { print "got term in child\n"; exit!(123) }
fork do
trap(:TERM) { print "got term in grandchild\n" }
sleep 10
end
sleep 10
CODE
end
it "should TERM the wayward child and grandchild, then KILL whoever is left" do
# note: let blocks don't correctly memoize if an exception is raised,
# so can't use executed_cmd
lambda { shell_cmd.run_command}.should raise_error(Mixlib::ShellOut::CommandTimeout)
shell_cmd.stdout.should include("got term in child")
shell_cmd.stdout.should include("got term in grandchild")
# A little janky. We get the process group id out of the command
# object, then try to kill a process in it to make sure none
# exists. Trusting the system under test like this isn't great but
# it's difficult to test otherwise.
lambda { Process.kill(:INT, shell_cmd.send(:child_pgid)) }.should raise_error(Errno::ESRCH)
end
end
end
context 'with subprocess that exceeds buffersize' do
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment