2015-01-01 09:32:14 +00:00
|
|
|
# Copyright 1997-2015 Free Software Foundation, Inc.
|
1999-06-28 23:04:32 +00:00
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
2007-08-23 18:14:19 +00:00
|
|
|
# the Free Software Foundation; either version 3 of the License, or
|
1999-06-28 23:04:32 +00:00
|
|
|
# (at your option) any later version.
|
2007-08-23 18:14:19 +00:00
|
|
|
#
|
1999-06-28 23:04:32 +00:00
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
2007-08-23 18:14:19 +00:00
|
|
|
#
|
1999-06-28 23:04:32 +00:00
|
|
|
# You should have received a copy of the GNU General Public License
|
2007-08-23 18:14:19 +00:00
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
1999-06-28 23:04:32 +00:00
|
|
|
|
|
|
|
|
2002-03-26 00:13:22 +00:00
|
|
|
# On HP-UX 11.0, this test is causing a process running the program
|
|
|
|
# "attach" to be left around spinning. Until we figure out why, I am
|
|
|
|
# commenting out the test to avoid polluting tiamat (our 11.0 nightly
|
|
|
|
# test machine) with these processes. RT
|
|
|
|
#
|
|
|
|
# Setting the magic bit in the target app should work. I added a
|
|
|
|
# "kill", and also a test for the R3 register warning. JB
|
|
|
|
if { [istarget "hppa*-*-hpux*"] } {
|
|
|
|
return 0
|
|
|
|
}
|
1999-06-28 23:04:32 +00:00
|
|
|
|
2015-01-09 11:04:19 +00:00
|
|
|
if {![can_spawn_for_attach]} {
|
1999-06-28 23:04:32 +00:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
test suite update - gdb.base/[ab]
Convert files gdb.base/[ab]*.exp to use standard_output_file et al.
* a2-run.exp, all-bin.exp, annota1.exp, annota3.exp, anon.exp,
args.exp, arithmet.exp, arrayidx.exp, assign.exp, async-shell.exp,
async.exp, attach-pie-misread.exp, attach-pie-noexec.exp,
attach-twice.exp, attach.exp, auxv.exp, bang.exp, bfp-test.exp,
bigcore.exp, bitfields.exp, bitfields2.exp, break-entry.exp,
break-interp.exp, break-on-linker-gcd-function.exp,
breakpoint-shadow.exp: Use standard_testfile,
standard_output_file, prepare_for_testing, clean_restart.
2013-06-27 18:47:53 +00:00
|
|
|
standard_testfile attach.c attach2.c
|
|
|
|
set binfile2 ${binfile}2
|
|
|
|
set escapedbinfile [string_to_regexp $binfile]
|
1999-06-28 23:04:32 +00:00
|
|
|
|
|
|
|
#execute_anywhere "rm -f ${binfile} ${binfile2}"
|
|
|
|
remote_exec build "rm -f ${binfile} ${binfile2}"
|
|
|
|
# For debugging this test
|
|
|
|
#
|
|
|
|
#log_user 1
|
|
|
|
|
|
|
|
# build the first test case
|
|
|
|
#
|
|
|
|
if { [gdb_compile "${srcdir}/${subdir}/${srcfile}" "${binfile}" executable {debug}] != "" } {
|
2006-08-10 05:27:22 +00:00
|
|
|
untested attach.exp
|
|
|
|
return -1
|
1999-06-28 23:04:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# Build the in-system-call test
|
|
|
|
|
|
|
|
if { [gdb_compile "${srcdir}/${subdir}/${srcfile2}" "${binfile2}" executable {debug}] != "" } {
|
2006-08-10 05:27:22 +00:00
|
|
|
untested attach.exp
|
|
|
|
return -1
|
1999-06-28 23:04:32 +00:00
|
|
|
}
|
|
|
|
|
2012-06-21 20:46:25 +00:00
|
|
|
if [get_compiler_info] {
|
1999-06-28 23:04:32 +00:00
|
|
|
return -1
|
|
|
|
}
|
|
|
|
|
|
|
|
proc do_attach_tests {} {
|
2004-07-08 20:04:47 +00:00
|
|
|
global gdb_prompt
|
|
|
|
global binfile
|
|
|
|
global escapedbinfile
|
|
|
|
global srcfile
|
|
|
|
global testfile
|
|
|
|
global subdir
|
|
|
|
global timeout
|
|
|
|
|
2015-04-20 10:35:29 +00:00
|
|
|
# Figure out a regular expression that will match the sysroot,
|
|
|
|
# noting that the default sysroot is "target:", and also noting
|
|
|
|
# that GDB will strip "target:" from the start of filenames when
|
|
|
|
# operating on the local filesystem
|
|
|
|
set sysroot ""
|
|
|
|
set test "show sysroot"
|
|
|
|
gdb_test_multiple $test $test {
|
|
|
|
-re "The current system root is \"(.*)\"\..*${gdb_prompt} $" {
|
|
|
|
set sysroot $expect_out(1,string)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
regsub "^target:" "$sysroot" "(target:)?" sysroot
|
|
|
|
|
2004-07-08 20:04:47 +00:00
|
|
|
# Start the program running and then wait for a bit, to be sure
|
|
|
|
# that it can be attached to.
|
|
|
|
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
set test_spawn_id [spawn_wait_for_attach $binfile]
|
|
|
|
set testpid [spawn_id_get_pid $test_spawn_id]
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Verify that we cannot attach to nonsense.
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
set test "attach to nonsense is prohibited"
|
|
|
|
gdb_test_multiple "attach abc" "$test" {
|
2010-02-15 17:35:50 +00:00
|
|
|
-re "Illegal process-id: abc\\.\r\n$gdb_prompt $" {
|
|
|
|
pass "$test"
|
|
|
|
}
|
|
|
|
-re "Attaching to.*, process .*couldn't open /proc file.*$gdb_prompt $" {
|
|
|
|
# Response expected from /proc-based systems.
|
|
|
|
pass "$test"
|
|
|
|
}
|
|
|
|
-re "Can't attach to process..*$gdb_prompt $" {
|
|
|
|
# Response expected on Cygwin
|
|
|
|
pass "$test"
|
|
|
|
}
|
|
|
|
-re "Attaching to.*$gdb_prompt $" {
|
|
|
|
fail "$test (bogus pid allowed)"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Verify that we cannot attach to nonsense even if its initial part is
|
|
|
|
# a valid PID.
|
|
|
|
|
|
|
|
set test "attach to digits-starting nonsense is prohibited"
|
|
|
|
gdb_test_multiple "attach ${testpid}x" "$test" {
|
|
|
|
-re "Illegal process-id: ${testpid}x\\.\r\n$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to.*, process .*couldn't open /proc file.*$gdb_prompt $" {
|
|
|
|
# Response expected from /proc-based systems.
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
2004-07-12 14:25:48 +00:00
|
|
|
-re "Can't attach to process..*$gdb_prompt $" {
|
2004-07-08 20:04:47 +00:00
|
|
|
# Response expected on Cygwin
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to.*$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
fail "$test (bogus pid allowed)"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Verify that we cannot attach to what appears to be a valid
|
|
|
|
# process ID, but is a process that doesn't exist. Traditionally,
|
|
|
|
# most systems didn't have a process with ID 0, so we take that as
|
|
|
|
# the default. However, there are a few exceptions.
|
|
|
|
|
|
|
|
set boguspid 0
|
|
|
|
if { [istarget "*-*-*bsd*"] } {
|
|
|
|
# In FreeBSD 5.0, PID 0 is used for "swapper". Use -1 instead
|
|
|
|
# (which should have the desired effect on any version of
|
|
|
|
# FreeBSD, and probably other *BSD's too).
|
|
|
|
set boguspid -1
|
|
|
|
}
|
2004-07-12 14:25:48 +00:00
|
|
|
set test "attach to nonexistent process is prohibited"
|
|
|
|
gdb_test_multiple "attach $boguspid" "$test" {
|
2004-07-08 20:04:47 +00:00
|
|
|
-re "Attaching to.*, process $boguspid.*No such process.*$gdb_prompt $" {
|
|
|
|
# Response expected on ptrace-based systems (i.e. HP-UX 10.20).
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to.*, process $boguspid failed.*Hint.*$gdb_prompt $" {
|
|
|
|
# Response expected on ttrace-based systems (i.e. HP-UX 11.0).
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to.*, process $boguspid.*denied.*$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to.*, process $boguspid.*not permitted.*$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to.*, process .*couldn't open /proc file.*$gdb_prompt $" {
|
|
|
|
# Response expected from /proc-based systems.
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
2004-07-12 14:25:48 +00:00
|
|
|
-re "Can't attach to process..*$gdb_prompt $" {
|
2004-07-08 20:04:47 +00:00
|
|
|
# Response expected on Cygwin
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
2012-02-15 12:48:55 +00:00
|
|
|
-re "Attaching to.*, process $boguspid.*failed.*$gdb_prompt $" {
|
|
|
|
# Response expected on the extended-remote target.
|
|
|
|
pass "$test"
|
|
|
|
}
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# Verify that we can attach to the process by first giving its
|
|
|
|
# executable name via the file command, and using attach with the
|
|
|
|
# process ID.
|
|
|
|
|
|
|
|
# (Actually, the test system appears to do this automatically for
|
|
|
|
# us. So, we must also be prepared to be asked if we want to
|
|
|
|
# discard an existing set of symbols.)
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
set test "set file, before attach1"
|
|
|
|
gdb_test_multiple "file $binfile" "$test" {
|
|
|
|
-re "Load new symbol table from.*y or n. $" {
|
|
|
|
gdb_test "y" "Reading symbols from $escapedbinfile\.\.\.*done." \
|
|
|
|
"$test (re-read)"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Reading symbols from $escapedbinfile\.\.\.*done.*$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
set test "attach1, after setting file"
|
|
|
|
gdb_test_multiple "attach $testpid" "$test" {
|
2004-07-08 20:04:47 +00:00
|
|
|
-re "Attaching to program.*`?$escapedbinfile'?, process $testpid.*main.*at .*$srcfile:.*$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to program.*`?$escapedbinfile\.exe'?, process $testpid.*\[Switching to thread $testpid\..*\].*$gdb_prompt $" {
|
|
|
|
# Response expected on Cygwin
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Verify that we can "see" the variable "should_exit" in the
|
|
|
|
# program, and that it is zero.
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
gdb_test "print should_exit" " = 0" "after attach1, print should_exit"
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Detach the process.
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
gdb_test "detach" \
|
|
|
|
"Detaching from program: .*$escapedbinfile, process $testpid" \
|
|
|
|
"attach1 detach"
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Wait a bit for gdb to finish detaching
|
|
|
|
|
|
|
|
exec sleep 5
|
|
|
|
|
|
|
|
# Purge the symbols from gdb's brain. (We want to be certain the
|
|
|
|
# next attach, which won't be preceded by a "file" command, is
|
|
|
|
# really getting the executable file without our help.)
|
|
|
|
|
|
|
|
set old_timeout $timeout
|
|
|
|
set timeout 15
|
2004-07-12 14:25:48 +00:00
|
|
|
set test "attach1, purging symbols after detach"
|
|
|
|
gdb_test_multiple "file" "$test" {
|
|
|
|
-re "No executable file now.*Discard symbol table.*y or n. $" {
|
|
|
|
gdb_test "y" "No symbol file now." "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
set timeout $old_timeout
|
|
|
|
|
|
|
|
# Verify that we can attach to the process just by giving the
|
|
|
|
# process ID.
|
|
|
|
|
2012-02-15 12:48:55 +00:00
|
|
|
set test "attach2, with no file"
|
|
|
|
set found_exec_file 0
|
2004-07-12 14:25:48 +00:00
|
|
|
gdb_test_multiple "attach $testpid" "$test" {
|
2015-04-20 10:35:29 +00:00
|
|
|
-re "Attaching to process $testpid.*Load new symbol table from \"$sysroot$escapedbinfile\.exe\".*y or n. $" {
|
2004-07-08 20:04:47 +00:00
|
|
|
# On Cygwin, the DLL's symbol tables are loaded prior to the
|
|
|
|
# executable's symbol table. This in turn always results in
|
|
|
|
# asking the user for actually loading the symbol table of the
|
|
|
|
# executable.
|
2015-04-20 10:35:29 +00:00
|
|
|
gdb_test "y" "Reading symbols from $sysroot$escapedbinfile\.\.\.*done." \
|
2004-07-12 14:25:48 +00:00
|
|
|
"$test (reset file)"
|
2012-02-15 12:48:55 +00:00
|
|
|
|
|
|
|
set found_exec_file 1
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
2015-04-20 10:35:29 +00:00
|
|
|
-re "Attaching to process $testpid.*Reading symbols from $sysroot$escapedbinfile.*main.*at .*$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2012-02-15 12:48:55 +00:00
|
|
|
set found_exec_file 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if {$found_exec_file == 0} {
|
|
|
|
set test "load file manually, after attach2"
|
|
|
|
gdb_test_multiple "file $binfile" "$test" {
|
|
|
|
-re "A program is being debugged already..*Are you sure you want to change the file.*y or n. $" {
|
|
|
|
gdb_test "y" "Reading symbols from $escapedbinfile\.\.\.*done." \
|
|
|
|
"$test (re-read)"
|
|
|
|
}
|
|
|
|
-re "Reading symbols from $escapedbinfile\.\.\.*done.*$gdb_prompt $" {
|
|
|
|
pass "$test"
|
|
|
|
}
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Verify that we can modify the variable "should_exit" in the
|
|
|
|
# program.
|
|
|
|
|
2010-06-01 21:29:21 +00:00
|
|
|
gdb_test_no_output "set should_exit=1" "after attach2, set should_exit"
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Verify that the modification really happened.
|
|
|
|
|
2014-09-12 11:39:04 +00:00
|
|
|
gdb_breakpoint [gdb_get_line_number "postloop"] temporary
|
|
|
|
gdb_continue_to_breakpoint "postloop" ".* postloop .*"
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Allow the test process to exit, to cleanup after ourselves.
|
|
|
|
|
2011-03-07 16:03:04 +00:00
|
|
|
gdb_continue_to_end "after attach2, exit"
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Make sure we don't leave a process around to confuse
|
|
|
|
# the next test run (and prevent the compile by keeping
|
|
|
|
# the text file busy), in case the "set should_exit" didn't
|
|
|
|
# work.
|
2004-07-12 14:25:48 +00:00
|
|
|
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
kill_wait_spawned_process $test_spawn_id
|
|
|
|
|
|
|
|
set test_spawn_id [spawn_wait_for_attach $binfile]
|
|
|
|
set testpid [spawn_id_get_pid $test_spawn_id]
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Verify that we can attach to the process, and find its a.out
|
|
|
|
# when we're cd'd to some directory that doesn't contain the
|
|
|
|
# a.out. (We use the source path set by the "dir" command.)
|
|
|
|
|
test suite update - gdb.base/[ab]
Convert files gdb.base/[ab]*.exp to use standard_output_file et al.
* a2-run.exp, all-bin.exp, annota1.exp, annota3.exp, anon.exp,
args.exp, arithmet.exp, arrayidx.exp, assign.exp, async-shell.exp,
async.exp, attach-pie-misread.exp, attach-pie-noexec.exp,
attach-twice.exp, attach.exp, auxv.exp, bang.exp, bfp-test.exp,
bigcore.exp, bitfields.exp, bitfields2.exp, break-entry.exp,
break-interp.exp, break-on-linker-gcd-function.exp,
breakpoint-shadow.exp: Use standard_testfile,
standard_output_file, prepare_for_testing, clean_restart.
2013-06-27 18:47:53 +00:00
|
|
|
gdb_test "dir [standard_output_file {}]" "Source directories searched: .*" \
|
2004-07-12 14:25:48 +00:00
|
|
|
"set source path"
|
2004-07-08 20:04:47 +00:00
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
gdb_test "cd /tmp" "Working directory /tmp." \
|
|
|
|
"cd away from process working directory"
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Explicitly flush out any knowledge of the previous attachment.
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
set test "before attach3, flush symbols"
|
2009-10-19 Pedro Alves <pedro@codesourcery.com>
Stan Shebs <stan@codesourcery.com>
Add base multi-executable/process support to GDB.
gdb/
* Makefile.in (SFILES): Add progspace.c.
(COMMON_OBS): Add progspace.o.
* progspace.h: New.
* progspace.c: New.
* breakpoint.h (struct bp_target_info) <placed_address_space>: New
field.
(struct bp_location) <pspace>: New field.
(struct breakpoint) <pspace>: New field.
(bpstat_stop_status, breakpoint_here_p)
(moribund_breakpoint_here_p, breakpoint_inserted_here_p)
(regular_breakpoint_inserted_here_p)
(software_breakpoint_inserted_here_p, breakpoint_thread_match)
(set_default_breakpoint): Adjust prototypes.
(remove_breakpoints_pid, breakpoint_program_space_exit): Declare.
(insert_single_step_breakpoint, deprecated_insert_raw_breakpoint):
Adjust prototypes.
* breakpoint.c (executing_startup): Delete.
(default_breakpoint_sspace): New.
(breakpoint_restore_shadows): Skip if the address space doesn't
match.
(update_watchpoint): Record the frame's program space in the
breakpoint location.
(insert_bp_location): Record the address space in target_info.
Adjust to pass the symbol space to solib_name_from_address.
(breakpoint_program_space_exit): New.
(insert_breakpoint_locations): Switch the symbol space and thread
when inserting breakpoints. Don't insert breakpoints in a vfork
parent waiting for vfork done if we're not attached to the vfork
child.
(remove_breakpoints_pid): New.
(reattach_breakpoints): Switch to a thread of PID. Ignore
breakpoints of other symbol spaces.
(create_internal_breakpoint): Store the symbol space in the sal.
(create_longjmp_master_breakpoint): Iterate over all symbol
spaces.
(update_breakpoints_after_exec): Ignore breakpoints for other
symbol spaces.
(remove_breakpoint): Rename to ...
(remove_breakpoint_1): ... this. Pass the breakpoints symbol
space to solib_name_from_address.
(remove_breakpoint): New.
(mark_breakpoints_out): Ignore breakpoints from other symbol
spaces.
(breakpoint_init_inferior): Ditto.
(breakpoint_here_p): Add an address space argument and adjust to
use breakpoint_address_match.
(moribund_breakpoint_here_p): Ditto.
(regular_breakpoint_inserted_here_p): Ditto.
(breakpoint_inserted_here_p): Ditto.
(software_breakpoint_inserted_here_p): Ditto.
(breakpoint_thread_match): Ditto.
(bpstat_check_location): Ditto.
(bpstat_stop_status): Ditto.
(print_breakpoint_location): If there's a location to print,
switch the current symbol space.
(print_one_breakpoint_location): Add `allflag' argument.
(print_one_breakpoint): Ditto. Adjust.
(do_captured_breakpoint_query): Adjust.
(breakpoint_1): Adjust.
(breakpoint_has_pc): Also match the symbol space.
(describe_other_breakpoints): Add a symbol space argument and
adjust.
(set_default_breakpoint): Add a symbol space argument. Set
default_breakpoint_sspace.
(breakpoint_address_match): New.
(check_duplicates_for): Add an address space argument, and adjust.
(set_raw_breakpoint): Record the symbol space in the location and
in the breakpoint.
(set_longjmp_breakpoint): Skip longjmp master breakpoints from
other symbol spaces.
(remove_thread_event_breakpoints, remove_solib_event_breakpoints)
(disable_breakpoints_in_shlibs): Skip breakpoints from other
symbol spaces.
(disable_breakpoints_in_unloaded_shlib): Match symbol spaces.
(create_catchpoint): Set the symbol space in the sal.
(disable_breakpoints_before_startup): Skip breakpoints from other
symbol spaces. Set executing_startup in the current symbol space.
(enable_breakpoints_after_startup): Clear executing_startup in the
current symbol space. Skip breakpoints from other symbol spaces.
(clone_momentary_breakpoint): Also copy the symbol space.
(add_location_to_breakpoint): Set the location's symbol space.
(bp_loc_is_permanent): Switch thread and symbol space.
(create_breakpoint): Adjust.
(expand_line_sal_maybe): Expand comment to mention symbol spaces.
Switch thread and symbol space when reading memory.
(parse_breakpoint_sals): Set the symbol space in the sal.
(break_command_really): Ditto.
(skip_prologue_sal): Switch and space.
(resolve_sal_pc): Ditto.
(watch_command_1): Record the symbol space in the sal.
(create_ada_exception_breakpoint): Adjust.
(clear_command): Adjust. Match symbol spaces.
(update_global_location_list): Use breakpoint_address_match.
(breakpoint_re_set_one): Switch thread and space.
(breakpoint_re_set): Save symbol space.
(breakpoint_re_set_thread): Also reset the symbol space.
(deprecated_insert_raw_breakpoint): Add an address space argument.
Adjust.
(insert_single_step_breakpoint): Ditto.
(single_step_breakpoint_inserted_here_p): Ditto.
(clear_syscall_counts): New.
(_initialize_breakpoint): Install it as inferior_exit observer.
* exec.h: Include "progspace.h".
(exec_bfd, exec_bfd_mtime): New defines.
(exec_close): Declare.
* exec.c: Include "gdbthread.h" and "progspace.h".
(exec_bfd, exec_bfd_mtime, current_target_sections_1): Delete.
(using_exec_ops): New.
(exec_close_1): Rename to exec_close, and make public.
(exec_close): Rename to exec_close_1, and adjust all callers. Add
description. Remove target sections and close executables from
all program spaces.
(exec_file_attach): Add comment.
(add_target_sections): Check on `using_exec_ops' to check if the
target should be pushed.
(remove_target_sections): Only unpush the target if there are no
more target sections in any symbol space.
* gdbcore.h: Include "exec.h".
(exec_bfd, exec_bfd_mtime): Remove declarations.
* frame.h (get_frame_program_space, get_frame_address_space)
(frame_unwind_program_space): Declare.
* frame.c (struct frame_info) <pspace, aspace>: New fields.
(create_sentinel_frame): Add program space argument. Set the
pspace and aspace fields of the frame object.
(get_current_frame, create_new_frame): Adjust.
(get_frame_program_space): New.
(frame_unwind_program_space): New.
(get_frame_address_space): New.
* stack.c (print_frame_info): Adjust.
(print_frame): Use the frame's program space.
* gdbthread.h (any_live_thread_of_process): Declare.
* thread.c (any_live_thread_of_process): New.
(switch_to_thread): Switch the program space as well.
(restore_selected_frame): Don't warn if trying to restore frame
level 0.
* inferior.h: Include "progspace.h".
(detach_fork): Declare.
(struct inferior) <removable, aspace, pspace>
<vfork_parent, vfork_child, pending_detach>
<waiting_for_vfork_done>: New fields.
<terminal_info>: Remove field.
<data, num_data>: New fields.
(register_inferior_data, register_inferior_data_with_cleanup)
(clear_inferior_data, set_inferior_data, inferior_data): Declare.
(exit_inferior, exit_inferior_silent, exit_inferior_num_silent)
(inferior_appeared): Declare.
(find_inferior_pid): Typo.
(find_inferior_id, find_inferior_for_program_space): Declare.
(set_current_inferior, save_current_inferior, prune_inferiors)
(number_of_inferiors): Declare.
(inferior_list): Declare.
* inferior.c: Include "gdbcore.h" and "symfile.h".
(inferior_list): Make public.
(delete_inferior_1): Always delete thread silently.
(find_inferior_id): Make public.
(current_inferior_): New.
(current_inferior): Use it.
(set_current_inferior): New.
(restore_inferior): New.
(save_current_inferior): New.
(free_inferior): Free the per-inferior data.
(add_inferior_silent): Allocate per-inferior data.
Call inferior_appeared.
(delete_threads_of_inferior): New.
(delete_inferior_1): Adjust interface to take an inferior pointer.
(delete_inferior): Adjust.
(delete_inferior_silent): Adjust.
(exit_inferior_1): New.
(exit_inferior): New.
(exit_inferior_silent): New.
(exit_inferior_num_silent): New.
(detach_inferior): Adjust.
(inferior_appeared): New.
(discard_all_inferiors): Adjust.
(find_inferior_id): Make public. Assert pid is not zero.
(find_inferior_for_program_space): New.
(have_inferiors): Check if we have any inferior with pid not zero.
(have_live_inferiors): Go over all pushed targets looking for
process_stratum.
(prune_inferiors): New.
(number_of_inferiors): New.
(print_inferior): Add executable column. Print vfork parent/child
relationships.
(inferior_command): Adjust to cope with not running inferiors.
(remove_inferior_command): New.
(add_inferior_command): New.
(clone_inferior_command): New.
(struct inferior_data): New.
(struct inferior_data_registration): New.
(struct inferior_data_registry): New.
(inferior_data_registry): New.
(register_inferior_data_with_cleanup): New.
(register_inferior_data): New.
(inferior_alloc_data): New.
(inferior_free_data): New.
(clear_inferior_data): New.
(set_inferior_data): New.
(inferior_data): New.
(initialize_inferiors): New.
(_initialize_inferiors): Register "add-inferior",
"remove-inferior" and "clone-inferior" commands.
* objfiles.h: Include "progspace.h".
(struct objfile) <pspace>: New field.
(symfile_objfile, object_files): Don't declare.
(ALL_PSPACE_OBJFILES): New.
(ALL_PSPACE_OBJFILES_SAFE): New.
(ALL_OBJFILES, ALL_OBJFILES_SAFE): Adjust.
(ALL_PSPACE_SYMTABS): New.
(ALL_PRIMARY_SYMTABS): Adjust.
(ALL_PSPACE_PRIMARY_SYMTABS): New.
(ALL_PSYMTABS): Adjust.
(ALL_PSPACE_PSYMTABS): New.
* objfiles.c (object_files, symfile_objfile): Delete.
(struct objfile_sspace_info): New.
(objfiles_pspace_data): New.
(objfiles_pspace_data_cleanup): New.
(get_objfile_pspace_data): New.
(objfiles_changed_p): Delete.
(allocate_objfile): Set the objfile's program space. Adjust to
reference objfiles_changed_p in pspace data.
(free_objfile): Adjust to reference objfiles_changed_p in pspace
data.
(objfile_relocate): Ditto.
(update_section_map): Add pspace argument. Adjust to iterate over
objfiles in the passed in pspace.
(find_pc_section): Delete sections and num_sections statics.
Adjust to refer to program space's objfiles_changed_p. Adjust to
refer to sections and num_sections store in the objfile's pspace
data.
(objfiles_changed): Adjust to reference objfiles_changed_p in
pspace data.
(_initialize_objfiles): New.
* linespec.c (decode_all_digits, decode_dollar): Set the sal's
program space.
* source.c (current_source_pspace): New.
(get_current_source_symtab_and_line): Set the sal's program space.
(set_current_source_symtab_and_line): Set current_source_pspace.
(select_source_symtab): Ditto. Use ALL_OBJFILES.
(forget_cached_source_info): Iterate over all program spaces.
* symfile.c (clear_symtab_users): Adjust.
* symmisc.c (print_symbol_bcache_statistics): Iterate over all
program spaces.
(print_objfile_statistics): Ditto.
(maintenance_print_msymbols): Ditto.
(maintenance_print_objfiles): Ditto.
(maintenance_info_symtabs): Ditto.
(maintenance_info_psymtabs): Ditto.
* symtab.h (SYMTAB_PSPACE): New.
(struct symtab_and_line) <pspace>: New field.
* symtab.c (init_sal): Clear the sal's program space.
(find_pc_sect_symtab): Set the sal's program space. Switch thread
and space.
(append_expanded_sal): Add program space argument. Iterate over
all program spaces.
(expand_line_sal): Iterate over all program spaces. Switch
program space.
* target.h (enum target_waitkind) <TARGET_WAITKIND_VFORK_DONE>: New.
(struct target_ops) <to_thread_address_space>: New field.
(target_thread_address_space): Define.
* target.c (target_detach): Only remove breakpoints from the
inferior we're detaching.
(target_thread_address_space): New.
* defs.h (initialize_progspace): Declare.
* top.c (gdb_init): Call it.
* solist.h (struct so_list) <sspace>: New field.
* solib.h (struct program_space): Forward declare.
(solib_name_from_address): Adjust prototype.
* solib.c (so_list_head): Replace with a macro referencing the
program space.
(update_solib_list): Set the so's program space.
(solib_name_from_address): Add a program space argument and adjust.
* solib-svr4.c (struct svr4_info) <pid>: Delete field.
<interp_text_sect_low, interp_text_sect_high, interp_plt_sect_low>
<interp_plt_sect_high>: New fields.
(svr4_info_p, svr4_info): Delete.
(solib_svr4_sspace_data): New.
(get_svr4_info): Rewrite.
(svr4_sspace_data_cleanup): New.
(open_symbol_file_object): Adjust.
(svr4_default_sos): Adjust.
(svr4_fetch_objfile_link_map): Adjust.
(interp_text_sect_low, interp_text_sect_high, interp_plt_sect_low)
(interp_plt_sect_high): Delete.
(svr4_in_dynsym_resolve_code): Adjust.
(enable_break): Adjust.
(svr4_clear_solib): Revert bit that removed the svr4_info here,
and reinstate clearing debug_base, debug_loader_offset_p,
debug_loader_offset and debug_loader_name.
(_initialize_svr4_solib): Register solib_svr4_pspace_data. Don't
install an inferior_exit observer anymore.
* printcmd.c (struct display) <pspace>: New field.
(display_command): Set the display's sspace.
(do_one_display): Match the display's sspace.
(display_uses_solib_p): Ditto.
* linux-fork.c (detach_fork): Moved to infrun.c.
(_initialize_linux_fork): Moved "detach-on-fork" command to
infrun.c.
* infrun.c (detach_fork): Moved from linux-fork.c.
(proceed_after_vfork_done): New.
(handle_vfork_child_exec_or_exit): New.
(follow_exec_mode_replace, follow_exec_mode_keep)
(follow_exec_mode_names, follow_exec_mode_string)
(show_follow_exec_mode_string): New.
(follow_exec): New. Reinstate the mark_breakpoints_out call.
Remove shared libraries before attaching new executable. If user
wants to keep the inferior, keep it.
(displaced_step_fixup): Adjust to pass an address space to the
breakpoints module.
(resume): Ditto.
(clear_proceed_status): In all-stop mode, always clear the proceed
status of all threads.
(prepare_to_proceed): Adjust to pass an address space to the
breakpoints module.
(proceed): Ditto.
(adjust_pc_after_break): Ditto.
(handle_inferior_event): When handling a process exit, switch the
program space to the inferior's that had exited. Call
handle_vfork_child_exec_or_exit. Adjust to pass an address space
to the breakpoints module. In non-stop mode, when following a
fork and detach-fork is off, also resume the other branch. Handle
TARGET_WAITKIND_VFORK_DONE. Set the program space in sals.
(normal_stop): Prune inferiors.
(_initialize_infrun): Install the new "follow-exec-mode" command.
"detach-on-fork" moved here.
* regcache.h (get_regcache_aspace): Declare.
* regcache.c (struct regcache) <aspace>: New field.
(regcache_xmalloc): Clear the aspace.
(get_regcache_aspace): New.
(regcache_cpy): Copy the aspace field.
(regcache_cpy_no_passthrough): Ditto.
(get_thread_regcache): Fetch the thread's address space from the
target, and store it in the regcache.
* infcall.c (call_function_by_hand): Set the sal's pspace.
* arch-utils.c (default_has_shared_address_space): New.
* arch-utils.h (default_has_shared_address_space): Declare.
* gdbarch.sh (has_shared_address_space): New.
* gdbarch.h, gdbarch.c: Regenerate.
* linux-tdep.c: Include auxv.h, target.h, elf/common.h.
(linux_has_shared_address_space): New.
(_initialize_linux_tdep): Declare.
* arm-tdep.c (arm_software_single_step): Pass the frame's address
space to insert_single_step_breakpoint.
* arm-linux-tdep.c (arm_linux_software_single_step): Pass the
frame's pspace to breakpoint functions.
* cris-tdep.c (crisv32_single_step_through_delay): Ditto.
(cris_software_single_step): Ditto.
* mips-tdep.c (deal_with_atomic_sequence): Add frame argument.
Pass the frame's pspace to breakpoint functions.
(mips_software_single_step): Adjust.
(mips_single_step_through_delay): Adjust.
* rs6000-aix-tdep.c (rs6000_software_single_step): Adjust.
* rs6000-tdep.c (ppc_deal_with_atomic_sequence): Adjust.
* solib-irix.c (enable_break): Adjust to pass the current frame's
address space to breakpoint functions.
* sparc-tdep.c (sparc_software_single_step): Ditto.
* spu-tdep.c (spu_software_single_step): Ditto.
* alpha-tdep.c (alpha_software_single_step): Ditto.
* record.c (record_wait): Adjust to pass an address space to the
breakpoints module.
* fork-child.c (fork_inferior): Set the new inferior's program and
address spaces.
* inf-ptrace.c (inf_ptrace_follow_fork): Copy the parent's program
and address spaces.
(inf_ptrace_attach): Set the inferior's program and address spaces.
* linux-nat.c: Include "solib.h".
(linux_child_follow_fork): Manage parent and child's program and
address spaces. Clone the parent's program space if necessary.
Don't wait for the vfork to be done here. Refuse to resume if
following the vfork parent while leaving the child stopped.
(resume_callback): Don't resume a vfork parent.
(linux_nat_resume): Also check for pending events in the
lp->waitstatus field.
(linux_handle_extended_wait): Report TARGET_WAITKIND_VFORK_DONE
events to the core.
(stop_wait_callback): Don't wait for SIGSTOP on vfork parents.
(cancel_breakpoint): Adjust.
* linux-thread-db.c (thread_db_wait): Don't remove thread event
breakpoints here.
(thread_db_mourn_inferior): Don't mark breakpoints out here.
Remove thread event breakpoints after mourning.
* corelow.c: Include progspace.h.
(core_open): Set the inferior's program and address spaces.
* remote.c (remote_add_inferior): Set the new inferior's program
and address spaces.
(remote_start_remote): Update address spaces.
(extended_remote_create_inferior_1): Don't init the thread list if
we already debugging other inferiors.
* darwin-nat.c (darwin_attach): Set the new inferior's program and
address spaces.
* gnu-nat.c (gnu_attach): Ditto.
* go32-nat.c (go32_create_inferior): Ditto.
* inf-ttrace.c (inf_ttrace_follow_fork, inf_ttrace_attach): Ditto.
* monitor.c (monitor_open): Ditto.
* nto-procfs.c (procfs_attach, procfs_create_inferior): Ditto.
* procfs.c (do_attach): Ditto.
* windows-nat.c (do_initial_windows_stuff): Ditto.
* inflow.c (inferior_process_group)
(terminal_init_inferior_with_pgrp, terminal_inferior,
(terminal_ours_1, inflow_inferior_exit, copy_terminal_info)
(child_terminal_info, new_tty_postfork, set_sigint_trap): Adjust
to use per-inferior data instead of inferior->terminal_info.
(inflow_inferior_data): New.
(inflow_new_inferior): Delete.
(inflow_inferior_data_cleanup): New.
(get_inflow_inferior_data): New.
* mi/mi-interp.c (mi_new_inferior): Rename to...
(mi_inferior_appeared): ... this.
(mi_interpreter_init): Adjust.
* tui/tui-disasm.c: Include "progspace.h".
(tui_set_disassem_content): Pass an address space to
breakpoint_here_p.
* NEWS: Mention multi-program debugging support. Mention new
commands "add-inferior", "clone-inferior", "remove-inferior",
"maint info program-spaces", and new option "set
follow-exec-mode".
2009-10-19 Pedro Alves <pedro@codesourcery.com>
Stan Shebs <stan@codesourcery.com>
gdb/doc/
* observer.texi (new_inferior): Rename to...
(inferior_appeared): ... this.
2009-10-19 Pedro Alves <pedro@codesourcery.com>
Stan Shebs <stan@codesourcery.com>
gdb/testsuite/
* gdb.base/foll-vfork.exp: Adjust to spell out "follow-fork".
* gdb.base/foll-exec.exp: Adjust to expect a process id before
"Executing new program".
* gdb.base/foll-fork.exp: Adjust to spell out "follow-fork".
* gdb.base/multi-forks.exp: Ditto. Adjust to the inferior being
left listed after having been killed.
* gdb.base/attach.exp: Adjust to spell out "symbol-file".
* gdb.base/maint.exp: Adjust test.
* Makefile.in (ALL_SUBDIRS): Add gdb.multi.
* gdb.multi/Makefile.in: New.
* gdb.multi/base.exp: New.
* gdb.multi/goodbye.c: New.
* gdb.multi/hangout.c: New.
* gdb.multi/hello.c: New.
* gdb.multi/bkpt-multi-exec.c: New.
* gdb.multi/bkpt-multi-exec.exp: New.
* gdb.multi/crashme.c: New.
2009-10-19 Pedro Alves <pedro@codesourcery.com>
Stan Shebs <stan@codesourcery.com>
gdb/doc/
* gdb.texinfo (Inferiors): Rename node to ...
(Inferiors and Programs): ... this. Mention running multiple
programs in the same debug session.
<info inferiors>: Mention the new 'Executable' column if "info
inferiors". Update examples. Document the "add-inferior",
"clone-inferior", "remove-inferior" and "maint info
program-spaces" commands.
(Process): Rename node to...
(Forks): ... this. Document "set|show follow-exec-mode".
2009-10-19 09:51:43 +00:00
|
|
|
gdb_test_multiple "symbol-file" "$test" {
|
2004-07-12 14:25:48 +00:00
|
|
|
-re "Discard symbol table from.*y or n. $" {
|
|
|
|
gdb_test "y" "No symbol file now." \
|
|
|
|
"$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
2004-07-12 14:25:48 +00:00
|
|
|
-re "No symbol file now.*$gdb_prompt $" {
|
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
gdb_test "exec" "No executable file now." \
|
|
|
|
"before attach3, flush exec"
|
|
|
|
|
|
|
|
gdb_test "attach $testpid" \
|
2015-04-20 10:35:29 +00:00
|
|
|
"Attaching to process $testpid.*Reading symbols from $sysroot$escapedbinfile.*main.*at .*" \
|
2004-07-12 14:25:48 +00:00
|
|
|
"attach when process' a.out not in cwd"
|
|
|
|
|
|
|
|
set test "after attach3, exit"
|
2010-06-04 23:09:15 +00:00
|
|
|
gdb_test "kill" \
|
|
|
|
"" \
|
|
|
|
"$test" \
|
|
|
|
"Kill the program being debugged.*y or n. $" \
|
|
|
|
"y"
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Another "don't leave a process around"
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
kill_wait_spawned_process $test_spawn_id
|
1999-06-28 23:04:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
proc do_call_attach_tests {} {
|
2004-07-08 20:04:47 +00:00
|
|
|
global gdb_prompt
|
|
|
|
global binfile2
|
|
|
|
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
set test_spawn_id [spawn_wait_for_attach $binfile2]
|
|
|
|
set testpid [spawn_id_get_pid $test_spawn_id]
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Attach
|
|
|
|
|
2010-06-10 19:48:20 +00:00
|
|
|
gdb_test "file $binfile2" ".*" "force switch to gdb64, if necessary"
|
2004-07-12 14:25:48 +00:00
|
|
|
set test "attach call"
|
|
|
|
gdb_test_multiple "attach $testpid" "$test" {
|
|
|
|
-re "warning: reading register.*I.*O error.*$gdb_prompt $" {
|
|
|
|
fail "$test (read register error)"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to.*process $testpid.*libc.*$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
-re "Attaching to.*process $testpid.*\[Switching to thread $testpid\..*\].*$gdb_prompt $" {
|
2004-07-12 14:25:48 +00:00
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# See if other registers are problems
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
set test "info other register"
|
|
|
|
gdb_test_multiple "i r r3" "$test" {
|
|
|
|
-re "warning: reading register.*$gdb_prompt $" {
|
|
|
|
fail "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
2004-07-12 14:25:48 +00:00
|
|
|
-re "r3.*$gdb_prompt $" {
|
|
|
|
pass "$test"
|
2004-07-08 20:04:47 +00:00
|
|
|
}
|
|
|
|
}
|
1999-06-28 23:04:32 +00:00
|
|
|
|
2004-07-08 20:04:47 +00:00
|
|
|
# Get rid of the process
|
|
|
|
|
2004-07-12 14:25:48 +00:00
|
|
|
gdb_test "p should_exit = 1"
|
2011-03-07 16:03:04 +00:00
|
|
|
gdb_continue_to_end
|
2004-07-08 20:04:47 +00:00
|
|
|
|
|
|
|
# Be paranoid
|
|
|
|
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
kill_wait_spawned_process $test_spawn_id
|
1999-06-28 23:04:32 +00:00
|
|
|
}
|
|
|
|
|
2014-03-21 03:11:51 +00:00
|
|
|
proc do_command_attach_tests {} {
|
|
|
|
global gdb_prompt
|
|
|
|
global binfile
|
|
|
|
global verbose
|
|
|
|
global GDB
|
|
|
|
global INTERNAL_GDBFLAGS
|
|
|
|
global GDBFLAGS
|
|
|
|
|
|
|
|
if ![isnative] then {
|
|
|
|
unsupported "command attach test"
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
set test_spawn_id [spawn_wait_for_attach $binfile]
|
|
|
|
set testpid [spawn_id_get_pid $test_spawn_id]
|
2014-03-21 03:11:51 +00:00
|
|
|
|
|
|
|
gdb_exit
|
|
|
|
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
set res [gdb_spawn_with_cmdline_opts "--pid=$testpid"]
|
2014-03-21 03:11:51 +00:00
|
|
|
set test "starting with --pid"
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
gdb_test_multiple "" $test {
|
2014-03-21 03:11:51 +00:00
|
|
|
-re "Reading symbols from.*$gdb_prompt $" {
|
|
|
|
pass "$test"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Get rid of the process
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
kill_wait_spawned_process $test_spawn_id
|
2014-03-21 03:11:51 +00:00
|
|
|
}
|
|
|
|
|
gdb/17347 - Regression: GDB stopped on run with attached process
Doing:
gdb --pid=PID -ex run
Results in GDB getting a SIGTTIN, and thus ending stopped. That's
usually indicative of a missing target_terminal_ours call.
E.g., from the PR:
$ sleep 1h & p=$!; sleep 0.1; gdb -batch sleep $p -ex run
[1] 28263
[1] Killed sleep 1h
[2]+ Stopped gdb -batch sleep $p -ex run
The workaround is doing:
gdb -ex "attach $PID" -ex "run"
instead of
gdb [-p] $PID -ex "run"
With the former, gdb waits for the attach command to complete before
moving on to the "run" command, because the interpreter is in sync
mode at this point, within execute_command. But for the latter,
attach_command is called directly from captured_main, and thus misses
that waiting. IOW, "run" is running before the attach continuation
has run, before the program stops and attach completes. The broken
terminal settings are just one symptom of that. Any command that
queries or requires input results in the same.
The fix is to wait in catch_command_errors (which is specific to
main.c nowadays), just like we wait in execute_command.
gdb/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* main.c: Include "infrun.h".
(catch_command_errors, catch_command_errors_const): Wait for the
foreground command to complete.
* top.c (maybe_wait_sync_command_done): New function, factored out
from ...
(maybe_wait_sync_command_done): ... here.
* top.h (maybe_wait_sync_command_done): New declaration.
gdb/testsuite/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* lib/gdb.exp (gdb_spawn_with_cmdline_opts): New procedure.
* gdb.base/attach.exp (test_command_line_attach_run): New
procedure.
(top level): Call it.
2014-09-11 12:04:15 +00:00
|
|
|
# Test ' gdb --pid PID -ex "run" '. GDB used to have a bug where
|
|
|
|
# "run" would run before the attach finished - PR17347.
|
|
|
|
|
|
|
|
proc test_command_line_attach_run {} {
|
|
|
|
global gdb_prompt
|
|
|
|
global binfile
|
|
|
|
|
|
|
|
if ![isnative] then {
|
|
|
|
unsupported "commandline attach run test"
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
with_test_prefix "cmdline attach run" {
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
set test_spawn_id [spawn_wait_for_attach $binfile]
|
|
|
|
set testpid [spawn_id_get_pid $test_spawn_id]
|
gdb/17347 - Regression: GDB stopped on run with attached process
Doing:
gdb --pid=PID -ex run
Results in GDB getting a SIGTTIN, and thus ending stopped. That's
usually indicative of a missing target_terminal_ours call.
E.g., from the PR:
$ sleep 1h & p=$!; sleep 0.1; gdb -batch sleep $p -ex run
[1] 28263
[1] Killed sleep 1h
[2]+ Stopped gdb -batch sleep $p -ex run
The workaround is doing:
gdb -ex "attach $PID" -ex "run"
instead of
gdb [-p] $PID -ex "run"
With the former, gdb waits for the attach command to complete before
moving on to the "run" command, because the interpreter is in sync
mode at this point, within execute_command. But for the latter,
attach_command is called directly from captured_main, and thus misses
that waiting. IOW, "run" is running before the attach continuation
has run, before the program stops and attach completes. The broken
terminal settings are just one symptom of that. Any command that
queries or requires input results in the same.
The fix is to wait in catch_command_errors (which is specific to
main.c nowadays), just like we wait in execute_command.
gdb/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* main.c: Include "infrun.h".
(catch_command_errors, catch_command_errors_const): Wait for the
foreground command to complete.
* top.c (maybe_wait_sync_command_done): New function, factored out
from ...
(maybe_wait_sync_command_done): ... here.
* top.h (maybe_wait_sync_command_done): New declaration.
gdb/testsuite/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* lib/gdb.exp (gdb_spawn_with_cmdline_opts): New procedure.
* gdb.base/attach.exp (test_command_line_attach_run): New
procedure.
(top level): Call it.
2014-09-11 12:04:15 +00:00
|
|
|
|
|
|
|
set test "run to prompt"
|
|
|
|
gdb_exit
|
|
|
|
|
|
|
|
set res [gdb_spawn_with_cmdline_opts \
|
|
|
|
"-iex \"set height 0\" -iex \"set width 0\" --pid=$testpid -ex \"start\""]
|
|
|
|
if { $res != 0} {
|
|
|
|
fail $test
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
kill_wait_spawned_process $test_spawn_id
|
gdb/17347 - Regression: GDB stopped on run with attached process
Doing:
gdb --pid=PID -ex run
Results in GDB getting a SIGTTIN, and thus ending stopped. That's
usually indicative of a missing target_terminal_ours call.
E.g., from the PR:
$ sleep 1h & p=$!; sleep 0.1; gdb -batch sleep $p -ex run
[1] 28263
[1] Killed sleep 1h
[2]+ Stopped gdb -batch sleep $p -ex run
The workaround is doing:
gdb -ex "attach $PID" -ex "run"
instead of
gdb [-p] $PID -ex "run"
With the former, gdb waits for the attach command to complete before
moving on to the "run" command, because the interpreter is in sync
mode at this point, within execute_command. But for the latter,
attach_command is called directly from captured_main, and thus misses
that waiting. IOW, "run" is running before the attach continuation
has run, before the program stops and attach completes. The broken
terminal settings are just one symptom of that. Any command that
queries or requires input results in the same.
The fix is to wait in catch_command_errors (which is specific to
main.c nowadays), just like we wait in execute_command.
gdb/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* main.c: Include "infrun.h".
(catch_command_errors, catch_command_errors_const): Wait for the
foreground command to complete.
* top.c (maybe_wait_sync_command_done): New function, factored out
from ...
(maybe_wait_sync_command_done): ... here.
* top.h (maybe_wait_sync_command_done): New declaration.
gdb/testsuite/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* lib/gdb.exp (gdb_spawn_with_cmdline_opts): New procedure.
* gdb.base/attach.exp (test_command_line_attach_run): New
procedure.
(top level): Call it.
2014-09-11 12:04:15 +00:00
|
|
|
return $res
|
|
|
|
}
|
|
|
|
gdb_test_multiple "" $test {
|
|
|
|
-re {Attaching to.*Start it from the beginning\? \(y or n\) } {
|
|
|
|
pass $test
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
send_gdb "y\n"
|
|
|
|
|
|
|
|
set test "run to main"
|
|
|
|
gdb_test_multiple "" $test {
|
|
|
|
-re "Temporary breakpoint .* main .*$gdb_prompt $" {
|
|
|
|
pass $test
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Get rid of the process
|
testsuite: tcl exec& -> 'kill -9 $pid' is racy (attach-many-short-lived-thread.exp races and others)
The buildbots show that attach-many-short-lived-thread.exp is racy.
But after staring at debug logs and playing with SystemTap scripts for
a (long) while, I figured out that neither GDB, nor the kernel nor the
test's program itself are at fault.
The problem is simply that the testsuite machinery is currently
subject to PID-reuse races. The attach-many-short-lived-threads.c
test program just happens to be much more susceptible to trigger this
race because threads and processes share the same number space on
Linux, and the test spawns many many short lived threads in
succession, thus enlarging the race window a lot.
Part of the problem is that several tests spawn processes with "exec&"
(in order to test the "attach" command) , and then at the end of the
test, to make sure things are cleaned up, issue a 'remote_spawn "kill
-p $testpid"'. Since with tcl's "exec&", tcl itself is responsible
for reaping the process's exit status, when we go kill the process,
testpid may have already exited _and_ its status may have (and often
has) been reaped already. Thus it can happen that another process
meanwhile reuses $testpid, and that "kill" command kills the wrong
process... Frequently, that happens to be
attach-many-short-lived-thread, but this explains other test's races
as well.
In the attach-many-short-lived-threads test, it sometimes manifests
like this:
(gdb) file /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads
Reading symbols from /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads...done.
(gdb) Loaded /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads into /home/pedro/gdb/mygit/build/gdb/testsuite/../../gdb/gdb
attach 5940
Attaching to program: /home/pedro/gdb/mygit/build/gdb/testsuite/gdb.threads/attach-many-short-lived-threads, process 5940
warning: process 5940 is a zombie - the process has already terminated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ptrace: Operation not permitted.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: attach
info threads
No threads.
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: no new threads
set breakpoint always-inserted on
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 1: set breakpoint always-inserted on
Other times the process dies while the test is ongoing (the process is
ptrace-stopped):
(gdb) print again = 1
Cannot access memory at address 0x6020cc
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: reset timer in the inferior
(Recall that on Linux, SIGKILL is not interceptable)
And other times it dies just while we're detaching:
$4 = 319
(gdb) PASS: gdb.threads/attach-many-short-lived-threads.exp: iter 2: print seconds_left
detach
Can't detach Thread 0x7fb13b7de700 (LWP 1842): No such process
(gdb) FAIL: gdb.threads/attach-many-short-lived-threads.exp: iter 2: detach
GDB mishandles the latter (it should ignore ESRCH while detaching just
like when continuing), but that's another story.
The fix here is to change spawn_wait_for_attach to use Expect's
'spawn' command instead of Tcl's 'exec&' to spawn programs, because
with spawn we control when to wait for/reap the process. That allows
killing the process by PID without being subject to pid-reuse races,
because even if the process is already dead, the kernel won't reuse
the process's PID until the zombie is reaped.
The other part of the problem lies in DejaGnu itself, unfortunately.
I have occasionally seen tests (attach-many-short-lived-threads
included, but not only that one) die with a random inexplicable
SIGTERM too, and that too is caused by the same reason, except that in
that case, the rogue SIGTERM is sent from this bit in DejaGnu's remote.exp:
exec sh -c "exec > /dev/null 2>&1 && (kill -2 $pgid || kill -2 $pid) && sleep 5 && (kill $pgid || kill $pid) && sleep 5 && (kill -9 $pgid || kill -9 $pid) &"
...
catch "wait -i $shell_id"
Even if the program exits promptly, that whole cascade of kills
carries on in the background, thus potentially killing the poor
process that manages to reuse $pid...
I sent a fix for that to the DejaGnu list:
http://lists.gnu.org/archive/html/dejagnu/2015-07/msg00000.html
With both patches in place, I haven't seen
attach-many-short-lived-threads.exp fail again.
Tested on x86_64 Fedora 20, native, gdbserver and extended-gdbserver.
gdb/testsuite/ChangeLog:
2015-07-31 Pedro Alves <palves@redhat.com>
* gdb.base/attach-pie-misread.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* gdb.base/attach-pie-noexec.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/attach-twice.exp: Likewise.
* gdb.base/attach.exp: Likewise.
(do_command_attach_tests): Use gdb_spawn_with_cmdline_opts and
gdb_test_multiple.
* gdb.base/solib-overlap.exp: Adjust to spawn_wait_for_attach
returning a spawn id instead of a pid. Use spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/valgrind-infcall.exp: Likewise.
* gdb.multi/multi-attach.exp: Likewise.
* gdb.python/py-prompt.exp: Likewise.
* gdb.python/py-sync-interp.exp: Likewise.
* gdb.server/ext-attach.exp: Likewise.
* gdb.threads/attach-into-signal.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.threads/attach-many-short-lived-threads.exp: Adjust to
spawn_wait_for_attach returning a spawn id instead of a pid. Use
spawn_id_get_pid and kill_wait_spawned_process.
* gdb.threads/attach-stopped.exp (corefunc): Use
spawn_wait_for_attach, spawn_id_get_pid and
kill_wait_spawned_process.
* gdb.base/break-interp.exp: Rename $res to $test_spawn_id.
Use spawn_id_get_pid. Wait for spawn id after eof. Use
kill_wait_spawned_process instead of explicit "kill -9".
* lib/gdb.exp (can_spawn_for_attach): Adjust comment.
(kill_wait_spawned_process, spawn_id_get_pid): New procedures.
(spawn_wait_for_attach): Use spawn instead of exec to spawn
processes. Don't map cygwin/windows pids here. Now returns a
spawn id list.
2015-07-31 19:06:24 +00:00
|
|
|
kill_wait_spawned_process $test_spawn_id
|
gdb/17347 - Regression: GDB stopped on run with attached process
Doing:
gdb --pid=PID -ex run
Results in GDB getting a SIGTTIN, and thus ending stopped. That's
usually indicative of a missing target_terminal_ours call.
E.g., from the PR:
$ sleep 1h & p=$!; sleep 0.1; gdb -batch sleep $p -ex run
[1] 28263
[1] Killed sleep 1h
[2]+ Stopped gdb -batch sleep $p -ex run
The workaround is doing:
gdb -ex "attach $PID" -ex "run"
instead of
gdb [-p] $PID -ex "run"
With the former, gdb waits for the attach command to complete before
moving on to the "run" command, because the interpreter is in sync
mode at this point, within execute_command. But for the latter,
attach_command is called directly from captured_main, and thus misses
that waiting. IOW, "run" is running before the attach continuation
has run, before the program stops and attach completes. The broken
terminal settings are just one symptom of that. Any command that
queries or requires input results in the same.
The fix is to wait in catch_command_errors (which is specific to
main.c nowadays), just like we wait in execute_command.
gdb/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* main.c: Include "infrun.h".
(catch_command_errors, catch_command_errors_const): Wait for the
foreground command to complete.
* top.c (maybe_wait_sync_command_done): New function, factored out
from ...
(maybe_wait_sync_command_done): ... here.
* top.h (maybe_wait_sync_command_done): New declaration.
gdb/testsuite/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* lib/gdb.exp (gdb_spawn_with_cmdline_opts): New procedure.
* gdb.base/attach.exp (test_command_line_attach_run): New
procedure.
(top level): Call it.
2014-09-11 12:04:15 +00:00
|
|
|
}
|
|
|
|
}
|
1999-06-28 23:04:32 +00:00
|
|
|
|
|
|
|
# Start with a fresh gdb
|
2004-07-08 20:04:47 +00:00
|
|
|
|
1999-06-28 23:04:32 +00:00
|
|
|
gdb_exit
|
|
|
|
gdb_start
|
|
|
|
gdb_reinitialize_dir $srcdir/$subdir
|
|
|
|
gdb_load ${binfile}
|
|
|
|
|
|
|
|
# This is a test of gdb's ability to attach to a running process.
|
2004-07-08 20:04:47 +00:00
|
|
|
|
1999-06-28 23:04:32 +00:00
|
|
|
do_attach_tests
|
|
|
|
|
|
|
|
# Test attaching when the target is inside a system call
|
2004-07-08 20:04:47 +00:00
|
|
|
|
1999-06-28 23:04:32 +00:00
|
|
|
gdb_exit
|
|
|
|
gdb_start
|
|
|
|
|
|
|
|
gdb_reinitialize_dir $srcdir/$subdir
|
|
|
|
do_call_attach_tests
|
|
|
|
|
2014-03-21 03:11:51 +00:00
|
|
|
# Test "gdb --pid"
|
|
|
|
|
|
|
|
do_command_attach_tests
|
|
|
|
|
gdb/17347 - Regression: GDB stopped on run with attached process
Doing:
gdb --pid=PID -ex run
Results in GDB getting a SIGTTIN, and thus ending stopped. That's
usually indicative of a missing target_terminal_ours call.
E.g., from the PR:
$ sleep 1h & p=$!; sleep 0.1; gdb -batch sleep $p -ex run
[1] 28263
[1] Killed sleep 1h
[2]+ Stopped gdb -batch sleep $p -ex run
The workaround is doing:
gdb -ex "attach $PID" -ex "run"
instead of
gdb [-p] $PID -ex "run"
With the former, gdb waits for the attach command to complete before
moving on to the "run" command, because the interpreter is in sync
mode at this point, within execute_command. But for the latter,
attach_command is called directly from captured_main, and thus misses
that waiting. IOW, "run" is running before the attach continuation
has run, before the program stops and attach completes. The broken
terminal settings are just one symptom of that. Any command that
queries or requires input results in the same.
The fix is to wait in catch_command_errors (which is specific to
main.c nowadays), just like we wait in execute_command.
gdb/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* main.c: Include "infrun.h".
(catch_command_errors, catch_command_errors_const): Wait for the
foreground command to complete.
* top.c (maybe_wait_sync_command_done): New function, factored out
from ...
(maybe_wait_sync_command_done): ... here.
* top.h (maybe_wait_sync_command_done): New declaration.
gdb/testsuite/ChangeLog:
2014-09-11 Pedro Alves <palves@redhat.com>
PR gdb/17347
* lib/gdb.exp (gdb_spawn_with_cmdline_opts): New procedure.
* gdb.base/attach.exp (test_command_line_attach_run): New
procedure.
(top level): Call it.
2014-09-11 12:04:15 +00:00
|
|
|
test_command_line_attach_run
|
|
|
|
|
1999-06-28 23:04:32 +00:00
|
|
|
return 0
|