Richard Hirst
2007-01-31 14:44:24 UTC
Hi,
We have experienced hanging threads using omniORB 4.0.5 when the
remote system crashes. The stack backtrace looks like this:
#0 0xffffe410 in __kernel_vsyscall ()
#1 0xb7760550 in poll () from /lib/libc.so.6
#2 0xb7bbde19 in omni::sslAddress::Connect () from /opt/levanta/lib/libomnisslTP.so.4
#3 0xb7c9ffd6 in omni::giopStream::sendChunk () from /opt/levanta/lib/libomniORB4.so.0
#4 0xb7cb439b in omni::giopImpl12::outputMessageEnd () from /opt/levanta/lib/libomniORB4.so.0
#5 0xb7ca5bd3 in omni::GIOP_C::InitialiseRequest () from /opt/levanta/lib/libomniORB4.so.0
...
...
and it turns out poll() is sometimes called with an infinite
timeout when it should not be.
The problem appears to exist in 4.1.0 too, from looking at the
source, and is due to code like this:
static inline int waitRead(SocketHandle_t sock, struct timeval& t)
{
int rc;
#if defined(USE_POLL)
struct pollfd fds;
fds.fd = sock;
fds.events = POLLIN;
int timeout = t.tv_sec*1000+(t.tv_usec/1000);
if (timeout == 0) timeout = -1;
rc = poll(&fds,1,timeout);
if (rc > 0 && fds.revents & POLLERR) {
rc = 0;
}
#else
which, if called with a timeout of less than 1ms, will round down
to timeout=0 and assume no timeout is required. The calling code
will start with a much larger timeout than 1ms, but it can loop
round and call waitRead() again with whatever timeout is remaining,
so waitRead() does get called with values less than 1ms occasionally.
I found three places with similar code, and an untested patch against
the 4.1.0 release is attached. The patch rounds up sub-millisecond
values, rather than rounding down.
Richard
-------------- next part --------------
diff -ur omniORB-4.1.0.ori/src/lib/omniORB/orbcore/ssl/sslAddress.cc omniORB-4.1.0/src/lib/omniORB/orbcore/ssl/sslAddress.cc
--- omniORB-4.1.0.ori/src/lib/omniORB/orbcore/ssl/sslAddress.cc 2006-10-09 14:08:58.000000000 +0100
+++ omniORB-4.1.0/src/lib/omniORB/orbcore/ssl/sslAddress.cc 2007-01-31 08:22:09.000000000 +0000
@@ -181,7 +181,7 @@
struct pollfd fds;
fds.fd = sock;
fds.events = POLLOUT;
- int timeout = t.tv_sec*1000+(t.tv_usec/1000);
+ int timeout = t.tv_sec*1000+((t.tv_usec+999)/1000);
if (timeout == 0) timeout = -1;
rc = poll(&fds,1,timeout);
if (rc > 0 && fds.revents & POLLERR) {
@@ -208,7 +208,7 @@
struct pollfd fds;
fds.fd = sock;
fds.events = POLLIN;
- int timeout = t.tv_sec*1000+(t.tv_usec/1000);
+ int timeout = t.tv_sec*1000+((t.tv_usec+999)/1000);
if (timeout == 0) timeout = -1;
rc = poll(&fds,1,timeout);
if (rc > 0 && fds.revents & POLLERR) {
Only in omniORB-4.1.0/src/lib/omniORB/orbcore/ssl: .sslAddress.cc.swp
diff -ur omniORB-4.1.0.ori/src/lib/omniORB/orbcore/tcp/tcpAddress.cc omniORB-4.1.0/src/lib/omniORB/orbcore/tcp/tcpAddress.cc
--- omniORB-4.1.0.ori/src/lib/omniORB/orbcore/tcp/tcpAddress.cc 2006-10-09 14:08:58.000000000 +0100
+++ omniORB-4.1.0/src/lib/omniORB/orbcore/tcp/tcpAddress.cc 2007-01-31 08:23:35.000000000 +0000
@@ -226,7 +226,7 @@
struct pollfd fds;
fds.fd = sock;
fds.events = POLLOUT;
- int timeout = t.tv_sec*1000+(t.tv_usec/1000);
+ int timeout = t.tv_sec*1000+((t.tv_usec+999)/1000);
if (timeout == 0) timeout = -1;
int rc = poll(&fds,1,timeout);
if (rc > 0 && fds.revents & POLLERR) {
We have experienced hanging threads using omniORB 4.0.5 when the
remote system crashes. The stack backtrace looks like this:
#0 0xffffe410 in __kernel_vsyscall ()
#1 0xb7760550 in poll () from /lib/libc.so.6
#2 0xb7bbde19 in omni::sslAddress::Connect () from /opt/levanta/lib/libomnisslTP.so.4
#3 0xb7c9ffd6 in omni::giopStream::sendChunk () from /opt/levanta/lib/libomniORB4.so.0
#4 0xb7cb439b in omni::giopImpl12::outputMessageEnd () from /opt/levanta/lib/libomniORB4.so.0
#5 0xb7ca5bd3 in omni::GIOP_C::InitialiseRequest () from /opt/levanta/lib/libomniORB4.so.0
...
...
and it turns out poll() is sometimes called with an infinite
timeout when it should not be.
The problem appears to exist in 4.1.0 too, from looking at the
source, and is due to code like this:
static inline int waitRead(SocketHandle_t sock, struct timeval& t)
{
int rc;
#if defined(USE_POLL)
struct pollfd fds;
fds.fd = sock;
fds.events = POLLIN;
int timeout = t.tv_sec*1000+(t.tv_usec/1000);
if (timeout == 0) timeout = -1;
rc = poll(&fds,1,timeout);
if (rc > 0 && fds.revents & POLLERR) {
rc = 0;
}
#else
which, if called with a timeout of less than 1ms, will round down
to timeout=0 and assume no timeout is required. The calling code
will start with a much larger timeout than 1ms, but it can loop
round and call waitRead() again with whatever timeout is remaining,
so waitRead() does get called with values less than 1ms occasionally.
I found three places with similar code, and an untested patch against
the 4.1.0 release is attached. The patch rounds up sub-millisecond
values, rather than rounding down.
Richard
-------------- next part --------------
diff -ur omniORB-4.1.0.ori/src/lib/omniORB/orbcore/ssl/sslAddress.cc omniORB-4.1.0/src/lib/omniORB/orbcore/ssl/sslAddress.cc
--- omniORB-4.1.0.ori/src/lib/omniORB/orbcore/ssl/sslAddress.cc 2006-10-09 14:08:58.000000000 +0100
+++ omniORB-4.1.0/src/lib/omniORB/orbcore/ssl/sslAddress.cc 2007-01-31 08:22:09.000000000 +0000
@@ -181,7 +181,7 @@
struct pollfd fds;
fds.fd = sock;
fds.events = POLLOUT;
- int timeout = t.tv_sec*1000+(t.tv_usec/1000);
+ int timeout = t.tv_sec*1000+((t.tv_usec+999)/1000);
if (timeout == 0) timeout = -1;
rc = poll(&fds,1,timeout);
if (rc > 0 && fds.revents & POLLERR) {
@@ -208,7 +208,7 @@
struct pollfd fds;
fds.fd = sock;
fds.events = POLLIN;
- int timeout = t.tv_sec*1000+(t.tv_usec/1000);
+ int timeout = t.tv_sec*1000+((t.tv_usec+999)/1000);
if (timeout == 0) timeout = -1;
rc = poll(&fds,1,timeout);
if (rc > 0 && fds.revents & POLLERR) {
Only in omniORB-4.1.0/src/lib/omniORB/orbcore/ssl: .sslAddress.cc.swp
diff -ur omniORB-4.1.0.ori/src/lib/omniORB/orbcore/tcp/tcpAddress.cc omniORB-4.1.0/src/lib/omniORB/orbcore/tcp/tcpAddress.cc
--- omniORB-4.1.0.ori/src/lib/omniORB/orbcore/tcp/tcpAddress.cc 2006-10-09 14:08:58.000000000 +0100
+++ omniORB-4.1.0/src/lib/omniORB/orbcore/tcp/tcpAddress.cc 2007-01-31 08:23:35.000000000 +0000
@@ -226,7 +226,7 @@
struct pollfd fds;
fds.fd = sock;
fds.events = POLLOUT;
- int timeout = t.tv_sec*1000+(t.tv_usec/1000);
+ int timeout = t.tv_sec*1000+((t.tv_usec+999)/1000);
if (timeout == 0) timeout = -1;
int rc = poll(&fds,1,timeout);
if (rc > 0 && fds.revents & POLLERR) {