Implement Configurable TCP Keepalive Settings in PJSIP Transports

This commit introduces configurable TCP keepalive settings for both TCP and TLS transports. The changes allow for finer control over TCP connection keepalives, enhancing stability and reliability in environments prone to connection timeouts or where intermediate devices may prematurely close idle connections. This has proven necessary and has already been tested in production in several specialized environments where access to the underlying transport is unreliable in ways invisible to the operating system directly, so these keepalive and timeout mechanisms are necessary.

Fixes #657
This commit is contained in:
Joshua Elson
2024-03-18 15:14:36 -04:00
parent f3de77f91f
commit 3d40d34271
6 changed files with 184 additions and 10 deletions

View File

@@ -828,17 +828,55 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
} else if (transport->type == AST_TRANSPORT_TCP) {
pjsip_tcp_transport_cfg cfg;
static int option = 1;
int sockopt_count = 0;
pjsip_tcp_transport_cfg_default(&cfg, temp_state->state->host.addr.sa_family);
cfg.bind_addr = temp_state->state->host;
cfg.async_cnt = transport->async_operations;
set_qos(transport, &cfg.qos_params);
/* sockopt_params.options is copied to each newly connected socket */
cfg.sockopt_params.options[0].level = pj_SOL_TCP();
cfg.sockopt_params.options[0].optname = pj_TCP_NODELAY();
cfg.sockopt_params.options[0].optval = &option;
cfg.sockopt_params.options[0].optlen = sizeof(option);
cfg.sockopt_params.cnt = 1;
cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
cfg.sockopt_params.options[sockopt_count].optname = pj_TCP_NODELAY();
cfg.sockopt_params.options[sockopt_count].optval = &option;
cfg.sockopt_params.options[sockopt_count].optlen = sizeof(option);
sockopt_count++;
if (transport->tcp_keepalive_enable) {
#if defined(PJ_MAX_SOCKOPT_PARAMS) && PJ_MAX_SOCKOPT_PARAMS >= 5
ast_log(LOG_DEBUG, "TCP Keepalive enabled for transport '%s'. Idle Time: %d, Interval: %d, Count: %d\n",
ast_sorcery_object_get_id(obj), transport->tcp_keepalive_idle_time, transport->tcp_keepalive_interval_time, transport->tcp_keepalive_probe_count);
cfg.sockopt_params.options[sockopt_count].level = pj_SOL_SOCKET();
cfg.sockopt_params.options[sockopt_count].optname = SO_KEEPALIVE;
cfg.sockopt_params.options[sockopt_count].optval = &option;
cfg.sockopt_params.options[sockopt_count].optlen = sizeof(option);
sockopt_count++;
cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPIDLE;
cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_idle_time;
cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_idle_time);
sockopt_count++;
cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPINTVL;
cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_interval_time;
cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_interval_time);
sockopt_count++;
cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPCNT;
cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_probe_count;
cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_probe_count);
sockopt_count++;
#else
ast_log(LOG_WARNING, "TCP keepalive settings for '%s' not set due to PJSIP built without support for setting all options. Consider using bundled PJSIP.\n",
ast_sorcery_object_get_id(obj));
#endif
}
cfg.sockopt_params.cnt = sockopt_count;
for (i = 0; i < BIND_TRIES && res != PJ_SUCCESS; i++) {
if (perm_state && perm_state->state && perm_state->state->factory
@@ -853,6 +891,7 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
} else if (transport->type == AST_TRANSPORT_TLS) {
#if defined(PJ_HAS_SSL_SOCK) && PJ_HAS_SSL_SOCK != 0
static int option = 1;
int sockopt_count = 0;
if (transport->async_operations > 1 && ast_compare_versions(pj_get_version(), "2.5.0") < 0) {
ast_log(LOG_ERROR, "Transport: %s: When protocol=tls and pjproject version < 2.5.0, async_operations can't be > 1\n",
@@ -864,11 +903,47 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
set_qos(transport, &temp_state->state->tls.qos_params);
/* sockopt_params.options is copied to each newly connected socket */
temp_state->state->tls.sockopt_params.options[0].level = pj_SOL_TCP();
temp_state->state->tls.sockopt_params.options[0].optname = pj_TCP_NODELAY();
temp_state->state->tls.sockopt_params.options[0].optval = &option;
temp_state->state->tls.sockopt_params.options[0].optlen = sizeof(option);
temp_state->state->tls.sockopt_params.cnt = 1;
temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
temp_state->state->tls.sockopt_params.options[sockopt_count].optname = pj_TCP_NODELAY();
temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &option;
temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(option);
sockopt_count++;
if (transport->tcp_keepalive_enable) {
#if defined(PJ_MAX_SOCKOPT_PARAMS) && PJ_MAX_SOCKOPT_PARAMS >= 5
ast_log(LOG_DEBUG, "TCP Keepalive enabled for transport '%s'. Idle Time: %d, Interval: %d, Count: %d\n",
ast_sorcery_object_get_id(obj), transport->tcp_keepalive_idle_time, transport->tcp_keepalive_interval_time, transport->tcp_keepalive_probe_count);
temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_SOCKET();
temp_state->state->tls.sockopt_params.options[sockopt_count].optname = SO_KEEPALIVE;
temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &option;
temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(option);
sockopt_count++;
temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPIDLE;
temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_idle_time;
temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_idle_time);
sockopt_count++;
temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPINTVL;
temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_interval_time;
temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_interval_time);
sockopt_count++;
temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPCNT;
temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_probe_count;
temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_probe_count);
sockopt_count++;
#else
ast_log(LOG_WARNING, "TCP keepalive settings for '%s' not set due to PJSIP built without support for setting all options. Consider using bundled PJSIP.\n",
ast_sorcery_object_get_id(obj));
#endif
}
temp_state->state->tls.sockopt_params.cnt = sockopt_count;
for (i = 0; i < BIND_TRIES && res != PJ_SUCCESS; i++) {
if (perm_state && perm_state->state && perm_state->state->factory
@@ -1760,6 +1835,10 @@ int ast_sip_initialize_sorcery_transport(void)
ast_sorcery_object_field_register_custom(sorcery, "transport", "require_client_cert", "", transport_tls_bool_handler, require_client_cert_to_str, NULL, 0, 0);
ast_sorcery_object_field_register_custom(sorcery, "transport", "allow_wildcard_certs", "", transport_tls_bool_handler, allow_wildcard_certs_to_str, NULL, 0, 0);
ast_sorcery_object_field_register_custom(sorcery, "transport", "method", "", transport_tls_method_handler, tls_method_to_str, NULL, 0, 0);
ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_enable", "no", OPT_BOOL_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_enable));
ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_idle_time", "30", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_idle_time));
ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_interval_time", "1", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_interval_time));
ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_probe_count", "5", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_probe_count));
#if defined(PJ_HAS_SSL_SOCK) && PJ_HAS_SSL_SOCK != 0
ast_sorcery_object_field_register_custom(sorcery, "transport", "cipher", "", transport_tls_cipher_handler, transport_tls_cipher_to_str, NULL, 0, 0);
#endif

View File

@@ -1800,6 +1800,30 @@
<configOption name="require_client_cert" default="false">
<synopsis>Require client certificate (TLS ONLY, not WSS)</synopsis>
</configOption>
<configOption name="tcp_keepalive_enable" default="no">
<synopsis>Enable TCP keepalive</synopsis>
<description><para>
When set to 'yes', TCP keepalive messages are sent to verify that the endpoint is still reachable. This can help detect dead TCP connections in environments where connections may be silently dropped (e.g., NAT timeouts).
</para></description>
</configOption>
<configOption name="tcp_keepalive_idle_time" default="30">
<synopsis>Idle time before the first TCP keepalive probe is sent</synopsis>
<description><para>
Specifies the amount of time in seconds that the connection must be idle before the first TCP keepalive probe is sent. An idle connection is defined as a connection in which no data has been sent or received by the application.
</para></description>
</configOption>
<configOption name="tcp_keepalive_interval_time" default="10">
<synopsis>Interval between TCP keepalive probes</synopsis>
<description><para>
Specifies the interval in seconds between individual TCP keepalive probes, once the first probe is sent. This interval is used for subsequent probes if the peer does not respond to the previous probe.
</para></description>
</configOption>
<configOption name="tcp_keepalive_probe_count" default="5">
<synopsis>Maximum number of TCP keepalive probes</synopsis>
<description><para>
Specifies the maximum number of TCP keepalive probes to send before considering the connection dead and notifying the application. If the peer does not respond after this many probes, the connection is considered broken.
</para></description>
</configOption>
<configOption name="type">
<synopsis>Must be of type 'transport'.</synopsis>
</configOption>