taskprocessor: Enable subsystems and overload by subsystem

To prevent one subsystem's taskprocessors from causing others
to stall, new capabilities have been added to taskprocessors.

* Any taskprocessor name that has a '/' will have the part
  before the '/' saved as its "subsystem".
  Examples:
  "sorcery/acl-0000006a" and "sorcery/aor-00000019"
  will be grouped to subsystem "sorcery".
  "pjsip/distributor-00000025" and "pjsip/distributor-00000026"
  will bn grouped to subsystem "pjsip".
  Taskprocessors with no '/' have an empty subsystem.

* When a taskprocessor enters high-water alert status and it
  has a non-empty subsystem, the subsystem alert count will
  be incremented.

* When a taskprocessor leaves high-water alert status and it
  has a non-empty subsystem, the subsystem alert count will be
  decremented.

* A new api ast_taskprocessor_get_subsystem_alert() has been
  added that returns the number of taskprocessors in alert for
  the subsystem.

* A new CLI command "core show taskprocessor alerted subsystems"
  has been added.

* A new unit test was addded.

REMINDER: The taskprocessor code itself doesn't take any action
based on high-water alerts or overloading.  It's up to taskprocessor
users to check and take action themselves.  Currently only the pjsip
distributor does this.

* A new pjsip/global option "taskprocessor_overload_trigger"
  has been added that allows the user to select the trigger
  mechanism the distributor uses to pause accepting new requests.
  "none": Don't pause on any overload condition.
  "global": Pause on ANY taskprocessor overload (the default and
  current behavior)
  "pjsip_only": Pause only on pjsip taskprocessor overloads.

* The core pjsip pool was renamed from "SIP" to "pjsip" so it can
  be properly grouped into the "pjsip" subsystem.

* stasis taskprocessor names were changed to "stasis" as the
  subsystem.

* Sorcery core taskprocessor names were changed to "sorcery" to
  match the object taskprocessors.

Change-Id: I8c19068bb2fc26610a9f0b8624bdf577a04fcd56
This commit is contained in:
George Joseph
2019-02-15 11:53:50 -07:00
parent 1c5def4b18
commit 2f8def1453
13 changed files with 523 additions and 10 deletions

View File

@@ -89,7 +89,11 @@ struct ast_taskprocessor {
unsigned int high_water_alert:1;
/*! Indicates if the taskprocessor is currently suspended */
unsigned int suspended:1;
/*! \brief Friendly name of the taskprocessor */
/*! \brief Anything before the first '/' in the name (if there is one) */
char *subsystem;
/*! \brief Friendly name of the taskprocessor.
* Subsystem is appended after the name's NULL terminator.
*/
char name[0];
};
@@ -112,6 +116,16 @@ struct ast_taskprocessor_listener {
void *user_data;
};
/*!
* Keep track of which subsystems are in alert
* and how many of their taskprocessors are overloaded.
*/
struct subsystem_alert {
unsigned int alert_count;
char subsystem[0];
};
static AST_VECTOR_RW(subsystem_alert_vector, struct subsystem_alert *) overloaded_subsystems;
#ifdef LOW_MEMORY
#define TPS_MAX_BUCKETS 61
#else
@@ -138,10 +152,12 @@ static int tps_ping_handler(void *datap);
static char *cli_tps_ping(struct ast_cli_entry *e, int cmd, struct ast_cli_args *a);
static char *cli_tps_report(struct ast_cli_entry *e, int cmd, struct ast_cli_args *a);
static char *cli_subsystem_alert_report(struct ast_cli_entry *e, int cmd, struct ast_cli_args *a);
static struct ast_cli_entry taskprocessor_clis[] = {
AST_CLI_DEFINE(cli_tps_ping, "Ping a named task processor"),
AST_CLI_DEFINE(cli_tps_report, "List instantiated task processors and statistics"),
AST_CLI_DEFINE(cli_subsystem_alert_report, "List task processor subsystems in alert"),
};
struct default_taskprocessor_listener_pvt {
@@ -271,6 +287,8 @@ static const struct ast_taskprocessor_listener_callbacks default_listener_callba
static void tps_shutdown(void)
{
ast_cli_unregister_multiple(taskprocessor_clis, ARRAY_LEN(taskprocessor_clis));
AST_VECTOR_CALLBACK_VOID(&overloaded_subsystems, ast_free);
AST_VECTOR_RW_FREE(&overloaded_subsystems);
ao2_t_ref(tps_singletons, -1, "Unref tps_singletons in shutdown");
tps_singletons = NULL;
}
@@ -285,6 +303,12 @@ int ast_tps_init(void)
return -1;
}
if (AST_VECTOR_RW_INIT(&overloaded_subsystems, 10)) {
ao2_ref(tps_singletons, -1);
ast_log(LOG_ERROR, "taskprocessor subsystems vector failed to initialize!\n");
return -1;
}
ast_cond_init(&cli_ping_cond, NULL);
ast_cli_register_multiple(taskprocessor_clis, ARRAY_LEN(taskprocessor_clis));
@@ -548,6 +572,157 @@ static int tps_cmp_cb(void *obj, void *arg, int flags)
return !strcasecmp(lhs->name, rhsname) ? CMP_MATCH | CMP_STOP : 0;
}
static int subsystem_match(struct subsystem_alert *alert, const char *subsystem)
{
return !strcmp(alert->subsystem, subsystem);
}
static int subsystem_cmp(struct subsystem_alert *a, struct subsystem_alert *b)
{
return strcmp(a->subsystem, b->subsystem);
}
unsigned int ast_taskprocessor_get_subsystem_alert(const char *subsystem)
{
struct subsystem_alert *alert;
unsigned int count = 0;
int idx;
AST_VECTOR_RW_RDLOCK(&overloaded_subsystems);
idx = AST_VECTOR_GET_INDEX(&overloaded_subsystems, subsystem, subsystem_match);
if (idx >= 0) {
alert = AST_VECTOR_GET(&overloaded_subsystems, idx);
count = alert->alert_count;
}
AST_VECTOR_RW_UNLOCK(&overloaded_subsystems);
return count;
}
static void subsystem_alert_increment(const char *subsystem)
{
struct subsystem_alert *alert;
int idx;
if (ast_strlen_zero(subsystem)) {
return;
}
AST_VECTOR_RW_WRLOCK(&overloaded_subsystems);
idx = AST_VECTOR_GET_INDEX(&overloaded_subsystems, subsystem, subsystem_match);
if (idx >= 0) {
alert = AST_VECTOR_GET(&overloaded_subsystems, idx);
alert->alert_count++;
AST_VECTOR_RW_UNLOCK(&overloaded_subsystems);
return;
}
alert = ast_malloc(sizeof(*alert) + strlen(subsystem) + 1);
if (!alert) {
AST_VECTOR_RW_UNLOCK(&overloaded_subsystems);
return;
}
alert->alert_count = 1;
strcpy(alert->subsystem, subsystem); /* Safe */
if (AST_VECTOR_APPEND(&overloaded_subsystems, alert)) {
ast_free(alert);
}
AST_VECTOR_RW_UNLOCK(&overloaded_subsystems);
}
static void subsystem_alert_decrement(const char *subsystem)
{
struct subsystem_alert *alert;
int idx;
if (ast_strlen_zero(subsystem)) {
return;
}
AST_VECTOR_RW_WRLOCK(&overloaded_subsystems);
idx = AST_VECTOR_GET_INDEX(&overloaded_subsystems, subsystem, subsystem_match);
if (idx < 0) {
ast_log(LOG_ERROR,
"Can't decrement alert count for subsystem '%s' as it wasn't in alert\n", subsystem);
AST_VECTOR_RW_UNLOCK(&overloaded_subsystems);
return;
}
alert = AST_VECTOR_GET(&overloaded_subsystems, idx);
alert->alert_count--;
if (alert->alert_count <= 0) {
AST_VECTOR_REMOVE(&overloaded_subsystems, idx, 0);
ast_free(alert);
}
AST_VECTOR_RW_UNLOCK(&overloaded_subsystems);
}
static void subsystem_copy(struct subsystem_alert *alert,
struct subsystem_alert_vector *vector)
{
struct subsystem_alert *alert_copy;
alert_copy = ast_malloc(sizeof(*alert_copy) + strlen(alert->subsystem) + 1);
if (!alert_copy) {
return;
}
alert_copy->alert_count = alert->alert_count;
strcpy(alert_copy->subsystem, alert->subsystem); /* Safe */
if (AST_VECTOR_ADD_SORTED(vector, alert_copy, subsystem_cmp)) {
ast_free(alert_copy);
}
}
static char *cli_subsystem_alert_report(struct ast_cli_entry *e, int cmd, struct ast_cli_args *a)
{
struct subsystem_alert_vector sorted_subsystems;
int i;
#define FMT_HEADERS_SUBSYSTEM "%-32s %12s\n"
#define FMT_FIELDS_SUBSYSTEM "%-32s %12u\n"
switch (cmd) {
case CLI_INIT:
e->command = "core show taskprocessor alerted subsystems";
e->usage =
"Usage: core show taskprocessor alerted subsystems\n"
" Shows a list of task processor subsystems that are currently alerted\n";
return NULL;
case CLI_GENERATE:
return NULL;
}
if (a->argc != e->args) {
return CLI_SHOWUSAGE;
}
if (AST_VECTOR_INIT(&sorted_subsystems, AST_VECTOR_SIZE(&overloaded_subsystems))) {
return CLI_FAILURE;
}
AST_VECTOR_RW_RDLOCK(&overloaded_subsystems);
for (i = 0; i < AST_VECTOR_SIZE(&overloaded_subsystems); i++) {
subsystem_copy(AST_VECTOR_GET(&overloaded_subsystems, i), &sorted_subsystems);
}
AST_VECTOR_RW_UNLOCK(&overloaded_subsystems);
ast_cli(a->fd, "\n" FMT_HEADERS_SUBSYSTEM, "Subsystem", "Alert Count");
for (i = 0; i < AST_VECTOR_SIZE(&sorted_subsystems); i++) {
struct subsystem_alert *alert = AST_VECTOR_GET(&sorted_subsystems, i);
ast_cli(a->fd, FMT_FIELDS_SUBSYSTEM, alert->subsystem, alert->alert_count);
}
ast_cli(a->fd, "\n%lu subsystems\n\n", AST_VECTOR_SIZE(&sorted_subsystems));
AST_VECTOR_CALLBACK_VOID(&sorted_subsystems, ast_free);
AST_VECTOR_FREE(&sorted_subsystems);
return CLI_SUCCESS;
}
/*! Count of the number of taskprocessors in high water alert. */
static unsigned int tps_alert_count;
@@ -577,6 +752,15 @@ static void tps_alert_add(struct ast_taskprocessor *tps, int delta)
ast_log(LOG_DEBUG, "Taskprocessor '%s' %s the high water alert.\n",
tps->name, tps_alert_count ? "triggered" : "cleared");
}
if (tps->subsystem[0] != '\0') {
if (delta > 0) {
subsystem_alert_increment(tps->subsystem);
} else {
subsystem_alert_decrement(tps->subsystem);
}
}
ast_rwlock_unlock(&tps_alert_lock);
}
@@ -747,8 +931,17 @@ static void *default_listener_pvt_alloc(void)
static struct ast_taskprocessor *__allocate_taskprocessor(const char *name, struct ast_taskprocessor_listener *listener)
{
struct ast_taskprocessor *p;
char *subsystem_separator;
size_t subsystem_length = 0;
size_t name_length;
p = ao2_alloc(sizeof(*p) + strlen(name) + 1, tps_taskprocessor_dtor);
name_length = strlen(name);
subsystem_separator = strchr(name, '/');
if (subsystem_separator) {
subsystem_length = subsystem_separator - name;
}
p = ao2_alloc(sizeof(*p) + name_length + subsystem_length + 2, tps_taskprocessor_dtor);
if (!p) {
ast_log(LOG_WARNING, "failed to create taskprocessor '%s'\n", name);
return NULL;
@@ -758,7 +951,9 @@ static struct ast_taskprocessor *__allocate_taskprocessor(const char *name, stru
p->tps_queue_low = (AST_TASKPROCESSOR_HIGH_WATER_LEVEL * 9) / 10;
p->tps_queue_high = AST_TASKPROCESSOR_HIGH_WATER_LEVEL;
strcpy(p->name, name); /*SAFE*/
strcpy(p->name, name); /* Safe */
p->subsystem = p->name + name_length + 1;
ast_copy_string(p->subsystem, name, subsystem_length + 1);
ao2_ref(listener, +1);
p->listener = listener;