/*
* priod.c: process reprioritizing daemon
*
* Thanks to http://netsplit.com/the-proc-connector-and-socket-filters
* for showing the way around the proc connector and BPF.
*
* Copyright (c) 2017, Přemysl Eric Janouch
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*/
#define _GNU_SOURCE
#define LIBERTY_WANT_POLLER
#include "config.h"
#undef PROGRAM_NAME
#define PROGRAM_NAME "priod"
#include "liberty/liberty.c"
#include
#include
#include
#include
#include
#include
#include
// --- Main program ------------------------------------------------------------
#define RULE_UNSET INT_MIN
struct rule
{
char *program_name; ///< Program name to match against
int oom_score_adj; ///< For /proc/%/oom_score_adj
int prio; ///< For setpriority()
int ioprio; ///< For SYS_ioprio_set
};
struct app_context
{
struct poller poller; ///< Poller
bool polling; ///< The event loop is running
int proc_fd; ///< Proc connector FD
struct poller_fd proc_event; ///< Proc connector read event
struct rule *rules; ///< Rules
size_t rules_len; ///< Number of rules
};
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
static void
log_message_custom (void *user_data, const char *quote, const char *fmt,
va_list ap)
{
(void) user_data;
FILE *stream = stdout;
// TODO: sd-daemon.h log level prefixes?
fputs (quote, stream);
vfprintf (stream, fmt, ap);
fputs ("\n", stream);
}
// --- Configuration -----------------------------------------------------------
static bool
load_integer (struct str_map *root, const char *key, int min, int max,
int *value, struct error **e)
{
*value = RULE_UNSET;
struct config_item *item;
if (!(item = str_map_find (root, key)))
return true;
if (item->type != CONFIG_ITEM_INTEGER
|| item->value.integer < min || item->value.integer > max)
return error_set (e, "%s: must be an integer (%d..%d)", key, min, max);
*value = item->value.integer;
return true;
}
static bool
load_rule (const char *name, struct str_map *m, struct rule *r,
struct error **e)
{
r->program_name = xstrdup (name);
if (!load_integer (m, "oom_score_adj", -1000, 1000, &r->oom_score_adj, e)
|| !load_integer (m, "prio", -20, 19, &r->prio, e)
|| !load_integer (m, "ioprio", 0, 7, &r->ioprio, e))
return false;
return true;
}
static struct rule *
find_rule (struct app_context *ctx, const char *program_name)
{
for (size_t i = 0; i < ctx->rules_len; i++)
if (!strcmp (ctx->rules[i].program_name, program_name))
return ctx->rules + i;
return NULL;
}
static void
load_configuration (struct app_context *ctx, const char *config_path)
{
struct error *e = NULL;
struct config_item *root = config_read_from_file (config_path, &e);
if (e)
{
print_error ("error loading configuration: %s", e->message);
error_free (e);
exit (EXIT_FAILURE);
}
struct str_map_iter iter = str_map_iter_make (&root->value.object);
ctx->rules = xcalloc (iter.map->len, sizeof *ctx->rules);
ctx->rules_len = 0;
struct config_item *subtree;
while ((subtree = str_map_iter_next (&iter)))
{
const char *path = iter.link->key;
if (subtree->type != CONFIG_ITEM_OBJECT)
exit_fatal ("rule `%s' in configuration is not an object", path);
if (!load_rule (path, &subtree->value.object,
&ctx->rules[ctx->rules_len++], &e))
exit_fatal ("rule `%s': %s", path, e->message);
}
}
// --- Signals -----------------------------------------------------------------
static int g_signal_pipe[2]; ///< A pipe used to signal... signals
static void
sigterm_handler (int signum)
{
(void) signum;
int original_errno = errno;
if (write (g_signal_pipe[1], "", 1) == -1)
soft_assert (errno == EAGAIN);
errno = original_errno;
}
static void
setup_signal_handlers (void)
{
if (pipe (g_signal_pipe) == -1)
exit_fatal ("%s: %s", "pipe", strerror (errno));
set_cloexec (g_signal_pipe[0]);
set_cloexec (g_signal_pipe[1]);
// So that the pipe cannot overflow; it would make write() block within
// the signal handler, which is something we really don't want to happen.
// The same holds true for read().
set_blocking (g_signal_pipe[0], false);
set_blocking (g_signal_pipe[1], false);
(void) signal (SIGPIPE, SIG_IGN);
struct sigaction sa;
sa.sa_flags = SA_RESTART;
sa.sa_handler = sigterm_handler;
sigemptyset (&sa.sa_mask);
if (sigaction (SIGINT, &sa, NULL) == -1
|| sigaction (SIGTERM, &sa, NULL) == -1)
exit_fatal ("sigaction: %s", strerror (errno));
}
// --- Main program ------------------------------------------------------------
// IO priorities are a sort-of-private kernel API with no proper headers
enum
{
IOPRIO_CLASS_NONE,
IOPRIO_CLASS_RT,
IOPRIO_CLASS_BE,
IOPRIO_CLASS_IDLE,
};
enum
{
IOPRIO_WHO_PROCESS = 1,
IOPRIO_WHO_PGRP,
IOPRIO_WHO_USER,
};
#define IOPRIO_CLASS_SHIFT 13
static void
adj_oom_score (int pid, const char *program_name, int score)
{
char buf[16]; snprintf (buf, sizeof buf, "%d\n", score);
char *path = xstrdup_printf ("/proc/%d/oom_score_adj", pid);
struct error *e = NULL;
if (!write_file (path, buf, strlen (buf), &e))
{
print_error ("%d (%s): %s", pid, program_name, e->message);
error_free (e);
}
free (path);
}
static bool
reprioritize (int pid, const char *program_name, DIR *dir, struct rule *rule,
struct str_map *set)
{
size_t not_previously_visited = 0;
struct dirent *iter;
while ((errno = 0, iter = readdir (dir)))
{
int tid = atoi (iter->d_name);
if (!tid || str_map_find (set, iter->d_name))
continue;
print_debug (" - thread %d", tid);
str_map_set (set, iter->d_name, (void *) ++not_previously_visited);
if (RULE_UNSET != rule->prio
&& setpriority (PRIO_PROCESS, pid, rule->prio))
print_error ("%d (%s): thread %d: setpriority: %s",
pid, program_name, tid, strerror (errno));
if (RULE_UNSET != rule->ioprio
&& syscall (SYS_ioprio_set, IOPRIO_WHO_PROCESS, tid,
IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT | rule->ioprio))
print_error ("%d (%s): thread %d: ioprio_set: %s",
pid, program_name, tid, strerror (errno));
}
if (errno)
{
print_error ("%d (%s): readdir: %s",
pid, program_name, strerror (errno));
}
return not_previously_visited == 0;
}
static void
on_exec_name (struct app_context *ctx, int pid, const char *program_name)
{
// TODO: we might want to at least provide more criteria to match on,
// so as to not blindly trust everything, despite these priorities being
// relatively harmless if you overlook possible "denial of service"
struct rule *rule = find_rule (ctx, program_name);
const char *slash = strrchr (program_name, '/');
if (!rule && (!slash || !(rule = find_rule (ctx, slash + 1))))
return;
print_debug ("%d (%s) matched", pid, program_name);
if (RULE_UNSET != rule->oom_score_adj)
adj_oom_score (pid, program_name, rule->oom_score_adj);
// Priority APIs are strictly per-thread (i.e. Linux "task"), so we must
// iterate through all tasks within a thread group
char *path = xstrdup_printf ("/proc/%d/task", pid);
DIR *dir = opendir (path);
free (path);
if (!dir)
{
print_error ("%d (%s): opendir: %s",
pid, program_name, strerror (errno));
return;
}
// This has an inherent race condition, but let's give it a try
struct str_map set = str_map_make (NULL);
for (size_t retries = 3; retries--; )
if (reprioritize (pid, program_name, dir, rule, &set))
break;
str_map_free (&set);
closedir (dir);
}
static void
on_exec (struct app_context *ctx, int pid)
{
// This is inherently racy but there seems to be no better way to do it
char *path = xstrdup_printf ("/proc/%d/cmdline", pid);
struct str cmdline = str_make ();
struct error *e = NULL;
if (read_file (path, &cmdline, &e))
on_exec_name (ctx, pid, cmdline.str);
else
{
print_debug ("%s", e->message);
error_free (e);
}
free (path);
str_free (&cmdline);
}
static void
preapply_rules (struct app_context *ctx)
{
DIR *dir = opendir ("/proc");
if (!dir)
{
print_error ("opendir: %s: %s", "/proc", strerror (errno));
return;
}
// We don't care about processes deleted or created during this loop
struct dirent *iter;
while ((errno = 0, iter = readdir (dir)))
{
int pid = atoi (iter->d_name);
if (pid && (iter->d_type == DT_UNKNOWN || iter->d_type == DT_DIR))
on_exec (ctx, pid);
}
closedir (dir);
}
static void
on_netlink_message (struct app_context *ctx, struct nlmsghdr *mh)
{
// In practice the kernel connector never sends multipart messages
if (!soft_assert (mh->nlmsg_type != 0)
|| !soft_assert (mh->nlmsg_flags == 0)
|| mh->nlmsg_type != NLMSG_DONE)
return;
struct cn_msg *m = NLMSG_DATA (mh);
if (m->id.idx != CN_IDX_PROC
|| m->id.val != CN_VAL_PROC)
return;
// XXX: potential alignment issues
struct proc_event *e = (struct proc_event *) m->data;
if (e->what == PROC_EVENT_EXEC)
on_exec (ctx, e->event_data.exit.process_tgid);
}
static void
on_event (const struct pollfd *pfd, struct app_context *ctx)
{
char buf[sysconf (_SC_PAGESIZE)];
struct sockaddr_nl addr;
while (true)
{
socklen_t addr_len = sizeof addr;
ssize_t len = recvfrom (pfd->fd, buf, sizeof buf, 0,
(struct sockaddr *) &addr, &addr_len);
if (len == 0)
exit_fatal ("socket closed");
if (len < 0 && (errno == EAGAIN || errno == ENOBUFS))
return;
if (len < 0)
exit_fatal ("recvfrom: %s", strerror (errno));
// Make sure it comes from the kernel
if (addr.nl_pid)
continue;
// In practice the kernel connector always sends one per dgram
for (struct nlmsghdr *mh = (struct nlmsghdr *) buf;
NLMSG_OK (mh, len); mh = NLMSG_NEXT (mh, len))
on_netlink_message (ctx, mh);
}
}
static void
on_signal_pipe_readable (const struct pollfd *fd, struct app_context *ctx)
{
char id = 0;
(void) read (fd->fd, &id, 1);
ctx->polling = false;
}
static const char *
parse_program_arguments (int argc, char **argv)
{
static const struct opt opts[] =
{
{ 'd', "debug", NULL, 0, "run in debug mode" },
{ 'h', "help", NULL, 0, "display this help and exit" },
{ 'V', "version", NULL, 0, "output version information and exit" },
{ 0, NULL, NULL, 0, NULL }
};
struct opt_handler oh = opt_handler_make (argc, argv, opts, "CONFIG",
"Process reprioritizing daemon.");
int c;
while ((c = opt_handler_get (&oh)) != -1)
switch (c)
{
case 'd':
g_debug_mode = true;
break;
case 'h':
opt_handler_usage (&oh, stdout);
exit (EXIT_SUCCESS);
case 'V':
printf (PROGRAM_NAME " " PROGRAM_VERSION "\n");
exit (EXIT_SUCCESS);
default:
print_error ("wrong options");
opt_handler_usage (&oh, stderr);
exit (EXIT_FAILURE);
}
argc -= optind;
argv += optind;
if (argc != 1)
{
opt_handler_usage (&oh, stderr);
exit (EXIT_FAILURE);
}
opt_handler_free (&oh);
return argv[0];
}
/// Sets up a filter so that we're only woken up by the kernel on exec() events
static void
setup_exec_filter (int fd)
{
struct incoming
{
union { struct nlmsghdr netlink; char align[NLMSG_HDRLEN]; };
struct cn_msg connector;
struct proc_event event;
}
__attribute__ ((packed));
// Byteswapping is needed because the netlink protocol is host-endian
struct sock_filter filter[] =
{
// Only continue filtering dgrams with one "proc_event" message in them
BPF_STMT (BPF_LD | BPF_W | BPF_LEN, 0),
BPF_JUMP (BPF_JMP | BPF_JEQ | BPF_K, sizeof (struct incoming), 0, 9),
BPF_STMT (BPF_LD | BPF_H | BPF_ABS,
offsetof (struct incoming, netlink.nlmsg_type)),
BPF_JUMP (BPF_JMP | BPF_JEQ | BPF_K, htons (NLMSG_DONE), 0, 7),
BPF_STMT (BPF_LD | BPF_W | BPF_ABS,
offsetof (struct incoming, connector.id.idx)),
BPF_JUMP (BPF_JMP | BPF_JEQ | BPF_K, htonl (CN_IDX_PROC), 0, 5),
BPF_STMT (BPF_LD | BPF_W | BPF_ABS,
offsetof (struct incoming, connector.id.val)),
BPF_JUMP (BPF_JMP | BPF_JEQ | BPF_K, htonl (CN_VAL_PROC), 0, 3),
BPF_STMT (BPF_LD | BPF_W | BPF_ABS,
offsetof (struct incoming, event.what)),
BPF_JUMP (BPF_JMP | BPF_JEQ | BPF_K, htonl (PROC_EVENT_EXEC), 1, 0),
BPF_STMT (BPF_RET | BPF_K, 0),
BPF_STMT (BPF_RET | BPF_K, 0xffffffff),
};
struct sock_fprog fprog = { .filter = filter, .len = N_ELEMENTS (filter) };
const int yes = 1;
if (setsockopt (fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof fprog) < 0)
print_error ("setsockopt: %s", strerror (errno));
#if defined SOL_NETLINK && defined NETLINK_NO_ENOBUFS
if (setsockopt (fd, SOL_NETLINK, NETLINK_NO_ENOBUFS, &yes, sizeof yes) < 0)
print_error ("setsockopt: %s", strerror (errno));
#endif
}
int
main (int argc, char *argv[])
{
g_log_message_real = log_message_custom;
const char *config_path = parse_program_arguments (argc, argv);
struct app_context ctx;
memset (&ctx, 0, sizeof ctx);
poller_init (&ctx.poller);
setup_signal_handlers ();
struct poller_fd signal_event =
poller_fd_make (&ctx.poller, g_signal_pipe[0]);
signal_event.dispatcher = (poller_fd_fn) on_signal_pipe_readable;
signal_event.user_data = &ctx;
poller_fd_set (&signal_event, POLLIN);
load_configuration (&ctx, config_path);
ctx.proc_fd = socket (PF_NETLINK,
SOCK_DGRAM | SOCK_NONBLOCK | SOCK_CLOEXEC, NETLINK_CONNECTOR);
if (ctx.proc_fd < 0)
exit_fatal ("cannot make a proc connector: %s", strerror (errno));
setup_exec_filter (ctx.proc_fd);
struct sockaddr_nl addr = { .nl_family = AF_NETLINK, .nl_pid = getpid (),
.nl_groups = CN_IDX_PROC };
if (bind (ctx.proc_fd, (struct sockaddr *) &addr, sizeof addr) < 0)
exit_fatal ("cannot make a proc connector: %s", strerror (errno));
struct
{
union { struct nlmsghdr netlink; char align[NLMSG_HDRLEN]; };
struct cn_msg connector;
enum proc_cn_mcast_op op;
}
__attribute__ ((packed)) subscription =
{
.netlink.nlmsg_len = sizeof subscription,
.netlink.nlmsg_type = NLMSG_DONE,
.netlink.nlmsg_pid = getpid (),
.connector.id.idx = CN_IDX_PROC,
.connector.id.val = CN_VAL_PROC,
.connector.len = sizeof subscription.op,
.op = PROC_CN_MCAST_LISTEN,
};
if (write (ctx.proc_fd, &subscription, sizeof subscription) < 0)
exit_fatal ("failed to subscribe for events: %s", strerror (errno));
ctx.proc_event = poller_fd_make (&ctx.poller, ctx.proc_fd);
ctx.proc_event.dispatcher = (poller_fd_fn) on_event;
ctx.proc_event.user_data = &ctx;
poller_fd_set (&ctx.proc_event, POLLIN);
// While new events are being queued, we can apply rules to already
// existing processes, so that we don't miss anything except for obvious
// cases when a process re-execs to something else after a match.
// It would inherit the same values anyway, so it seems to be mostly okay.
preapply_rules (&ctx);
ctx.polling = true;
while (ctx.polling)
poller_run (&ctx.poller);
poller_free (&ctx.poller);
xclose (ctx.proc_fd);
for (size_t i = 0; i < ctx.rules_len; i++)
free (ctx.rules[i].program_name);
free (ctx.rules);
return EXIT_SUCCESS;
}