/*-*- Mode: C; c-basic-offset: 8 -*-*/
/***
This file is part of RealtimeKit.
Copyright 2009 Lennart Poettering
Copyright 2010 Maarten Lankhorst
RealtimeKit is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
RealtimeKit is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with RealtimeKit. If not, see .
***/
#ifdef HAVE_CONFIG_H
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "rtkit.h"
#include "sd-daemon.h"
#ifndef __linux__
#error "This stuff only works on Linux!"
#endif
#ifndef SCHED_RESET_ON_FORK
/* "Your libc lacks the definition of SCHED_RESET_ON_FORK. We'll now define it ourselves, however make sure your kernel is new enough! */
#define SCHED_RESET_ON_FORK 0x40000000
#endif
#ifndef RLIMIT_RTTIME
#define RLIMIT_RTTIME 15
#endif
#define INTROSPECT_XML \
DBUS_INTROSPECT_1_0_XML_DOCTYPE_DECL_NODE \
"\n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n"\
" " \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
" \n" \
"\n"
/* Similar to assert(), but has side effects, and hence shall never be optimized away, regardless of NDEBUG */
#define assert_se(expr) \
do { \
if (__builtin_expect(!(expr), 0)) { \
fprintf(stderr, "Asssertion %s failed at %s:%u, function %s(). Aborting.\n", #expr, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
abort(); \
} \
} while(0)
#define ELEMENTSOF(x) (sizeof(x)/sizeof(x[0]))
#define TIMESPEC_MSEC(ts) (((int64_t) (ts).tv_sec * 1000LL) + ((int64_t) (ts).tv_nsec / 1000000LL))
/* If we actually execute a request we temporarily upgrade our realtime priority to this level */
static unsigned our_realtime_priority = 21;
/* Normally we run at this nice level */
static int our_nice_level = 1;
/* The maximum realtime priority to hand out */
static unsigned max_realtime_priority = 20;
/* The minimum nice level to hand out */
static int min_nice_level = -15;
/* Username we shall run under */
static const char *username = "rtkit";
/* Enforce that clients have RLIMIT_RTTIME set to a value <= this */
static unsigned long long rttime_nsec_max = 200000000ULL; /* 200 ms */
/* How many users do we supervise at max? */
static unsigned users_max = 2048;
/* How many processes of a single user do we supervise at max? */
static unsigned processes_per_user_max = 15;
/* How many threads of a single user do we supervise at max? */
static unsigned threads_per_user_max = 25;
/* Refuse further requests if one user issues more than ACTIONS_PER_BURST_MAX in this time */
static unsigned actions_burst_sec = 20;
/* Refuse further requests if one user issues more than this many in ACTIONS_BURST_SEC time */
static unsigned actions_per_burst_max = 25;
/* Drop privileges */
static bool do_drop_privileges = TRUE;
/* Change root directory to /proc */
static bool do_chroot = TRUE;
/* Limit resources */
static bool do_limit_resources = TRUE;
/* Run a canary watchdog */
static bool do_canary = TRUE;
/* Canary cheep interval */
static unsigned canary_cheep_msec = 5000; /* 5s */
/* Canary watchdog interval */
static unsigned canary_watchdog_msec = 10000; /* 10s */
/* Watchdog realtime priority */
static unsigned canary_watchdog_realtime_priority = 99;
/* How long after the canary died shall we refuse further RT requests? */
static unsigned canary_refusal_sec = 5*60;
/* Demote root processes? */
static bool canary_demote_root = FALSE;
/* Demote unknown processes? */
static bool canary_demote_unknown = FALSE;
/* Log to stderr? */
static bool log_stderr = FALSE;
/* Scheduling policy to use */
static int sched_policy = SCHED_RR;
struct thread {
/* We use the thread id plus its starttime as a unique identifier for threads */
pid_t pid;
unsigned long long starttime;
struct thread *next;
};
struct process {
/* We use the process id plus its starttime as a unique identifier for processes */
pid_t pid;
unsigned long long starttime;
struct thread *threads;
struct process *next;
};
struct rtkit_user {
uid_t uid;
time_t timestamp;
unsigned n_actions;
struct process *processes;
unsigned n_processes;
unsigned n_threads;
struct rtkit_user *next;
};
static struct rtkit_user *users = NULL;
static unsigned n_users = 0;
static unsigned n_total_processes = 0;
static unsigned n_total_threads = 0;
static const char *proc = NULL;
static int quit_fd = -1, canary_fd = -1;
static pthread_t canary_thread_id = 0, watchdog_thread_id = 0;
static volatile uint32_t refuse_until = 0;
static const char *get_proc_path(void) {
/* Useful for chroot environments */
if (proc)
return proc;
return "/proc";
}
static char* get_user_name(uid_t uid, char *user, size_t len) {
struct passwd *pw;
if ((pw = getpwuid(uid))) {
strncpy(user, pw->pw_name, len-1);
user[len-1] = 0;
return user;
}
snprintf(user, len-1, "%llu", (unsigned long long) uid);
user[len-1] = 0;
return user;
}
static int read_starttime(pid_t pid, pid_t tid, unsigned long long *st) {
char fn[128], line[256], *p;
int r;
FILE *f;
if (tid != 0)
assert_se(snprintf(fn, sizeof(fn)-1, "%s/%llu/task/%llu/stat", get_proc_path(), (unsigned long long) pid, (unsigned long long) tid) < (int) (sizeof(fn)-1));
else
assert_se(snprintf(fn, sizeof(fn)-1, "%s/%llu/stat", get_proc_path(), (unsigned long long) pid) < (int) (sizeof(fn)-1));
fn[sizeof(fn)-1] = 0;
if (!(f = fopen(fn, "r")))
return -errno;
if (!(fgets(line, sizeof(line), f))) {
r = -errno;
fclose(f);
return r;
}
fclose(f);
/* Let's skip the pid and comm fields. The latter is enclosed
* in () but does not escape any () in its value, so let's
* skip over it manually */
if (!(p = strrchr(line, ')')))
return -EIO;
p++;
if (sscanf(p, " "
"%*c " /* state */
"%*d " /* ppid */
"%*d " /* pgrp */
"%*d " /* session */
"%*d " /* tty_nr */
"%*d " /* tpgid */
"%*u " /* flags */
"%*u " /* minflt */
"%*u " /* cminflt */
"%*u " /* majflt */
"%*u " /* cmajflt */
"%*u " /* utime */
"%*u " /* stime */
"%*d " /* cutime */
"%*d " /* cstime */
"%*d " /* priority */
"%*d " /* nice */
"%*d " /* num_threads */
"%*d " /* itrealvalue */
"%llu " /* starttime */,
st) != 1)
return -EIO;
return 0;
}
static void free_thread(struct thread *t) {
free(t);
}
static void free_process(struct process *p) {
struct thread *t;
while ((t = p->threads)) {
p->threads = t->next;
free_thread(t);
}
free(p);
}
static void free_user(struct rtkit_user *u) {
struct process *p;
while ((p = u->processes)) {
u->processes = p->next;
free_process(p);
}
free(u);
}
static bool user_in_burst(struct rtkit_user *u) {
time_t now = time(NULL);
return now < u->timestamp + (time_t) actions_burst_sec;
}
static bool verify_burst(struct rtkit_user *u) {
if (!user_in_burst(u)) {
/* Restart burst phase */
time(&u->timestamp);
u->n_actions = 0;
return true;
}
if (u->n_actions >= actions_per_burst_max) {
char user[64];
syslog(LOG_WARNING, "Warning: Reached burst limit for user '%s', denying request.\n", get_user_name(u->uid, user, sizeof(user)));
return false;
}
u->n_actions++;
return true;
}
static int find_user(struct rtkit_user **_u, uid_t uid) {
struct rtkit_user *u;
for (u = users; u; u = u->next)
if (u->uid == uid) {
*_u = u;
return 0;
}
if (n_users >= users_max) {
syslog(LOG_WARNING, "Warning: Reached maximum concurrent user limit, denying request.\n");
return -EBUSY;
}
if (!(u = malloc(sizeof(struct rtkit_user))))
return -ENOMEM;
u->uid = uid;
u->timestamp = time(NULL);
u->n_actions = 0;
u->processes = NULL;
u->n_processes = u->n_threads = 0;
u->next = users;
users = u;
n_users++;
*_u = u;
return 0;
}
static int find_process(struct process** _p, struct rtkit_user *u, pid_t pid, unsigned long long starttime) {
struct process *p;
for (p = u->processes; p; p = p->next)
if (p->pid == pid && p->starttime == starttime) {
*_p = p;
return 0;
}
if (u->n_processes >= processes_per_user_max) {
char user[64];
syslog(LOG_WARNING, "Warning: Reached maximum concurrent process limit for user '%s', denying request.\n", get_user_name(u->uid, user, sizeof(user)));
return -EBUSY;
}
if (!(p = malloc(sizeof(struct process))))
return -ENOMEM;
p->pid = pid;
p->starttime = starttime;
p->threads = NULL;
p->next = u->processes;
u->processes = p;
u->n_processes++;
n_total_processes++;
*_p = p;
return 0;
}
static int find_thread(struct thread** _t, struct rtkit_user *u, struct process *p, pid_t pid, unsigned long long starttime) {
struct thread *t;
for (t = p->threads; t; t = t->next)
if (t->pid == pid && t->starttime == starttime) {
*_t = t;
return 0;
}
if (u->n_threads >= threads_per_user_max) {
char user[64];
syslog(LOG_WARNING, "Warning: Reached maximum concurrent threads limit for user '%s', denying request.\n", get_user_name(u->uid, user, sizeof(user)));
return -EBUSY;
}
if (!(t = malloc(sizeof(struct thread))))
return -ENOMEM;
t->pid = pid;
t->starttime = starttime;
t->next = p->threads;
p->threads = t;
u->n_threads++;
n_total_threads++;
*_t = t;
return 0;
}
static bool thread_relevant(struct process *p, struct thread *t) {
unsigned long long st;
int r;
/* This checks if a thread still matters to us, i.e. if its
* PID still refers to the same thread and if it is still high
* priority or real time */
if ((r = read_starttime(p->pid, t->pid, &st)) < 0) {
/* Did the thread die? */
if (r == -ENOENT)
return FALSE;
syslog(LOG_WARNING, "Warning: failed to read start time: %s\n", strerror(-r));
return FALSE;
}
/* Did the thread get replaced by another thread? */
if (st != t->starttime)
return FALSE;
if ((r = sched_getscheduler(t->pid)) < 0) {
/* Maybe it died right now? */
if (errno == ESRCH)
return FALSE;
syslog(LOG_WARNING, "Warning: failed to read scheduler policy: %s\n", strerror(errno));
return FALSE;
}
/* Is this a realtime thread? */
r &= ~SCHED_RESET_ON_FORK;
if (r == SCHED_RR || r == SCHED_FIFO)
return TRUE;
errno = 0;
r = getpriority(PRIO_PROCESS, t->pid);
if (errno != 0) {
/* Maybe it died right now? */
if (errno == ESRCH)
return FALSE;
syslog(LOG_WARNING, "Warning: failed to read nice level: %s\n", strerror(errno));
return FALSE;
}
/* Is this a high priority thread? */
if (r < 0)
return TRUE;
return FALSE;
}
static void thread_gc(struct rtkit_user *u, struct process *p) {
struct thread *t, *n, *l;
/* Cleanup dead theads of a specific user we don't need to keep track about anymore */
for (l = NULL, t = p->threads; t; t = n) {
n = t->next;
if (!thread_relevant(p, t)) {
free_thread(t);
if (l)
l->next = n;
else
p->threads = n;
assert(u->n_threads >= 1);
u->n_threads--;
assert(n_total_threads >= 1);
n_total_threads--;
} else
l = t;
}
assert(!p->threads || u->n_threads);
}
static void process_gc(struct rtkit_user *u) {
struct process *p, *n, *l;
/* Cleanup dead processes of a specific user we don't need to keep track about anymore */
for (l = NULL, p = u->processes; p; p = n) {
n = p->next;
thread_gc(u, p);
if (!p->threads) {
free_process(p);
if (l)
l->next = n;
else
u->processes = n;
assert(u->n_processes >= 1);
u->n_processes--;
assert(n_total_processes >= 1);
n_total_processes--;
} else
l = p;
}
assert(!u->processes == !u->n_processes);
}
static void user_gc(void) {
struct rtkit_user *u, *n, *l;
/* Cleanup all users we don't need to keep track about anymore */
for (l = NULL, u = users; u; u = n) {
n = u->next;
process_gc(u);
if (!u->processes && !user_in_burst(u)) {
free_user(u);
if (l)
l->next = n;
else
users = n;
assert(n_users >= 1);
n_users--;
} else
l = u;
}
assert(!users == !n_users);
}
static bool startswith(const char *s, const char *prefix) {
return strncmp(s, prefix, strlen(prefix)) == 0;
}
static int self_set_realtime(unsigned priority) {
struct sched_param param;
int r;
memset(¶m, 0, sizeof(param));
param.sched_priority = priority;
if (sched_setscheduler(0, sched_policy|SCHED_RESET_ON_FORK, ¶m) < 0) {
r = -errno;
syslog(LOG_ERR, "Failed to make ourselves RT: %s\n", strerror(errno));
goto finish;
}
r = 0;
finish:
return r;
}
static void self_drop_realtime(int nice_level) {
struct sched_param param;
memset(¶m, 0, sizeof(param));
if (sched_setscheduler(0, SCHED_OTHER, ¶m) < 0)
syslog(LOG_WARNING, "Warning: Failed to reset scheduling to SCHED_OTHER: %s\n", strerror(errno));
if (setpriority(PRIO_PROCESS, 0, nice_level) < 0)
syslog(LOG_WARNING, "Warning: Failed to reset nice level to %u: %s\n", our_nice_level, strerror(errno));
}
/* Verifies that RLIMIT_RTTIME is set for the specified process */
static int verify_process_rttime(struct process *p) {
char fn[128];
FILE *f;
int r;
bool good = false;
assert_se(snprintf(fn, sizeof(fn)-1, "%s/%llu/limits", get_proc_path(), (unsigned long long) p->pid) < (int) (sizeof(fn)-1));
fn[sizeof(fn)-1] = 0;
if (!(f = fopen(fn, "r"))) {
r = -errno;
syslog(LOG_ERR, "Failed to open '%s': %s\n", fn, strerror(errno));
return r;
}
for (;;) {
char line[128];
char soft[32], hard[32];
unsigned long long rttime;
char *e = NULL;
if (!fgets(line, sizeof(line), f))
break;
if (!startswith(line, "Max realtime timeout"))
continue;
if (sscanf(line + 20, "%s %s", soft, hard) != 2) {
syslog(LOG_WARNING, "Warning: parse failure in %s.\n", fn);
break;
}
errno = 0;
rttime = strtoll(hard, &e, 10);
if (errno != 0 || !e || *e != 0)
break;
if (rttime <= rttime_nsec_max)
good = true;
break;
}
fclose(f);
return good ? 0 : -EPERM;
}
static int verify_process_user(struct rtkit_user *u, struct process *p) {
char fn[128];
int r;
struct stat st;
assert_se(snprintf(fn, sizeof(fn)-1, "%s/%llu", get_proc_path(), (unsigned long long) p->pid) < (int) (sizeof(fn)-1));
fn[sizeof(fn)-1] = 0;
memset(&st, 0, sizeof(st));
if (stat(fn, &st) < 0) {
r = -errno;
if (r != -ENOENT)
syslog(LOG_WARNING, "Warning: Failed to stat() file '%s': %s\n", fn, strerror(-r));
return r;
}
return st.st_uid == u->uid ? 0 : -EPERM;
}
static int verify_process_starttime(struct process *p) {
unsigned long long st;
int r;
if ((r = read_starttime(p->pid, 0, &st)) < 0) {
if (r != -ENOENT)
syslog(LOG_WARNING, "Warning: Failed to read start time of process %llu: %s\n", (unsigned long long) p->pid, strerror(-r));
return r;
}
return st == p->starttime ? 0 : -EPERM;
}
static int verify_thread_starttime(struct process *p, struct thread *t) {
unsigned long long st;
int r;
if ((r = read_starttime(p->pid, t->pid, &st)) < 0) {
if (r != -ENOENT)
syslog(LOG_WARNING, "Warning: Failed to read start time of thread %llu: %s\n", (unsigned long long) t->pid, strerror(-r));
return r;
}
return st == t->starttime ? 0 : -EPERM;
}
static int thread_reset(pid_t tid) {
struct sched_param param;
int r = 0;
memset(¶m, 0, sizeof(param));
param.sched_priority = 0;
if (sched_setscheduler(tid, SCHED_OTHER, ¶m) < 0) {
if (errno != ESRCH)
syslog(LOG_WARNING, "Warning: Failed to reset scheduling to SCHED_OTHER for thread %llu: %s\n", (unsigned long long) tid, strerror(errno));
r = -1;
}
if (setpriority(PRIO_PROCESS, tid, 0) < 0) {
if (errno != ESRCH)
syslog(LOG_WARNING, "Warning: Failed to reset nice level to 0 for thread %llu: %s\n", (unsigned long long) tid, strerror(errno));
r = -1;
}
return r;
}
static char* get_exe_name(pid_t pid, char *exe, size_t len) {
char fn[128];
ssize_t n;
assert_se(snprintf(fn, sizeof(fn)-1, "%s/%llu/exe", get_proc_path(), (unsigned long long) pid) < (int) (sizeof(fn)-1));
fn[sizeof(fn)-1] = 0;
if ((n = readlink(fn, exe, len-1)) < 0) {
snprintf(exe, len-1, "n/a");
exe[len-1] = 0;
} else
exe[n] = 0;
return exe;
}
static int process_set_realtime(struct rtkit_user *u, struct process *p, struct thread *t, unsigned priority) {
int r;
struct sched_param param;
char user[64], exe[128];
if ((int) priority < sched_get_priority_min(sched_policy) ||
(int) priority > sched_get_priority_max(sched_policy))
return -EINVAL;
/* We always want to be able to get a higher RT priority than
* the client */
if (priority >= our_realtime_priority ||
priority > max_realtime_priority)
return -EPERM;
/* Make sure users don't flood us with requests */
if (!verify_burst(u))
return -EBUSY;
/* Temporarily become a realtime process. We do this to make
* sure that our verification code is not preempted by an evil
* client's code which might have gotten RT through
* us. */
if ((r = self_set_realtime(our_realtime_priority)) < 0)
return r;
/* Let's make sure that everything is alright before we make
* the process realtime */
if ((r = verify_process_user(u, p)) < 0 ||
(r = verify_process_starttime(p)) < 0 ||
(r = verify_process_rttime(p)) < 0 ||
(r = verify_thread_starttime(p, t)) < 0)
goto finish;
/* Ok, everything seems to be in order, now, let's do it */
memset(¶m, 0, sizeof(param));
param.sched_priority = (int) priority;
if (sched_setscheduler(t->pid, sched_policy|SCHED_RESET_ON_FORK, ¶m) < 0) {
r = -errno;
syslog(LOG_ERR, "Failed to make thread %llu RT: %s\n", (unsigned long long) t->pid, strerror(errno));
goto finish;
}
/* We do some sanity checks afterwards, to verify that the
* caller didn't play games with us and replaced the process
* behind the PID */
if ((r = verify_thread_starttime(p, t)) < 0 ||
(r = verify_process_rttime(p)) < 0 ||
(r = verify_process_starttime(p)) < 0 ||
(r = verify_process_user(u, p)) < 0) {
thread_reset(t->pid);
goto finish;
}
syslog(LOG_INFO, "Successfully made thread %llu of process %llu (%s) owned by '%s' RT at priority %u.\n",
(unsigned long long) t->pid,
(unsigned long long) p->pid,
get_exe_name(p->pid, exe, sizeof(exe)),
get_user_name(u->uid, user, sizeof(user)),
priority);
r = 0;
finish:
self_drop_realtime(our_nice_level);
return r;
}
static int process_set_high_priority(struct rtkit_user *u, struct process *p, struct thread *t, int priority) {
int r;
struct sched_param param;
char user[64], exe[128];
if (priority < PRIO_MIN || priority >= PRIO_MAX)
return -EINVAL;
if (priority < min_nice_level)
return -EPERM;
/* Make sure users don't flood us with requests */
if (!verify_burst(u))
return -EBUSY;
/* Temporarily become a realtime process */
if ((r = self_set_realtime(our_realtime_priority)) < 0)
return r;
/* Let's make sure that everything is alright before we make
* the process high priority */
if ((r = verify_process_user(u, p)) < 0 ||
(r = verify_process_starttime(p)) < 0 ||
(r = verify_thread_starttime(p, t)) < 0)
goto finish;
/* Ok, everything seems to be in order, now, let's do it */
memset(¶m, 0, sizeof(param));
param.sched_priority = 0;
if (sched_setscheduler(t->pid, SCHED_OTHER|SCHED_RESET_ON_FORK, ¶m) < 0) {
r = -errno;
syslog(LOG_ERR, "Failed to make process %llu SCHED_NORMAL: %s\n", (unsigned long long) t->pid, strerror(errno));
goto finish;
}
if (setpriority(PRIO_PROCESS, t->pid, priority) < 0) {
r = -errno;
syslog(LOG_ERR, "Failed to set nice level of process %llu to %i: %s\n", (unsigned long long) t->pid, priority, strerror(errno));
goto finish;
}
if ((r = verify_thread_starttime(p, t)) < 0 ||
(r = verify_process_starttime(p)) < 0 ||
(r = verify_process_user(u, p)) < 0) {
thread_reset(t->pid);
goto finish;
}
syslog(LOG_INFO, "Successfully made thread %llu of process %llu (%s) owned by '%s' high priority at nice level %i.\n",
(unsigned long long) t->pid,
(unsigned long long) p->pid,
get_exe_name(p->pid, exe, sizeof(exe)),
get_user_name(u->uid, user, sizeof(user)),
priority);
r = 0;
finish:
self_drop_realtime(our_nice_level);
return r;
}
static void reset_known(void) {
struct rtkit_user *u;
struct process *p;
struct thread *t;
unsigned n_demoted = 0;
syslog(LOG_INFO, "Demoting known real-time threads.\n");
for (u = users; u; u = u->next)
for (p = u->processes; p; p = p->next)
for (t = p->threads; t; t = t->next)
if (verify_process_user(u, p) >= 0 &&
verify_process_starttime(p) >= 0 &&
verify_thread_starttime(p, t) >= 0)
if (thread_reset(t->pid) >= 0) {
char exe[64];
syslog(LOG_NOTICE, "Successfully demoted thread %llu of process %llu (%s).\n",
(unsigned long long) t->pid,
(unsigned long long) p->pid,
get_exe_name(p->pid, exe, sizeof(exe)));
n_demoted++;
}
syslog(LOG_NOTICE, "Demoted %u threads.\n", n_demoted);
}
static int reset_all(void) {
DIR *pd;
int r;
unsigned n_demoted = 0;
/* Goes through /proc and demotes *all* threads to
* SCHED_OTHER */
syslog(LOG_INFO, "Demoting known and unknown real-time threads.\n");
if (!(pd = opendir(get_proc_path()))) {
r = -errno;
syslog(LOG_ERR, "opendir(%s) failed: %s\n", get_proc_path(), strerror(errno));
return r;
}
for (;;) {
const struct dirent *pde;
char *e = NULL;
unsigned long long pid;
char fn[128];
DIR *td;
struct stat st;
if (!(pde = readdir(pd)))
break;
if (!(pde->d_type & DT_DIR))
continue;
errno = 0;
pid = strtoull(pde->d_name, &e, 10);
if (errno != 0 || !e || *e != 0)
continue;
if ((pid_t) pid == getpid() ||
pid == 1)
continue;
assert_se(snprintf(fn, sizeof(fn)-1, "%s/%llu", get_proc_path(), pid) < (int) (sizeof(fn)-1));
fn[sizeof(fn)-1] = 0;
if (stat(fn, &st) < 0) {
if (errno != ENOENT)
syslog(LOG_WARNING, "Warning: stat(%s) failed: %s\n", fn, strerror(errno));
continue;
}
if (!S_ISDIR(st.st_mode))
continue;
if (!canary_demote_root && st.st_uid == 0)
continue;
assert_se(snprintf(fn, sizeof(fn)-1, "%s/%llu/task", get_proc_path(), pid) < (int) (sizeof(fn)-1));
fn[sizeof(fn)-1] = 0;
if (!(td = opendir(fn)))
continue;
for (;;) {
const struct dirent *tde;
unsigned long long tid;
if (!(tde = readdir(td)))
break;
if (!(tde->d_type & DT_DIR))
continue;
e = NULL;
errno = 0;
tid = strtoull(tde->d_name, &e, 10);
if (errno != 0 || !e || *e != 0)
continue;
if ((r = sched_getscheduler(tid)) < 0) {
if (errno != ESRCH)
syslog(LOG_WARNING, "Warning: sched_getscheduler() failed: %s\n", strerror(errno));
continue;
}
if (r == SCHED_FIFO || r == SCHED_RR ||
r == (SCHED_FIFO|SCHED_RESET_ON_FORK) || r == (SCHED_RR|SCHED_RESET_ON_FORK))
if (thread_reset((pid_t) tid) >= 0) {
char exe[64];
syslog(LOG_NOTICE, "Successfully demoted thread %llu of process %llu (%s).\n",
(unsigned long long) tid,
(unsigned long long) pid,
get_exe_name(pid, exe, sizeof(exe)));
n_demoted++;
}
}
closedir(td);
}
closedir(pd);
syslog(LOG_NOTICE, "Demoted %u threads.\n", n_demoted);
return 0;
}
/* This mimics dbus_bus_get_unix_user() */
static unsigned long get_unix_process_id(
DBusConnection *connection,
const char *name,
DBusError *error) {
DBusMessage *m, *r;
uint32_t pid = (uint32_t) -1;
assert_se(m = dbus_message_new_method_call(
DBUS_SERVICE_DBUS,
DBUS_PATH_DBUS,
DBUS_INTERFACE_DBUS,
"GetConnectionUnixProcessID"));
assert_se(dbus_message_append_args(
m,
DBUS_TYPE_STRING, &name,
DBUS_TYPE_INVALID));
r = dbus_connection_send_with_reply_and_block(connection, m, -1, error);
dbus_message_unref (m);
if (!r)
goto finish;
if (dbus_set_error_from_message(error, r))
goto finish;
if (!dbus_message_get_args(
r, error,
DBUS_TYPE_UINT32, &pid,
DBUS_TYPE_INVALID)) {
pid = (uint32_t) -1;
goto finish;
}
finish:
if (r)
dbus_message_unref(r);
return (unsigned long) pid;
}
static int lookup_client(
struct rtkit_user **_u,
struct process **_p,
struct thread **_t,
DBusConnection *c,
DBusMessage *m,
pid_t pid,
pid_t tid) {
DBusError error;
int r;
unsigned long uid;
unsigned long long starttime;
struct rtkit_user *u;
struct process *p;
struct thread *t;
dbus_error_init(&error);
/* Determine caller credentials */
if ((uid = dbus_bus_get_unix_user(c, dbus_message_get_sender(m), &error)) == (unsigned long) -1) {
syslog(LOG_ERR, "dbus_message_get_unix_user() failed: %s\n", error.message);
r = -EIO;
goto fail;
}
if (pid == (pid_t) -1 &&
(pid = get_unix_process_id(c, dbus_message_get_sender(m), &error)) == (pid_t) -1) {
syslog(LOG_ERR, "get_unix_process_id() failed: %s\n", error.message);
r = -EIO;
goto fail;
}
/* Find or create user structure */
if ((r = find_user(&u, (uid_t) uid) < 0))
goto fail;
/* Find or create process structure */
if ((r = read_starttime((pid_t) pid, 0, &starttime)) < 0)
goto fail;
if ((r = find_process(&p, u, (pid_t) pid, starttime)) < 0)
goto fail;
/* Find or create thread structure */
if (tid == 0)
tid = p->pid;
if ((r = read_starttime(p->pid, tid, &starttime)) < 0)
goto fail;
if ((r = find_thread(&t, u, p, (pid_t) tid, starttime)) < 0)
goto fail;
*_u = u;
*_p = p;
*_t = t;
return 0;
fail:
dbus_error_free(&error);
return r;
}
static const char *translate_error_forward(int error) {
switch (error) {
case -EPERM:
case -EACCES:
case -EBUSY:
return DBUS_ERROR_ACCESS_DENIED;
case -ENOMEM:
return DBUS_ERROR_NO_MEMORY;
default:
return DBUS_ERROR_FAILED;
}
}
static int translate_error_backwards(const char *name) {
if (strcmp(name, DBUS_ERROR_NO_MEMORY) == 0)
return -ENOMEM;
if (strcmp(name, DBUS_ERROR_SERVICE_UNKNOWN) == 0 ||
strcmp(name, DBUS_ERROR_NAME_HAS_NO_OWNER) == 0)
return -ENOENT;
if (strcmp(name, DBUS_ERROR_ACCESS_DENIED) == 0 ||
strcmp(name, DBUS_ERROR_AUTH_FAILED) == 0)
return -EACCES;
return -EIO;
}
static int verify_polkit(DBusConnection *c, struct rtkit_user *u, struct process *p, const char *action) {
DBusError error;
DBusMessage *m = NULL, *r = NULL;
const char *unix_process = "unix-process";
const char *pid = "pid";
const char *start_time = "start-time";
const char *cancel_id = "";
uint32_t flags = 0;
uint32_t pid_u32 = p->pid;
uint64_t start_time_u64 = p->starttime;
DBusMessageIter iter_msg, iter_struct, iter_array, iter_dict, iter_variant;
int ret;
dbus_bool_t authorized = FALSE;
dbus_error_init(&error);
assert_se(m = dbus_message_new_method_call(
"org.freedesktop.PolicyKit1",
"/org/freedesktop/PolicyKit1/Authority",
"org.freedesktop.PolicyKit1.Authority",
"CheckAuthorization"));
dbus_message_iter_init_append(m, &iter_msg);
assert_se(dbus_message_iter_open_container(&iter_msg, DBUS_TYPE_STRUCT, NULL, &iter_struct));
assert_se(dbus_message_iter_append_basic(&iter_struct, DBUS_TYPE_STRING, &unix_process));
assert_se(dbus_message_iter_open_container(&iter_struct, DBUS_TYPE_ARRAY, "{sv}", &iter_array));
assert_se(dbus_message_iter_open_container(&iter_array, DBUS_TYPE_DICT_ENTRY, NULL, &iter_dict));
assert_se(dbus_message_iter_append_basic(&iter_dict, DBUS_TYPE_STRING, &pid));
assert_se(dbus_message_iter_open_container(&iter_dict, DBUS_TYPE_VARIANT, "u", &iter_variant));
assert_se(dbus_message_iter_append_basic(&iter_variant, DBUS_TYPE_UINT32, &pid_u32));
assert_se(dbus_message_iter_close_container(&iter_dict, &iter_variant));
assert_se(dbus_message_iter_close_container(&iter_array, &iter_dict));
assert_se(dbus_message_iter_open_container(&iter_array, DBUS_TYPE_DICT_ENTRY, NULL, &iter_dict));
assert_se(dbus_message_iter_append_basic(&iter_dict, DBUS_TYPE_STRING, &start_time));
assert_se(dbus_message_iter_open_container(&iter_dict, DBUS_TYPE_VARIANT, "t", &iter_variant));
assert_se(dbus_message_iter_append_basic(&iter_variant, DBUS_TYPE_UINT64, &start_time_u64));
assert_se(dbus_message_iter_close_container(&iter_dict, &iter_variant));
assert_se(dbus_message_iter_close_container(&iter_array, &iter_dict));
assert_se(dbus_message_iter_close_container(&iter_struct, &iter_array));
assert_se(dbus_message_iter_close_container(&iter_msg, &iter_struct));
assert_se(dbus_message_iter_append_basic(&iter_msg, DBUS_TYPE_STRING, &action));
assert_se(dbus_message_iter_open_container(&iter_msg, DBUS_TYPE_ARRAY, "{ss}", &iter_array));
assert_se(dbus_message_iter_close_container(&iter_msg, &iter_array));
assert_se(dbus_message_iter_append_basic(&iter_msg, DBUS_TYPE_UINT32, &flags));
assert_se(dbus_message_iter_append_basic(&iter_msg, DBUS_TYPE_STRING, &cancel_id));
if (!(r = dbus_connection_send_with_reply_and_block(c, m, -1, &error))) {
ret = translate_error_backwards(error.name);
goto finish;
}
if (dbus_set_error_from_message(&error, r)) {
ret = translate_error_backwards(error.name);
goto finish;
}
if (!dbus_message_iter_init(r, &iter_msg) ||
dbus_message_iter_get_arg_type(&iter_msg) != DBUS_TYPE_STRUCT) {
ret = -EIO;
goto finish;
}
dbus_message_iter_recurse(&iter_msg, &iter_struct);
if (dbus_message_iter_get_arg_type(&iter_struct) != DBUS_TYPE_BOOLEAN) {
ret = -EIO;
goto finish;
}
dbus_message_iter_get_basic(&iter_struct, &authorized);
ret = authorized ? 0 : -EPERM;
finish:
if (m)
dbus_message_unref(m);
if (r)
dbus_message_unref(r);
if (error.message)
syslog(LOG_WARNING, "Warning: PolicyKit call failed: %s\n", error.message);
dbus_error_free(&error);
return ret;
}
static int verify_canary_refusal(void) {
struct timespec now;
assert_se(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
if (now.tv_sec < (time_t) refuse_until) {
syslog(LOG_WARNING, "Recovering from system lockup, not allowing further RT threads.\n");
return -EPERM;
}
return 0;
}
static void add_variant(
DBusMessage *m,
int type,
const void *data) {
DBusMessageIter iter, sub;
char t[2];
t[0] = (char) type;
t[1] = 0;
dbus_message_iter_init_append(m, &iter);
assert_se(dbus_message_iter_open_container(&iter, DBUS_TYPE_VARIANT, t, &sub));
assert_se(dbus_message_iter_append_basic(&sub, type, data));
assert_se(dbus_message_iter_close_container(&iter, &sub));
}
static int handle_dbus_prop_get(const char* property, DBusMessage *r) {
if (strcmp(property, "RTTimeNSecMax") == 0)
add_variant(r, DBUS_TYPE_INT64, &rttime_nsec_max);
else if (strcmp(property, "MaxRealtimePriority") == 0)
add_variant(r, DBUS_TYPE_INT32, &max_realtime_priority);
else if (strcmp(property, "MinNiceLevel") == 0)
add_variant(r, DBUS_TYPE_INT32, &min_nice_level);
else
return -1;
return 0;
}
static DBusHandlerResult dbus_handler(DBusConnection *c, DBusMessage *m, void *userdata) {
DBusError error;
DBusMessage *r = NULL;
int is2 = 0;
dbus_error_init(&error);
/* We garbage collect on every user call */
user_gc();
if (dbus_message_is_method_call(m, "org.freedesktop.RealtimeKit1", "MakeThreadRealtime") ||
(is2 = dbus_message_is_method_call(m, "org.freedesktop.RealtimeKit1", "MakeThreadRealtimeWithPID"))) {
uint64_t thread, process = (uint64_t) -1;
uint32_t priority;
struct rtkit_user *u;
struct process *p;
struct thread *t;
int ret;
if ((ret = verify_canary_refusal()) < 0) {
assert_se(r = dbus_message_new_error_printf(m, translate_error_forward(ret), strerror(-ret)));
goto finish;
}
if (is2)
ret = dbus_message_get_args(m, &error,
DBUS_TYPE_UINT64, &process,
DBUS_TYPE_UINT64, &thread,
DBUS_TYPE_UINT32, &priority,
DBUS_TYPE_INVALID);
else
ret = dbus_message_get_args(m, &error,
DBUS_TYPE_UINT64, &thread,
DBUS_TYPE_UINT32, &priority,
DBUS_TYPE_INVALID);
if (!ret) {
syslog(LOG_DEBUG, "Failed to parse MakeThreadRealtime() method call: %s\n", error.message);
assert_se(r = dbus_message_new_error(m, error.name, error.message));
goto finish;
}
if ((ret = lookup_client(&u, &p, &t, c, m, (pid_t)process, (pid_t) thread)) < 0) {
syslog(LOG_DEBUG, "Failed to look up client: %s\n", strerror(-ret));
assert_se(r = dbus_message_new_error_printf(m, translate_error_forward(ret), strerror(-ret)));
goto finish;
}
if ((ret = verify_polkit(c, u, p, "org.freedesktop.RealtimeKit1.acquire-real-time")) < 0) {
assert_se(r = dbus_message_new_error_printf(m, translate_error_forward(ret), strerror(-ret)));
goto finish;
}
if ((ret = process_set_realtime(u, p, t, priority))) {
assert_se(r = dbus_message_new_error_printf(m, translate_error_forward(ret), strerror(-ret)));
goto finish;
}
assert_se(r = dbus_message_new_method_return(m));
} else if (dbus_message_is_method_call(m, "org.freedesktop.RealtimeKit1", "MakeThreadHighPriority")
|| (is2 = dbus_message_is_method_call(m, "org.freedesktop.RealtimeKit1", "MakeThreadHighPriorityWithPID"))) {
uint64_t thread, process = (uint64_t) -1;
int32_t priority;
struct rtkit_user *u;
struct process *p;
struct thread *t;
int ret;
if ((ret = verify_canary_refusal()) < 0) {
assert_se(r = dbus_message_new_error_printf(m, translate_error_forward(ret), strerror(-ret)));
goto finish;
}
if (is2)
ret = dbus_message_get_args(m, &error,
DBUS_TYPE_UINT64, &process,
DBUS_TYPE_UINT64, &thread,
DBUS_TYPE_INT32, &priority,
DBUS_TYPE_INVALID);
else
ret = dbus_message_get_args(m, &error,
DBUS_TYPE_UINT64, &thread,
DBUS_TYPE_INT32, &priority,
DBUS_TYPE_INVALID);
if (!ret) {
syslog(LOG_DEBUG, "Failed to parse MakeThreadHighPriority() method call: %s\n", error.message);
assert_se(r = dbus_message_new_error(m, error.name, error.message));
goto finish;
}
if ((ret = lookup_client(&u, &p, &t, c, m, (pid_t)process, (pid_t) thread)) < 0) {
syslog(LOG_DEBUG, "Failed to look up client: %s\n", strerror(-ret));
assert_se(r = dbus_message_new_error_printf(m, translate_error_forward(ret), strerror(-ret)));
goto finish;
}
if ((ret = verify_polkit(c, u, p, "org.freedesktop.RealtimeKit1.acquire-high-priority")) < 0) {
assert_se(r = dbus_message_new_error_printf(m, translate_error_forward(ret), strerror(-ret)));
goto finish;
}
if ((ret = process_set_high_priority(u, p, t, priority))) {
assert_se(r = dbus_message_new_error_printf(m, translate_error_forward(ret), strerror(-ret)));
goto finish;
}
assert_se(r = dbus_message_new_method_return(m));
} else if (dbus_message_is_method_call(m, "org.freedesktop.RealtimeKit1", "ResetAll")) {
reset_all();
user_gc();
assert_se(r = dbus_message_new_method_return(m));
} else if (dbus_message_is_method_call(m, "org.freedesktop.RealtimeKit1", "ResetKnown")) {
reset_known();
user_gc();
assert_se(r = dbus_message_new_method_return(m));
} else if (dbus_message_is_method_call(m, "org.freedesktop.RealtimeKit1", "Exit")) {
assert_se(r = dbus_message_new_method_return(m));
assert_se(dbus_connection_send(c, r, NULL));
dbus_message_unref(r);
r = NULL;
dbus_connection_close(c);
} else if (dbus_message_is_method_call(m, "org.freedesktop.DBus.Properties", "Get")) {
const char *interface, *property;
if (!dbus_message_get_args(m, &error,
DBUS_TYPE_STRING, &interface,
DBUS_TYPE_STRING, &property,
DBUS_TYPE_INVALID)) {
syslog(LOG_DEBUG, "Failed to parse property get call: %s\n", error.message);
assert_se(r = dbus_message_new_error(m, error.name, error.message));
goto finish;
}
if (strcmp(interface, "org.freedesktop.RealtimeKit1") == 0) {
assert_se(r = dbus_message_new_method_return(m));
if (!handle_dbus_prop_get(property, r) < 0) {
dbus_message_unref(r);
assert_se(r = dbus_message_new_error_printf(
m,
DBUS_ERROR_UNKNOWN_METHOD,
"Unknown property %s",
property));
}
} else
return DBUS_HANDLER_RESULT_NOT_YET_HANDLED;
} else if (dbus_message_is_method_call(m, "org.freedesktop.DBus.Introspectable", "Introspect")) {
const char *xml = INTROSPECT_XML;
assert_se(r = dbus_message_new_method_return(m));
assert_se(dbus_message_append_args(
r,
DBUS_TYPE_STRING, &xml,
DBUS_TYPE_INVALID));
} else
return DBUS_HANDLER_RESULT_NOT_YET_HANDLED;
syslog(LOG_DEBUG, "Supervising %u threads of %u processes of %u users.\n",
n_total_threads,
n_total_processes,
n_users);
sd_notifyf(0,
"STATUS=Supervising %u threads of %u processes of %u users.",
n_total_threads,
n_total_processes,
n_users);
finish:
if (r) {
assert_se(dbus_connection_send(c, r, NULL));
dbus_message_unref(r);
}
dbus_error_free(&error);
return DBUS_HANDLER_RESULT_HANDLED;
}
static int setup_dbus(DBusConnection **c) {
static const DBusObjectPathVTable vtable = {
.message_function = dbus_handler,
};
DBusError error;
dbus_error_init(&error);
if (!(*c = dbus_bus_get_private(DBUS_BUS_SYSTEM, &error))) {
syslog(LOG_ERR, "Failed to connect to system bus: %s\n", error.message);
goto fail;
}
if (dbus_bus_request_name(*c, RTKIT_SERVICE_NAME, DBUS_NAME_FLAG_DO_NOT_QUEUE, &error) < 0) {
syslog(LOG_ERR, "Failed to register name on bus: %s\n", error.message);
goto fail;
}
assert_se(dbus_connection_register_object_path(*c, RTKIT_OBJECT_PATH, &vtable, NULL));
return 0;
fail:
dbus_error_free(&error);
return -EIO;
}
static void block_all_signals(void) {
sigset_t set;
assert_se(sigfillset(&set) == 0);
assert_se(pthread_sigmask(SIG_BLOCK, &set, NULL) == 0);
}
static void* canary_thread(void *data) {
struct timespec last_cheep, now;
struct pollfd pollfd;
assert(canary_fd >= 0);
assert(quit_fd >= 0);
/* Make sure we are not disturbed by any signal */
block_all_signals();
self_drop_realtime(0);
memset(&pollfd, 0, sizeof(pollfd));
pollfd.fd = quit_fd;
pollfd.events = POLLIN;
assert_se(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
last_cheep = now;
syslog(LOG_DEBUG, "Canary thread running.\n");
for (;;) {
int r;
int64_t msec;
msec = TIMESPEC_MSEC(last_cheep) + canary_cheep_msec - TIMESPEC_MSEC(now);
if (msec < 0)
msec = 0;
r = poll(&pollfd, 1, (int) msec);
assert_se(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
if (r < 0) {
if (errno == EINTR || errno == EAGAIN)
continue;
syslog(LOG_ERR, "poll() failed: %s\n", strerror(errno));
break;
}
if (pollfd.revents) {
syslog(LOG_DEBUG, "Exiting canary thread.\n");
break;
}
if (TIMESPEC_MSEC(last_cheep) + canary_cheep_msec <= TIMESPEC_MSEC(now)) {
eventfd_t value = 1;
if (eventfd_write(canary_fd, value) < 0) {
syslog(LOG_ERR, "eventfd_write() failed: %s\n", strerror(errno));
break;
}
last_cheep = now;
continue;
}
}
return NULL;
}
static void* watchdog_thread(void *data) {
enum {
POLLFD_CANARY,
POLLFD_QUIT,
_POLLFD_MAX
};
struct timespec last_cheep, now;
struct pollfd pollfd[_POLLFD_MAX];
assert(canary_fd >= 0);
assert(quit_fd >= 0);
/* Make sure we are not disturbed by any signal */
block_all_signals();
self_set_realtime(canary_watchdog_realtime_priority);
memset(pollfd, 0, sizeof(pollfd));
pollfd[POLLFD_CANARY].fd = canary_fd;
pollfd[POLLFD_CANARY].events = POLLIN;
pollfd[POLLFD_QUIT].fd = quit_fd;
pollfd[POLLFD_QUIT].events = POLLIN;
assert_se(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
last_cheep = now;
syslog(LOG_DEBUG, "Watchdog thread running.\n");
for (;;) {
int r;
int64_t msec;
msec = TIMESPEC_MSEC(last_cheep) + canary_watchdog_msec - TIMESPEC_MSEC(now);
if (msec < 0)
msec = 0;
r = poll(pollfd, _POLLFD_MAX, (int) msec);
assert_se(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
if (r < 0) {
if (errno == EINTR || errno == EAGAIN)
continue;
syslog(LOG_ERR, "poll() failed: %s\n", strerror(errno));
break;
}
if (pollfd[POLLFD_QUIT].revents) {
syslog(LOG_DEBUG, "Exiting watchdog thread.\n");
break;
}
if (pollfd[POLLFD_CANARY].revents) {
eventfd_t value;
if (eventfd_read(canary_fd, &value) < 0) {
syslog(LOG_ERR, "eventfd_read() failed: %s\n", strerror(errno));
break;
}
last_cheep = now;
continue;
}
if (TIMESPEC_MSEC(last_cheep) + canary_watchdog_msec <= TIMESPEC_MSEC(now)) {
last_cheep = now;
syslog(LOG_WARNING, "The canary thread is apparently starving. Taking action.\n");
refuse_until = (uint32_t) now.tv_sec + canary_refusal_sec;
__sync_synchronize();
if (canary_demote_unknown)
reset_all();
else
reset_known();
continue;
}
}
return NULL;
}
static void stop_canary(void) {
int r;
if (quit_fd >= 0) {
eventfd_t value = 1;
if (eventfd_write(quit_fd, value) < 0)
syslog(LOG_WARNING, "Warning: eventfd_write() failed: %s\n", strerror(errno));
}
if (canary_thread_id != 0) {
if ((r = pthread_join(canary_thread_id, NULL)) != 0)
syslog(LOG_WARNING, "Warning: pthread_join() failed: %s\n", strerror(r));
canary_thread_id = 0;
}
if (watchdog_thread_id != 0) {
if ((r = pthread_join(watchdog_thread_id, NULL)) != 0)
syslog(LOG_WARNING, "Warning: pthread_join() failed: %s\n", strerror(r));
watchdog_thread_id = 0;
}
if (canary_fd >= 0) {
close(canary_fd);
canary_fd = -1;
}
if (quit_fd >= 0) {
close(quit_fd);
quit_fd = -1;
}
}
static int start_canary(void) {
int r;
if (!do_canary)
return 0;
if ((canary_fd = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC)) < 0 ||
(quit_fd = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC)) < 0) {
r = -errno;
syslog(LOG_ERR, "eventfd() failed: %s\n", strerror(errno));
goto fail;
}
if ((r = -pthread_create(&canary_thread_id, NULL, canary_thread, NULL)) < 0 ||
(r = -pthread_create(&watchdog_thread_id, NULL, watchdog_thread, NULL)) < 0) {
syslog(LOG_ERR, "pthread_create failed: %s\n", strerror(-r));
goto fail;
}
return 0;
fail:
stop_canary();
return r;
}
static int drop_privileges(void) {
struct passwd *pw = NULL;
int r;
if (do_drop_privileges) {
/* First, get user data */
if (!(pw = getpwnam(username))) {
syslog(LOG_ERR, "Failed to find user '%s'.\n", username);
return -ENOENT;
}
}
if (do_chroot) {
/* Second, chroot() */
if (chroot("/proc") < 0 ||
chdir("/") < 0) {
r = -errno;
syslog(LOG_ERR, "Failed to chroot() to /proc: %s\n", strerror(errno));
return r;
}
proc = "/";
syslog(LOG_DEBUG, "Successfully called chroot.\n");
}
if (do_drop_privileges) {
cap_t caps;
const cap_value_t cap_values[] = {
CAP_SYS_NICE, /* Needed for obvious reasons */
CAP_DAC_READ_SEARCH, /* Needed so that we can verify resource limits */
CAP_SYS_PTRACE /* Needed so that we can read /proc/$$/exe. Linux is weird. */
};
/* Third, say that we want to keep caps */
if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
r = -errno;
syslog(LOG_ERR, "PR_SET_KEEPCAPS failed: %s\n", strerror(errno));
return r;
}
/* Fourth, drop privs */
if (setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) < 0 ||
setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid) < 0) {
r = -errno;
syslog(LOG_ERR, "Failed to become %s: %s\n", username, strerror(errno));
return r;
}
/* Fifth, reset caps flag */
if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
r = -errno;
syslog(LOG_ERR, "PR_SET_KEEPCAPS failed: %s\n", strerror(errno));
return r;
}
/* Sixth, reduce caps */
assert_se(caps = cap_init());
assert_se(cap_clear(caps) == 0);
assert_se(cap_set_flag(caps, CAP_EFFECTIVE, ELEMENTSOF(cap_values), cap_values, CAP_SET) == 0);
assert_se(cap_set_flag(caps, CAP_PERMITTED, ELEMENTSOF(cap_values), cap_values, CAP_SET) == 0);
if (cap_set_proc(caps) < 0) {
r = -errno;
syslog(LOG_ERR, "cap_set_proc() failed: %s\n", strerror(errno));
return r;
}
assert_se(cap_free(caps) == 0);
/* Seventh, update environment */
setenv("USER", username, 1);
setenv("USERNAME", username, 1);
setenv("LOGNAME", username, 1);
setenv("HOME", get_proc_path(), 1);
syslog(LOG_DEBUG, "Successfully dropped privileges.\n");
}
return 0;
}
static int set_resource_limits(void) {
static struct {
int id;
const char *name;
rlim_t value;
} table[] = {
{ .id = RLIMIT_FSIZE, .name = "RLIMIT_FSIZE", .value = 0 },
{ .id = RLIMIT_MEMLOCK, .name = "RLIMIT_MEMLOCK", .value = 0 },
{ .id = RLIMIT_MSGQUEUE, .name = "RLIMIT_MSGQUEUE", .value = 0 },
{ .id = RLIMIT_NICE, .name = "RLIMIT_NICE", .value = 20 },
{ .id = RLIMIT_NOFILE, .name = "RLIMIT_NOFILE", .value = 50 },
{ .id = RLIMIT_NPROC, .name = "RLIMIT_NPROC", .value = 3 },
{ .id = RLIMIT_RTPRIO, .name = "RLIMIT_RTPRIO", .value = 0 }, /* Since we have CAP_SYS_NICE we don't need this */
{ .id = RLIMIT_RTTIME, .name = "RLIMIT_RTTIME", .value = 0 }
};
unsigned u;
int r;
assert(table[7].id == RLIMIT_RTTIME);
table[7].value = rttime_nsec_max; /* Do as I say AND do as I do */
if (!do_limit_resources)
return 0;
for (u = 0; u < ELEMENTSOF(table); u++) {
struct rlimit rlim;
if (getrlimit(table[u].id, &rlim) < 0) {
r = -errno;
syslog(LOG_ERR, "Failed to get %s: %s\n", table[u].name, strerror(errno));
return r;
}
if (rlim.rlim_max < table[u].value)
continue;
rlim.rlim_cur = rlim.rlim_max = table[u].value;
if (setrlimit(table[u].id, &rlim) < 0) {
r = -errno;
syslog(LOG_ERR, "Failed to set %s: %s\n", table[u].name, strerror(errno));
return r;
}
}
syslog(LOG_DEBUG, "Successfully limited resources.\n");
return 0;
}
enum {
ARG_HELP = 256,
ARG_VERSION,
ARG_SCHEDULING_POLICY,
ARG_OUR_REALTIME_PRIORITY,
ARG_OUR_NICE_LEVEL,
ARG_MAX_REALTIME_PRIORITY,
ARG_MIN_NICE_LEVEL,
ARG_USER_NAME,
ARG_RTTIME_NSEC_MAX,
ARG_USERS_MAX,
ARG_PROCESSES_PER_USER_MAX,
ARG_THREADS_PER_USER_MAX,
ARG_ACTIONS_BURST_SEC,
ARG_ACTIONS_PER_BURST_MAX,
ARG_NO_DROP_PRIVILEGES,
ARG_NO_CHROOT,
ARG_NO_LIMIT_RESOURCES,
ARG_NO_CANARY,
ARG_CANARY_CHEEP_MSEC,
ARG_CANARY_WATCHDOG_MSEC,
ARG_CANARY_DEMOTE_ROOT,
ARG_CANARY_DEMOTE_UNKNOWN,
ARG_CANARY_REFUSE_SEC,
ARG_STDERR,
ARG_INTROSPECT
};
/* Table for getopt_long() */
static const struct option long_options[] = {
{ "help", no_argument, 0, ARG_HELP },
{ "version", no_argument, 0, ARG_VERSION },
{ "scheduling-policy", required_argument, 0, ARG_SCHEDULING_POLICY },
{ "our-realtime-priority", required_argument, 0, ARG_OUR_REALTIME_PRIORITY },
{ "our-nice-level", required_argument, 0, ARG_OUR_NICE_LEVEL },
{ "max-realtime-priority", required_argument, 0, ARG_MAX_REALTIME_PRIORITY },
{ "min-nice-level", required_argument, 0, ARG_MIN_NICE_LEVEL },
{ "user-name", required_argument, 0, ARG_USER_NAME },
{ "rttime-nsec-max", required_argument, 0, ARG_RTTIME_NSEC_MAX },
{ "users-max", required_argument, 0, ARG_USERS_MAX },
{ "processes-per-user-max", required_argument, 0, ARG_PROCESSES_PER_USER_MAX },
{ "threads-per-user-max", required_argument, 0, ARG_THREADS_PER_USER_MAX },
{ "actions-burst-sec", required_argument, 0, ARG_ACTIONS_BURST_SEC },
{ "actions-per-burst-max", required_argument, 0, ARG_ACTIONS_PER_BURST_MAX },
{ "no-drop-privileges", no_argument, 0, ARG_NO_DROP_PRIVILEGES },
{ "no-chroot", no_argument, 0, ARG_NO_CHROOT },
{ "no-limit-resources", no_argument, 0, ARG_NO_LIMIT_RESOURCES },
{ "no-canary", no_argument, 0, ARG_NO_CANARY },
{ "canary-cheep-msec", required_argument, 0, ARG_CANARY_CHEEP_MSEC },
{ "canary-watchdog-msec", required_argument, 0, ARG_CANARY_WATCHDOG_MSEC },
{ "canary-demote-root", no_argument, 0, ARG_CANARY_DEMOTE_ROOT },
{ "canary-demote-unknown", no_argument, 0, ARG_CANARY_DEMOTE_UNKNOWN },
{ "canary-refuse-sec", required_argument, 0, ARG_CANARY_REFUSE_SEC },
{ "stderr", no_argument, 0, ARG_STDERR },
{ "introspect", no_argument, 0, ARG_INTROSPECT },
{ NULL, 0, 0, 0}
};
static char* get_file_name(const char *p) {
char *e;
if ((e = strrchr(p, '/')))
return e + 1;
else
return (char*) p;
}
static void show_help(const char *exe) {
static const char * const sp_names[] = {
[SCHED_OTHER] = "OTHER",
[SCHED_BATCH] = "BATCH",
[SCHED_FIFO] = "FIFO",
[SCHED_RR] = "RR"
};
printf("%s [options]\n\n"
"COMMANDS:\n"
" -h, --help Show this help\n"
" --version Show version\n\n"
"OPTIONS:\n"
" --stderr Log to STDERR in addition to syslog\n"
" --user-name=USER Run daemon as user (%s)\n\n"
" --scheduling-policy=(RR|FIFO) Choose scheduling policy (%s)\n"
" --our-realtime-priority=[%i..%i] Realtime priority for the daemon (%u)\n"
" --our-nice-level=[%i..%i] Nice level for the daemon (%i)\n"
" --max-realtime-priority=[%i..%i] Max realtime priority for clients (%u)\n"
" --min-nice-level=[%i..%i] Min nice level for clients (%i)\n\n"
" --rttime-nsec-max=NSEC Require clients to have set RLIMIT_RTTIME\n"
" not greater than this (%llu)\n\n"
" --users-max=INT How many users this daemon will serve at\n"
" max at the same time (%u)\n"
" --processes-per-user-max=INT How many processes this daemon will serve\n"
" at max per user at the same time (%u)\n"
" --threads-per-user-max=INT How many threads this daemon will serve\n"
" at max per user at the same time (%u)\n\n"
" --actions-burst-sec=SEC Enforce requests limits in this time (%u)\n"
" --actions-per-burst-max=INT Allow this many requests per burst (%u)\n\n"
" --canary-cheep-msec=MSEC Canary cheep interval (%u)\n"
" --canary-watchdog-msec=MSEC Watchdog action delay (%u)\n"
" --canary-demote-unknown When the canary dies demote unknown\n"
" processes too?\n"
" --canary-demote-root When the canary dies demote root\n"
" processes too?\n"
" --canary-refuse-sec=SEC How long to refuse further requests\n"
" after the canary died (%u)\n\n"
" --no-canary Don't run a canary-based RT watchdog\n\n"
" --no-drop-privileges Don't drop privileges\n"
" --no-chroot Don't chroot\n"
" --no-limit-resources Don't limit daemon's resources\n",
exe,
username,
sp_names[sched_policy],
sched_get_priority_min(sched_policy), sched_get_priority_max(sched_policy), our_realtime_priority,
PRIO_MIN, PRIO_MAX-1, our_nice_level,
sched_get_priority_min(sched_policy), sched_get_priority_max(sched_policy), max_realtime_priority,
PRIO_MIN, PRIO_MAX-1, min_nice_level,
rttime_nsec_max,
users_max,
processes_per_user_max,
threads_per_user_max,
actions_burst_sec,
actions_per_burst_max,
canary_cheep_msec,
canary_watchdog_msec,
canary_refusal_sec);
}
static int parse_command_line(int argc, char *argv[], int *ret) {
int c;
while ((c = getopt_long(argc, argv, "h", long_options, NULL)) >= 0) {
switch (c) {
case 'h':
case ARG_HELP:
show_help(get_file_name(argv[0]));
*ret = 0;
return 0;
case ARG_VERSION:
printf("%s " PACKAGE_VERSION "\n", get_file_name(argv[0]));
*ret = 0;
return 0;
case ARG_USER_NAME:
username = optarg;
break;
case ARG_SCHEDULING_POLICY: {
if (strcasecmp(optarg, "rr") == 0)
sched_policy = SCHED_RR;
else if (strcasecmp(optarg, "fifo") == 0)
sched_policy = SCHED_FIFO;
else {
fprintf(stderr, "--scheduling-policy parameter invalid.\n");
return -1;
}
break;
}
case ARG_OUR_REALTIME_PRIORITY: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u < (unsigned) sched_get_priority_min(sched_policy) || u > (unsigned) sched_get_priority_max(sched_policy)) {
fprintf(stderr, "--our-realtime-priority parameter invalid.\n");
return -1;
}
our_realtime_priority = u;
break;
}
case ARG_OUR_NICE_LEVEL: {
char *e = NULL;
long i;
errno = 0;
i = strtol(optarg, &e, 0);
if (errno != 0 || !e || *e || i < PRIO_MIN || i > PRIO_MAX*2) {
fprintf(stderr, "--our-nice-level parameter invalid.\n");
return -1;
}
our_nice_level = i;
break;
}
case ARG_MAX_REALTIME_PRIORITY: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u < (unsigned) sched_get_priority_min(sched_policy) || u > (unsigned) sched_get_priority_max(sched_policy)) {
fprintf(stderr, "--max-realtime-priority parameter invalid.\n");
return -1;
}
max_realtime_priority = u;
break;
}
case ARG_MIN_NICE_LEVEL: {
char *e = NULL;
long i;
errno = 0;
i = strtol(optarg, &e, 0);
if (errno != 0 || !e || *e || i < PRIO_MIN || i >= PRIO_MAX) {
fprintf(stderr, "--min-nice-level parameter invalid.\n");
return -1;
}
min_nice_level = i;
break;
}
case ARG_RTTIME_NSEC_MAX: {
char *e = NULL;
errno = 0;
rttime_nsec_max = strtoull(optarg, &e, 0);
if (errno != 0 || !e || *e || rttime_nsec_max <= 0) {
fprintf(stderr, "--rttime-nsec-max parameter invalid.\n");
return -1;
}
break;
}
case ARG_USERS_MAX: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u <= 0) {
fprintf(stderr, "--users-max parameter invalid.\n");
return -1;
}
users_max = u;
break;
}
case ARG_PROCESSES_PER_USER_MAX: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u <= 0) {
fprintf(stderr, "--processes-per-user-max parameter invalid.\n");
return -1;
}
processes_per_user_max = u;
break;
}
case ARG_THREADS_PER_USER_MAX: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u <= 0) {
fprintf(stderr, "--threads-per-user-max parameter invalid.\n");
return -1;
}
threads_per_user_max = u;
break;
}
case ARG_ACTIONS_BURST_SEC: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u <= 0) {
fprintf(stderr, "--actions-burst-sec parameter invalid.\n");
return -1;
}
actions_burst_sec = u;
break;
}
case ARG_ACTIONS_PER_BURST_MAX: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u <= 0) {
fprintf(stderr, "--actions-per-burst-max parameter invalid.\n");
return -1;
}
actions_per_burst_max = u;
break;
}
case ARG_NO_DROP_PRIVILEGES:
do_drop_privileges = FALSE;
break;
case ARG_NO_CHROOT:
do_chroot = FALSE;
break;
case ARG_NO_LIMIT_RESOURCES:
do_limit_resources = FALSE;
break;
case ARG_NO_CANARY:
do_canary = FALSE;
break;
case ARG_CANARY_WATCHDOG_MSEC: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u <= 0) {
fprintf(stderr, "--canary-watchdog-msec parameter invalid.\n");
return -1;
}
canary_watchdog_msec = u;
break;
}
case ARG_CANARY_CHEEP_MSEC: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e || u <= 0) {
fprintf(stderr, "--canary-cheep-msec parameter invalid.\n");
return -1;
}
canary_cheep_msec = u;
break;
}
case ARG_CANARY_DEMOTE_ROOT:
canary_demote_root = TRUE;
break;
case ARG_CANARY_DEMOTE_UNKNOWN:
canary_demote_unknown = TRUE;
break;
case ARG_CANARY_REFUSE_SEC: {
char *e = NULL;
unsigned long u;
errno = 0;
u = strtoul(optarg, &e, 0);
if (errno != 0 || !e || *e) {
fprintf(stderr, "--canary-refuse-sec parameter invalid.\n");
return -1;
}
canary_refusal_sec = (uint32_t) u;
break;
}
case ARG_STDERR:
log_stderr = TRUE;
break;
case ARG_INTROSPECT:
fputs(INTROSPECT_XML, stdout);
*ret = 0;
return 0;
case '?':
default:
fprintf(stderr, "Unknown command.\n");
return -1;
}
}
if (optind < argc) {
fprintf(stderr, "Too many arguments.\n");
return -1;
}
if (max_realtime_priority >= our_realtime_priority) {
fprintf(stderr, "The maximum realtime priority (%u) handed out to clients cannot be higher then our own (%u).\n",
max_realtime_priority,
our_realtime_priority);
return -1;
}
if (canary_cheep_msec >= canary_watchdog_msec) {
fprintf(stderr, "The canary watchdog interval must be larger than the cheep interval.\n");
return -1;
}
assert(our_realtime_priority >= (unsigned) sched_get_priority_min(sched_policy));
assert(our_realtime_priority <= (unsigned) sched_get_priority_max(sched_policy));
assert(max_realtime_priority >= (unsigned) sched_get_priority_min(sched_policy));
assert(max_realtime_priority <= (unsigned) sched_get_priority_max(sched_policy));
assert(canary_watchdog_realtime_priority >= (unsigned) sched_get_priority_min(sched_policy));
assert(canary_watchdog_realtime_priority <= (unsigned) sched_get_priority_max(sched_policy));
assert(our_nice_level >= PRIO_MIN);
assert(our_nice_level < PRIO_MAX);
assert(min_nice_level >= PRIO_MIN);
assert(min_nice_level < PRIO_MAX);
return 1;
}
int main(int argc, char *argv[]) {
DBusConnection *bus = NULL;
int ret = 1;
struct rtkit_user *u;
unsigned long slack_ns;
if (parse_command_line(argc, argv, &ret) <= 0)
goto finish;
if (getuid() != 0) {
fprintf(stderr, "Need to be run as root.\n");
goto finish;
}
openlog(get_file_name(argv[0]),
LOG_NDELAY|LOG_PID|(log_stderr ? LOG_PERROR : 0),
LOG_DAEMON);
/* Raise our timer slack, we don't really need to be woken up
* on time. */
slack_ns = (((unsigned long) canary_watchdog_msec - (unsigned long) canary_cheep_msec) / 2UL) * 1000000UL;
if (prctl(PR_SET_TIMERSLACK, slack_ns) < 0)
syslog(LOG_WARNING, "PRT_SET_TIMERSLACK failed: %s\n", strerror(errno));
self_drop_realtime(our_nice_level);
if (setup_dbus(&bus) < 0)
goto finish;
if (drop_privileges() < 0)
goto finish;
if (set_resource_limits() < 0)
goto finish;
if (start_canary() < 0)
goto finish;
umask(0777);
syslog(LOG_DEBUG, "Running.\n");
sd_notify(0, "STATUS=Running.");
dbus_connection_set_exit_on_disconnect(bus, FALSE);
while (dbus_connection_read_write_dispatch(bus, -1))
;
ret = 0;
syslog(LOG_DEBUG, "Exiting cleanly.\n");
finish:
if (bus) {
if (dbus_connection_get_is_connected(bus))
dbus_connection_close(bus);
dbus_connection_unref(bus);
}
reset_known();
while ((u = users)) {
users = u->next;
free_user(u);
}
stop_canary();
dbus_shutdown();
return ret;
}