/*-*- Mode: C; c-basic-offset: 8 -*-*/ /* This is Linux-only. Rock'n'roll! */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #define TOP_PATH "/var/run/rtwatch" #define PERIOD_USEC_FILE #define CPU_LOAD_MAX 50 /* 80% CPU load at max */ #define RTPRIO_MAX 18 static int write_long(const char *fn, unsigned long u) { int fd; ssize_t k; char t[32]; snprintf(t, sizeof(t), "%lu\n", u); if ((fd = open(fn, O_WRONLY|O_NOFOLLOW|O_NOCTTY)) < 0) { fprintf(stderr, "open(\"%s\", O_WRONLY): %s\n", fn, strerror(errno)); return -1; } if ((k = write(fd, t, strlen(t))) != (ssize_t) strlen(t)) { fprintf(stderr, "write(): %s\n", k < 0 ? strerror(errno) : "short write"); close(fd); return -1; } if (close(fd) < 0) { fprintf(stderr, "close(): %s\n", strerror(errno)); return -1; } return 0; } static int read_long(const char *fn, unsigned long *u) { int fd; ssize_t k; char t[32]; if ((fd = open(fn, O_RDONLY|O_NOFOLLOW|O_NOCTTY)) < 0) { fprintf(stderr, "open(\"%s\", O_RDONLY): %s\n", fn, strerror(errno)); return -1; } if ((k = read(fd, t, sizeof(t)-1)) < 0) { fprintf(stderr, "read(): %s\n", k < 0 ? strerror(errno) : "eof"); close(fd); return -1; } if (close(fd) < 0) { fprintf(stderr, "close(): %s\n", strerror(errno)); return -1; } t[k] = 0; if (k <= 0 || t[k-1] != '\n' || sscanf(t, "%lu", u) != 1) { fprintf(stderr, "Parse failure in %s\n", fn); return -1; } return 0; } static int child(unsigned long runtime_usec, const char *cgroup_path, char *argv[]) { sigset_t ss; int ret = 1; char path[PATH_MAX]; struct rlimit rl; sigemptyset(&ss); if (sigprocmask(SIG_SETMASK, &ss, NULL) < 0) { fprintf(stderr, "sigprocmask(): %s\n", strerror(errno)); goto finish; } snprintf(path, sizeof(path), "%s/cpu.rt_runtime_us", cgroup_path); if (write_long(path, runtime_usec) < 0) goto finish; snprintf(path, sizeof(path), "%s/tasks", cgroup_path); if (write_long(path, (unsigned long) getpid()) < 0) goto finish; memset(&rl, 0, sizeof(rl)); rl.rlim_cur = rl.rlim_max = RTPRIO_MAX; if (setrlimit(RLIMIT_RTPRIO, &rl) < 0) { fprintf(stderr, "RLIMIT_RTPRIO: %s\n", strerror(errno)); goto finish; } if (setresuid(getuid(), getuid(), getuid()) < 0) { fprintf(stderr, "setresuid(): %s\n", strerror(errno)); goto finish; } execvp(argv[0], argv); fprintf(stderr, "execvp(): %s\n", strerror(errno)); finish: return ret; } static int check_dir(const char *path) { struct stat st; if (lstat(path, &st) < 0) { fprintf(stderr, "lstat(\"%s\", ...): %s", path, strerror(errno)); return -1; } if (!S_ISDIR(st.st_mode) || (st.st_mode & 0777) != 0700) { fprintf(stderr, "%s is not a directory or has inproper access modes.", path); return -1; } return 0; } static int wait_for_kid(pid_t pid, int timeout, sigset_t *ss, int *ret, int *ret_signal) { static const struct timespec ts = { .tv_sec = 2, .tv_nsec = 0, }; siginfo_t info; for (;;) { pid_t cpid; int status; memset(&info, 0, sizeof(info)); if (sigtimedwait(ss, &info, timeout ? &ts : 0) < 0) { if (errno == EAGAIN) return 1; fprintf(stderr, "sigtimedwait(): %s\n", strerror(errno)); return -1; } if (info.si_signo != SIGCHLD) { fprintf(stderr, "Caught signal %i.\n", info.si_signo); return -1; } if ((cpid = waitpid(-1, &status, WNOHANG)) < 0) { fprintf(stderr, "waitpid(%lu, ...): %s\n", (unsigned long) pid, strerror(errno)); return -1; } if (cpid != pid) continue; if (WIFEXITED(status)) { *ret = WEXITSTATUS(status); return 0; } if (WIFSIGNALED(status)) { fprintf(stderr, "Warning: child process terminated with signal %i.\n", WTERMSIG(status)); *ret_signal = WTERMSIG(status); return 0; } fprintf(stderr, "Hmm, waitpid() returned and we don't know why.\n"); return -1; } } static int safe_kill(pid_t pid, sigset_t *ss, int *ret, int *ret_signal) { int r; fprintf(stderr, "Killing child %lu with signal %i\n", (unsigned long) pid, SIGTERM); if (kill(pid, SIGTERM) < 0) { if (errno == ESRCH) return 0; fprintf(stderr, "kill(): %s\n", strerror(errno)); return -1; } if ((r = wait_for_kid(pid, 1, ss, ret, ret_signal)) <= 0) return r; fprintf(stderr, "Killing child %lu with signal %i\n", (unsigned long) pid, SIGKILL); if (kill(pid, SIGKILL) < 0) { if (errno == ESRCH) return 0; fprintf(stderr, "kill(): %s\n", strerror(errno)); return -1; } if ((r = wait_for_kid(pid, 1, ss, ret, ret_signal)) <= 0) return r; fprintf(stderr, "Client didn't react.\n"); return -1; } static void help(const char *argv0, FILE *f) { fprintf(f, "%s [arguments...]\n\n" "\truntime: available runtime in percent.\n" "\tprogram: the program to run\n\n" "Example:\n" "\t%s 1.5 foobar\n", argv0, argv0); } int main(int argc, char *argv[]) { char id[64], mnt_path[PATH_MAX], cgroup_path[PATH_MAX], root_runtime_us_path[PATH_MAX]; const char *argv0; int ret = 1; pid_t pid; sigset_t ss; float cpu_percentage; unsigned long runtime_usec, period_usec, u; int ret_signal = -1; if (argc < 0) argv0 = "rtwatch"; else if ((argv0 = strrchr(argv[0], '/'))) argv0++; else argv0 = argv[0]; if (argc == 1) { help(argv0, stdout); ret = 0; goto finish; } if (argc < 3) { help(argv0, stderr); goto finish; } cpu_percentage = atof(argv[1]); if (cpu_percentage <= 0 || cpu_percentage > 100) { fprintf(stderr, "Failed to parse period argument: %s\n", argv[1]); goto finish; } if (cpu_percentage > CPU_LOAD_MAX) { fprintf(stderr, "More than the limit of %u%% CPU requested.\n", CPU_LOAD_MAX); goto finish; } if (read_long("/proc/sys/kernel/sched_rt_period_us", &period_usec) < 0) goto finish; runtime_usec = (double)period_usec*cpu_percentage/100; if (runtime_usec <= 0) { fprintf(stderr, "Percentage too small.\n"); goto finish; } fprintf(stderr, "period=%0.2fms runtime=%0.2fms max_cpu_load=%0.1f%%\n", (double) period_usec / 1000, (double) runtime_usec / 1000, (double) (runtime_usec*100/period_usec)); if (geteuid() != 0) { fprintf(stderr, "%s needs to be run as root.\n", argv0); goto finish; } /* Block the signals we want to wait for with sigwait() */ sigemptyset(&ss); sigaddset(&ss, SIGTERM); sigaddset(&ss, SIGINT); sigaddset(&ss, SIGQUIT); sigaddset(&ss, SIGCHLD); if (sigprocmask(SIG_SETMASK, &ss, NULL) < 0) { fprintf(stderr, "sigprocmask(): %s\n", strerror(errno)); goto finish; } /* Make sure our signals are not set to SIG_IGN */ signal(SIGTERM, SIG_DFL); signal(SIGINT, SIG_DFL); signal(SIGQUIT, SIG_DFL); signal(SIGCHLD, SIG_DFL); /* Create top directory for the hierarchies */ if (mkdir(TOP_PATH, 0700) < 0 && errno != EEXIST) { fprintf(stderr, "mkdir(\"%s\"): %s\n", TOP_PATH, strerror(errno)); goto finish; } chmod(TOP_PATH, 0700); if (check_dir(TOP_PATH) < 0) goto finish_remove_top_path; /* Create mount directory for the hierarchy */ snprintf(mnt_path, sizeof(mnt_path), TOP_PATH "/%lu", (unsigned long) getpid()); if (mkdir(mnt_path, 0700) < 0) { fprintf(stderr, "mkdir(\"%s\"): %s\n", mnt_path, strerror(errno)); goto finish_remove_top_path; } chmod(mnt_path, 0700); if (check_dir(mnt_path) < 0) goto finish_remove_mnt_path; /* Create hierarchy */ snprintf(id, sizeof(id), "rtwatch-%lu", (unsigned long) getpid()); if (mount(id, mnt_path, "cgroup", 0, "cpu") < 0) { fprintf(stderr, "mount(\"%s\", \"%s\", \"cgroup\", ...): %s\n", id, mnt_path, strerror(errno)); goto finish_remove_mnt_path; } /* Decrease root runtime */ snprintf(root_runtime_us_path, sizeof(root_runtime_us_path), "%s/cpu.rt_runtime_us", mnt_path); if (read_long(root_runtime_us_path, &u) < 0) goto finish_umount; if (u < (period_usec/20 + runtime_usec)) { fprintf(stderr, "Refusing to limit root runtime to less than 5%%\n"); goto finish_umount; } u -= runtime_usec; if (write_long(root_runtime_us_path, u) < 0) goto finish_umount; /* Create cgroup */ snprintf(cgroup_path, sizeof(cgroup_path), "%s/rtwatch-%lu", mnt_path, (unsigned long) getpid()); if (mkdir(cgroup_path, 0700) < 0) { fprintf(stderr, "mkdir(\"%s\"): %s\n", cgroup_path, strerror(errno)); goto finish_increment_root_runtime; } if ((pid = fork()) < 0) { fprintf(stderr, "fork(): %s\n", strerror(errno)); goto finish_remove_cgroup_path; } else if (!pid) _exit(child(runtime_usec, cgroup_path, argv + 2)); if (wait_for_kid(pid, 0, &ss, &ret, &ret_signal) < 0) goto finish_kill; goto finish_remove_cgroup_path; finish_kill: /* We'll die shortly. Then, our child will be reparented to * init which then will care for reaping it. */ if (safe_kill(pid, &ss, &ret, &ret_signal) < 0) ret = 1; finish_remove_cgroup_path: if (rmdir(cgroup_path) < 0) { fprintf(stderr, "Warning: rmdir(\"%s\"): %s\n", cgroup_path, strerror(errno)); ret = 1; } finish_increment_root_runtime: if (read_long(root_runtime_us_path, &u) < 0) goto finish_umount; u += runtime_usec; if (u > period_usec) u = period_usec; write_long(root_runtime_us_path, u); finish_umount: if (umount(mnt_path) < 0) { fprintf(stderr, "Warning: umount(\"%s\"): %s\n", mnt_path, strerror(errno)); ret = 1; } finish_remove_mnt_path: if (rmdir(mnt_path) < 0) { fprintf(stderr, "Warning: rmdir(\"%s\"): %s\n", mnt_path, strerror(errno)); ret = 1; } finish_remove_top_path: /* If some other instance of rtwatch is still running the * following call will fail, but that's fine. */ if (rmdir(TOP_PATH) < 0 && errno != EBUSY && errno != ENOTEMPTY) { fprintf(stderr, "Warning: rmdir(\"%s\"): %s\n", TOP_PATH, strerror(errno)); ret = 1; } finish: if (ret_signal > 0) { sigemptyset(&ss); if (sigprocmask(SIG_SETMASK, &ss, NULL) < 0) fprintf(stderr, "Warning: sigprocmask(): %s\n", strerror(errno)); raise(ret_signal); } return ret; }