一. 前言

众所周知,System V IPC进程间通信机制体系中有着多种多样的进程间通信方式,如管道和有名管道,消息队列,信号,共享内存和信号量,套接字。从本文开始我们就逐个剖析进程间通信的机制和底层原理,就从信号开始讲起吧。

二. 信号基本知识

信号是进程处理紧急情况所用的一种方式,它没有特别复杂的数据结构,就是用一个代号一样的数字。Linux 提供了几十种信号,分别代表不同的意义。我们可以通过kill -l命令查看信号。

# kill -l 1) SIGHUP 2) SIGINT 3) SIGQUIT 4) SIGILL 5) SIGTRAP 6) SIGABRT 7) SIGBUS 8) SIGFPE 9) SIGKILL 10) SIGUSR1 11) SIGSEGV 12) SIGUSR2 13) SIGPIPE 14) SIGALRM 15) SIGTERM 16) SIGSTKFLT 17) SIGCHLD 18) SIGCONT 19) SIGSTOP 20) SIGTSTP 21) SIGTTIN 22) SIGTTOU 23) SIGURG 24) SIGXCPU 25) SIGXFSZ 26) SIGVTALRM 27) SIGPROF 28) SIGWINCH 29) SIGIO 30) SIGPWR 31) SIGSYS 34) SIGRTMIN 35) SIGRTMIN 1 36) SIGRTMIN 2 37) SIGRTMIN 3 38) SIGRTMIN 4 39) SIGRTMIN 5 40) SIGRTMIN 6 41) SIGRTMIN 7 42) SIGRTMIN 8 43) SIGRTMIN 9 44) SIGRTMIN 10 45) SIGRTMIN 11 46) SIGRTMIN 12 47) SIGRTMIN 13 48) SIGRTMIN 14 49) SIGRTMIN 15 50) SIGRTMAX-14 51) SIGRTMAX-13 52) SIGRTMAX-12 53) SIGRTMAX-11 54) SIGRTMAX-10 55) SIGRTMAX-9 56) SIGRTMAX-8 57) SIGRTMAX-7 58) SIGRTMAX-6 59) SIGRTMAX-5 60) SIGRTMAX-4 61) SIGRTMAX-3 62) SIGRTMAX-2 63) SIGRTMAX-1 64) SIGRTMAX

信号可以在任何时候发送给某一进程,进程需要为这个信号配置信号处理函数。当某个信号发生的时候,就默认执行这个函数就可以了。通过man 7 signal可以查看各个信号的具体含义和对应的处理方法

Signal Value Action Comment ────────────────────────────────────────────────────────────────────── SIGHUP 1 Term Hangup detected on controlling terminal or death of controlling process SIGINT 2 Term Interrupt from keyboard SIGQUIT 3 Core Quit from keyboard SIGILL 4 Core Illegal Instruction SIGABRT 6 Core Abort signal from abort(3) SIGFPE 8 Core Floating point exception SIGKILL 9 Term Kill signal SIGSEGV 11 Core Invalid memory reference SIGPIPE 13 Term Broken pipe: write to pipe with no readers SIGALRM 14 Term Timer signal from alarm(2) SIGTERM 15 Term Termination signal SIGUSR1 30,10,16 Term User-defined signal 1 SIGUSR2 31,12,17 Term User-defined signal 2 ……

由上表可见,信号的处理通常分为三种:

三. 信号和中断

信号和中断有着诸多相似之处:

但是二者实际上是有很多不同的,其不同的用途导致了运行逻辑的不同,最终在代码实现上体现出了不同的设计特点。其主要区别有:

更多Linux内核视频教程文本资料免费领取后台私信【内核大礼包】自行获取。

进程通信ipc(Systemvipc进程间通信机制和底层原理详解)(1)

四. 注册信号处理函数

有些时候我们希望能够让信号运行一些特殊功能,所以有了自定义的信号处理函数。注册API主要有signal()和sigaction()两个,其中sigaction()比较推荐使用。

typedef void (*sighandler_t)(int); sighandler_t signal(int signum, sighandler_t handler); int sigaction(int signum, const struct sigaction *act, struct sigaction *oldact);

其主要区别在于sigaction()对于信号signum会绑定对应的结构体sigaction而不仅仅是一个处理函数sighandler_t。这样做的好处是可以更精细的控制信号处理,通过不同参数实现不同的效果。例如sa_flags可以设置如

sa_restorer保存的是sa_handler 执行完毕之后,马上要执行的函数,即下一个函数地址的位置。

struct sigaction { __sighandler_t sa_handler; unsigned long sa_flags; __sigrestore_t sa_restorer; sigset_t sa_mask; /* mask last for extensibility */ };

sigaction()也是glibc封装的函数,最终系统调用为rt_sigaction()。该函数首先将用户态的 struct sigaction 结构拷贝为内核态的 k_sigaction,然后调用 do_sigaction()设置对应的信号处理动作。

SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret = -EINVAL; ...... if (act) { if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) return -EFAULT; } ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); if (!ret && oact) { if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) return -EFAULT; } out: return ret; }

do_sigaction()会将用户层传来的信号处理函数赋值给当前任务task_struct currrent对应的sighand->action[]数组中sig信号对应的位置,以用于之后调用。

int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; ...... k = &p->sighand->action[sig-1]; spin_lock_irq(&p->sighand->siglock); if (oact) *oact = *k; if (act) { sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); *k = *act; ...... } spin_unlock_irq(&p->sighand->siglock); return 0; }

五. 发送信号

信号发送来源广泛,有可能来自于用户态,有可能来自于硬件,也有可能来自于内核。

不论通过kill或者sigqueue系统调用还是通过tkill或者tgkill发送指定线程的信号,其最终调用的均是do_send_sig_info()函数,其调用链如下所示

kill()->kill_something_info()->kill_pid_info()->group_send_sig_info()->do_send_sig_info() tkill()->do_tkill()->do_send_specific()->do_send_sig_info() tgkill()->do_tkill()->do_send_specific()->do_send_sig_info() rt_sigqueueinfo()->do_rt_sigqueueinfo()->kill_proc_info()->kill_pid_info()->group_send_sig_info()->do_send_sig_info()

do_send_sig_info() 会调用 send_signal(),进而调用 __send_signal()。这里代码比较复杂,主要逻辑如下

int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { ...... ret = send_signal(sig, info, p, type); ...... } static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { ...... return __send_signal(sig, info, t, type, from_ancestor_ns); } static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type, int from_ancestor_ns) { struct sigpending *pending; struct sigqueue *q; ...... pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; ...... if (legacy_queue(pending, sig)) goto ret; ...... /* * Real-time signals must be queued if sent by sigqueue, or * some other real-time mechanism. It is implementation * defined whether kill() does so. We attempt to do so, on * the principle of least surprise, but since kill is not * allowed to fail with EAGAIN when low on memory we just * make sure at least one signal gets delivered and don't * pass on the info struct. */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else override_rlimit = 0; q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; q->info.si_pid = 0; q->info.si_uid = 0; break; default: copy_siginfo(&q->info, info); if (from_ancestor_ns) q->info.si_pid = 0; break; } userns_fixup_signal_uid(&q->info, t); } ...... out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); ...... complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; }

legacy_queue()中主要是判断是否为可靠信号,判断的依据是当信号小于 SIGRTMIN也即 32 的时候,如果我们发现这个信号已经在集合里面了,就直接退出。这里之所以前32位信号称之为不可靠信号其实是历史遗留问题,早期UNIX系统只定义了32种信号,而这些经过检验被定义为不可靠信号,主要指的是进程可能对信号做出错误的反应以及信号可能丢失:UNIX系统每次信号处理完需要重新安装信号,因此容易出现各种错误。linux也支持不可靠信号,但是对不可靠信号机制做出了改进:在调用完信号处理函数后,不必重新调用该信号的安装函数(信号安装函数是在可靠机制上是实现的)。因此,linux下的不可靠信号问题主要指的是信号可能丢失。

这里之所以会出现信号丢失,是因为这些信号可能会频繁快速出现。这样信号能够处理多少,和信号处理函数什么时候被调用,信号多大频率被发送,都有关系,而信号处理函数的调用时间也是不确定的,因此这种信号称之为不可靠信号。与之相对的,其他信号称之为可靠信号,支持排队执行。

static inline int legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } #define SIGRTMIN 32 #define SIGRTMAX _NSIG #define _NSIG 64

对于可靠信号我们通过__sigqueue_alloc()分配sigqueue对象,并挂载在sigpending中的链表上,最终调用complete_signal()找一个线程处理。其主要逻辑为:

static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; /* * Now find a thread we can wake up to take the signal off the queue. * * If the main thread wants the signal, it gets first crack. * Probably the least surprising to the average bear. */ if (wants_signal(sig, p)) t = p; else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. */ return; else { /* * Otherwise try to find a suitable thread. */ t = signal->curr_target; while (!wants_signal(sig, t)) { t = next_thread(t); if (t == signal->curr_target) /* * No thread needs to be woken. * Any eligible threads will see * the signal in the queue soon. */ return; } signal->curr_target = t; } /* * Found a killable thread. If the signal will be fatal, * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && !(signal->flags & SIGNAL_GROUP_EXIT) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ if (!sig_kernel_coredump(sig)) { /* * Start a group exit and wake everybody up. * This way we don't have other threads * running and doing things after a slower * thread has the fatal signal pending. */ signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = sig; signal->group_stop_count = 0; t = p; do { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); return; } } /* * The signal is already in the shared-pending queue. * Tell the chosen thread to wake up and dequeue it. */ signal_wake_up(t, sig == SIGKILL); return; }

signal_wake_up()函数主要逻辑为:

信号处理的调度和任务调度类似,均是采用标记位的方式进行。当信号来的时候,内核并不直接处理这个信号,而是设置一个标识位 TIF_SIGPENDING来表示已经有信号等待处理。同样等待系统调用结束,或者中断处理结束,从内核态返回用户态的时候再进行信号的处理。

进程/线程的唤醒和任务调度一样最终会调用 try_to_wake_up() ,具体逻辑就不重复分析了。如果 wake_up_state 返回 0,说明进程或者线程已经是 TASK_RUNNING 状态了,如果它在另外一个 CPU 上运行,则调用 kick_process 发送一个处理器间中断,强制那个进程或者线程重新调度,重新调度完毕后,会返回用户态运行。

static inline void signal_wake_up(struct task_struct *t, bool resume) { signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); } void signal_wake_up_state(struct task_struct *t, unsigned int state) { set_tsk_thread_flag(t, TIF_SIGPENDING); /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); }

六. 信号的处理

这里我们以一个从tap 网卡中读取数据的例子来分析信号的处理逻辑。这部分涉及到了系统调用、任务调度、中断等知识,对前面的文章也算是一个回顾。从网卡读取数据会通过系统调用进入内核,之后通过函数调用表找到对应的函数执行。在读的过程中,如果没有数据处理则会调用schedule()函数主动让出CPU进入休眠状态并等待再次唤醒。

tap_do_read()主要逻辑为:

static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) { ...... while (1) { if (!noblock) prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE); /* Read frames from the queue */ skb = skb_array_consume(&q->skb_array); if (skb) break; if (noblock) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } /* Nothing to read, let's sleep */ schedule(); } ...... }

schedule()会在系统调用返回或者中断返回的时刻调用exit_to_usermode_loop(),在任务调度中标记位为_TIF_NEED_RESCHED,而对于信号来说是_TIF_SIGPENDING。

static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) { while (true) { ...... if (cached_flags & _TIF_NEED_RESCHED) schedule(); ...... /* deal with pending signal delivery */ if (cached_flags & _TIF_SIGPENDING) do_signal(regs); ...... if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) break; } }

do_signal()函数会调用 handle_signal(),这里主要存在一个问题使得逻辑变得较为复杂:信号处理函数定义于用户态,而调度过程位于内核态。

/* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ void do_signal(struct pt_regs *regs) { struct ksignal ksig; if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; } /* Did we come from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; } } /* * If there's no signal to deliver, we just put the saved sigmask * back. */ restore_saved_sigmask(); }

handle_signal()会判断当前是否从系统调用调度而来,当发现错误码为ERESTARTSYS的时候就知道这是从一个没有调用完的系统调用返回的,设置系统错误码为EINTR。由于此处不会直接返回任务调度前记录的用户态状态,而是进入注册好的信号处理函数,因此需要调用setup_rt_frame()构建新的寄存器结构体pt_regs。

static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; ...... /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { regs->ax = -EINTR; break; } /* fallthrough */ case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; } } ...... failed = (setup_rt_frame(ksig, regs) < 0); ...... signal

setup_rt_frame()主要调用__setup_rt_frame(),主要逻辑为:

static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { ...... return __setup_rt_frame(ksig->sig, ksig, set, regs); ...... } static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *fp = NULL; int err = 0; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); ...... put_user_try { ...... /* Set up to return from userspace. If provided, use a stub already in userspace. */ /* x86-64 should always use SA_RESTORER. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) { put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ err |= -EFAULT; } } put_user_catch(err); err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) return -EFAULT; /* Set up registers for signal handler */ regs->di = sig; /* In case the signal handler was declared without prototypes */ regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the next argument after the signal number on the stack. */ regs->si = (unsigned long)&frame->info; regs->dx = (unsigned long)&frame->uc; regs->ip = (unsigned long) ksig->ka.sa.sa_handler; regs->sp = (unsigned long)frame; /* * Set up the CS and SS registers to run signal handlers in * 64-bit mode, even if the handler happens to be interrupting * 32-bit or 16-bit code. * * SS is subtle. In 64-bit mode, we don't need any particular * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad * SS value. We also have a compatbility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU * avoids relying on sigreturn to restore SS; instead it uses * a trampoline.) So we do our best: if the old SS was valid, * we keep it. Otherwise we replace it. */ regs->cs = __USER_CS; if (unlikely(regs->ss != __USER_DS)) force_valid_ss(regs); return 0; }

sa_restorer在__libc_sigaction()函数中被赋值为restore_rt,实际上调用函数调用__NR_rt_sigreturn()

RESTORE (restore_rt, __NR_rt_sigreturn) #define RESTORE(name, syscall) RESTORE2 (name, syscall) # define RESTORE2(name, syscall) \ asm \ ( \ ".LSTART_" #name ":\n" \ " .type __" #name ",@function\n" \ "__" #name ":\n" \ " movq $" #syscall ", %rax\n" \ " syscall\n" \ ......

__NR_rt_sigreturn()对应的内核函数为sys_rt_sigreturn(),这里会调用restore_sigframe()将pt_regs恢复成原进程的栈帧状态,从而继续执行函数调用后续的内容。

asmlinkage int sys_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, * then 'sp' should be word aligned here. If it's * not, then the user is trying to mess with us. */ if (regs->ARM_sp & 7) goto badframe; frame = (struct rt_sigframe __user *)regs->ARM_sp; if (!access_ok(frame, sizeof (*frame))) goto badframe; if (restore_sigframe(regs, &frame->sig)) goto badframe; if (restore_altstack(&frame->sig.uc.uc_stack)) goto badframe; return regs->ARM_r0; badframe: force_sig(SIGSEGV, current); return 0; }

总结

信号的发送与处理是一个复杂的过程,这里来总结一下。

进程通信ipc(Systemvipc进程间通信机制和底层原理详解)(2)

,