Friday, September 19, 2008

@Kirk McKursick's FreeBSD Internals course

I had the fortunate pleasure to attend Kirk McKursick's FreeBSD internals course, kindly $upported by Google Switzerland, my employer =).


May the source be with you ... `:-)

One interesting discussion I had with Kirk was about the behavior of zombie processes with respect to opened sockets and TCP data lingering: he stated that a process could be in zombie state while pushing the data ashes thru an already opened TCP socket ... something that I couldn't agree with, mainly because this would imply a high wait() "latency" in under a pkt loss/congestion scenario.

Linux even has a tcp_max_orphans sysctl explicitly there for this (man 7 tcp), anyhow I wanted to be sure about the behavior of Linux and xBSD, so I coded a quick&dirty test, see zocket.c and Makefile below.

Result: as expected, the sockets were orphaned after the process became zombie, and remained as is (FIN_WAIT1) independently from the process path to death =) (tested on Linux, FreeBSd and OpenBSD).

As a side note, was interesting to find that in Linux you can push a BPF down to filter the very data stream, but in FreeBSD you can only hook at the accept() syscall with another mechanism (man 9 accept_filter), this is somewhat 2-sided for Linux: cool that you can arbitrary filter the data stream, notso-cool that you can easily create a local DoS with this power :)

make && make test ## should do the magic:

  • zocket.c
    * zocket.c: connect a socket, block its data stream, write data @children:
    * show zombie and socket lifetime afterwards
    * Author: JuanJo: jjo () google com
    * License: GPLv2+

    #include <stdio.h>
    #include <unistd.h>
    #include <stdlib.h>
    #include <string.h>
    #include <sys/socket.h>
    #include <sys/wait.h>
    #include <netinet/in.h>
    #include <arpa/inet.h>
    #include <assert.h>
    #if __linux__
    #include <linux/filter.h>

    /* quickie wrapper over syscall error checking */
    #define ERR_IF(cond) do { if(cond) { perror( #cond ); abort(); } } while (0)

    static int block_fromto(int sockfd, const struct sockaddr_in *sp,
    const struct sockaddr_in *dp)
    #ifdef __linux__
    * Push a BPF into _this_ socket only, interesting enough
    * this seems to be a linux-only feature, BSD has this available
    * only at accept().

    struct sock_filter bpf_blockme[]= {
    BPF_STMT(BPF_RET+BPF_K, 0), /* just accept 0 bytes ;) */
    struct sock_fprog filter = {
    sizeof(bpf_blockme)/sizeof(*bpf_blockme), bpf_blockme,

    ERR_IF(setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER,
    &filter, sizeof(filter))<0);
    #else /* assuming BSD: block at PF level */
    char cmd[1024];
    cmd[sizeof cmd-1]=0;
    snprintf(cmd, sizeof cmd-1,
    "echo block drop out inet proto tcp "
    "from %s port %d to %s port %d| pfctl -f-",
    /* leaking but effective ... */
    strdup(inet_ntoa(sp->sin_addr)), htons(sp->sin_port),
    strdup(inet_ntoa(dp->sin_addr)), htons(dp->sin_port));
    printf("====== configuring pf:\n+ %s\n", cmd);
    return 0;
    int main(int argc, const char *argv[]) {
    int sock;
    int pid;
    int syncpipe[2];
    unsigned n;
    char cmd[2048];
    struct sockaddr_in dest,me;

    #ifndef __linux__
    /* need to use /sbin/pfctl for blocking pkt stream if !linux */
    if (argc != 3) {
    fprintf(stderr, "ERROR: Usage: %s <ip> <port>\n", argv[0]);
    dest.sin_family = AF_INET;
    ERR_IF( inet_aton(argv[1], &dest.sin_addr) == 0);

    sock = socket(AF_INET, SOCK_STREAM, 0);
    ERR_IF(sock <0);
    ERR_IF(connect (sock, (struct sockaddr *)&dest, sizeof dest) < 0);
    n=sizeof me;
    getsockname(sock, (struct sockaddr *)&me, &n);

    /* unbuffer stdout */
    setbuf(stdout, NULL);
    /* block (output) data stream */
    block_fromto(sock, &me, &dest);
    for(n=3;n;n--) {
    switch(pid=fork()) {
    case 0:
    write(sock, "1234567890", 10);
    case -1: ERR_IF(1);
    /* sync to children death */
    read(syncpipe[0], NULL, 0);
    cmd[sizeof cmd -1]=0;
    snprintf(cmd, sizeof cmd -1,
    "ps -o pid,ppid,stat,command|egrep [z]ocket;"
    "netstat -tn|egrep '[.:]%d .*[.:]%d'",
    ntohs(me.sin_port), ntohs(dest.sin_port));
    /* this will show the zombies and the socket send-q (as netstat -tn) */
    printf("====== BEFORE wait() ======\n+ %s\n", cmd); system(cmd);
    /* obviously the zombies are gone, what about the (orphaned) socket ?*/
    printf("====== AFTER wait() ======\n+ %s\n", cmd); system(cmd);
    return 0;

  • Makefile
    ## Makefile for zocket.c
    ## Tested on: Linux 2.6, FreeBSD 6.x(dragonfly), OpenBSD 4.3
    CFLAGS=-Wall -g
    DEST= 111

    ## No portable (GNU,BSD) way of doing VAR=<output_from_shellcmd>, wrap
    ## them by invoking make again
    make OS=`uname -s` all_os
    make OS=`uname -s` test_os

    all_os: $(T)

    ## FreeBSD: dynload PF module
    -@test -x /sbin/kldload && \
    { /sbin/kldstat | egrep pf.ko || { kldload pf; sleep 1;};}
    ## Enable PF
    -@test -x /sbin/pfctl && \
    sudo /sbin/pfctl -e -f- </dev/null 2>/dev/null;exit 0

    test_os: all_os setup_pf
    @ulimit -c 0; [ $(OS) != Linux -a -x /usr/bin/sudo ] && SUDO=sudo;\
    exec $$SUDO ./$(T) $(DEST) | tee test.out.$(OS)

    $(T): zocket.c
    $(CC) $(CFLAGS) -o $(@) $(?)

    rm -fv zocket.bin.*

    %.html: %
    c2html -s $(^)