Switch-Router

Open-vSwitch Inside(1)--vswitchd

Published at 2018-10-23 | Last Update 2018-10-23

Open-vSwitch Inside(1)–vswitchd

vswitchd 守护进程是 OVS 中最重要的进程,它管理本机上的所有虚拟交换机,维护它们的运行状态。它与框架中的其他组件都有独立的通信通道。Figure.1 展示了 OVS 重要组件之间的联系

Figure.1 Open vSwitch Architecture

  • vswitchd 通过 OpenFlow 协议与控制器通信(每个虚拟交换机可能与不同的控制器建立连接)
  • vswitchd 通过 OVSDB 协议与 ovsdb-server 通信,从后者获得实时的配置信息
  • vswitchd 通过 Netlink 通道与 OVS 内核模块进行通信

数据结构

数据结构是程序的基础,下面列举了 vswitchd 中涉及的最重要数据结构。

ofproto

OVS 中用 struct ofproto 表示一个 OpenFlow 交换机基类。之所以说是基类,是因为 OVS 使用包含该结构的其他结构表示一个具体的交换机实例。

struct ofproto {
    const struct ofproto_class *ofproto_class;
    char *type;                     /* Datapath type. */
    char *name;                     /* Datapath name. */
  
    /* Datapath. */
    struct hmap ports;              /* Contains "struct ofport"s. */
    
    /* Flow tables. */
    struct oftable *tables;
    int n_tables;
    
    /* OpenFlow connections. */
    struct connmgr *connmgr;

    /* other fields are omitted */
};
ofproto_class

OpenFlow 交换机的所属的分类。OVS 被设计为支持多种 OpenFlow 交换机的实现,只要它们都满足 struct ofproto_class 这个抽象接口。这些实现(称为 provider )通过 ofproto_class_register() 进行注册。

当前版本(2.10.0)只注册了 ofproto_dpif_class 这一种 provider ,它创建的交换机实例的结构为 struct ofproto_dpif ,这个结构除了包含 struct ofproto 这个核心元素之外,还包含这种OpenFlow交换机的特有字段。

                      struct ofproto_class(Abstract Interface)
                    +----------------------------------------------+
                    | struct ofproto *(*alloc)(void);              |
                    | int (*construct)(struct ofproto *ofproto)    |
                    | int (*run)(struct ofproto *ofproto)          |
                    | struct ofport *(*port_alloc)(void)           |
                    | int (*port_construct)(struct ofport *ofport) |
                    |                  ......                      |
                    +-----------------------+----------------------+
                                            |
                                            |
                     provider:	+-----------+------------+   
                                |   ofproto_dpif_class   |  
                                +------------------------+ 
                                |         alloc          | 
                                |        construct       | 
                                |          run           |  
                                |        port_alloc      |
                                |      port_construct    |
                                +-----------+------------+ 
                                            |
                                +------------------------+   
                                |  struct ofproto up     | 
                                |        ......          | 
                                +------------------------+ 
                                    struct ofproto_dpif
type

OpenFlow 交换机实例底层 datapath 的类型的名字。当前版本支持 system (默认)和 netdev 两种类型,后者通常用于使用 DPDK ,在用户空间完成 datapath 。

系统支持的 datapath 的类型( struct dpif_class )通过 dp_register_provider()接口进行注册, vswitchd 在首次创建 OpenFlow 交换机时,会进行datapath类型的注册,Linux 内核支持如下两种 datapath

static const struct dpif_class *base_dpif_classes[] = {
    &dpif_netlink_class,       /* "system" */
    &dpif_netdev_class,        /* "netdev" */
};

static void
dp_initialize(void)
{
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
    if (ovsthread_once_start(&once)) {
        int i;

        /* code omitted*/
        for (i = 0; i < ARRAY_SIZE(base_dpif_classes); i++) {
            dp_register_provider(base_dpif_classes[i]);
        }
        ovsthread_once_done(&once);
    }
}
name

OpenFlow 交换机实例底层 datapath 的名字。在创建的时候指定。

ports

OpenFlow 交换机实例包含的端口集合。vswitchd 使用 struct ofport 表示交换机端口的必要元素, 而端口的数据结构根据OpenFlow交换机的类别而定。当前OVS唯一支持的 ofproto_dpif 交换机对应的端口结构为 struct ofport_dpif

                        struct ofproto_dpif                      struct ofport_dpif
                    +----------------------------+           +-------------------------+ 
                    |   struct ofproto up        |           |  struct ofport up       |
                    +----------------------------+  =====>   +-------------------------+
                    | struct dpif_backer *backer |           |        ......           |
                    | struct sset ports          |           +-------------------------+
                    |          ......            |   
                    +----------------------------+  
tables & n_tables

OpenFlow交换机保存在用户空间的流表数组,OVS 唯一支持的 ofproto_dpif_class 型交换机支持 255 张流表,这些表在创建交换机时被创建和初始化(ofproto_init_tables()).

connmgr

OpenFlow 交换机保存与控制器连接的结构。该交换机建立连接的所有控制器都可以用这个字段找到。

struct connmgr {
    struct ofproto *ofproto;
    
    /* OpenFlow connections. */
    struct hmap controllers;     /* All OFCONN_PRIMARY controllers. */
    struct ovs_list all_conns;   /* All controllers.  All modifications are
                                    protected by ofproto_mutex, so that any
                                    traversals from other threads can be made
                                    safe by holding the ofproto_mutex. */
};

该字段在交换机创建时创建,在为交换机设置控制器时(connmgr_set_controllers())被使用。

ofproto_dpif

ofproto_dpif_class 是当前版本的 OVS 只有唯一的 ofproto 的provider,它创建的交换机实例结构是 ofproto_dpif (后缀 dpif 的意思我猜是带 datapath interface 的交换机)

struct ofproto_dpif {
   
    struct ofproto up;          /* 内嵌struct ofproto */
    struct dpif_backer *backer; /* All datapaths of a given type share a single dpif backer instance. */

    struct sset ports;           /* Set of standard port names. */
    /* Other fileds are omitted */
};

ofproto_dpif 结构中有一项 backer 指针,vswitch 用它表示交换机与 datapath 的接口。由注释可知,所有相同 type 的 datapath 共享一个 dpif backer 。而我们知道,Linux 内核支持两种 type 的 datapath :”system” 和 “netdev”,所以就有两种类型的 datapath 接口:默认的 dpif_netlink , 以及 DPDK 下使用的 dpif_netdev

                                                  dpif_netlink 
+--------------+                                   +--------+     
|              |                                   |  dpif  |
| ofproto_dpif +----+       +--------------+   +-->+--------+             
| (ofproto)    |    |       |              |   |   |        |
+--------------+    |       |              +---+   +--------+      
                    +------>+  dpif_backer |                  
+--------------+    |       |              |       +--------+
|              |    |       |              +------>+        |
| ofproto_dpif +----+       +--------------+       +--------+
|  (ofproto)   |                                     udpif
+--------------+          
                    "system" datapath
					
                                                  dpif_netdev 
+--------------+                                   +--------+     
|              |                                   |  dpif  |
| ofproto_dpif +----+       +--------------+   +-->+--------+             
| (ofproto)    |    |       |              |   |   |        |
+--------------+    |       |              +---+   +--------+      
                    +------>+  dpif_backer |                  
+--------------+    |       |              |       +--------+
|              |    |       |              +------>+        |
| ofproto_dpif +----+       +--------------+       +--------+
|  (ofproto)   |                                     udpif
+--------------+          
                    "netdev" datapath
dpif_class

dpif_class 是一个抽象接口,它包含一组函数指针的定义。当前版本 OVS 有两个结构实现了这个接口:dpif_netlink 和 dpif_netdev

bridge

struct ofproto 表示 OpenFlow 交换机,而 struct bridge 其实描述的是一回事,在 vswitchd 中,它们是一一对应的。稍微的不同是,struct ofproto 更贴近于 OVS 内部实现,而 struct bridge 更贴近于用户侧,比如创建/删除 OpenFlow 交换机的命令分别是:

$ ovs-vsctl add-br BRIDGE
$ ovs-vsctl del-br BRIDGE
struct bridge {
    char *name;                 /* User-specified arbitrary name. */
    char *type;                 /* Datapath type. */
    const struct ovsrec_bridge *cfg;

    /* OpenFlow switch processing. */
    struct ofproto *ofproto;    /* OpenFlow switch. */

    /* Bridge ports. */
    struct hmap ports;          /* "struct port"s indexed by name. */
    struct hmap ifaces;         /* "struct iface"s indexed by ofp_port. */
    struct hmap iface_by_name;  /* "struct iface"s indexed by name. */
    
    /* other filed are omitted */
};

struct bridge 用 ports 表示包含的端口集合,它和 struct ofproto 中 ofport 是对应的,需要注意的是它还有一个 ifaces 集合,一般情况下 struct port 和 struct iface 是一一对应的,但我们可以将多个 struct iface 聚合(bond)为一个 struct port ,此时就是一对多的关系了。

                    +-------------------------------+
                    |            bridge             |
                    +-----+--------+---------+------+
                          |        |         |
                        port 1   port 2    port 3
                       iface 1  iface 2      |
                                          +--+--+
                                          |     |
                                       iface 3 iface 4

netdev

vswitchd 使用 struct ofport 表示交换机端口的核心元素, 所有不同的端口都需要包含它:

struct ofport {
    struct hmap_node hmap_node; /* In struct ofproto's "ports" hmap. */
    struct ofproto *ofproto;    /* The ofproto that contains this port. */
    struct netdev *netdev;
    ......
    int mtu;
};

ofproto 为该端口所属的交换机,而 netdev 为该端口对应的”网络设备”。在OVS中,每一个 OpenFlow 交换机上的端口都有对应的 struct netdev。

struct netdev 有一套与 struct ofproto 类似的层次结构:抽象接口 struct netdev_class 在最高层,它规定了网络设备必须实现的接口,而 OVS 提供了提供了多种接口的生产者(provider),每种生产者可以创建出不同的 netdev ,这些 netdev 都包含 struct netdev

                                                   struct netdev_class(Abstract Interface)
                                                     +------------------------------+   
                                                     | char* type                   |    
                                                     | void (*run)()                |
                                                     | struct netdev *(*alloc)(void)|
                                                     |          ......              |
                                                     +-----+---------------+--------+
                                                           |               |
<------------------<---------+-<---------------------<---+-+               +-->-+----------------------->+-----------------------+
|                            |                          |                       |                        |                       |
+-----------+------------+   +-----------+----------+  +----------------------+  +----------+---------+  +-----------+---------+ +-----------+---------+  
|   netdev_linux_class   |   |   netdev_tap_class   |  | netdev_internal_class|  |                    |  |                     | |                     | 
+------------------------+   +----------------------+  +----------------------+  +--------------------+  +---------------------+ +---------------------+ 
|      "system"          |   |        "tap"         |  |      "internal"      |  |      "geneve"      |  |      "vxlan"        | |                     |
| netdev_linux_run       |   | netdev_linux_run     |  | netdev_linux_run     |  | netdev_vport_run   |  |  netdev_vport_run   | |                     |
| netdev_linux_alloc     |   | netdev_linux_alloc   |  | netdev_linux_alloc   |  | netdev_vport_alloc |  |  netdev_vport_alloc | |      .........      |
|       ......           |   |        ......        |  |       ......         |  |      ......        |  |       ......        | |                     |
+-----------+------------+   +-----------+----------+  +-----------+----------+  +----------+---------+  +-----------+---------+ +-----------+---------+
            |                            |                         |                        |                        |                       |
            +----------------------------+-------------------------+                        +------------------------+-----------------------+             
                            +------------+-----------+                                                    +----------+---------+
                            |   struct netdev up     |                                                    | struct netdev up   |
                            |   /* other filed */    |                                                    |  /* other filed */ |  
                            +------------------------+                                                    +--------------------+ 
                              struct netdev_linux                                                          struct netdev_vport		

过程

数据结构是程序的静态组成元素,而真正让程序具有生命的是它的执行过程,下面将分析一些 vswitchd 的部分程序执行过程。

创建OpenFlow交换机

用户可以终端输入 ovs-vsctl add-br br1 来创建OpenFlow交换机。ovsdb-server 进程会收到这条配置,随后 vswitchd 进程会从 ovsdb-server 进程拿到这个配置,最终会调用到 ofproto_create() 接口创建交换机实例。

/* Call Flow */
main()
 |
 |- bridge_run()
    |
    |- bridge_reconfigure()
       |
       |- ofproto_create()

int
ofproto_create(const char *datapath_name, const char *datapath_type)
{
     class = ofproto_class_find__(datapath_type);  // datapath_type = "system"
     
     /* Step1 alloc memory for 'ofproto' */
     ofproto = class->alloc(); 
        |
        |- ofproto_dpif_class->alloc()
                      
     ofproto->ofproto_class = class;
     ofproto->name = xstrdup(datapth_name);  //  "br1"
     ofproto->type = xstrdup(datapath_type); //  "system"
     
     /* Step2 construct the datapath */
     ofproto->ofproto_class->construct(ofproto);
        |
        |- ofproto_dpif_class->construct(ofproto)
            |
            |- struct ofproto_dpif *ofproto = ofproto_dpif_cast(_ofproto)
            |
            |   /* Step2.1 open datapath interface */
            |- open_dpif_backer(ofproto->up.type, &ofproto->backer)   //  初始化 
                |
                |- dpif_create_and_open(backer_name, type, &backer->dpif) // backer_name = "ovs-system" 
                    |
                    |- dpif_create(name, type, dpifp)
                       |
                       |- do_open(name, type, true, dpifp)
                          |
                          |- dpif_netlink_class->open(dpif_netlink_class, name, create, &dpif)
                              |
                              |- dpif_netlink_init()  // vswitchd just call once
                              |- dpif_netlink_dp_transact()
             |
             |  /* Step2.2 Initialize flow table*/
             |- ofproto_init_tables(ofproto_, N_TABLES)
             |- add_internal_flows(ofproto)
}

创建交换机实例时首先进行实例内存的分配(分配 struct ofproto 大小,但只返回 struct ofproto_dpif);这之后进行 datapath interface 的初始化,对于默认的 “system” 类型的交换机,其实就是初始化与内核模块通信的 Netlink 通道。最后初始化流表。

为OpenFlow交换机添加端口

假设我已经创建了 br1 交换机,现在通过ovs-vsctl add-port br1 veth1_br为其添加一个端口 veth1_br (事先通过ip link add veth1_br type veth peer name veth1添加)。我们从 vswtichd 的 bridge_add_ports__() 开始分析

/* Call Flow */
main()
 |
 |- bridge_reconfigure()
   |
   |- bridge_add_ports()
   
void
bridge_add_ports(struct bridge *br, const struct shash *wanted_ports, bool with_requested_port)
   |
   |- iface_create(br, iface_cfg, port_cfg)
      |
      |  /* Opens a network device for 'if_cfg' and configure it */
      |- iface_do_create(br, iface_cfg, &ofp_port, &netdev, &errp)
         |
         |  /* Return the type to pass to netdev_open() when 'ofproto' hash a port of type 'port_type' */
         |- type = ofproto_port_open_type(br->ofproto, iface_get_type(iface_cfg, br->cfg))  // return "system"
             |
             |- ofproto_dpif_class->port_open_type(datapath_type, port_type) // datapath_type = port_type = "system"
         |
         |  /* Alloc a "struct netdev_linux" as netdev  */
         |- netdev_open(iface_cfg->name, type, &netdev)    // iface_cfg->name = "veth1_br"    type = "system" 
         |
         |  /* Attempt to add 'netdev' as a port on 'ofproto' */
         |- ofproto_port_add(br->ofproto, netdev, ofp_portp)
            |
            |- ofproto_dpif_class->port_add(ofproto,netdev)
               |
               |   /* Attempts to add 'netdev' as a port on 'dpif' */
               |- dpif_port_add(ofproto_backer->dpif, netdev, &port_no)
                   |
                   |- dpif_netlink_class->port_add(dpif, netdev, &port_no)
                      |
                      |- dpif_netlink_rtnl_port_create_add_add(dpif, netdev, port_nop)
                         |
                         |-  NETLINK msg => Kernel
               |
               |- update_port(ofproto, netdev_name)
                  |
                  |  /* Adds an ofproto to ofproto */
                  |- ofport_install(ofproto, netdev, &pp)
                     |
                     |  /* Alloc ofport */
                     |- ofproto_dpif_class->port_alloc()
                     |- hmap_insert(&p->ports, &ofport->hmap_node, hash_ofp_port(ofport->port))
          |
          |- sset_add(&ofproto->ports, devname)

我们可以看到,在此过程中,vswitchd 会先创建一个 netdev 作为端口对应的网络设备,然后通过 NETLINK 通道内核模块,最后创建 struct ofport (实际会申请 struct ofport_dpif 大小的内存),将其加入 ofproto 中。

主流程
int main()
{
    /* 创建unix socket server,用于ovs-appctl */
    retval = unixctl_server_create(unixctl_path, &unixctl); 
    
    /* 初始化bridge模块, 从ovsdb进程获取交换机配置,初始化其他二层协议 */
    bridge_init();

    /* 守护进程循环 */
    while (!exiting) {
        bridge_run()
          |
          |--bridge_init_ofproto() //  将bridge配置转换为ofproto (运行一次)
              |
              |-- ofproto_init(&iface_hints);
          |--bridge_run__()
              | 
              |--foreach type in datapath types:  // Let each datapath type do the work that it needs to do.
                     ofproto_type_run(type)       // call ofproto_dpif_class->type_run(type)  
              |--for br in all_bridges:           // Let each bridge do the work that it needs to do.
                     ofproto_run(br->ofproto)     // call ofproto_dpif_class->run(p)
          |--  bridge_reconfigure(cfg ? cfg : &null_cfg); //  ovsdb变化时调用,重新进行配置交换机
        
        /* 运行unix socket server,用于ovs-appctl */
        unixctl_server_run(unixctl); 
        
        netdev_run(); 
          |  /* 注册 netdev (运行一次) */
          |-- netdev_initialize();
             /* Performs periodic work needed by all the various kinds of netdevs */
          |-- foreach rc in netdev_classes: 
                 rc->class->run()
                      
        bridge_wait();
          |
          |-- foreach type in datapath types:  // type = "system" or "netdev"
                 ofproto_type_wait(type);    // call ofproto_dpif_class->type_wait(type)
          |-- foreach br in all_bridges
                 ofproto_wait(br->ofproto) // call ofproto_dpif_class->wait(p)
        unixctl_server_wait(unixctl);
        netdev_wait();
          |
          |-- foreach rc in netdev_classes: 
                 rc->class->wait(rc->class)
        /*  
           block until one or more of the events registered with poll_fd_wait() occurs
                 until the minimum duration registered with poll_timer_wait() elapses
                 not at all if poll_immediate_wake() has been called
         */
        poll_block();
    }
}