Build own Kubernetes - Pods network in code

jonatan5524

Jonatan Ezron

Posted on November 5, 2022

Build own Kubernetes - Pods network in code

In the previous articles, we created the pod's network using network devices and iptables rules, in this article and the next we are going to establish this in code.
Keep in mind, we are going to add the network functionality to the kube-agent for an easier approach, in Kubernetes the component responsible for this part is the kube-proxy


First, we want to create a docker network when creating a node, so in pkg/node/node.go we create a new function:

func createNetwork(cli *client.Client, ctx context.Context) error {
    _, err := cli.NetworkInspect(ctx, NODE_DOCKER_NETWORK_NAME, types.NetworkInspectOptions{})
    if err == nil {
        return nil
    }

    newNetwork := types.NetworkCreate{IPAM: &network.IPAM{
        Driver: "default",
        Config: []network.IPAMConfig{{
            Subnet: NODE_CIDR,
        }},
    }}

    _, err = cli.NetworkCreate(context.Background(), NODE_DOCKER_NETWORK_NAME, newNetwork)
    if err != nil {
        return err
    }

    return nil
}
Enter fullscreen mode Exit fullscreen mode

The above function creates a network using the provided name and node CIDR, both of the constraints define in pkg/node/constraints.go:

package node

import "github.com/jonatan5524/own-kubernetes/pkg/agent/api"

const (
    NODE_NAME                = "node"
    NODE_IMAGE               = "own-kube-node"
    NODE_PORT                = api.PORT + "/tcp"
    NODE_PORT_HOST_IP        = "0.0.0.0"
    MEMORY_LIMIT             = 2.9e+9 // 2900MB
    CPU_LIMIT                = 2
    NODE_HOST_MIN_PORT       = 10250
    NODE_HOST_MAX_PORT       = 10300
    NODE_DOCKER_NETWORK_NAME = "kube-net"
    NODE_CIDR                = "172.18.0.0/24"
)
Enter fullscreen mode Exit fullscreen mode

And now we add the network creation in the NewNode function, at the start of the function and hostConfig and ContainerCreate:

func NewNode(cli *client.Client, ctx context.Context) (*Node, error) {
    err := createNetwork(cli, ctx)
    if err != nil {
        return nil, err
    }

.
.
.

    hostConfig := &container.HostConfig{
        PortBindings: nat.PortMap{
            nat.Port(fmt.Sprintf("%s/tcp", api.PORT)): []nat.PortBinding{
                {
                    HostIP:   NODE_PORT_HOST_IP,
                    HostPort: "0",
                },
            },
        },
        NetworkMode: NODE_DOCKER_NETWORK_NAME,
        Resources: container.Resources{
            Memory:    MEMORY_LIMIT,
            CPUShares: CPU_LIMIT,
        },
        Privileged: true,
    }

    _, err = cli.ContainerCreate(ctx, config, hostConfig, &network.NetworkingConfig{}, nil, id)
    if err != nil {
        return nil, err
    }

    return &Node{Id: id}, nil
}
Enter fullscreen mode Exit fullscreen mode

Now we create in pkg/util.go an ExecuteCommand function for all the commands we are going to execute in the node:

func ExecuteCommand(command string) error {
    cmd := exec.Command("bash", "-c", command)
    cmd.Stdout = os.Stdout

    if err := cmd.Start(); err != nil {
        return err
    }

    log.Printf("%s logs: %s", command, cmd.Stdout)

    return nil
}
Enter fullscreen mode Exit fullscreen mode

Now for the pod's network, we add in pkg/pod/service.go in NewPodAndRun a call for a new function to connect to the network:

func NewPodAndRun(imageRegistry string, name string) (string, error) {
    pod, err := NewPod(imageRegistry, name)
    if err != nil {
        return "", err
    }

    log.Printf("pod created: %s\n", pod.Id)
    log.Printf("starting pod\n")

    runningPod, err := pod.Run()
    if err != nil {
        return "", err
    }

    log.Printf("setting up pod network\n")
    if err := connectToNetwork(pod.Id, (*runningPod.Task).Pid()); err != nil {
        return "", err
    }

    return pod.Id, nil
}
Enter fullscreen mode Exit fullscreen mode

The connectToNetwork function creates a bridge and VXLAN if necessary, find a new available IP address and create the veth pair for connection:

func connectToNetwork(podId string, pid uint32) error {
    netId := podId[:15-len("veth")-1]

    podCIDR, err := generateNewNodePodCIDR()
    if err != nil {
        return err
    }

    // podCIDR: 10.0.2.0/24 -> bridgeIP: 10.0.2.1/24
    bridgeIP := pkg.ReplaceAtIndex(podCIDR, '1', len(podCIDR)-4)

    if !net.IsDeviceExists(BRIDGE_NAME) {
        if err := net.CreateBridge(BRIDGE_NAME, bridgeIP); err != nil {
            return err
        }
    }

    if !net.IsDeviceExists(VXLAN_NAME) {
        if err := net.CreateVXLAN(VXLAN_NAME, NODE_LOCAL_NETWORK_INTERFACE, BRIDGE_NAME); err != nil {
            return err
        }
    }

    podIP, err := net.GetNextAvailableIPAddr(podCIDR)
    if err != nil {
        return err
    }

    if err := net.CreateVethPairNamespaces(
        fmt.Sprintf("veth-%s", netId),
        fmt.Sprintf("ceth-%s", netId),
        BRIDGE_NAME,
        int(pid),
        podIP+podCIDR[len(podCIDR)-3:],
        bridgeIP,
    ); err != nil {
        return err
    }

    return nil
}
Enter fullscreen mode Exit fullscreen mode

We generate a pod CIDR for the current pod using the generateNewNodePodCIDR function that gets the IP address of the node and switches the x in 10.0.x.0/24 the pod CIDR template with the last byte in the IP address. For example, if our node IP address is 172.18.0.2 the pod CIDR in this node will be 10.0.2.0/24.

func generateNewNodePodCIDR() (string, error) {
    localIPAddr, err := net.GetLocalIPAddr(NODE_LOCAL_NETWORK_INTERFACE)
    if err != nil {
        return "", err
    }

    // localIPAddr: 172.18.0.2 -> podCIDR: 10.0.2.0/24
    return strings.ReplaceAll(POD_CIDR, "x", string(localIPAddr[len(localIPAddr)-4])), nil
}
Enter fullscreen mode Exit fullscreen mode

All the constraints are taken from pkg/pod/constraints.go:

package node

import "github.com/jonatan5524/own-kubernetes/pkg/agent/api"

const (
    NODE_NAME                = "node"
    NODE_IMAGE               = "own-kube-node"
    NODE_PORT                = api.PORT + "/tcp"
    NODE_PORT_HOST_IP        = "0.0.0.0"
    MEMORY_LIMIT             = 2.9e+9 // 2900MB
    CPU_LIMIT                = 2
    NODE_HOST_MIN_PORT       = 10250
    NODE_HOST_MAX_PORT       = 10300
    NODE_DOCKER_NETWORK_NAME = "kube-net"
    NODE_CIDR                = "172.18.0.0/24"
)
Enter fullscreen mode Exit fullscreen mode

All of our network-related functions are from pkg/net/net.go, lets through those functions.
First, our create functions for veth, bridge, and vxlan is fairly simple, we just execute the commands from the previous articles:

func CreateBridge(name string, ipAddr string) error {
    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link add %s type bridge", name)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("ip addr add %s dev %s", ipAddr, name)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s up", name)); err != nil {
        return err
    }

    return nil
}

func CreateVethPairNamespaces(name string, pair string, bridge string, namespacePID int, ipAddr string, bridgeIpAddr string) error {
    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link add %s type veth peer name %s", name, pair)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s up", name)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s netns /proc/%d/ns/net", pair, namespacePID)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("nsenter --net=/proc/%d/ns/net ip link set %s up", namespacePID, pair)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("nsenter --net=/proc/%d/ns/net ip addr add %s dev %s", namespacePID, ipAddr, pair)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s master %s", name, bridge)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("nsenter --net=/proc/%d/ns/net /usr/sbin/ip route add default via %s", namespacePID, bridgeIpAddr)); err != nil {
        return err
    }

    return nil
}

func CreateVXLAN(name string, nodeInterface string, bridgeName string) error {
    const (
        ID    = "10"
        GROUP = "239.1.1.1"
    )

    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link add %s type vxlan id %s group %s dstport 0 dev %s", name, ID, GROUP, nodeInterface)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s master %s", name, bridgeName)); err != nil {
        return err
    }

    if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s up", name)); err != nil {
        return err
    }

    return nil
}
Enter fullscreen mode Exit fullscreen mode

Next, we have a function to check if a device exists, to check that, we can look for a file in /sys/class/net/ for example if a br0 device exists the /sys/class/net/br0 exists as well:

func IsDeviceExists(name string) bool {
    _, err := os.Stat(fmt.Sprintf("/sys/class/net/%s", name))

    return !os.IsNotExist(err)
}
Enter fullscreen mode Exit fullscreen mode

Next, for getting the local IP address (used to discover the node IP address inside the node):

unc GetLocalIPAddr(interfaceName string) (addr string, err error) {
    ief, err := net.InterfaceByName(interfaceName)

    if err != nil {
        return "", err
    }

    addrs, err := ief.Addrs()
    if err != nil {
        return "", err
    }

    return addrs[0].String(), nil
}
Enter fullscreen mode Exit fullscreen mode

And for getting a new available IP address with a given CIDR, we get all the possible hosts and ping each of them to see if one of them is not returning a response (meaning it is not allocated), this code was taken from ipcalc.go gist:

func GetNextAvailableIPAddr(cidr string) (string, error) {
    hosts, err := hosts(cidr)
    if err != nil {
        return "", err
    }

    for _, ip := range hosts {
        out, _ := exec.Command("ping", "-c1", "-t1", ip).Output()

        if strings.Contains(string(out), "Destination Host Unreachable") {
            return ip, nil
        }
    }

    return "", fmt.Errorf("no available ip have found in cidr: %s", cidr)
}

func hosts(cidr string) ([]string, error) {
    ip, ipnet, err := net.ParseCIDR(cidr)
    if err != nil {
        return nil, err
    }

    inc := func(ip net.IP) {
        for j := len(ip) - 1; j >= 0; j-- {
            ip[j]++
            if ip[j] > 0 {
                break
            }
        }
    }

    var ips []string
    for ip := ip.Mask(ipnet.Mask); ipnet.Contains(ip); inc(ip) {
        ips = append(ips, ip.String())
    }

    // remove network address and broadcast address
    return ips[1 : len(ips)-1], nil
}
Enter fullscreen mode Exit fullscreen mode

Let's test those changes!

❯ make build
# create nodesudo ./bin/main node create
2022/11/05 13:05:26 node created: node-9613851a-2fcb-440f-949d-77eade2c3b0b
2022/11/05 13:05:26 starting node
2022/11/05 13:05:26 node assign port: 49232
# create pod
❯ curl -X POST localhost:49232/pods -H 'Content-Type: application/json' -d '{"name": "pod1", "image registry": "docker.io/cloudnativelabs/whats-my-ip:latest"}'
{"image registry":"docker.io/cloudnativelabs/whats-my-ip:latest","name":"pod1-49fa480b-803c-4655-87a2-a93f643e0c61"}
# the node logs  - we can see all the commands that was executedsudo docker logs 892

   ____    __
  / __/___/ /  ___
 / _// __/ _ \/ _ \
/___/\__/_//_/\___/ v4.9.0
High performance, minimalist Go web framework
https://echo.labstack.com
____________________________________O/_______
                                    O\
2022/11/05 11:05:26 containerd logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:26 containerd running
⇨ http server started on [::]:10250
2022/11/05 11:05:37 pod created: pod1-49fa480b-803c-4655-87a2-a93f643e0c61
2022/11/05 11:05:37 starting pod
2022/11/05 11:05:37 setting up pod network
2022/11/05 11:05:37 ip link add br0 type bridge logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip addr add 10.0.3.1/24 dev br0 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip link set br0 up logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip link add vxlan10 type vxlan id 10 group 239.1.1.1 dstport 0 dev eth0 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip link set vxlan10 master br0 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip link set vxlan10 up logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 ip link add veth-pod1-49fa4 type veth peer name ceth-pod1-49fa4 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 ip link set veth-pod1-49fa4 up logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 ip link set ceth-pod1-49fa4 netns /proc/51/ns/net logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 nsenter --net=/proc/51/ns/net ip link set ceth-pod1-49fa4 up logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 nsenter --net=/proc/51/ns/net ip addr add 10.0.3.3/24 dev ceth-pod1-49fa4 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 ip link set veth-pod1-49fa4 master br0 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 nsenter --net=/proc/51/ns/net /usr/sbin/ip route add default via 10.0.3.1/24 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
method=POST, uri=/pods, status=201
# and inside the node we can see bridge and vxlan are created, and we can ping to the pod:sudo docker exec -it 892 /bin/bash
root@8923c146abc3:/agent# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
2: br0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UP group default qlen 1000
    link/ether 42:89:5f:a7:4f:78 brd ff:ff:ff:ff:ff:ff
    inet 10.0.3.1/24 scope global br0
       valid_lft forever preferred_lft forever
3: vxlan10: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue master br0 state UNKNOWN group default qlen 1000
    link/ether 42:89:5f:a7:4f:78 brd ff:ff:ff:ff:ff:ff
5: veth-pod1-49fa4@if4: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master br0 state UP group default qlen 1000
    link/ether b2:aa:10:8b:8a:25 brd ff:ff:ff:ff:ff:ff link-netnsid 1
267: eth0@if268: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default
    link/ether 02:42:ac:12:00:03 brd ff:ff:ff:ff:ff:ff link-netnsid 0
    inet 172.18.0.3/24 brd 172.18.0.255 scope global eth0
       valid_lft forever preferred_lft forever
root@8923c146abc3:/agent# ping 10.0.3.3
PING 10.0.3.3 (10.0.3.3) 56(84) bytes of data.
64 bytes from 10.0.3.3: icmp_seq=1 ttl=64 time=0.142 ms
64 bytes from 10.0.3.3: icmp_seq=2 ttl=64 time=0.069 ms
^C
--- 10.0.3.3 ping statistics ---
Enter fullscreen mode Exit fullscreen mode

In the next article, we will continue in our network implementation in code to ClusterIP and NodePort with iptables.

As always, the source code can be found here, the changes can be seen in this commit.

💖 💪 🙅 🚩
jonatan5524
Jonatan Ezron

Posted on November 5, 2022

Join Our Newsletter. No Spam, Only the good stuff.

Sign up to receive the latest update from our blog.

Related