Build own Kubernetes - Pods network in code
Jonatan Ezron
Posted on November 5, 2022
In the previous articles, we created the pod's network using network devices and iptables rules, in this article and the next we are going to establish this in code.
Keep in mind, we are going to add the network functionality to the kube-agent for an easier approach, in Kubernetes the component responsible for this part is the kube-proxy
First, we want to create a docker network when creating a node, so in pkg/node/node.go
we create a new function:
func createNetwork(cli *client.Client, ctx context.Context) error {
_, err := cli.NetworkInspect(ctx, NODE_DOCKER_NETWORK_NAME, types.NetworkInspectOptions{})
if err == nil {
return nil
}
newNetwork := types.NetworkCreate{IPAM: &network.IPAM{
Driver: "default",
Config: []network.IPAMConfig{{
Subnet: NODE_CIDR,
}},
}}
_, err = cli.NetworkCreate(context.Background(), NODE_DOCKER_NETWORK_NAME, newNetwork)
if err != nil {
return err
}
return nil
}
The above function creates a network using the provided name and node CIDR, both of the constraints define in pkg/node/constraints.go
:
package node
import "github.com/jonatan5524/own-kubernetes/pkg/agent/api"
const (
NODE_NAME = "node"
NODE_IMAGE = "own-kube-node"
NODE_PORT = api.PORT + "/tcp"
NODE_PORT_HOST_IP = "0.0.0.0"
MEMORY_LIMIT = 2.9e+9 // 2900MB
CPU_LIMIT = 2
NODE_HOST_MIN_PORT = 10250
NODE_HOST_MAX_PORT = 10300
NODE_DOCKER_NETWORK_NAME = "kube-net"
NODE_CIDR = "172.18.0.0/24"
)
And now we add the network creation in the NewNode
function, at the start of the function and hostConfig
and ContainerCreate:
func NewNode(cli *client.Client, ctx context.Context) (*Node, error) {
err := createNetwork(cli, ctx)
if err != nil {
return nil, err
}
.
.
.
hostConfig := &container.HostConfig{
PortBindings: nat.PortMap{
nat.Port(fmt.Sprintf("%s/tcp", api.PORT)): []nat.PortBinding{
{
HostIP: NODE_PORT_HOST_IP,
HostPort: "0",
},
},
},
NetworkMode: NODE_DOCKER_NETWORK_NAME,
Resources: container.Resources{
Memory: MEMORY_LIMIT,
CPUShares: CPU_LIMIT,
},
Privileged: true,
}
_, err = cli.ContainerCreate(ctx, config, hostConfig, &network.NetworkingConfig{}, nil, id)
if err != nil {
return nil, err
}
return &Node{Id: id}, nil
}
Now we create in pkg/util.go
an ExecuteCommand
function for all the commands we are going to execute in the node:
func ExecuteCommand(command string) error {
cmd := exec.Command("bash", "-c", command)
cmd.Stdout = os.Stdout
if err := cmd.Start(); err != nil {
return err
}
log.Printf("%s logs: %s", command, cmd.Stdout)
return nil
}
Now for the pod's network, we add in pkg/pod/service.go
in NewPodAndRun
a call for a new function to connect to the network:
func NewPodAndRun(imageRegistry string, name string) (string, error) {
pod, err := NewPod(imageRegistry, name)
if err != nil {
return "", err
}
log.Printf("pod created: %s\n", pod.Id)
log.Printf("starting pod\n")
runningPod, err := pod.Run()
if err != nil {
return "", err
}
log.Printf("setting up pod network\n")
if err := connectToNetwork(pod.Id, (*runningPod.Task).Pid()); err != nil {
return "", err
}
return pod.Id, nil
}
The connectToNetwork
function creates a bridge and VXLAN if necessary, find a new available IP address and create the veth
pair for connection:
func connectToNetwork(podId string, pid uint32) error {
netId := podId[:15-len("veth")-1]
podCIDR, err := generateNewNodePodCIDR()
if err != nil {
return err
}
// podCIDR: 10.0.2.0/24 -> bridgeIP: 10.0.2.1/24
bridgeIP := pkg.ReplaceAtIndex(podCIDR, '1', len(podCIDR)-4)
if !net.IsDeviceExists(BRIDGE_NAME) {
if err := net.CreateBridge(BRIDGE_NAME, bridgeIP); err != nil {
return err
}
}
if !net.IsDeviceExists(VXLAN_NAME) {
if err := net.CreateVXLAN(VXLAN_NAME, NODE_LOCAL_NETWORK_INTERFACE, BRIDGE_NAME); err != nil {
return err
}
}
podIP, err := net.GetNextAvailableIPAddr(podCIDR)
if err != nil {
return err
}
if err := net.CreateVethPairNamespaces(
fmt.Sprintf("veth-%s", netId),
fmt.Sprintf("ceth-%s", netId),
BRIDGE_NAME,
int(pid),
podIP+podCIDR[len(podCIDR)-3:],
bridgeIP,
); err != nil {
return err
}
return nil
}
We generate a pod CIDR for the current pod using the generateNewNodePodCIDR
function that gets the IP address of the node and switches the x in 10.0.x.0/24
the pod CIDR template with the last byte in the IP address. For example, if our node IP address is 172.18.0.2 the pod CIDR in this node will be 10.0.2.0/24.
func generateNewNodePodCIDR() (string, error) {
localIPAddr, err := net.GetLocalIPAddr(NODE_LOCAL_NETWORK_INTERFACE)
if err != nil {
return "", err
}
// localIPAddr: 172.18.0.2 -> podCIDR: 10.0.2.0/24
return strings.ReplaceAll(POD_CIDR, "x", string(localIPAddr[len(localIPAddr)-4])), nil
}
All the constraints are taken from pkg/pod/constraints.go
:
package node
import "github.com/jonatan5524/own-kubernetes/pkg/agent/api"
const (
NODE_NAME = "node"
NODE_IMAGE = "own-kube-node"
NODE_PORT = api.PORT + "/tcp"
NODE_PORT_HOST_IP = "0.0.0.0"
MEMORY_LIMIT = 2.9e+9 // 2900MB
CPU_LIMIT = 2
NODE_HOST_MIN_PORT = 10250
NODE_HOST_MAX_PORT = 10300
NODE_DOCKER_NETWORK_NAME = "kube-net"
NODE_CIDR = "172.18.0.0/24"
)
All of our network-related functions are from pkg/net/net.go
, lets through those functions.
First, our create functions for veth, bridge, and vxlan is fairly simple, we just execute the commands from the previous articles:
func CreateBridge(name string, ipAddr string) error {
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link add %s type bridge", name)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("ip addr add %s dev %s", ipAddr, name)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s up", name)); err != nil {
return err
}
return nil
}
func CreateVethPairNamespaces(name string, pair string, bridge string, namespacePID int, ipAddr string, bridgeIpAddr string) error {
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link add %s type veth peer name %s", name, pair)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s up", name)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s netns /proc/%d/ns/net", pair, namespacePID)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("nsenter --net=/proc/%d/ns/net ip link set %s up", namespacePID, pair)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("nsenter --net=/proc/%d/ns/net ip addr add %s dev %s", namespacePID, ipAddr, pair)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s master %s", name, bridge)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("nsenter --net=/proc/%d/ns/net /usr/sbin/ip route add default via %s", namespacePID, bridgeIpAddr)); err != nil {
return err
}
return nil
}
func CreateVXLAN(name string, nodeInterface string, bridgeName string) error {
const (
ID = "10"
GROUP = "239.1.1.1"
)
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link add %s type vxlan id %s group %s dstport 0 dev %s", name, ID, GROUP, nodeInterface)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s master %s", name, bridgeName)); err != nil {
return err
}
if err := pkg.ExecuteCommand(fmt.Sprintf("ip link set %s up", name)); err != nil {
return err
}
return nil
}
Next, we have a function to check if a device exists, to check that, we can look for a file in /sys/class/net/
for example if a br0
device exists the /sys/class/net/br0
exists as well:
func IsDeviceExists(name string) bool {
_, err := os.Stat(fmt.Sprintf("/sys/class/net/%s", name))
return !os.IsNotExist(err)
}
Next, for getting the local IP address (used to discover the node IP address inside the node):
unc GetLocalIPAddr(interfaceName string) (addr string, err error) {
ief, err := net.InterfaceByName(interfaceName)
if err != nil {
return "", err
}
addrs, err := ief.Addrs()
if err != nil {
return "", err
}
return addrs[0].String(), nil
}
And for getting a new available IP address with a given CIDR, we get all the possible hosts and ping each of them to see if one of them is not returning a response (meaning it is not allocated), this code was taken from ipcalc.go gist:
func GetNextAvailableIPAddr(cidr string) (string, error) {
hosts, err := hosts(cidr)
if err != nil {
return "", err
}
for _, ip := range hosts {
out, _ := exec.Command("ping", "-c1", "-t1", ip).Output()
if strings.Contains(string(out), "Destination Host Unreachable") {
return ip, nil
}
}
return "", fmt.Errorf("no available ip have found in cidr: %s", cidr)
}
func hosts(cidr string) ([]string, error) {
ip, ipnet, err := net.ParseCIDR(cidr)
if err != nil {
return nil, err
}
inc := func(ip net.IP) {
for j := len(ip) - 1; j >= 0; j-- {
ip[j]++
if ip[j] > 0 {
break
}
}
}
var ips []string
for ip := ip.Mask(ipnet.Mask); ipnet.Contains(ip); inc(ip) {
ips = append(ips, ip.String())
}
// remove network address and broadcast address
return ips[1 : len(ips)-1], nil
}
Let's test those changes!
❯ make build
# create node
❯ sudo ./bin/main node create
2022/11/05 13:05:26 node created: node-9613851a-2fcb-440f-949d-77eade2c3b0b
2022/11/05 13:05:26 starting node
2022/11/05 13:05:26 node assign port: 49232
# create pod
❯ curl -X POST localhost:49232/pods -H 'Content-Type: application/json' -d '{"name": "pod1", "image registry": "docker.io/cloudnativelabs/whats-my-ip:latest"}'
{"image registry":"docker.io/cloudnativelabs/whats-my-ip:latest","name":"pod1-49fa480b-803c-4655-87a2-a93f643e0c61"}
# the node logs - we can see all the commands that was executed
❯ sudo docker logs 892
____ __
/ __/___/ / ___
/ _// __/ _ \/ _ \
/___/\__/_//_/\___/ v4.9.0
High performance, minimalist Go web framework
https://echo.labstack.com
____________________________________O/_______
O\
2022/11/05 11:05:26 containerd logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:26 containerd running
⇨ http server started on [::]:10250
2022/11/05 11:05:37 pod created: pod1-49fa480b-803c-4655-87a2-a93f643e0c61
2022/11/05 11:05:37 starting pod
2022/11/05 11:05:37 setting up pod network
2022/11/05 11:05:37 ip link add br0 type bridge logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip addr add 10.0.3.1/24 dev br0 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip link set br0 up logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip link add vxlan10 type vxlan id 10 group 239.1.1.1 dstport 0 dev eth0 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip link set vxlan10 master br0 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:37 ip link set vxlan10 up logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 ip link add veth-pod1-49fa4 type veth peer name ceth-pod1-49fa4 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 ip link set veth-pod1-49fa4 up logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 ip link set ceth-pod1-49fa4 netns /proc/51/ns/net logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 nsenter --net=/proc/51/ns/net ip link set ceth-pod1-49fa4 up logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 nsenter --net=/proc/51/ns/net ip addr add 10.0.3.3/24 dev ceth-pod1-49fa4 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 ip link set veth-pod1-49fa4 master br0 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
2022/11/05 11:05:50 nsenter --net=/proc/51/ns/net /usr/sbin/ip route add default via 10.0.3.1/24 logs: &{%!s(*os.file=&{{{0 0 0} 1 {0} <nil> 0 1 true true true} /dev/stdout <nil> false true false})}
method=POST, uri=/pods, status=201
# and inside the node we can see bridge and vxlan are created, and we can ping to the pod:
❯ sudo docker exec -it 892 /bin/bash
root@8923c146abc3:/agent# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
2: br0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UP group default qlen 1000
link/ether 42:89:5f:a7:4f:78 brd ff:ff:ff:ff:ff:ff
inet 10.0.3.1/24 scope global br0
valid_lft forever preferred_lft forever
3: vxlan10: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue master br0 state UNKNOWN group default qlen 1000
link/ether 42:89:5f:a7:4f:78 brd ff:ff:ff:ff:ff:ff
5: veth-pod1-49fa4@if4: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master br0 state UP group default qlen 1000
link/ether b2:aa:10:8b:8a:25 brd ff:ff:ff:ff:ff:ff link-netnsid 1
267: eth0@if268: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default
link/ether 02:42:ac:12:00:03 brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 172.18.0.3/24 brd 172.18.0.255 scope global eth0
valid_lft forever preferred_lft forever
root@8923c146abc3:/agent# ping 10.0.3.3
PING 10.0.3.3 (10.0.3.3) 56(84) bytes of data.
64 bytes from 10.0.3.3: icmp_seq=1 ttl=64 time=0.142 ms
64 bytes from 10.0.3.3: icmp_seq=2 ttl=64 time=0.069 ms
^C
--- 10.0.3.3 ping statistics ---
In the next article, we will continue in our network implementation in code to ClusterIP and NodePort with iptables.
As always, the source code can be found here, the changes can be seen in this commit.
Posted on November 5, 2022
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.