公司的开发测试环境想部署个 docker 集群, k8s 不会 ,k3s 更不会.
目前搭配的组合是 esxi 作为基础系统 , 虚拟出多个 centos8 的系统 ,centos8 安装 docker swarm 集群, 出现了一个问题!! docker swarm 部署服务完成后,跨主机容器内都能正常 ping 但是 宿主机去访问 docker 开放的端口 访问三次只有一次成功. 具体如下:
公司路由器网关 10.0.0.1
1. server-01 10.0.0.21 (manage)
2. server-02 10.0.0.22
3. server-03 10.0.0.23
### 防火墙全部关闭 只有 iptable
server-01 $ docker swarm init --default-addr-pool 192.0.0.0/24
server-02 $ docker swarm join
server-03 $ docker swarm join
###
server-01 $ docker node ls
ID HOSTNAME STATUS AVAILABILITY MANAGER STATUS ENGINE VERSION
km7dmxn402qt0s473kpqb47ac * Server-01 Ready Active Leader 20.10.9
k5vq74oh1njscvv4mf9gpyogh Server-02 Ready Active 20.10.9
rxzmo276saehmh1rc118fdxxe Server-03 Ready Active 20.10.9
### 网络状态如下
server-01 $ docker network inspect ingress
[
{
"Name": "ingress",
"Id": "m7ia7lmmlu1zm0zchr13ohk4q",
"Created": "2021-10-14T15:08:48.036907446+08:00",
"Scope": "swarm",
"Driver": "overlay",
"EnableIPv6": false,
"IPAM": {
"Driver": "default",
"Options": null,
"Config": [
{
"Subnet": "192.0.0.0/24",
"Gateway": "192.0.0.1"
}
]
},
"Internal": false,
"Attachable": false,
"Ingress": true,
"ConfigFrom": {
"Network": ""
},
"ConfigOnly": false,
"Containers": {
"ingress-sbox": {
"Name": "ingress-endpoint",
"EndpointID": "4b5146ca8e180dd88a5271b7d29b439f6d5995801a47d8c648379d9b51ab0b77",
"MacAddress": "02:42:c0:00:00:02",
"IPv4Address": "192.0.0.2/24",
"IPv6Address": ""
}
},
"Options": {
"com.docker.network.driver.overlay.vxlanid_list": "4096"
},
"Labels": {},
"Peers": [
{
"Name": "6ebb8868ac00",
"IP": "10.0.0.21"
},
{
"Name": "7982d5a14bf2",
"IP": "10.0.0.22"
},
{
"Name": "b25e17d118a4",
"IP": "10.0.0.23"
}
]
}
]
server-01 $ docker network inspect docker_gwbridge
[
{
"Name": "docker_gwbridge",
"Id": "6f2d03207e884bfec1918d4e8fc1a1f5f14ec9e5bcd71fd409a26630ab73d413",
"Created": "2021-10-14T15:08:48.422229208+08:00",
"Scope": "local",
"Driver": "bridge",
"EnableIPv6": false,
"IPAM": {
"Driver": "default",
"Options": null,
"Config": [
{
"Subnet": "172.18.0.0/16",
"Gateway": "172.18.0.1"
}
]
},
"Internal": false,
"Attachable": false,
"Ingress": false,
"ConfigFrom": {
"Network": ""
},
"ConfigOnly": false,
"Containers": {
"ingress-sbox": {
"Name": "gateway_ingress-sbox",
"EndpointID": "1c4c1b5ba462d87832710029171c3911df457c950055a369670f59cef374247b",
"MacAddress": "02:42:ac:12:00:02",
"IPv4Address": "172.18.0.2/16",
"IPv6Address": ""
}
},
"Options": {
"com.docker.network.bridge.enable_icc": "false",
"com.docker.network.bridge.enable_ip_masquerade": "true",
"com.docker.network.bridge.name": "docker_gwbridge"
},
"Labels": {}
}
]
### 创建 nginx 服务
server-01 $ docker service create --replicas 3 -p 80:80 --name nginx nginx
server-01 $ docker service ps nginx
ID NAME IMAGE NODE DESIRED STATE CURRENT STATE ERROR PORTS
xsomsqqtkr62 nginx.1 nginx:latest Server-02 Running Running 2 minutes ago
selbdoapjek0 nginx.2 nginx:latest Server-03 Running Running 2 minutes ago
w5bigfn8xtz4 nginx.3 nginx:latest Server-01 Running Running 2 minutes ago
server-01 $ docker service ls
ID NAME MODE REPLICAS IMAGE PORTS
ro33x7v9ceri nginx replicated 3/3 nginx:latest *:80->80/tcp
server-01 $ docker ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
37de9b7759c9 nginx:latest "/docker-entrypoint.…" 5 minutes ago Up 5 minutes 80/tcp nginx.3.w5bigfn8xtz4pi10hoe62gi4b
···························································
## 重点来了!!!
[root@Server-01 ~]# curl 10.0.0.21 --卡住
^C
[root@Server-01 ~]# curl 10.0.0.21 --卡住
^C
[root@Server-01 ~]# curl 10.0.0.21 --三次成功一次
<!DOCTYPE html>
<html>
<head>
<title>Welcome to nginx!</title>
<style>
html { color-scheme: light dark; }
body { width: 35em; margin: 0 auto;
font-family: Tahoma, Verdana, Arial, sans-serif; }
</style>
</head>
<body>
<h1>Welcome to nginx!</h1>
<p>If you see this page, the nginx web server is successfully installed and
working. Further configuration is required.</p>
<p>For online documentation and support please refer to
<a href="http://nginx.org/">nginx.org</a>.<br/>
Commercial support is available at
<a href="http://nginx.com/">nginx.com</a>.</p>
<p><em>Thank you for using nginx.</em></p>
</body>
</html>
[root@Server-01 ~]# netstat -tunlp
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name
tcp 0 0 192.168.122.1:53 0.0.0.0:* LISTEN 1740/dnsmasq
tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN 1068/sshd
tcp 0 0 0.0.0.0:111 0.0.0.0:* LISTEN 1/systemd
tcp6 0 0 :::22 :::* LISTEN 1068/sshd
tcp6 0 0 :::2377 :::* LISTEN 1222/dockerd
tcp6 0 0 :::7946 :::* LISTEN 1222/dockerd
tcp6 0 0 :::111 :::* LISTEN 1/systemd
tcp6 0 0 :::80 :::* LISTEN 1222/dockerd
udp 0 0 192.168.122.1:53 0.0.0.0:* 1740/dnsmasq
udp 0 0 0.0.0.0:67 0.0.0.0:* 1740/dnsmasq
udp 0 0 0.0.0.0:111 0.0.0.0:* 1/systemd
udp 0 0 0.0.0.0:4789 0.0.0.0:* -
udp6 0 0 :::7946 :::* 1222/dockerd
udp6 0 0 :::111 :::* 1/systemd
[root@Server-01 ~]# iptables -nL --line-number
Chain INPUT (policy ACCEPT)
num target prot opt source destination
1 LIBVIRT_INP all -- 0.0.0.0/0 0.0.0.0/0
Chain FORWARD (policy DROP)
num target prot opt source destination
1 DOCKER-USER all -- 0.0.0.0/0 0.0.0.0/0
2 DOCKER-INGRESS all -- 0.0.0.0/0 0.0.0.0/0
3 DOCKER-ISOLATION-STAGE-1 all -- 0.0.0.0/0 0.0.0.0/0
4 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 ctstate RELATED,ESTABLISHED
5 DOCKER all -- 0.0.0.0/0 0.0.0.0/0
6 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
7 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 ctstate RELATED,ESTABLISHED
8 DOCKER all -- 0.0.0.0/0 0.0.0.0/0
9 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
10 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
11 LIBVIRT_FWX all -- 0.0.0.0/0 0.0.0.0/0
12 LIBVIRT_FWI all -- 0.0.0.0/0 0.0.0.0/0
13 LIBVIRT_FWO all -- 0.0.0.0/0 0.0.0.0/0
14 DROP all -- 0.0.0.0/0 0.0.0.0/0
Chain OUTPUT (policy ACCEPT)
num target prot opt source destination
1 LIBVIRT_OUT all -- 0.0.0.0/0 0.0.0.0/0
Chain LIBVIRT_INP (1 references)
num target prot opt source destination
1 ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp dpt:53
2 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:53
3 ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp dpt:67
4 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:67
Chain LIBVIRT_OUT (1 references)
num target prot opt source destination
1 ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp dpt:53
2 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:53
3 ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp dpt:68
4 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:68
Chain LIBVIRT_FWO (1 references)
num target prot opt source destination
1 ACCEPT all -- 192.168.122.0/24 0.0.0.0/0
2 REJECT all -- 0.0.0.0/0 0.0.0.0/0 reject-with icmp-port-unreachable
Chain LIBVIRT_FWI (1 references)
num target prot opt source destination
1 ACCEPT all -- 0.0.0.0/0 192.168.122.0/24 ctstate RELATED,ESTABLISHED
2 REJECT all -- 0.0.0.0/0 0.0.0.0/0 reject-with icmp-port-unreachable
Chain LIBVIRT_FWX (1 references)
num target prot opt source destination
1 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
Chain DOCKER (2 references)
num target prot opt source destination
Chain DOCKER-ISOLATION-STAGE-1 (1 references)
num target prot opt source destination
1 DOCKER-ISOLATION-STAGE-2 all -- 0.0.0.0/0 0.0.0.0/0
2 DOCKER-ISOLATION-STAGE-2 all -- 0.0.0.0/0 0.0.0.0/0
3 RETURN all -- 0.0.0.0/0 0.0.0.0/0
Chain DOCKER-ISOLATION-STAGE-2 (2 references)
num target prot opt source destination
1 DROP all -- 0.0.0.0/0 0.0.0.0/0
2 DROP all -- 0.0.0.0/0 0.0.0.0/0
3 RETURN all -- 0.0.0.0/0 0.0.0.0/0
Chain DOCKER-USER (1 references)
num target prot opt source destination
1 RETURN all -- 0.0.0.0/0 0.0.0.0/0
Chain DOCKER-INGRESS (1 references)
num target prot opt source destination
1 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:80
2 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 state RELATED,ESTABLISHED tcp spt:80
3 RETURN all -- 0.0.0.0/0 0.0.0.0/0
1
saytesnake 2021-10-14 17:36:45 +08:00
在 esxi 网卡打开允许混合。
|
2
defunct9 2021-10-14 19:59:36 +08:00
哦。推倒重来。你用的 swarm 过时了。直接用 docker-compose
|
3
vinle 2021-10-14 22:36:52 +08:00
首先对楼主的测试方法有点好奇的是:你已经在 01 节点测试服务可用性的话,为什么不 curl localhost/127.0.0.1 ?你 curl 了 10 段的话,其中的测试结果应该是包含两个东西:节点服务可用性+节点本机 10 段网络的配置。
然后,作为两年前实践过用 swarm 来尝试搭过小集群的过来人( 3 台机子,每台约 10 个 service,每个 service 从 5~20 个 replicas 不等),只想说,这货就是个完全的社区项目,这并不是说 swarm 不能用,只是想要达到企业级的稳定性 /安全性 /灵活性是不可能的。为什么?其实了解下 swarm 这项目出来的目的,便会发现这东西是很难达到“好用”级别的(但是不可否认还是挺好玩)。而要想做到前面说的这些,唯有 kubernete,国内也有一些服务商有提供 out-of-box 的云原生基建平台,都非常不错。但是如果你要用手把手地用 swarm 来搞,那只能祝君好运,并且玩得愉快🌹 |
5
liuxu 2021-10-14 23:14:01 +08:00
你要是 debian/ubuntu 的话我可以帮你详细分析下,其他的系统我就只能大致说下怎么查
首先你的 server-01 的 ip 似乎有一个 192.168.122.0/24,先确认下 server-0{1,2,3}和你本地机器的 ip 是不是在一个网段,互相 ping 一下 然后 server-0{1,2,3}的 iptable 、netstat 和 ifconfig 都看看 最后互相 curl,在双方机器上用 tcpdump 抓包看看 |
6
ik 2021-10-14 23:18:54 +08:00 via iPhone
iptables 规则问题? 三个 docker 服务都重启一下呢?
|
7
ziwen1943 2021-10-15 08:57:32 +08:00
看看防火墙和 iptables 是不是有奇奇怪怪的规则
|
8
zxkxhnqwe123 OP @vinle 三台服务器上面都是一样的 调用 curl 127.0.0.1 都是一样的效果. 并且所有系统都是干净重装好的
|
9
zxkxhnqwe123 OP @saytesnake 试过了 好像也不行 ! 叫混杂模式
|
10
zxkxhnqwe123 OP @saytesnake 主要是 我是开发人员,公司也没有专业运维,现在想解决 devops 自动化运维 测试环境,所以只能从简单的折腾
|
11
juzisang 2021-10-15 09:30:24 +08:00
看一下这几个端口有没有开放
https://docs.docker.com/engine/swarm/swarm-tutorial/#open-protocols-and-ports-between-the-hosts 前几个月也搭了一个 swarm 集群 https://www.v2ex.com/t/772731 |
12
byzf 2021-10-15 10:53:59 +08:00
以前碰到过几次请求三次只成功一次的情况,有 dns 配置的问题,有负载均衡的问题。
|
13
defunct9 2021-10-15 10:55:29 +08:00
开 ssh,让我上去看看
|
17
mepwang 2021-10-15 16:16:22 +08:00
curl -v 看看卡到哪一步了
|
18
jackleeforce3615 2021-10-15 16:53:29 +08:00
一直以为没多少人用 docker swarm 了
|
19
mkdir 2021-10-15 17:14:21 +08:00
@jackleeforce3615 一直用一直爽
|
20
zxkxhnqwe123 OP @mepwang
[root@Server-01 ~]# curl 127.0.0.1 -v * Rebuilt URL to: 127.0.0.1/ * Trying 127.0.0.1... * TCP_NODELAY set ^C [root@Server-01 ~]# curl 127.0.0.1 -v * Rebuilt URL to: 127.0.0.1/ * Trying 127.0.0.1... * TCP_NODELAY set * Connected to 127.0.0.1 (127.0.0.1) port 80 (#0) > GET / HTTP/1.1 > Host: 127.0.0.1 > User-Agent: curl/7.61.1 > Accept: */* > < HTTP/1.1 200 OK < Server: nginx/1.21.3 < Date: Fri, 15 Oct 2021 09:56:24 GMT < Content-Type: text/html < Content-Length: 615 < Last-Modified: Tue, 07 Sep 2021 15:21:03 GMT < Connection: keep-alive < ETag: "6137835f-267" < Accept-Ranges: bytes < <!DOCTYPE html> <html> <head> <title>Welcome to nginx!</title> <style> html { color-scheme: light dark; } body { width: 35em; margin: 0 auto; font-family: Tahoma, Verdana, Arial, sans-serif; } </style> </head> <body> <h1>Welcome to nginx!</h1> <p>If you see this page, the nginx web server is successfully installed and working. Further configuration is required.</p> <p>For online documentation and support please refer to <a href="http://nginx.org/">nginx.org</a>.<br/> Commercial support is available at <a href="http://nginx.com/">nginx.com</a>.</p> <p><em>Thank you for using nginx.</em></p> </body> </html> * Connection #0 to host 127.0.0.1 left intact [root@Server-01 ~]# ^C [root@Server-01 ~]# curl 127.0.0.1 -v * Rebuilt URL to: 127.0.0.1/ * Trying 127.0.0.1... * TCP_NODELAY set |
21
mepwang 2021-10-18 11:25:35 +08:00
看不出来什么问题,curl 调用三次成功一次,会不会和你的副本数量有关系?
能给的建议不多, 你把 replica 的数目改成 4 个或 2 个,看看 curl 调用成功的几率是不是变成 4 次或者 2 次成功一次。 感觉是你的 swarm 集群有点问题,直觉上是网络转发这块。 你给你的应用添加一个 overlay network 试试看? |
22
zxkxhnqwe123 OP 终于解决了 !!!! 放假花了两天时间解决了,也当学习了 . 这两天重装了 不下 20 次 ,3 台虚拟机 不停重启,重装.
原理就是开启 esxi 网卡的混杂模式 , 网卡用 E1000e (这个其实不太确定,不想验证了). 然后确认下 swarm 网关和局域网的网关是否冲突了. 这些做完就是圆满结束 感谢以上的朋友帮忙!!! 判断依据 https://stackoverflow.com/questions/59007780/container-running-on-docker-swarm-not-accessible-from-outside |
23
isnullstring 129 天前
@zxkxhnqwe123 #22 回来留个脚印
我的情况跟楼主一样,先是确认 swarm 网关,默认是 10.0.0.0 ,跟现有一致的话肯定是不行的 环境 :exsi 6.7 + ubuntu 22.04 + 10.0.0.0 完整解决办法: 1 、虚拟交换机 开混杂模式 2 、必须修改虚拟机网卡类型,E1000 3 、初始化集群时指定 IP 段,注意 stackoverflow 中的回答 -------------------------------------------------swarm 网段---------------------通讯 IP docker swarm init --default-addr-pool 11.0.0.0/8 --advertise-addr 10.0.1.137 |
24
isnullstring 129 天前
@isnullstring #23 还有个奇怪现象,只有 1 个管理节点和 1 个工作节点时候就没毛病,但是通过管理节点无法访问工作节点的端口,第二个节点一加进来就凉
|