场景描述:
AWS云环境中部署一套全面的监控体系,包括EKS(Elastic Kubernetes Service)、Prometheus、Grafana和Alertmanager。该监控体系旨在实现对Kubernetes集群中应用性能和资源使用情况的实时监控和管理,并确保数据的持久化存储。
方案概述:
为满足用户需求,我们将执行以下步骤:
- 部署aws eks
- 使用helm安装kube-prometheus-stack
- 使用efs对数据进行持久化
- 安装alb进行转发
详细步骤:
Prometheus Operator 相关文档
Prometheus-operator 官网
https://prometheus-operator.dev/docs/getting-started/introduction/
Prometheus-operator Git
https://github.com/prometheus-operator/prometheus-operator#getting-started
部署eks
配置aksk
aws configure
# 检查
aws sts get-caller-identity
{
"UserId": "AIDA5ABAEOQ7GW6FZOZ3A",
"Account": "893420598334",
"Arn": "arn:aws:iam::893420598334:user/zhangyitian@bosicloud.com"
}
安装kubectl、eksctl、helm、aws
安装kubectl
# 1.32下载 curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.32.3/2025-04-17/bin/linux/amd64/kubectl # 移动到可执行目录 chmod +x kubectl && mv kubectl /usr/bin/ # 验证 [root@ip-10-5-15-225 ~]# kubectl version Client Version: v1.32.3-eks-473151a Kustomize Version: v5.5.0 The connection to the server localhost:8080 was refused - did you specify the right host or port?
安装eksctl
ARCH=amd64 PLATFORM=$(uname -s)_$ARCH # 下载 curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$PLATFORM.tar.gz" # 解压 tar -xzf eksctl_$PLATFORM.tar.gz -C /tmp && rm -rf eksctl_$PLATFORM.tar.gz # 移动到可执行目录 mv /tmp/eksctl /usr/local/bin # 验证 [root@ip-10-5-15-225 ~]# eksctl version 0.211.0
安装helm
# 下载安装
curl -O https://image-auto-scaleing.s3.cn-north-1.amazonaws.com.cn/helm-v3.8.1-linux-amd64.tar.gz && tar -zxvf helm-v3.8.1-linux-amd64.tar.gz && cp linux-amd64/helm /usr/bin/
# 验证
[root@ip-10-5-15-225 ~]# helm version
version.BuildInfo{Version:"v3.8.1", GitCommit:"5cb9af4b1b271d11d7a97a71df3ac337dd94ad37", GitTreeState:"clean", GoVersion:"go1.17.5"}
更新aws命令
bash -c "$(curl -fsSL https://image-auto-scaleing.s3.cn-north-1.amazonaws.com.cn/awscli-update.sh)"
创建EKS NodeGroup IAM
[root@ip-10-0-174-176 iam]# cat << EOF > trust-policy.json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
[root@ip-10-0-174-176 iam]# aws iam create-role --role-name zyt-eks-nodegroup --assume-role-policy-document file://trust-policy.json
{
"Role": {
"Path": "/",
"RoleName": "zyt-eks-nodegroup",
"RoleId": "AROA5ABAEOQ7DUBLZPFGO",
"Arn": "arn:aws:iam::893420598334:role/zyt-eks-nodegroup",
"CreateDate": "2025-08-07T02:11:55+00:00",
"AssumeRolePolicyDocument": {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
}
}
# 附加权限策略
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonEC2FullAccess
# 获取zyt-eks-nodegroup arn
aws iam get-role --role-name zyt-eks-nodegroup --query 'Role.Arn' --output text
arn:aws:iam::893420598334:role/zyt-eks-nodegroup
安装EKS
执行命令后等待10-15分钟,完成后eksctl会自动将kubeconfig文件添加至服务器
cat << EOF > cluster.yaml
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: zyt-eks-250808 # 集群名称
region: us-east-1 # 集群区域
vpc:
id: "vpc-071fa2c6b1e205798" # 集群部署VPC id
cidr: "10.0.0.0/16"
subnets:
private:
us-east-1a: {id: subnet-0a5dda57a8bf1aeef} # 服务部署的子网群组。如公有子网,私有子网
us-east-1b: {id: subnet-043312904e68f23da}
iam:
withOIDC: true # 是否开启OIDC
addons: # 安装插件,下面三个为必备
- name: vpc-cni
- name: coredns
- name: kube-proxy
managedNodeGroups: # 管理节点组。管理节点组意思为aws平台管理节点组。NodeGroups则是自己维护节点组
- name: zyt-eks-monitoring # 节点组名称
amiFamily: AmazonLinux2 # AMI镜像
labels: {role: worker} # 标签
instanceType: t3.large # 实例规格
desiredCapacity: 3 # 期望节点数
minSize: 3 # 最小节点数
maxSize: 10 # 最大节点数
volumeSize: 50 # 存储空间,默认大小为G
maxPodsPerNode: 110 # 每个节点最多能部署多少POD
privateNetworking: true
subnets:
- subnet-0a5dda57a8bf1aeef
- subnet-043312904e68f23da
ssh:
allow: true # 是否允许SSH登录
publicKeyName: zyt-pem
iam:
instanceRoleARN: "arn:aws:iam::893420598334:role/zyt-eks-nodegroup"
tags:
Sale: "zhangyitian@bosicloud.com"
Owner: "zhangyitian@bosicloud.com"
EOF
eksctl create cluster -f cluster.yaml
配置EFS
创建efs
创建安全组
允许IP选择VPC网段
编辑EFS网络
选择刚才创建的安全组
将EFS挂载到本地
mkdir /efs mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport fs-01d3d2773c8b0b57e.efs.us-east-1.amazonaws.com:/ /efs
部署Prometheus-Operator
helm拉取kube-prome-stack
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update wget https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-75.10.0/kube-prometheus-stack-75.10.0.tgz tar xf kube-prometheus-stack-75.10.0.tgz
helm部署服务查看启动状态
kubectl create ns monitoring
[root@ip-10-0-174-176 kube-prometheus-stack]# pwd
/root/eks/monitoring/kube-prometheus-stack
# 启动服务
helm install prom-stack . -n monitoring
[root@ip-10-0-174-176 kube-prometheus-stack]# kubectl get pod -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-prom-stack-kube-prometheus-alertmanager-0 2/2 Running 0 51s
prom-stack-grafana-549f5d664-h8k47 3/3 Running 0 55s
prom-stack-kube-prometheus-operator-7748c66d94-p4nv8 1/1 Running 0 55s
prom-stack-kube-state-metrics-59b85d4cbd-xvc9v 1/1 Running 0 55s
prom-stack-prometheus-node-exporter-g985q 1/1 Running 0 55s
prom-stack-prometheus-node-exporter-hgdzm 1/1 Running 0 55s
prom-stack-prometheus-node-exporter-rcjv2 1/1 Running 0 55s
prometheus-prom-stack-kube-prometheus-prometheus-0 2/2 Running 0 51s
# 查看grafana默认密码
kubectl --namespace monitoring get secrets prom-stack-grafana -o jsonpath="{.data.admin-password}" | base64 -d ; echo
安装EFS插件
eksctl配置role与policy
export cluster_name=zyt-eks-250808
export role_name=AmazonEKS_EFS_CSI_DriverRole_zyt_0808
eksctl create iamserviceaccount \
--name efs-csi-controller-sa \
--namespace kube-system \
--cluster $cluster_name \
--role-name $role_name \
--role-only \
--attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy \
--approve
# 修改role权限范围
TRUST_POLICY=$(aws iam get-role --role-name $role_name --query 'Role.AssumeRolePolicyDocument' | \
sed -e 's/efs-csi-controller-sa/efs-csi-*/' -e 's/StringEquals/StringLike/')
# 查看修改内容
aws iam get-role --role-name AmaZonEks_EFS_CSI_DriverRole_zyt_0808 --query 'Role.AssumeRolePolicyDocument'
# 权限更新
aws iam update-assume-role-policy --role-name $role_name --policy-document "$TRUST_POLICY"
安装EFS插件
eksctl create addon --name aws-efs-csi-driver --cluster zyt-eks-250808 --service-account-role-arn arn:aws:iam::893420598334:role/AmazonEKS_EFS_CSI_DriverRole_zyt_0808 --force [root@ip-10-0-174-176 kube-prometheus-stack]# kubectl get pods -n kube-system NAME READY STATUS RESTARTS AGE aws-node-6cwwh 2/2 Running 0 29m aws-node-sfpsd 2/2 Running 0 29m aws-node-thpsg 2/2 Running 0 29m coredns-6b9575c64c-4qb5v 1/1 Running 0 33m coredns-6b9575c64c-66gjd 1/1 Running 0 33m efs-csi-controller-765df4ffd9-nbb2t 3/3 Running 0 2m51s efs-csi-controller-765df4ffd9-z9jtf 3/3 Running 0 2m51s efs-csi-node-477z9 3/3 Running 0 2m51s efs-csi-node-6k5tw 3/3 Running 0 2m51s efs-csi-node-pzxl5 3/3 Running 0 2m51s kube-proxy-28n4p 1/1 Running 0 29m kube-proxy-2p75d 1/1 Running 0 29m kube-proxy-m4r94 1/1 Running 0 29m metrics-server-6d499768b4-prfhc 1/1 Running 0 33m metrics-server-6d499768b4-r6frm 1/1 Running 0 33m
创建EFS存储类
cat << EOF > efs_sc.yaml apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: name: efs-sc provisioner: efs.csi.aws.com parameters: provisioningMode: efs-ap fileSystemId: fs-040e588a290e9b177 directoryPerms: "700" gidRangeStart: "1000" gidRangeEnd: "2000" basePath: "/monitoring" reclaimPolicy: Retain volumeBindingMode: Immediate EOF kubectl apply -f efs_sc.yaml
对服务进行持久化存储
编辑helm文件
cat << EOF > prometheus-efs-values.yaml
prometheus:
prometheusSpec:
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: efs-sc
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 50Gi
alertmanager:
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
storageClassName: efs-sc
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
grafana:
persistence:
enabled: true
storageClassName: efs-sc
size: 10Gi
# 关键配置:禁用 chown initContainer
initChownData:
enabled: false # 关闭强制修改属主
# 确保 securityContext 与 EFS 访问点匹配
securityContext:
fsGroup: 1002 # 必须等于 EFS 访问点的 Gid
runAsUser: 1002
EOF
helm upgrade prom-stack prometheus-community/kube-prometheus-stack -n monitoring --version 75.10.0 -f prometheus-efs-values.yaml --reuse-values
验证持久化状态
[root@ip-10-0-1-70 pvc-3124b07a-05f2-4a8c-b456-0bbde47da0ee]# ls csv grafana.db pdf plugins png [root@ip-10-0-1-70 pvc-3124b07a-05f2-4a8c-b456-0bbde47da0ee]# kubectl exec -it -n monitoring prom-stack-grafana-967ccfb8c-7z4z9 -- bash prom-stack-grafana-967ccfb8c-7z4z9:/usr/share/grafana$ cd /var/lib/grafana/ prom-stack-grafana-967ccfb8c-7z4z9:/var/lib/grafana$ ls csv grafana.db pdf plugins png prom-stack-grafana-967ccfb8c-7z4z9:/var/lib/grafana$ touch zyt prom-stack-grafana-967ccfb8c-7z4z9:/var/lib/grafana$ exit exit [root@ip-10-0-1-70 pvc-3124b07a-05f2-4a8c-b456-0bbde47da0ee]# ls csv grafana.db pdf plugins png zyt
安装ALB
配置ALB策略
curl -o iam_policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/main/docs/install/iam_policy.json aws iam create-policy --policy-name AWSLoadBalancerControllerIAMPolicy-zyt_0808 --policy-document file://iam_policy.json
配置serviceaccount
eksctl create iamserviceaccount --cluster=zyt-eks-250808 --namespace=kube-system --name=aws-load-balancer-controller --attach-policy-arn=arn:aws:iam::893420598334:policy/AWSLoadBalancerControllerIAMPolicy-zyt_0808 --override-existing-serviceaccounts --approve
安装ALB
helm repo add eks https://aws.github.io/eks-charts helm repo update helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ -n kube-system \ --set clusterName=zyt-eks-250808 \ --set serviceAccount.create=false \ --set serviceAccount.name=aws-load-balancer-controller \ --set region=us-east-1 \ --set vpcId=vpc-071fa2c6b1e205798
配置ALB转发规则
Prometheus
cat << EOF > prometheus_alb.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus-ingress
namespace: monitoring
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/subnets: "subnet-09ca72eec6dbf351a,subnet-0558f4b0ec4424206" # ALB转发的子网选择public,因为要去外网交互
alb.ingress.kubernetes.io/healthcheck-path: /-/healthy # 自定义健康检查
spec:
ingressClassName: alb
rules:
- http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prom-stack-kube-prometheus-prometheus
port:
number: 9090
EOF
kubectl apply -f prometheus_alb.yaml
AlterManager
cat << EOF > altermanager_alb.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager-ingress
namespace: monitoring
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/subnets: "subnet-09ca72eec6dbf351a,subnet-0558f4b0ec4424206" # ALB转发的子网选择public,因为要去外网交互
alb.ingress.kubernetes.io/healthcheck-path: /-/healthy # 自定义健康检查
spec:
ingressClassName: alb
rules:
- http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prom-stack-kube-prometheus-alertmanager
port:
number: 9093
EOF
kubectl apply -f altermanager_alb.yaml
Grafana
cat << EOF > grafana_alb.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana-ingress
namespace: monitoring
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/subnets: "subnet-09ca72eec6dbf351a,subnet-0558f4b0ec4424206" # ALB转发的子网选择public,因为要去外网交互
alb.ingress.kubernetes.io/success-codes: 200-302 # 因为grafana会进行302跳转,所以要加一个注解
spec:
ingressClassName: alb
rules:
- http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prom-stack-grafana
port:
number: 80
EOF
kubectl apply -f grafana_alb.yaml
查看ingress状态
[root@ip-10-0-1-70 alb]# kubectl get ingress -n monitoring NAME CLASS HOSTS ADDRESS PORTS AGE alertmanager-ingress alb * k8s-monitori-alertman-5490852c5c-1026823537.us-east-1.elb.amazonaws.com 80 42m grafana-ingress alb * k8s-monitori-grafanai-f110992746-1761918337.us-east-1.elb.amazonaws.com 80 10m prometheus-ingress alb * k8s-monitori-promethe-753a6ad123-489061696.us-east-1.elb.amazonaws.com 80 41m
总结
至此,AWS 云上的 EKS、Prometheus、Grafana 与 Alertmanager 已构成闭环监控体系:指标实时采集、图表即点即现、告警持久化留存。后续只需随业务演进微调规则与面板,即可持续守护集群性能与稳定性。



