场景描述:

AWS云环境中部署一套全面的监控体系,包括EKS(Elastic Kubernetes Service)、Prometheus、Grafana和Alertmanager。该监控体系旨在实现对Kubernetes集群中应用性能和资源使用情况的实时监控和管理,并确保数据的持久化存储。

方案概述:

为满足用户需求,我们将执行以下步骤:

  1. 部署aws eks
  2. 使用helm安装kube-prometheus-stack
  3. 使用efs对数据进行持久化
  4. 安装alb进行转发

详细步骤:

Prometheus Operator 相关文档

Prometheus-operator 官网

 https://prometheus-operator.dev/docs/getting-started/introduction/

Prometheus-operator Git

https://github.com/prometheus-operator/prometheus-operator#getting-started

部署eks

配置aksk

aws configure

# 检查
aws sts get-caller-identity
{
    "UserId": "AIDA5ABAEOQ7GW6FZOZ3A",
    "Account": "893420598334",
    "Arn": "arn:aws:iam::893420598334:user/zhangyitian@bosicloud.com"
}

安装kubectl、eksctl、helm、aws

安装kubectl
# 1.32下载
curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.32.3/2025-04-17/bin/linux/amd64/kubectl

# 移动到可执行目录
chmod +x kubectl && mv kubectl /usr/bin/

# 验证
[root@ip-10-5-15-225 ~]# kubectl version
Client Version: v1.32.3-eks-473151a
Kustomize Version: v5.5.0
The connection to the server localhost:8080 was refused - did you specify the right host or port?
安装eksctl
ARCH=amd64
PLATFORM=$(uname -s)_$ARCH

# 下载
curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$PLATFORM.tar.gz"

# 解压
tar -xzf eksctl_$PLATFORM.tar.gz -C /tmp && rm -rf eksctl_$PLATFORM.tar.gz

# 移动到可执行目录
mv /tmp/eksctl /usr/local/bin

# 验证
[root@ip-10-5-15-225 ~]# eksctl version
0.211.0
安装helm
# 下载安装
curl -O https://image-auto-scaleing.s3.cn-north-1.amazonaws.com.cn/helm-v3.8.1-linux-amd64.tar.gz && tar -zxvf helm-v3.8.1-linux-amd64.tar.gz && cp linux-amd64/helm /usr/bin/

# 验证
[root@ip-10-5-15-225 ~]# helm version
version.BuildInfo{Version:"v3.8.1", GitCommit:"5cb9af4b1b271d11d7a97a71df3ac337dd94ad37", GitTreeState:"clean", GoVersion:"go1.17.5"}
更新aws命令
bash -c "$(curl -fsSL https://image-auto-scaleing.s3.cn-north-1.amazonaws.com.cn/awscli-update.sh)"

创建EKS NodeGroup IAM

[root@ip-10-0-174-176 iam]# cat << EOF > trust-policy.json 
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "ec2.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
}
EOF
[root@ip-10-0-174-176 iam]# aws iam create-role --role-name zyt-eks-nodegroup --assume-role-policy-document file://trust-policy.json 
{
    "Role": {
        "Path": "/",
        "RoleName": "zyt-eks-nodegroup",
        "RoleId": "AROA5ABAEOQ7DUBLZPFGO",
        "Arn": "arn:aws:iam::893420598334:role/zyt-eks-nodegroup",
        "CreateDate": "2025-08-07T02:11:55+00:00",
        "AssumeRolePolicyDocument": {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Effect": "Allow",
                    "Principal": {
                        "Service": "ec2.amazonaws.com"
                    },
                    "Action": "sts:AssumeRole"
                }
            ]
        }
    }
}

# 附加权限策略
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
aws iam attach-role-policy --role-name zyt-eks-nodegroup --policy-arn arn:aws:iam::aws:policy/AmazonEC2FullAccess

# 获取zyt-eks-nodegroup arn
aws iam get-role --role-name zyt-eks-nodegroup --query 'Role.Arn' --output text
arn:aws:iam::893420598334:role/zyt-eks-nodegroup

安装EKS

执行命令后等待10-15分钟,完成后eksctl会自动将kubeconfig文件添加至服务器

cat << EOF > cluster.yaml
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
  name: zyt-eks-250808   # 集群名称
  region: us-east-1   # 集群区域
vpc:
  id: "vpc-071fa2c6b1e205798"    # 集群部署VPC id
  cidr: "10.0.0.0/16"
  subnets:
    private:
      us-east-1a: {id: subnet-0a5dda57a8bf1aeef}  # 服务部署的子网群组。如公有子网,私有子网
      us-east-1b: {id: subnet-043312904e68f23da}

iam:
  withOIDC: true   # 是否开启OIDC

addons:   # 安装插件,下面三个为必备
 - name: vpc-cni
 - name: coredns
 - name: kube-proxy

managedNodeGroups:   # 管理节点组。管理节点组意思为aws平台管理节点组。NodeGroups则是自己维护节点组
  - name: zyt-eks-monitoring   # 节点组名称
    amiFamily: AmazonLinux2   # AMI镜像 
    labels: {role: worker}	  # 标签
    instanceType: t3.large   # 实例规格
    desiredCapacity: 3   # 期望节点数
    minSize: 3   # 最小节点数
    maxSize: 10   # 最大节点数
    volumeSize: 50   # 存储空间,默认大小为G
    maxPodsPerNode: 110   # 每个节点最多能部署多少POD
    privateNetworking: true
    subnets:
      - subnet-0a5dda57a8bf1aeef
      - subnet-043312904e68f23da
    ssh:
      allow: true   # 是否允许SSH登录
      publicKeyName: zyt-pem
    iam: 
      instanceRoleARN: "arn:aws:iam::893420598334:role/zyt-eks-nodegroup"
    tags:
      Sale: "zhangyitian@bosicloud.com"
      Owner: "zhangyitian@bosicloud.com"
EOF

eksctl create cluster -f cluster.yaml

配置EFS

创建efs

创建安全组

允许IP选择VPC网段

编辑EFS网络

选择刚才创建的安全组

将EFS挂载到本地
mkdir /efs
mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport fs-01d3d2773c8b0b57e.efs.us-east-1.amazonaws.com:/ /efs

部署Prometheus-Operator

helm拉取kube-prome-stack

helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
wget https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-75.10.0/kube-prometheus-stack-75.10.0.tgz
tar xf kube-prometheus-stack-75.10.0.tgz

helm部署服务查看启动状态

kubectl create ns monitoring
[root@ip-10-0-174-176 kube-prometheus-stack]# pwd
/root/eks/monitoring/kube-prometheus-stack

# 启动服务
helm install prom-stack . -n monitoring
[root@ip-10-0-174-176 kube-prometheus-stack]# kubectl get pod -n monitoring
NAME                                                     READY   STATUS    RESTARTS   AGE
alertmanager-prom-stack-kube-prometheus-alertmanager-0   2/2     Running   0          51s
prom-stack-grafana-549f5d664-h8k47                       3/3     Running   0          55s
prom-stack-kube-prometheus-operator-7748c66d94-p4nv8     1/1     Running   0          55s
prom-stack-kube-state-metrics-59b85d4cbd-xvc9v           1/1     Running   0          55s
prom-stack-prometheus-node-exporter-g985q                1/1     Running   0          55s
prom-stack-prometheus-node-exporter-hgdzm                1/1     Running   0          55s
prom-stack-prometheus-node-exporter-rcjv2                1/1     Running   0          55s
prometheus-prom-stack-kube-prometheus-prometheus-0       2/2     Running   0          51s

# 查看grafana默认密码
kubectl --namespace monitoring get secrets prom-stack-grafana -o jsonpath="{.data.admin-password}" | base64 -d ; echo

安装EFS插件

eksctl配置role与policy

export cluster_name=zyt-eks-250808
export role_name=AmazonEKS_EFS_CSI_DriverRole_zyt_0808

eksctl create iamserviceaccount \
    --name efs-csi-controller-sa \
    --namespace kube-system \
    --cluster $cluster_name \
    --role-name $role_name \
    --role-only \
    --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy \
    --approve
    
# 修改role权限范围
TRUST_POLICY=$(aws iam get-role --role-name $role_name --query 'Role.AssumeRolePolicyDocument' | \
    sed -e 's/efs-csi-controller-sa/efs-csi-*/' -e 's/StringEquals/StringLike/')

# 查看修改内容
aws iam get-role --role-name AmaZonEks_EFS_CSI_DriverRole_zyt_0808 --query 'Role.AssumeRolePolicyDocument'

# 权限更新
aws iam update-assume-role-policy --role-name $role_name --policy-document "$TRUST_POLICY"

安装EFS插件

eksctl create addon --name aws-efs-csi-driver --cluster zyt-eks-250808 --service-account-role-arn arn:aws:iam::893420598334:role/AmazonEKS_EFS_CSI_DriverRole_zyt_0808 --force

[root@ip-10-0-174-176 kube-prometheus-stack]# kubectl get pods -n kube-system
NAME                                  READY   STATUS    RESTARTS   AGE
aws-node-6cwwh                        2/2     Running   0          29m
aws-node-sfpsd                        2/2     Running   0          29m
aws-node-thpsg                        2/2     Running   0          29m
coredns-6b9575c64c-4qb5v              1/1     Running   0          33m
coredns-6b9575c64c-66gjd              1/1     Running   0          33m
efs-csi-controller-765df4ffd9-nbb2t   3/3     Running   0          2m51s
efs-csi-controller-765df4ffd9-z9jtf   3/3     Running   0          2m51s
efs-csi-node-477z9                    3/3     Running   0          2m51s
efs-csi-node-6k5tw                    3/3     Running   0          2m51s
efs-csi-node-pzxl5                    3/3     Running   0          2m51s
kube-proxy-28n4p                      1/1     Running   0          29m
kube-proxy-2p75d                      1/1     Running   0          29m
kube-proxy-m4r94                      1/1     Running   0          29m
metrics-server-6d499768b4-prfhc       1/1     Running   0          33m
metrics-server-6d499768b4-r6frm       1/1     Running   0          33m

创建EFS存储类

cat << EOF > efs_sc.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: efs-sc
provisioner: efs.csi.aws.com
parameters:
  provisioningMode: efs-ap
  fileSystemId: fs-040e588a290e9b177  
  directoryPerms: "700"
  gidRangeStart: "1000"
  gidRangeEnd: "2000"
  basePath: "/monitoring"
reclaimPolicy: Retain
volumeBindingMode: Immediate
EOF

kubectl apply -f efs_sc.yaml

对服务进行持久化存储

编辑helm文件

cat << EOF > prometheus-efs-values.yaml
prometheus:
  prometheusSpec:
    storageSpec:
      volumeClaimTemplate:
        spec:
          storageClassName: efs-sc
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 50Gi

alertmanager:
  alertmanagerSpec:
    storage:
      volumeClaimTemplate:
        spec:
          storageClassName: efs-sc
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 10Gi
grafana:
  persistence:
    enabled: true
    storageClassName: efs-sc
    size: 10Gi
  # 关键配置:禁用 chown initContainer
  initChownData:
    enabled: false  # 关闭强制修改属主
  # 确保 securityContext 与 EFS 访问点匹配
  securityContext:
    fsGroup: 1002  # 必须等于 EFS 访问点的 Gid
    runAsUser: 1002
EOF

helm upgrade prom-stack prometheus-community/kube-prometheus-stack   -n monitoring   --version 75.10.0   -f prometheus-efs-values.yaml   --reuse-values

验证持久化状态

[root@ip-10-0-1-70 pvc-3124b07a-05f2-4a8c-b456-0bbde47da0ee]# ls
csv  grafana.db  pdf  plugins  png
[root@ip-10-0-1-70 pvc-3124b07a-05f2-4a8c-b456-0bbde47da0ee]# kubectl exec -it -n monitoring prom-stack-grafana-967ccfb8c-7z4z9 -- bash
prom-stack-grafana-967ccfb8c-7z4z9:/usr/share/grafana$ cd /var/lib/grafana/
prom-stack-grafana-967ccfb8c-7z4z9:/var/lib/grafana$ ls
csv         grafana.db  pdf         plugins     png
prom-stack-grafana-967ccfb8c-7z4z9:/var/lib/grafana$ touch zyt
prom-stack-grafana-967ccfb8c-7z4z9:/var/lib/grafana$ exit
exit
[root@ip-10-0-1-70 pvc-3124b07a-05f2-4a8c-b456-0bbde47da0ee]# ls
csv  grafana.db  pdf  plugins  png  zyt

安装ALB

配置ALB策略

curl -o iam_policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/main/docs/install/iam_policy.json
aws iam create-policy     --policy-name AWSLoadBalancerControllerIAMPolicy-zyt_0808     --policy-document file://iam_policy.json

配置serviceaccount

eksctl create iamserviceaccount --cluster=zyt-eks-250808 --namespace=kube-system --name=aws-load-balancer-controller --attach-policy-arn=arn:aws:iam::893420598334:policy/AWSLoadBalancerControllerIAMPolicy-zyt_0808 --override-existing-serviceaccounts --approve

安装ALB

helm repo add eks https://aws.github.io/eks-charts
helm repo update
helm install aws-load-balancer-controller eks/aws-load-balancer-controller \
  -n kube-system \
  --set clusterName=zyt-eks-250808 \
  --set serviceAccount.create=false \
  --set serviceAccount.name=aws-load-balancer-controller \
  --set region=us-east-1 \
  --set vpcId=vpc-071fa2c6b1e205798

配置ALB转发规则

Prometheus

cat << EOF > prometheus_alb.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: prometheus-ingress
  namespace: monitoring
  annotations:
    kubernetes.io/ingress.class: alb
    alb.ingress.kubernetes.io/scheme: internet-facing
    alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
    alb.ingress.kubernetes.io/target-type: ip
    alb.ingress.kubernetes.io/subnets: "subnet-09ca72eec6dbf351a,subnet-0558f4b0ec4424206" # ALB转发的子网选择public,因为要去外网交互
    alb.ingress.kubernetes.io/healthcheck-path: /-/healthy  # 自定义健康检查
spec:
  ingressClassName: alb
  rules:
  - http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: prom-stack-kube-prometheus-prometheus
            port:
              number: 9090
EOF
kubectl apply -f prometheus_alb.yaml

AlterManager

cat << EOF > altermanager_alb.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: alertmanager-ingress
  namespace: monitoring
  annotations:
    kubernetes.io/ingress.class: alb
    alb.ingress.kubernetes.io/scheme: internet-facing
    alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
    alb.ingress.kubernetes.io/target-type: ip
    alb.ingress.kubernetes.io/subnets: "subnet-09ca72eec6dbf351a,subnet-0558f4b0ec4424206" # ALB转发的子网选择public,因为要去外网交互
    alb.ingress.kubernetes.io/healthcheck-path: /-/healthy  # 自定义健康检查
spec:
  ingressClassName: alb
  rules:
  - http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: prom-stack-kube-prometheus-alertmanager
            port:
              number: 9093
EOF

kubectl apply -f altermanager_alb.yaml

Grafana

cat << EOF > grafana_alb.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: grafana-ingress
  namespace: monitoring
  annotations:
    kubernetes.io/ingress.class: alb
    alb.ingress.kubernetes.io/scheme: internet-facing
    alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
    alb.ingress.kubernetes.io/target-type: ip
    alb.ingress.kubernetes.io/subnets: "subnet-09ca72eec6dbf351a,subnet-0558f4b0ec4424206" # ALB转发的子网选择public,因为要去外网交互
    alb.ingress.kubernetes.io/success-codes: 200-302 # 因为grafana会进行302跳转,所以要加一个注解
spec:
  ingressClassName: alb
  rules:
  - http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: prom-stack-grafana  
            port:
              number: 80
EOF

kubectl apply -f grafana_alb.yaml

查看ingress状态

[root@ip-10-0-1-70 alb]# kubectl get ingress -n monitoring
NAME                   CLASS   HOSTS   ADDRESS                                                                   PORTS   AGE
alertmanager-ingress   alb     *       k8s-monitori-alertman-5490852c5c-1026823537.us-east-1.elb.amazonaws.com   80      42m
grafana-ingress        alb     *       k8s-monitori-grafanai-f110992746-1761918337.us-east-1.elb.amazonaws.com   80      10m
prometheus-ingress     alb     *       k8s-monitori-promethe-753a6ad123-489061696.us-east-1.elb.amazonaws.com    80      41m

总结

至此,AWS 云上的 EKS、Prometheus、Grafana 与 Alertmanager 已构成闭环监控体系:指标实时采集、图表即点即现、告警持久化留存。后续只需随业务演进微调规则与面板,即可持续守护集群性能与稳定性。