一、创建 SNS 主题

1、创建一个slq-test-alarm的sns主题

2、添加邮件订阅者(如需邮件通知)

二、创建一个lambda函数

1、创建一个lambda执行角色

策略如下:

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "sns:Subscribe",
                "sns:GetTopicAttributes",
                "sns:ListSubscriptionsByTopic"
            ],
            "Resource": "arn:aws:sns:*:*:CloudWatch-Alerts"
        },
        {
            "Effect": "Allow",
            "Action": [
                "cloudwatch:DescribeAlarms",
                "cloudwatch:GetMetricData"
            ],
            "Resource": "*"
        }
    ]
}

2、创建一个lambda函数

3、部署lambda

a、函数代码如下:
import json
import logging
import os
import urllib.request
import datetime
import uuid
import re

# 配置日志
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# 从环境变量获取飞书Webhook URL
FEISHU_WEBHOOK_URL = os.environ['FEISHU_WEBHOOK_URL']

def lambda_handler(event, context):
    logger.info("收到的事件: " + json.dumps(event))
    
    try:
        # 解析CloudWatch告警数据
        message = json.loads(event['Records'][0]['Sns']['Message'])
        
        # 提取告警信息
        alarm_name = message.get('AlarmName', 'N/A')
        alarm_description = message.get('AlarmDescription', 'N/A')
        new_state = message.get('NewStateValue', 'N/A')
        reason = message.get('NewStateReason', 'N/A')
        timestamp = message.get('StateChangeTime', 'N/A')
        
        # 提取资源信息
        region = message.get('Region', 'N/A')
        account_id = message.get('AWSAccountId', 'N/A')
        
        # 从告警ARN中提取服务信息
        alarm_arn = message.get('AlarmArn', '')
        service = "UNKNOWN_SERVICE"
        if alarm_arn:
            service_match = re.search(r'alarm:([^:]+):', alarm_arn)
            if service_match:
                service = service_match.group(1).upper()
        
        # 确定事件类型和严重性
        event_type = "AWS_SERVICE_OPERATIONAL_ISSUE" if new_state == "ALARM" else "AWS_SERVICE_OPERATIONAL_NORMAL"
        severity = "CRITICAL" if new_state == "ALARM" else "NORMAL"
        
        # 生成关联ID
        incident_id = str(uuid.uuid4().int)[:14]
        
        # 格式化时间
        start_time = format_time(timestamp)
        end_time = format_time(datetime.datetime.utcnow().isoformat())
        
        # 构建飞书消息
        feishu_message = {
            "msg_type": "post",
            "content": {
                "post": {
                    "zh_cn": {
                        "title": f"AWS告警: {alarm_name}",
                        "content": [
                            [
                                {
                                    "tag": "text",
                                    "text": f"关联账号: {account_id}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"服务: {service}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"区域: {region}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"事件代码: {event_type}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"事件类型: {'issue' if new_state == 'ALARM' else 'normal'}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"受影响资源: {extract_resources(message)}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"开始时间: {start_time}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"结束时间: {end_time if new_state == 'OK' else '持续中'}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"事件说明: {reason}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"当前状态: {get_status_description(new_state)}\n"
                                }
                            ],
                            [
                                {
                                    "tag": "text",
                                    "text": f"详细信息: {alarm_description}\n"
                                }
                            ]
                        ]
                    }
                }
            }
        }
        
        # 发送到飞书
        req = urllib.request.Request(
            url=FEISHU_WEBHOOK_URL,
            data=json.dumps(feishu_message).encode('utf-8'),
            headers={'Content-Type': 'application/json'}
        )
        
        response = urllib.request.urlopen(req)
        response_body = response.read().decode('utf-8')
        logger.info(f"飞书响应: {response_body}")
        
        return {
            'statusCode': 200,
            'body': json.dumps('消息已成功发送到飞书!')
        }
        
    except Exception as e:
        logger.error(f"处理失败: {str(e)}")
        return {
            'statusCode': 500,
            'body': json.dumps(f'处理失败: {str(e)}')
        }

def format_time(iso_time):
    """格式化ISO时间为更易读的格式"""
    try:
        dt = datetime.datetime.fromisoformat(iso_time.replace('Z', '+00:00'))
        return dt.strftime("%a, %d %b %Y %H:%M:%S GMT")
    except:
        return iso_time

def extract_resources(message):
    """从告警消息中提取受影响的资源"""
    try:
        # 尝试从不同位置提取资源信息
        dimensions = message.get('Trigger', {}).get('Dimensions', [])
        if dimensions:
            resources = []
            for dim in dimensions:
                resources.append(f"{dim.get('name', '')}: {dim.get('value', '')}")
            return resources if resources else "[]"
        
        # 尝试从指标维度提取
        dimensions = message.get('Dimensions', [])
        if dimensions:
            return [f"{dim.get('name', '')}: {dim.get('value', '')}" for dim in dimensions] or "[]"
        
        return "[]"
    except:
        return "[]"

def get_status_description(state):
    """根据状态返回详细描述"""
    if state == "ALARM":
        return "Current severity level: Critical - Service impacted"
    elif state == "OK":
        return "Current severity level: Operating normally"
    else:
        return "Current severity level: Insufficient data"

b、把以上代码进行部署

c、修改常规配置

超时:30s 内存:128M

d、修改环境变量
key=FEISHU_WEBHOOK_URL
values=飞书webhook url

4、添加触发器

三、创建告警

1、控制台

2、通过aws cli设置告警

aws cloudwatch put-metric-alarm \
  --alarm-name <告警名称> \
  --alarm-description <告警描述> \
  --metric-name <指标名称> \
  --namespace <命名空间> \
  --statistic <统计方法> \
  --period <周期秒数> \
  --threshold <阈值> \
  --comparison-operator <比较运算符> \
  --dimensions <维度> \
  --evaluation-periods <评估周期数> \
  --alarm-actions <告警动作ARN> \
  --ok-actions <恢复动作ARN>

示例:监控ec2实例的cpu利用率

四、效果展示

1、邮件

2、飞书群

自我判定

#

判定描述

自我判定(是/否)

1在各搜索引擎中是否能找到知识信息(包括但不限于Google、百度、Bing)
2是否需要代码集成开发