#!/bin/bash
# opc-memory-health-report.sh — 由「小维(运维监控)」每天跑记忆系统端到端自检并发报告给 Bo
# 调度：/etc/cron.d/opc-memory-health
set -uo pipefail
export PATH="/root/.local/share/pnpm:/root/.nvm/versions/node/v22.22.1/bin:/usr/local/bin:/usr/bin:/bin:$PATH"

BOT_TOKEN="8668963466:AAHHCVjtfKcRiuRQm9nv0tBsXEHpsoDMrNI"
CHAT_ID="8694870488"
MEMAPI="http://127.0.0.1:17760"
QDRANT="http://127.0.0.1:6333"
E2E=/root/bin/opc-memory-e2e.sh
TEST_AGENT="${1:-xiaozhen}"
TS() { TZ="Asia/Shanghai" date '+%Y-%m-%d %H:%M:%S'; }

send_telegram() {
  local msg="$1"
  curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
    -H "Content-Type: application/json" \
    -d "{\"chat_id\": \"${CHAT_ID}\", \"text\": $(echo "$msg" | python3 -c 'import sys,json;print(json.dumps(sys.stdin.read()))'), \"parse_mode\": \"HTML\"}" \
    >/dev/null 2>&1
}

# ---- 1) 基础设施健康 ----
SVC=$(systemctl is-active opc-memory 2>/dev/null)
HEALTH=$(curl -s -m 8 "$MEMAPI/health" 2>/dev/null | grep -o '"status": *"ok"' | head -1)
MEM_RSS=$(ps aux | awk '/api_server.py/&&!/awk/{print int($6/1024)"MB"}' | head -1)
MEM_CNT=$(curl -s -m 10 "$QDRANT/collections/opc_memories" 2>/dev/null | grep -oE '"points_count":[0-9]+' | head -1 | cut -d: -f2)
SYS_AVAIL=$(free -m | awk 'NR==2{print $7"MB"}')

# ---- 2) 端到端记忆自检（写→新会话读→查库）----
E2E_OUT=$(timeout 560 "$E2E" "$TEST_AGENT" 2>&1)
E2E_SUMMARY=$(echo "$E2E_OUT" | grep '^SUMMARY=' | sed 's/^SUMMARY=//')
[ -z "$E2E_SUMMARY" ] && E2E_SUMMARY='{"result":"FAIL","note":"e2e无输出/超时"}'
RESULT=$(echo "$E2E_SUMMARY" | python3 -c "import sys,json;print(json.load(sys.stdin).get('result','FAIL'))" 2>/dev/null || echo FAIL)
W=$(echo "$E2E_SUMMARY" | python3 -c "import sys,json;print(json.load(sys.stdin).get('write','?'))" 2>/dev/null)
R=$(echo "$E2E_SUMMARY" | python3 -c "import sys,json;print(json.load(sys.stdin).get('read','?'))" 2>/dev/null)
D=$(echo "$E2E_SUMMARY" | python3 -c "import sys,json;print(json.load(sys.stdin).get('db','?'))" 2>/dev/null)

# ---- 2b) 固定检索用例（小金/慧慧 + 防空转机制）----
CASES_OUT=$(timeout 120 /root/bin/opc-memory-cases.sh 2>&1)
CASES_RESULT=$(echo "$CASES_OUT" | grep '^CASES_RESULT=' | cut -d= -f2)
[ -z "$CASES_RESULT" ] && CASES_RESULT=FAIL
CASES_DETAIL=$(echo "$CASES_OUT" | grep -E "✅|❌" | sed 's/^  //')

# ---- 3) 组装报告 ----
if [ "$RESULT" = PASS ] && [ "$CASES_RESULT" = PASS ]; then HEAD="✅ 记忆系统正常"; else HEAD="❌ 记忆系统异常，请关注"; fi
icon(){ [ "$1" = PASS ] && echo "✅" || echo "❌"; }
REPORT="<b>${HEAD}</b>
🧠 记忆系统每日健康报告 · 小维(运维监控)
🕐 $(TS)

<b>端到端自检</b>（对象：${TEST_AGENT}/小真）
 $(icon "$W") 写入记忆
 $(icon "$R") 新会话读回
 $(icon "$D") 向量库核查

<b>固定检索用例</b>（小金/慧慧/防空转）
${CASES_DETAIL}

<b>基础设施</b>
 • 记忆服务: ${SVC:-未知} / health: $([ -n "$HEALTH" ] && echo OK || echo 异常)
 • 记忆条数: ${MEM_CNT:-?} 条
 • 服务内存: ${MEM_RSS:-?}
 • 系统可用内存: ${SYS_AVAIL}

明细: ${E2E_SUMMARY}"

send_telegram "$REPORT"
echo "[$(TS)] health-report sent: result=$RESULT"
