Prometheus监控

在Kubernetes上快速部署Prometheus

创建一个新的命名空间

1
2
3
4
5
6
7
8
[root@prometheus]# cat monitor_namespace.yaml 
apiVersion: v1
kind: Namespace
metadata:
name: monitor
labels:
name: monitor
[root@prometheus]#kubectl create -f monitor_namespace.yaml

rbac文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
[root@prometheus]# cat rbac-setup.yaml 
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor


[root@prometheus]#kubectl create -f rbac-setup.yaml

prometheus-deploy文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
[root@prometheus]# cat configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
#被引用到/etc/prometheus/prometheus.yml
prometheus.yml: |
global:
#每15s采集一次数据和15s做一次告警检测
scrape_interval: 15s
evaluation_interval: 15s
#指定加载的告警规则文件
rule_files:
- /etc/prometheus/rules.yml
#将报警送至何地进行报警
alerting:
alertmanagers:
- static_configs:
- targets: ["192.168.50.60:9093"]
#指定prometheus要监控的目标
scrape_configs:
- job_name: 'k8s-node'
scrape_interval: 10s
static_configs:
- targets:
- '192.168.50.61:31672'

#自定义获取监控数据,每个 job_name 都是独立的
- job_name: 'tomcat-pods'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_service_annotation_prometheus_io_jvm_scrape]
regex: true;true
action: keep
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_app_metrics_patn]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__meta_kubernetes_pod_ip, __meta_kubernetes_service_annotation_prometheus_io_app_metrics_port]
action: replace
target_label: __address__
regex: (.+);(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_pod_host_ip]
action: replace
target_label: kubernetes_host_ip

- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https

- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics

- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name

- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name

- job_name: 'kubernetes-ingresses'
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# 监控规则文件,被引用到/etc/prometheus/rules.yml
rules.yml: |
groups:
- name: test-rule
rules:

############# Node监控 #############
- alert: k8s-node状态异常
expr: up{job="k8s-node"} != 1
for: 3m
labels:
team: k8s-node
annotations:
summary: "{{$labels.instance}}: Node节点状态异常"
description: "可能是重启了"
- alert: k8s-node节点CPU使用率
expr: (1 - avg(irate(node_cpu_seconds_total{job="k8s-node",mode="idle"}[1m])) by (instance)) * 100 > 95
for: 1m
labels:
team: k8s-node
annotations:
summary: "{{$labels.instance}}: Node节点CPU使用率超过95%"
description: "{{$labels.instance}}: Node节点当前CPU使用率为: {{ $value }}"
- alert: k8s-node节点磁盘使用率
expr: (node_filesystem_size_bytes{mountpoint="/",job="k8s-node"} - node_filesystem_avail_bytes{mountpoint="/",job="k8s-node"}) / node_filesystem_size_bytes{mountpoint="/",job="k8s-node"} * 100 > 85
for: 1m
labels:
team: k8s-node
annotations:
description: "Node服务器[[ {{$labels.instance}} ]] 的 {{mountpoint}} 磁盘空间使用率超过85%"
summary: "磁盘 {{$labels.device}} 当前使用率为: {{ $value }}"
- alert: k8s-node节点内存使用率
expr: (node_memory_MemTotal_bytes{job="k8s-node"} - (node_memory_Buffers_bytes{job="k8s-node"} + node_memory_Cached_bytes{job="k8s-node"} + node_memory_MemFree_bytes{job="k8s-node"})) / node_memory_MemTotal_bytes{job="k8s-node"} * 100
for: 1m
labels:
team: k8s-node
annotations:
description: "Node服务器[[ {{$labels.instance}} ]] 内存使用率超过95%"
summary: "{{$labels.instance}} 当前内存使用率为: {{ $value }}"

############ Pod 监控 ############
- alert: 监控k8s的pod状态异常
expr: up{kubernetes_namespace="monitor"} != 1
for: 3m
labels:
team: "kube-state-metrics"
annotations:
description: "{{$labels.kubernetes_namespace}} 内的 pod 状态有变动"
summary: "此 Pod 用于获取 k8s 监控数据, 绑定在一个节点上"
- alert: 应用的 pod 状态有变动
expr: kube_pod_container_status_ready{namespace="product"} != 1
for: 3m
labels:
status: "product 命名空间内的 pod {{$labels.pod}}有变动"
annotations:
description: "Deployment {{$labels.container}} 内的 pod 状态有变动"
summary: "可能是重启或者在升级版本,如果频繁重启,请跟踪排查问题"
- alert: 以下应用的 pod 重启次数已经超过15,请查看原因
expr: kube_pod_container_status_restarts_total{namespace="product"} > 15
for: 3m
labels:
status: "product 命名空间内的 pod {{$labels.pod}} 重启次数太多"
annotations:
description: "Deployment {{$labels.container}} 内的 pod 重启次数太多"
summary: "重启次数太多,可能是因为 pod 内应用有问题"
########### Java 监控 ############
- alert: jvm线程数过高
expr: jvm_threads_current{job="tomcat-pods"}>2000
for: 1m
labels:
status: "空间内 jvm 的变动情况"
annotations:
description: "{{$labels.kubernetes_pod_name}}: Jvm线程数过高"
summary: '{{ $labels.kubernetes_pod_name }} : 当前你线程值为: {{ $value }}'






[root@prometheus]# cat prometheus.deploy.yml
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
name: prometheus-deployment
name: prometheus
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- image: prom/prometheus:v2.6.0
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/home/prometheus"
- "--storage.tsdb.retention=168h"
- "--web.enable-lifecycle"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/home/prometheus"
name: data
- mountPath: "/etc/prometheus"
name: config-volume
- mountPath: "/etc/localtime"
readOnly: false
name: localtime
resources:
requests:
cpu: 100m
memory: 2048Mi
limits:
cpu: 500m
memory: 3180Mi
serviceAccountName: prometheus
nodeSelector:
nodetype: prometheus
volumes:
- name: data
hostPath:
path: "/opt/prometheus/data"
- name: config-volume
configMap:
name: prometheus-config
- name: localtime
hostPath:
path: "/etc/localtime"
type: File




[root@prometheus]# cat prometheus.svc.yml
---
kind: Service
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus
namespace: monitor
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
nodePort: 30003
selector:
app: prometheus



[root@prometheus]#kubectl create -f configmap.yaml
[root@prometheus]#kubectl create -f prometheus.deploy.yml
[root@prometheus]#kubectl create -f prometheus.svc.yml

注:需要在本地创建/opt/prometheus/data作为prometheus数据路径,另需要给data目录赋予777权限

热重读配置文件

congfigmap有热重启功能,这样每次改完配置文件都不需要重启prometheus的pod来重读配置了

1
2
3
4
5
6
7
8
9
- "--web.enable-lifecycle"在prometheus.deploy.yml的配置文件里面加上这段话就可以了

[root@prometheus]# cat reload-prometheus.sh
#!/bin/bash
kubectl apply -f configmap.yaml
sleep 60
curl -XPOST http://192.168.50.60:30003/-/reload

可以写个脚本,每次修改完配置文件的配置之后,执行一下脚本就可以同步生效了!

安装kube-state-metrics

1
2
3
[root@prometheus]# git clone https://github.com/kubernetes/kube-state-metrics.git
之后把默认的命名空间改成monitor,进入kube-state-metrics目录
[root@prometheus]#kubectl create -f ./

安装grafana

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
创建grafana的数据目录
mkdir /opt/grafana/data
启动脚本
[root@grafana]# cat start_grafana.sh
#!/bin/bash

docker stop `docker ps -a |awk '/grafana/{print $1}'`
docker rm `docker ps -a |awk '/grafana/{print $1}'`
docker run -d \
--name=grafana \
--restart=always \
-p 3000:3000 \
-m 4096m \
-v /opt/grafana/data:/var/lib/grafana \
-v /opt/grafana/log:/var/log/grafana \
grafana/grafana:5.4.3

1、安装完之后,需要添加source,source直接点prometheus,链接就是http://192.168.50.60:30003之前创建的prometheus界面

2、添加模板dashboad(列出几个常用的)

点import导入,有俩种方式,直接填官网模板,或者导入json

https://grafana.com/dashboards/9276 node的cpu、内存等

https://grafana.com/dashboards/3146 pod

https://grafana.com/dashboards/8588 deployment

安装alertmanager

创建配置文件、目录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
创建alert数据目录
mkdir /opt/alert/data

注意:需要alertmanager.yml配置,此配置钉钉和邮件可同时放松
[root@docker60 alert]# cat alertmanager.yml
global:
resolve_timeout: 5m

route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 6m
receiver: default
receivers:
- name: 'default'
email_configs:
- to: ""
send_resolved: true
from: ""
smarthost: "smtp.xxx.com:25"
auth_username: ""
auth_password: ""
webhook_configs:
- url: 'http://192.168.50.60:8060/dingtalk/ops_dingding/send'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname']

启动脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
[root@alert]# cat start_alert.sh
#!/bin/bash

docker stop `docker ps -a |awk '/alertmanager/{print $1}'`
docker rm `docker ps -a |awk '/alertmanager/{print $1}'`

docker run -d \
--name alertmanager \
--restart=always \
-p 9093:9093 \
-v /etc/localtime:/etc/localtime:ro \
-v /opt/alert/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
-v /opt/alert/data:/alertmanager \
prom/alertmanager:v0.15.3

安装dingding插件

1
2
3
4
5
6
7
8
9
10
11
12
1、安装go (这里就不叙述了)
2、假设go的路径是/usr/local/go
mkdir -pv /usr/local/go/src/github.com/timonwong
3、下载dingding插件
git clone https://github.com/timonwong/prometheus-webhook-dingtalk.git
4、添加dingding机器人
在dingding群里面添加即可
5、启动dingding
[root@alert]# cat start_dingding.sh
cd /usr/local/go/src/github.com/timonwong/prometheus-webhook-dingtalk
kill -9 `ps -ef | grep prometheus-webhook-dingtalk | grep -v grep | awk '{print $2}'`
nohup ./prometheus-webhook-dingtalk --ding.profile="ops_dingding=https://oapi.dingtalk.com/robot/send?access_token=xxxx" 2>&1 1>dingding.log &

企业微信报警

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
注册企业微信,添加应用即可;
[root@node107 alert]# cat alertmanager.yml
global:
resolve_timeout: 5m

route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 6m
receiver: default
receivers:
- name: 'default'
email_configs:
- to: "xxx"
send_resolved: true
from: "xxx"
smarthost: "xxxx:25"
auth_username: "xxxxx"
auth_password: "xxxx"

wechat_configs:
- corp_id: 'xxxxx' ##企业id
to_party: '2' ## 看通讯录部门那编号是几
agent_id: '1000003'
api_secret: 'xxxxxxx'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname']
Donate