apiVersion: v1 data: alerts: | {} prometheus.yml: | global: scrape_interval: 30s scrape_timeout: 30s rule_files: - /etc/config/rules - /etc/config/alerts scrape_configs: - job_name: prometheus static_configs: - targets: - localhost:9090 - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token job_name: kubernetes-apiservers kubernetes_sd_configs: - role: endpoints relabel_configs: - action: keep regex: default;kubernetes;https source_labels: - __meta_kubernetes_namespace - __meta_kubernetes_service_name - __meta_kubernetes_endpoint_port_name scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token job_name: kubernetes-nodes kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - replacement: kubernetes.default.svc:443 target_label: __address__ - regex: (.+) replacement: /api/v1/nodes//proxy/metrics source_labels: - __meta_kubernetes_node_name target_label: __metrics_path__ scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token job_name: kubernetes-nodes-cadvisor kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - replacement: kubernetes.default.svc:443 target_label: __address__ - regex: (.+) replacement: /api/v1/nodes//proxy/metrics/cadvisor source_labels: - __meta_kubernetes_node_name target_label: __metrics_path__ scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true - job_name: kubernetes-service-endpoints kubernetes_sd_configs: - role: endpoints relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scrape - action: replace regex: (https?) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scheme target_label: __scheme__ - action: replace regex: (.+) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: - __address__ - __meta_kubernetes_service_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_service_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - action: replace source_labels: - __meta_kubernetes_service_name target_label: kubernetes_name - honor_labels: true job_name: prometheus-pushgateway kubernetes_sd_configs: - role: service relabel_configs: - action: keep regex: pushgateway source_labels: - __meta_kubernetes_service_annotation_prometheus_io_probe - job_name: kubernetes-services kubernetes_sd_configs: - role: service metrics_path: /probe params: module: - http_2xx relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_probe - source_labels: - __address__ target_label: __param_target - replacement: blackbox target_label: __address__ - source_labels: - __param_target target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - source_labels: - __meta_kubernetes_service_name target_label: kubernetes_name - job_name: kubernetes-pods kubernetes_sd_configs: - role: pod relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_scrape - action: replace regex: (.+) source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: - __address__ - __meta_kubernetes_pod_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: kubernetes_pod_name alerting: alertmanagers: - kubernetes_sd_configs: - role: pod tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace] regex: monitor action: keep - source_labels: [__meta_kubernetes_pod_label_app] regex: prometheus action: keep - source_labels: [__meta_kubernetes_pod_label_component] regex: alertmanager action: keep - source_labels: [__meta_kubernetes_pod_container_port_number] regex: action: drop rules: | groups: - name: example rules: - alert: NodeCPUUsage expr: 100 - (avg by (instance) (irate(node_cpu{component="node-exporter",mode="idle"}[5m])) * 100)>1 for: 2m labels: severity: critical service: cpus annotations: summary: "{{$labels.instance}}: High CPU usage detected" description: "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" - alert: InstanceDown expr: up == 0 for: 5m labels: severity: critical service: node annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - alert: APIHighRequestLatency expr: api_http_request_latencies_second{quantile="0.5"} > 1 for: 10m labels: severity: critical service: node annotations: summary: "High request latency on {{ $labels.instance }}" description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)" kind: ConfigMap metadata: labels: app: prometheus chart: prometheus-6.2.1 component: server heritage: Tiller release: prometheus name: prometheus-server namespace: monitor
apiVersion: v1 data: alertmanager.yml: | global: resolve_timeout: 5m smtp_smarthost: ':25' smtp_from: 'hxp195446040@' smtp_auth_username: 'hxp195446040@' smtp_auth_password: 'xxxxxx' smtp_require_tls: false receivers: - name: default-receiver email_configs: - to: 'hxp195446040@' html: '{{ template "email.hxp.html" .}}' headers: { Subject: "[WARN] 报警邮件test" } - name: admin-receiver email_configs: - to: 'hxp195446040@' html: '{{ template "email.hxp.html" .}}' headers: { Subject: "[WARN] 报警邮件test" } templates: - '/etc/config/*.tmpl' inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'cluster', 'service'] route: group_interval: 5m group_wait: 30s receiver: default-receiver repeat_interval: 3h group_by: ['alertname', 'cluster', 'service'] routes: - match_re: service: ^(foo1|foo2|baz)$ receiver: default-receiver routes: - match: severity: critical receiver: admin-receiver - match: service: cpus receiver: default-receiver routes: - match: severity: critical receiver: admin-receiver - match: service: node receiver: default-receiver group_by: [alertname, cluster] routes: - match: owner: team-X receiver: default-receiver - match: owner: team-Y receiver: admin-receiver hxp.tmpl: | {{ define "email.hxp.html" }} 报警名实例开始时间摘要详情 {{ range $i, $alert := .Alerts }} {{ index $alert.Labels "alertname" }}{{index $alert.Labels "instance"}}{{ $alert.StartsAt }}{{index $alert.Annotations "summary"}}{{index $alert.Annotations "description"}} {{ end }} {{ end }} {{ define "email.hxp.title.html" }} {{ range $i, $alert := .Alerts }} {{index $alert.Annotations "summary"}}| {{ end }} {{ end }} kind: ConfigMap metadata: labels: app: prometheus chart: prometheus-6.2.1 component: alertmanager heritage: Tiller release: prometheus name: prometheus-alertmanager namespace: monitor