一、实现方案
1、每个微服务通过/metrics接口暴露自己的指标,作为一个exporter
2、部署prometheus,同时配置serviceMonitor采集微服务的exporter地址,exporter地址通过service地址方式访问
serviceMonitor如下:
apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: test-monitor namespace: monitoring spec: endpoints: - honorLabels: true interval: 10s port: http scheme: http namespaceSelector: matchNames: - test selector: matchLabels: app.kubernetes.io/group: test |
3、监控平台配置grafana展示监控指标,配置告警规则
二、微服务监控指标列表
指标组 | 指标名称 | 指标类型 | 指标说明 |
---|---|---|---|
http指标 |
http_request_duration_seconds_count |
Histogram |
http接口的请求时长 |
http_request_total |
Counter | http接口的请求总次数 | |
http_response_size_bytes |
Histogram |
http接口的响应体大小 | |
http_requests_in_flight |
Gauge |
http接口的并发请求数 | |
Go指标
|
go_gc_duration_seconds_sum |
Histogram | Go程序gc时长 |
go_goroutines |
Gauge |
go程序协程数量 | |
go_memstats_alloc_bytes |
Gauge |
Go程序占用的内存大小 | |
Gprc指标 |
grpc_server_handled_total |
Counter | 每个grpc方法调用完成的请求数 |
grpc_server_handling_seconds_bucket |
Histogram |
每个grpc方法响应时长 | |
grpc_server_started_total |
Counter | 每个grpc方法调用的总次数 |
完整指标样例:
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary go_gc_duration_seconds{quantile="0"} 2.5934e-05 go_gc_duration_seconds{quantile="0.25"} 3.062e-05 go_gc_duration_seconds{quantile="0.5"} 4.3478e-05 go_gc_duration_seconds{quantile="0.75"} 5.73e-05 go_gc_duration_seconds{quantile="1"} 0.000188399 go_gc_duration_seconds_sum 0.000660942 go_gc_duration_seconds_count 12 # HELP go_goroutines Number of goroutines that currently exist. # TYPE go_goroutines gauge go_goroutines 529 # HELP go_info Information about the Go environment. # TYPE go_info gauge go_info{version="go1.13.12"} 1 # HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. # TYPE go_memstats_alloc_bytes gauge go_memstats_alloc_bytes 2.364408e+07 # HELP http_request_duration_seconds The latency of the HTTP requests. # TYPE http_request_duration_seconds histogram http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="1"} 5 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="5"} 5 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="10"} 5 http_request_duration_seconds_sum{code="200",api="/v1/common/user/filter",method="GET",service="common"} 1.534460968 http_request_duration_seconds_count{code="200",api="/v1/common/user/filter",method="GET",service="common"} 5 # HELP http_requests_in_flight The number of inflight requests being handled at the same time. # TYPE http_requests_in_flight gauge http_requests_in_flight{api="/v1/common/user/filter",service="common"} 2 # HELP http_response_size_bytes The size of the HTTP responses. # TYPE http_response_size_bytes histogram http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.005"} 0 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.01"} 0 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.025"} 0 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.05"} 0 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.1"} 0 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.25"} 6 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.5"} 6 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="1"} 6 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="2.5"} 6 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="5"} 6 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="10"} 6 http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="+Inf"} 6 http_response_size_bytes_sum{code="200",api="/v1/common/user/filter",method="GET",service="common"} 44565 http_response_size_bytes_count{code="200",api="/v1/common/user/filter",method="GET",service="common"} 5 # HELP grpc_server_handled_total Total number of RPCs completed on the server, regardless of success or failure. # TYPE grpc_server_handled_total counter grpc_server_handled_total{grpc_code="OK",grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary"} 0 grpc_server_handled_total{grpc_code="OK",grpc_method="Bound",grpc_service="ecf.common.ssh.SSH",grpc_type="unary"} 0 grpc_server_handled_total{grpc_code="OK",grpc_method="Check",grpc_service="ecf.common.arrears.Arrears",grpc_type="unary"} 0 grpc_server_handled_total{grpc_code="OK",grpc_method="CheckEmailVerifyCode",grpc_service="ecf.common.message.Message",grpc_type="unary"} 0 grpc_server_handled_total{grpc_code="OK",grpc_method="CheckOfferLoad",grpc_service="ecf.common.offer.CommonOffer",grpc_type="unary"} 0 # HELP grpc_server_handling_seconds Histogram of response latency (seconds) of gRPC that had been application-level handled by the server. # TYPE grpc_server_handling_seconds histogram grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.005"} 0 grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.01"} 0 grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.025"} 0 # HELP grpc_server_started_total Total number of RPCs started on the server. # TYPE grpc_server_started_total counter grpc_server_started_total{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary"} 0 grpc_server_started_total{grpc_method="Bound",grpc_service="ecf.common.ssh.SSH",grpc_type="unary"} 0 grpc_server_started_total{grpc_method="Check",grpc_service="ecf.common.arrears.Arrears",grpc_type="unary"} 0 grpc_server_started_total{grpc_method="CheckEmailVerifyCode",grpc_service="ecf.common.message.Message",grpc_type="unary"} 0 grpc_server_started_total{grpc_method="CheckOfferLoad",grpc_service="ecf.common.offer.CommonOffer",grpc_type="unary"} 0 grpc_server_started_total{grpc_method="Count",grpc_service="ecf.common.cluster.Cluster",grpc_type="unary"} 0 |
三、监控平台配置指标
指标组 | 指标名称 | 指标说明 |
---|---|---|
http指标 |
rate(http_request_duration_seconds_count{code!="200"}[30s]) |
http接口的请求错误率 |
sum(rate(http_request_total{}[5m])) by (service) |
最近5分钟内每秒请求速率 | |
|
||
|
接口平均时间大于200ms | |
sum(http_requests_in_flight) by (service) |
http接口的并发请求数 | |
Go指标
|
go_gc_duration_seconds_sum |
Go程序gc时长 |
go_goroutines |
go程序协程数量 | |
go_memstats_alloc_bytes |
Go程序占用的内存大小 | |
Gprc指标 |
sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service) |
grpc请求平均错误率 |
histogram_quantile(0.9, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[5m])) by (grpc_service,le) ) |
每个服务的rpc处理时间的90%-tile分位数估计值 | |
|
rpc接口的1分钟内的平均响应时间大于200ms的 |
|
grpc_server_started_total |
每个grpc方法调用的总次数 |
四、采集指标代码实现
Grpc server实现grpc指标:
// grpc/server.go import grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" // RunServer runs gRPC service func RunServer(ctx context.Context, port string, init InitServers) error { // add grpc middleware opts = grpc_interceptor.AddLogging(log.WithField("server", "grpc"), opts) // register service server := grpc.NewServer(opts...) // register api init(server) grpc_prometheus.Register(server) } // grpc-interceptor/logger.go // AddLogging returns grpc.Server config option that turn on logging. func AddLogging(logger *logrus.Entry, opts []grpc.ServerOption) []grpc.ServerOption { // Shared options for the logger, with a custom gRPC code to log level function. o := []grpc_logrus.Option{ grpc_logrus.WithLevels(codeToLevel), } grpc_prometheus.EnableHandlingTimeHistogram() // Add unary interceptor opts = append(opts, grpc_middleware.WithUnaryServerChain( grpc_prometheus.UnaryServerInterceptor, )) // Add stream interceptor (added as an example here) opts = append(opts, grpc_middleware.WithStreamServerChain( grpc_prometheus.StreamServerInterceptor, )) return opts } |
Grpc-gateway实现http指标:
// rest/server.go func RunServer(ctx context.Context, httpPort string, init InitMux, interceptors ...rest_interceptor.Interceptor) error { // more code... h := prometheus_interceptor.AddPrometheus(rest_interceptor.AddRequestID( rest_interceptor.AddLogger(log.WithField("gateway", "rest"), api))) // Create our middleware. recorder := metrics.NewRecorder(metrics.Config{ DurationBuckets: []float64{1, 2.5, 5, 10, 20, 40, 80, 160, 320, 640}, }) mdlw := middleware.New(middleware.Config{ Recorder: recorder, Service: "common", }) h = std.api("", mdlw, h) srv := &http.Server{ Addr: httpPort, ReadTimeout: 60 * time.Second, WriteTimeout: 60 * time.Second, // add api with middleware api: h, } return srv.ListenAndServe() } // prometheus_interceptor/prometheus.go func AddPrometheus(h http.api) http.api { return http.apiFunc(func(w http.ResponseWriter, r *http.Request) { if strings.HasPrefix(r.URL.Path, "/metrics") { promhttp.api().ServeHTTP(w, r) return } h.ServeHTTP(w, r) return }) } // 参考:"github.com/slok/go-http-metrics/middleware/std" type recorder struct { httpRequestDurHistogram *prometheus.HistogramVec httpResponseSizeHistogram *prometheus.HistogramVec httpRequestsInflight *prometheus.GaugeVec } // NewRecorder returns a new metrics recorder that implements the recorder // using Prometheus as the backend. func NewRecorder(cfg Config) metrics.Recorder { cfg.defaults() r := &recorder{ httpRequestDurHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: cfg.Prefix, Subsystem: "http", Name: "request_duration_seconds", Help: "The latency of the HTTP requests.", Buckets: cfg.DurationBuckets, }, []string{cfg.ServiceLabel, cfg.apiIDLabel, cfg.MethodLabel, cfg.StatusCodeLabel}), httpResponseSizeHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: cfg.Prefix, Subsystem: "http", Name: "response_size_bytes", Help: "The size of the HTTP responses.", Buckets: cfg.SizeBuckets, }, []string{cfg.ServiceLabel, cfg.apiIDLabel, cfg.MethodLabel, cfg.StatusCodeLabel}), httpRequestsInflight: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: cfg.Prefix, Subsystem: "http", Name: "requests_in_flight", Help: "The number of inflight requests being handled at the same time.", }, []string{cfg.ServiceLabel, cfg.apiIDLabel}), } cfg.Registry.MustRegister( r.httpRequestDurHistogram, r.httpResponseSizeHistogram, r.httpRequestsInflight, ) return r } func (r recorder) ObserveHTTPRequestDuration(_ context.Context, p metrics.HTTPReqProperties, duration time.Duration) { r.httpRequestDurHistogram.WithLabelValues(p.Service, p.ID, p.Method, p.Code).Observe(duration.Seconds()) } func (r recorder) ObserveHTTPResponseSize(_ context.Context, p metrics.HTTPReqProperties, sizeBytes int64) { r.httpResponseSizeHistogram.WithLabelValues(p.Service, p.ID, p.Method, p.Code).Observe(float64(sizeBytes)) } func (r recorder) AddInflightRequests(_ context.Context, p metrics.HTTPProperties, quantity int) { r.httpRequestsInflight.WithLabelValues(p.Service, p.ID).Add(float64(quantity)) } |