一、实现方案

1、每个微服务通过/metrics接口暴露自己的指标，作为一个exporter

2、部署prometheus，同时配置serviceMonitor采集微服务的exporter地址，exporter地址通过service地址方式访问

serviceMonitor如下：

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: test-monitor
  namespace: monitoring
spec:
  endpoints:
  - honorLabels: true
    interval: 10s
    port: http
    scheme: http
  namespaceSelector:
    matchNames:
    - test
  selector:
    matchLabels:
      app.kubernetes.io/group: test

3、监控平台配置grafana展示监控指标，配置告警规则

二、微服务监控指标列表

指标组	指标名称	指标类型	指标说明
http指标	http_request_duration_seconds_count	Histogram	http接口的请求时长
	http_request_total	Counter	http接口的请求总次数
	http_response_size_bytes	Histogram	http接口的响应体大小
	http_requests_in_flight	Gauge	http接口的并发请求数
Go指标	go_gc_duration_seconds_sum	Histogram	Go程序gc时长
	go_goroutines	Gauge	go程序协程数量
	go_memstats_alloc_bytes	Gauge	Go程序占用的内存大小
Gprc指标	grpc_server_handled_total	Counter	每个grpc方法调用完成的请求数
	grpc_server_handling_seconds_bucket	Histogram	每个grpc方法响应时长
	grpc_server_started_total	Counter	每个grpc方法调用的总次数

完整指标样例：

# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 2.5934e-05
go_gc_duration_seconds{quantile="0.25"} 3.062e-05
go_gc_duration_seconds{quantile="0.5"} 4.3478e-05
go_gc_duration_seconds{quantile="0.75"} 5.73e-05
go_gc_duration_seconds{quantile="1"} 0.000188399
go_gc_duration_seconds_sum 0.000660942
go_gc_duration_seconds_count 12
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 529
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.13.12"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 2.364408e+07


# HELP http_request_duration_seconds The latency of the HTTP requests.
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="1"} 5
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="5"} 5
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="10"} 5
http_request_duration_seconds_sum{code="200",api="/v1/common/user/filter",method="GET",service="common"} 1.534460968
http_request_duration_seconds_count{code="200",api="/v1/common/user/filter",method="GET",service="common"} 5


# HELP http_requests_in_flight The number of inflight requests being handled at the same time.
# TYPE http_requests_in_flight gauge
http_requests_in_flight{api="/v1/common/user/filter",service="common"} 2

# HELP http_response_size_bytes The size of the HTTP responses.
# TYPE http_response_size_bytes histogram
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.005"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.01"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.025"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.05"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.1"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.25"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="1"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="2.5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="10"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="+Inf"} 6
http_response_size_bytes_sum{code="200",api="/v1/common/user/filter",method="GET",service="common"} 44565
http_response_size_bytes_count{code="200",api="/v1/common/user/filter",method="GET",service="common"} 5


# HELP grpc_server_handled_total Total number of RPCs completed on the server, regardless of success or failure.
# TYPE grpc_server_handled_total counter
grpc_server_handled_total{grpc_code="OK",grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="Bound",grpc_service="ecf.common.ssh.SSH",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="Check",grpc_service="ecf.common.arrears.Arrears",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="CheckEmailVerifyCode",grpc_service="ecf.common.message.Message",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="CheckOfferLoad",grpc_service="ecf.common.offer.CommonOffer",grpc_type="unary"} 0


# HELP grpc_server_handling_seconds Histogram of response latency (seconds) of gRPC that had been application-level handled by the server.
# TYPE grpc_server_handling_seconds histogram
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.005"} 0
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.01"} 0
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.025"} 0


# HELP grpc_server_started_total Total number of RPCs started on the server.
# TYPE grpc_server_started_total counter
grpc_server_started_total{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Bound",grpc_service="ecf.common.ssh.SSH",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Check",grpc_service="ecf.common.arrears.Arrears",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="CheckEmailVerifyCode",grpc_service="ecf.common.message.Message",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="CheckOfferLoad",grpc_service="ecf.common.offer.CommonOffer",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Count",grpc_service="ecf.common.cluster.Cluster",grpc_type="unary"} 0

三、监控平台配置指标

指标组	指标名称	指标说明
http指标	rate(http_request_duration_seconds_count{code!="200"}[30s])	http接口的请求错误率
	sum(rate(http_request_total{}[5m])) by (service)	最近5分钟内每秒请求速率
	~~histogram_quantile(0.9, sum( rate(http_request_duration_seconds_bucket[10m]) ) by (api, le) )~~	~~每个接口90%的平均请求时间~~
	`sum(rate(http_request_duration_seconds_sum[1m]) / rate(http_request_duration_seconds_count[1m])) by(service,api) > 200`	接口平均时间大于200ms
	sum(http_requests_in_flight) by (service)	http接口的并发请求数
Go指标	go_gc_duration_seconds_sum	Go程序gc时长
	go_goroutines	go程序协程数量
	go_memstats_alloc_bytes	Go程序占用的内存大小
Gprc指标	sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service)	grpc请求平均错误率
	histogram_quantile(0.9, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[5m])) by (grpc_service,le) )	每个服务的rpc处理时间的90%-tile分位数估计值
	`sum(rate(`grpc_server_handling_seconds_bucket`[1m]) / rate(`grpc_server_handling_seconds_count`[1m])) by(grpc_service) > 200`	rpc接口的1分钟内的平均响应时间大于200ms的
	grpc_server_started_total	每个grpc方法调用的总次数

四、采集指标代码实现

Grpc server实现grpc指标：

// grpc/server.go
import grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"

// RunServer runs gRPC service
func RunServer(ctx context.Context, port string, init InitServers) error {
	// add grpc middleware
	opts = grpc_interceptor.AddLogging(log.WithField("server", "grpc"), opts)
	// register service
	server := grpc.NewServer(opts...)
	// register api
	init(server)


	grpc_prometheus.Register(server)
}


// grpc-interceptor/logger.go
// AddLogging returns grpc.Server config option that turn on logging.
func AddLogging(logger *logrus.Entry, opts []grpc.ServerOption) []grpc.ServerOption {
   // Shared options for the logger, with a custom gRPC code to log level function.
   o := []grpc_logrus.Option{
      grpc_logrus.WithLevels(codeToLevel),
   }

   grpc_prometheus.EnableHandlingTimeHistogram()

   // Add unary interceptor
   opts = append(opts, grpc_middleware.WithUnaryServerChain(
      grpc_prometheus.UnaryServerInterceptor,
   ))

   // Add stream interceptor (added as an example here)
   opts = append(opts, grpc_middleware.WithStreamServerChain(
      grpc_prometheus.StreamServerInterceptor,
   ))

   return opts
}

Grpc-gateway实现http指标：

// rest/server.go
func RunServer(ctx context.Context, httpPort string, init InitMux, interceptors ...rest_interceptor.Interceptor) error {
	// more code...

	h := prometheus_interceptor.AddPrometheus(rest_interceptor.AddRequestID(
   			rest_interceptor.AddLogger(log.WithField("gateway", "rest"), api)))


	// Create our middleware.
	recorder := metrics.NewRecorder(metrics.Config{
   		DurationBuckets: []float64{1, 2.5, 5, 10, 20, 40, 80, 160, 320, 640},
	})
	mdlw := middleware.New(middleware.Config{
   		Recorder: recorder,
  		Service:  "common",
	})

	h = std.api("", mdlw, h)

	srv := &http.Server{
		Addr:         httpPort,
   		ReadTimeout:  60 * time.Second,
  	 	WriteTimeout: 60 * time.Second,
   		// add api with middleware
   		api: h,
	}

	return srv.ListenAndServe()
}




// prometheus_interceptor/prometheus.go
func AddPrometheus(h http.api) http.api {
   return http.apiFunc(func(w http.ResponseWriter, r *http.Request) {
      if strings.HasPrefix(r.URL.Path, "/metrics") {

         promhttp.api().ServeHTTP(w, r)
         return
      }
      h.ServeHTTP(w, r)
      return
   })
}


// 参考："github.com/slok/go-http-metrics/middleware/std"
type recorder struct {
   httpRequestDurHistogram   *prometheus.HistogramVec
   httpResponseSizeHistogram *prometheus.HistogramVec
   httpRequestsInflight      *prometheus.GaugeVec
}

// NewRecorder returns a new metrics recorder that implements the recorder
// using Prometheus as the backend.
func NewRecorder(cfg Config) metrics.Recorder {
   cfg.defaults()

   r := &recorder{
      httpRequestDurHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
         Namespace: cfg.Prefix,
         Subsystem: "http",
         Name:      "request_duration_seconds",
         Help:      "The latency of the HTTP requests.",
         Buckets:   cfg.DurationBuckets,
      }, []string{cfg.ServiceLabel, cfg.apiIDLabel, cfg.MethodLabel, cfg.StatusCodeLabel}),

      httpResponseSizeHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
         Namespace: cfg.Prefix,
         Subsystem: "http",
         Name:      "response_size_bytes",
         Help:      "The size of the HTTP responses.",
         Buckets:   cfg.SizeBuckets,
      }, []string{cfg.ServiceLabel, cfg.apiIDLabel, cfg.MethodLabel, cfg.StatusCodeLabel}),

      httpRequestsInflight: prometheus.NewGaugeVec(prometheus.GaugeOpts{
         Namespace: cfg.Prefix,
         Subsystem: "http",
         Name:      "requests_in_flight",
         Help:      "The number of inflight requests being handled at the same time.",
      }, []string{cfg.ServiceLabel, cfg.apiIDLabel}),
   }

   cfg.Registry.MustRegister(
      r.httpRequestDurHistogram,
      r.httpResponseSizeHistogram,
      r.httpRequestsInflight,
   )

   return r
}

func (r recorder) ObserveHTTPRequestDuration(_ context.Context, p metrics.HTTPReqProperties, duration time.Duration) {
   r.httpRequestDurHistogram.WithLabelValues(p.Service, p.ID, p.Method, p.Code).Observe(duration.Seconds())
}

func (r recorder) ObserveHTTPResponseSize(_ context.Context, p metrics.HTTPReqProperties, sizeBytes int64) {
   r.httpResponseSizeHistogram.WithLabelValues(p.Service, p.ID, p.Method, p.Code).Observe(float64(sizeBytes))
}

func (r recorder) AddInflightRequests(_ context.Context, p metrics.HTTPProperties, quantity int) {
   r.httpRequestsInflight.WithLabelValues(p.Service, p.ID).Add(float64(quantity))
}

一、实现方案

1、每个微服务通过/metrics接口暴露自己的指标，作为一个exporter

2、部署prometheus，同时配置serviceMonitor采集微服务的exporter地址，exporter地址通过service地址方式访问

serviceMonitor如下：

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: test-monitor
  namespace: monitoring
spec:
  endpoints:
  - honorLabels: true
    interval: 10s
    port: http
    scheme: http
  namespaceSelector:
    matchNames:
    - test
  selector:
    matchLabels:
      app.kubernetes.io/group: test

3、监控平台配置grafana展示监控指标，配置告警规则

二、微服务监控指标列表

指标组	指标名称	指标类型	指标说明
http指标	http_request_duration_seconds_count	Histogram	http接口的请求时长
	http_request_total	Counter	http接口的请求总次数
	http_response_size_bytes	Histogram	http接口的响应体大小
	http_requests_in_flight	Gauge	http接口的并发请求数
Go指标	go_gc_duration_seconds_sum	Histogram	Go程序gc时长
	go_goroutines	Gauge	go程序协程数量
	go_memstats_alloc_bytes	Gauge	Go程序占用的内存大小
Gprc指标	grpc_server_handled_total	Counter	每个grpc方法调用完成的请求数
	grpc_server_handling_seconds_bucket	Histogram	每个grpc方法响应时长
	grpc_server_started_total	Counter	每个grpc方法调用的总次数

完整指标样例：

# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 2.5934e-05
go_gc_duration_seconds{quantile="0.25"} 3.062e-05
go_gc_duration_seconds{quantile="0.5"} 4.3478e-05
go_gc_duration_seconds{quantile="0.75"} 5.73e-05
go_gc_duration_seconds{quantile="1"} 0.000188399
go_gc_duration_seconds_sum 0.000660942
go_gc_duration_seconds_count 12
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 529
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.13.12"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 2.364408e+07


# HELP http_request_duration_seconds The latency of the HTTP requests.
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="1"} 5
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="5"} 5
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="10"} 5
http_request_duration_seconds_sum{code="200",api="/v1/common/user/filter",method="GET",service="common"} 1.534460968
http_request_duration_seconds_count{code="200",api="/v1/common/user/filter",method="GET",service="common"} 5


# HELP http_requests_in_flight The number of inflight requests being handled at the same time.
# TYPE http_requests_in_flight gauge
http_requests_in_flight{api="/v1/common/user/filter",service="common"} 2

# HELP http_response_size_bytes The size of the HTTP responses.
# TYPE http_response_size_bytes histogram
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.005"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.01"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.025"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.05"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.1"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.25"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="1"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="2.5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="10"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="+Inf"} 6
http_response_size_bytes_sum{code="200",api="/v1/common/user/filter",method="GET",service="common"} 44565
http_response_size_bytes_count{code="200",api="/v1/common/user/filter",method="GET",service="common"} 5


# HELP grpc_server_handled_total Total number of RPCs completed on the server, regardless of success or failure.
# TYPE grpc_server_handled_total counter
grpc_server_handled_total{grpc_code="OK",grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="Bound",grpc_service="ecf.common.ssh.SSH",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="Check",grpc_service="ecf.common.arrears.Arrears",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="CheckEmailVerifyCode",grpc_service="ecf.common.message.Message",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="CheckOfferLoad",grpc_service="ecf.common.offer.CommonOffer",grpc_type="unary"} 0


# HELP grpc_server_handling_seconds Histogram of response latency (seconds) of gRPC that had been application-level handled by the server.
# TYPE grpc_server_handling_seconds histogram
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.005"} 0
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.01"} 0
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.025"} 0


# HELP grpc_server_started_total Total number of RPCs started on the server.
# TYPE grpc_server_started_total counter
grpc_server_started_total{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Bound",grpc_service="ecf.common.ssh.SSH",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Check",grpc_service="ecf.common.arrears.Arrears",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="CheckEmailVerifyCode",grpc_service="ecf.common.message.Message",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="CheckOfferLoad",grpc_service="ecf.common.offer.CommonOffer",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Count",grpc_service="ecf.common.cluster.Cluster",grpc_type="unary"} 0

三、监控平台配置指标

指标组	指标名称	指标说明
http指标	rate(http_request_duration_seconds_count{code!="200"}[30s])	http接口的请求错误率
	sum(rate(http_request_total{}[5m])) by (service)	最近5分钟内每秒请求速率
	~~histogram_quantile(0.9, sum( rate(http_request_duration_seconds_bucket[10m]) ) by (api, le) )~~	~~每个接口90%的平均请求时间~~
	`sum(rate(http_request_duration_seconds_sum[1m]) / rate(http_request_duration_seconds_count[1m])) by(service,api) > 200`	接口平均时间大于200ms
	sum(http_requests_in_flight) by (service)	http接口的并发请求数
Go指标	go_gc_duration_seconds_sum	Go程序gc时长
	go_goroutines	go程序协程数量
	go_memstats_alloc_bytes	Go程序占用的内存大小
Gprc指标	sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service)	grpc请求平均错误率
	histogram_quantile(0.9, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[5m])) by (grpc_service,le) )	每个服务的rpc处理时间的90%-tile分位数估计值
	`sum(rate(`grpc_server_handling_seconds_bucket`[1m]) / rate(`grpc_server_handling_seconds_count`[1m])) by(grpc_service) > 200`	rpc接口的1分钟内的平均响应时间大于200ms的
	grpc_server_started_total	每个grpc方法调用的总次数

四、采集指标代码实现

Grpc server实现grpc指标：

// grpc/server.go
import grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"

// RunServer runs gRPC service
func RunServer(ctx context.Context, port string, init InitServers) error {
	// add grpc middleware
	opts = grpc_interceptor.AddLogging(log.WithField("server", "grpc"), opts)
	// register service
	server := grpc.NewServer(opts...)
	// register api
	init(server)


	grpc_prometheus.Register(server)
}


// grpc-interceptor/logger.go
// AddLogging returns grpc.Server config option that turn on logging.
func AddLogging(logger *logrus.Entry, opts []grpc.ServerOption) []grpc.ServerOption {
   // Shared options for the logger, with a custom gRPC code to log level function.
   o := []grpc_logrus.Option{
      grpc_logrus.WithLevels(codeToLevel),
   }

   grpc_prometheus.EnableHandlingTimeHistogram()

   // Add unary interceptor
   opts = append(opts, grpc_middleware.WithUnaryServerChain(
      grpc_prometheus.UnaryServerInterceptor,
   ))

   // Add stream interceptor (added as an example here)
   opts = append(opts, grpc_middleware.WithStreamServerChain(
      grpc_prometheus.StreamServerInterceptor,
   ))

   return opts
}

Grpc-gateway实现http指标：

// rest/server.go
func RunServer(ctx context.Context, httpPort string, init InitMux, interceptors ...rest_interceptor.Interceptor) error {
	// more code...

	h := prometheus_interceptor.AddPrometheus(rest_interceptor.AddRequestID(
   			rest_interceptor.AddLogger(log.WithField("gateway", "rest"), api)))


	// Create our middleware.
	recorder := metrics.NewRecorder(metrics.Config{
   		DurationBuckets: []float64{1, 2.5, 5, 10, 20, 40, 80, 160, 320, 640},
	})
	mdlw := middleware.New(middleware.Config{
   		Recorder: recorder,
  		Service:  "common",
	})

	h = std.api("", mdlw, h)

	srv := &http.Server{
		Addr:         httpPort,
   		ReadTimeout:  60 * time.Second,
  	 	WriteTimeout: 60 * time.Second,
   		// add api with middleware
   		api: h,
	}

	return srv.ListenAndServe()
}




// prometheus_interceptor/prometheus.go
func AddPrometheus(h http.api) http.api {
   return http.apiFunc(func(w http.ResponseWriter, r *http.Request) {
      if strings.HasPrefix(r.URL.Path, "/metrics") {

         promhttp.api().ServeHTTP(w, r)
         return
      }
      h.ServeHTTP(w, r)
      return
   })
}


// 参考："github.com/slok/go-http-metrics/middleware/std"
type recorder struct {
   httpRequestDurHistogram   *prometheus.HistogramVec
   httpResponseSizeHistogram *prometheus.HistogramVec
   httpRequestsInflight      *prometheus.GaugeVec
}

// NewRecorder returns a new metrics recorder that implements the recorder
// using Prometheus as the backend.
func NewRecorder(cfg Config) metrics.Recorder {
   cfg.defaults()

   r := &recorder{
      httpRequestDurHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
         Namespace: cfg.Prefix,
         Subsystem: "http",
         Name:      "request_duration_seconds",
         Help:      "The latency of the HTTP requests.",
         Buckets:   cfg.DurationBuckets,
      }, []string{cfg.ServiceLabel, cfg.apiIDLabel, cfg.MethodLabel, cfg.StatusCodeLabel}),

      httpResponseSizeHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
         Namespace: cfg.Prefix,
         Subsystem: "http",
         Name:      "response_size_bytes",
         Help:      "The size of the HTTP responses.",
         Buckets:   cfg.SizeBuckets,
      }, []string{cfg.ServiceLabel, cfg.apiIDLabel, cfg.MethodLabel, cfg.StatusCodeLabel}),

      httpRequestsInflight: prometheus.NewGaugeVec(prometheus.GaugeOpts{
         Namespace: cfg.Prefix,
         Subsystem: "http",
         Name:      "requests_in_flight",
         Help:      "The number of inflight requests being handled at the same time.",
      }, []string{cfg.ServiceLabel, cfg.apiIDLabel}),
   }

   cfg.Registry.MustRegister(
      r.httpRequestDurHistogram,
      r.httpResponseSizeHistogram,
      r.httpRequestsInflight,
   )

   return r
}

func (r recorder) ObserveHTTPRequestDuration(_ context.Context, p metrics.HTTPReqProperties, duration time.Duration) {
   r.httpRequestDurHistogram.WithLabelValues(p.Service, p.ID, p.Method, p.Code).Observe(duration.Seconds())
}

func (r recorder) ObserveHTTPResponseSize(_ context.Context, p metrics.HTTPReqProperties, sizeBytes int64) {
   r.httpResponseSizeHistogram.WithLabelValues(p.Service, p.ID, p.Method, p.Code).Observe(float64(sizeBytes))
}

func (r recorder) AddInflightRequests(_ context.Context, p metrics.HTTPProperties, quantity int) {
   r.httpRequestsInflight.WithLabelValues(p.Service, p.ID).Add(float64(quantity))
}

应用商城

合作伙伴

开发者

支持与服务

了解天翼云

微服务监控指标实现方案

一、实现方案

1、每个微服务通过/metrics接口暴露自己的指标，作为一个exporter

2、部署prometheus，同时配置serviceMonitor采集微服务的exporter地址，exporter地址通过service地址方式访问

3、监控平台配置grafana展示监控指标，配置告警规则

二、微服务监控指标列表

三、监控平台配置指标

四、采集指标代码实现

微服务监控指标实现方案

一、实现方案

1、每个微服务通过/metrics接口暴露自己的指标，作为一个exporter

2、部署prometheus，同时配置serviceMonitor采集微服务的exporter地址，exporter地址通过service地址方式访问

3、监控平台配置grafana展示监控指标，配置告警规则

二、微服务监控指标列表

三、监控平台配置指标

四、采集指标代码实现

活动

应用商城

合作伙伴

开发者

支持与服务

了解天翼云

微服务监控指标实现方案

一、实现方案

1、每个微服务通过/metrics接口暴露自己的指标，作为一个exporter

2、部署prometheus，同时配置serviceMonitor采集微服务的exporter地址，exporter地址通过service地址方式访问

3、监控平台配置grafana展示监控指标，配置告警规则

二、微服务监控指标列表

三、监控平台配置指标

四、采集指标代码实现

微服务监控指标实现方案

一、实现方案

1、每个微服务通过/metrics接口暴露自己的指标，作为一个exporter

2、部署prometheus，同时配置serviceMonitor采集微服务的exporter地址，exporter地址通过service地址方式访问

3、监控平台配置grafana展示监控指标，配置告警规则

二、微服务监控指标列表

三、监控平台配置指标

四、采集指标代码实现