添加应用程序级别指标#

Ray 在 ray.util.metrics 中提供了一个方便的 API,用于定义和导出自定义指标,以便了解您的应用程序。支持三种指标:Counter、Gauge 和 Histogram。这些指标对应于相同的 Prometheus 指标类型。以下是一个使用这些 API 导出指标的 Actor 的简单示例:

import time

import ray
from ray.util.metrics import Counter, Gauge, Histogram

ray.init(_metrics_export_port=8080)


@ray.remote
class MyActor:
    def __init__(self, name):
        self._curr_count = 0

        self.counter = Counter(
            "num_requests",
            description="Number of requests processed by the actor.",
            tag_keys=("actor_name",),
        )
        self.counter.set_default_tags({"actor_name": name})

        self.gauge = Gauge(
            "curr_count",
            description="Current count held by the actor. Goes up and down.",
            tag_keys=("actor_name",),
        )
        self.gauge.set_default_tags({"actor_name": name})

        self.histogram = Histogram(
            "request_latency",
            description="Latencies of requests in ms.",
            boundaries=[0.1, 1],
            tag_keys=("actor_name",),
        )
        self.histogram.set_default_tags({"actor_name": name})

    def process_request(self, num):
        start = time.time()
        self._curr_count += num

        # Increment the total request count.
        self.counter.inc()
        # Update the gauge to the new value.
        self.gauge.set(self._curr_count)
        # Record the latency for this request in ms.
        self.histogram.observe(1000 * (time.time() - start))

        return self._curr_count


print("Starting actor.")
my_actor = MyActor.remote("my_actor")
print("Calling actor.")
my_actor.process_request.remote(-10)
print("Calling actor.")
my_actor.process_request.remote(5)
print("Metrics should be exported.")
print("See http://localhost:8080 (this may take a few seconds to load).")

# Sleep so we can look at the metrics before exiting.
time.sleep(30)
print("Exiting!")

当脚本运行时,指标会被导出到 ``localhost:8080``(这是 Prometheus 配置为抓取的端点)。在浏览器中打开此地址。你应该会看到以下输出:

# HELP ray_request_latency Latencies of requests in ms.
# TYPE ray_request_latency histogram
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="0.1"} 2.0
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="1.0"} 2.0
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="+Inf"} 2.0
ray_request_latency_count{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 2.0
ray_request_latency_sum{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 0.11992454528808594
# HELP ray_curr_count Current count held by the actor. Goes up and down.
# TYPE ray_curr_count gauge
ray_curr_count{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} -15.0
# HELP ray_num_requests_total Number of requests processed by the actor.
# TYPE ray_num_requests_total counter
ray_num_requests_total{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 2.0

更多详情请参见 ray.util.metrics