添加应用程序级别指标#
Ray 在 ray.util.metrics 中提供了一个方便的 API,用于定义和导出自定义指标,以便了解您的应用程序。支持三种指标:Counter、Gauge 和 Histogram。这些指标对应于相同的 Prometheus 指标类型。以下是一个使用这些 API 导出指标的 Actor 的简单示例:
import time
import ray
from ray.util.metrics import Counter, Gauge, Histogram
ray.init(_metrics_export_port=8080)
@ray.remote
class MyActor:
def __init__(self, name):
self._curr_count = 0
self.counter = Counter(
"num_requests",
description="Number of requests processed by the actor.",
tag_keys=("actor_name",),
)
self.counter.set_default_tags({"actor_name": name})
self.gauge = Gauge(
"curr_count",
description="Current count held by the actor. Goes up and down.",
tag_keys=("actor_name",),
)
self.gauge.set_default_tags({"actor_name": name})
self.histogram = Histogram(
"request_latency",
description="Latencies of requests in ms.",
boundaries=[0.1, 1],
tag_keys=("actor_name",),
)
self.histogram.set_default_tags({"actor_name": name})
def process_request(self, num):
start = time.time()
self._curr_count += num
# Increment the total request count.
self.counter.inc()
# Update the gauge to the new value.
self.gauge.set(self._curr_count)
# Record the latency for this request in ms.
self.histogram.observe(1000 * (time.time() - start))
return self._curr_count
print("Starting actor.")
my_actor = MyActor.remote("my_actor")
print("Calling actor.")
my_actor.process_request.remote(-10)
print("Calling actor.")
my_actor.process_request.remote(5)
print("Metrics should be exported.")
print("See http://localhost:8080 (this may take a few seconds to load).")
# Sleep so we can look at the metrics before exiting.
time.sleep(30)
print("Exiting!")
当脚本运行时,指标会被导出到 ``localhost:8080``(这是 Prometheus 配置为抓取的端点)。在浏览器中打开此地址。你应该会看到以下输出:
# HELP ray_request_latency Latencies of requests in ms.
# TYPE ray_request_latency histogram
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="0.1"} 2.0
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="1.0"} 2.0
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="+Inf"} 2.0
ray_request_latency_count{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 2.0
ray_request_latency_sum{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 0.11992454528808594
# HELP ray_curr_count Current count held by the actor. Goes up and down.
# TYPE ray_curr_count gauge
ray_curr_count{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} -15.0
# HELP ray_num_requests_total Number of requests processed by the actor.
# TYPE ray_num_requests_total counter
ray_num_requests_total{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 2.0
更多详情请参见 ray.util.metrics。