Spaces:
Running
Running
Commit
Β·
c259566
1
Parent(s):
f460af4
works ish
Browse files- app.py +57 -44
- leaderboard/md.py +1 -1
- leaderboard/utils.py +6 -3
app.py
CHANGED
|
@@ -12,6 +12,14 @@ from leaderboard.css import custom_css
|
|
| 12 |
from leaderboard.md import *
|
| 13 |
from leaderboard.utils import load_all_data
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
#######################################################
|
| 16 |
# Setup #
|
| 17 |
#######################################################
|
|
@@ -152,11 +160,10 @@ rewardbench_data_avg = avg_over_rewardbench_v2(rewardbench_data).sort_values(by=
|
|
| 152 |
rewardbench_data = prep_df(rewardbench_data)
|
| 153 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
| 154 |
|
| 155 |
-
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|
| 156 |
-
col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
|
|
|
| 160 |
|
| 161 |
###########################################
|
| 162 |
# Leaderboard Helpers & Setting #
|
|
@@ -297,6 +304,11 @@ total_models = len(
|
|
| 297 |
rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
|
| 298 |
).values
|
| 299 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
assets = Path("leaderboard").resolve() # absolute dir with the image
|
| 301 |
|
| 302 |
# Using a string for a predefined color
|
|
@@ -352,7 +364,7 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
| 352 |
# reference data
|
| 353 |
rewardbench_table_hidden = gr.Dataframe(
|
| 354 |
rewardbench_data_avg.values,
|
| 355 |
-
datatype=
|
| 356 |
headers=rewardbench_data_avg.columns.tolist(),
|
| 357 |
visible=False,
|
| 358 |
)
|
|
@@ -362,7 +374,7 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
| 362 |
"",
|
| 363 |
["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
| 364 |
),
|
| 365 |
-
datatype=
|
| 366 |
headers=rewardbench_data_avg.columns.tolist(),
|
| 367 |
elem_id="rewardbench_dataframe_avg",
|
| 368 |
max_height=800, # 800 px β ~25 rows on default row-height
|
|
@@ -385,42 +397,42 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
| 385 |
button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
|
| 386 |
with gr.TabItem("RewardBench", scale=1.5):
|
| 387 |
with gr.Row():
|
| 388 |
-
gr.Markdown(CAPTION_V1.format(str(
|
| 389 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 390 |
with gr.TabItem("Leaderboard"):
|
| 391 |
pass
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
with gr.TabItem("About"):
|
| 425 |
with gr.Row():
|
| 426 |
gr.Markdown(ABOUT_TEXT_V1)
|
|
@@ -433,19 +445,20 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
| 433 |
button_data_v1 = gr.Button("Show Random Sample")
|
| 434 |
|
| 435 |
with gr.Row():
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display])
|
| 439 |
|
|
|
|
| 440 |
|
| 441 |
|
| 442 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
| 443 |
-
|
| 444 |
|
| 445 |
model_types_1.change(
|
| 446 |
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
|
| 447 |
)
|
| 448 |
-
|
|
|
|
|
|
|
| 449 |
|
| 450 |
with gr.Row():
|
| 451 |
with gr.Accordion("π Citation", open=False):
|
|
|
|
| 12 |
from leaderboard.md import *
|
| 13 |
from leaderboard.utils import load_all_data
|
| 14 |
|
| 15 |
+
# get v1 data
|
| 16 |
+
rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
|
| 17 |
+
# rename column "Unnamed: 0" to ""
|
| 18 |
+
rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
|
| 19 |
+
# rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
|
| 20 |
+
rb_orig_snapshot.reset_index(drop=True, inplace=True)
|
| 21 |
+
|
| 22 |
+
# import ipdb; ipdb.set_trace()
|
| 23 |
#######################################################
|
| 24 |
# Setup #
|
| 25 |
#######################################################
|
|
|
|
| 160 |
rewardbench_data = prep_df(rewardbench_data)
|
| 161 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
| 162 |
|
|
|
|
|
|
|
| 163 |
|
| 164 |
+
|
| 165 |
+
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
|
| 166 |
+
col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
|
| 167 |
|
| 168 |
###########################################
|
| 169 |
# Leaderboard Helpers & Setting #
|
|
|
|
| 304 |
rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
|
| 305 |
).values
|
| 306 |
)
|
| 307 |
+
total_models_v1 = len(
|
| 308 |
+
regex_table(
|
| 309 |
+
rb_orig_snapshot.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
|
| 310 |
+
).values
|
| 311 |
+
)
|
| 312 |
assets = Path("leaderboard").resolve() # absolute dir with the image
|
| 313 |
|
| 314 |
# Using a string for a predefined color
|
|
|
|
| 364 |
# reference data
|
| 365 |
rewardbench_table_hidden = gr.Dataframe(
|
| 366 |
rewardbench_data_avg.values,
|
| 367 |
+
datatype=col_types_rewardbench_v1,
|
| 368 |
headers=rewardbench_data_avg.columns.tolist(),
|
| 369 |
visible=False,
|
| 370 |
)
|
|
|
|
| 374 |
"",
|
| 375 |
["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
| 376 |
),
|
| 377 |
+
datatype=col_types_rewardbench_v1,
|
| 378 |
headers=rewardbench_data_avg.columns.tolist(),
|
| 379 |
elem_id="rewardbench_dataframe_avg",
|
| 380 |
max_height=800, # 800 px β ~25 rows on default row-height
|
|
|
|
| 397 |
button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
|
| 398 |
with gr.TabItem("RewardBench", scale=1.5):
|
| 399 |
with gr.Row():
|
| 400 |
+
gr.Markdown(CAPTION_V1.format(str(total_models_v1)))
|
| 401 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 402 |
with gr.TabItem("Leaderboard"):
|
| 403 |
pass
|
| 404 |
+
with gr.Row():
|
| 405 |
+
search_1_v1 = gr.Textbox(
|
| 406 |
+
label="Model Search (delimit with , )",
|
| 407 |
+
placeholder="Model Search (delimit with , )",
|
| 408 |
+
show_label=False,
|
| 409 |
+
)
|
| 410 |
+
model_types_1_v1 = gr.CheckboxGroup(
|
| 411 |
+
["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
|
| 412 |
+
value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
| 413 |
+
label="Model Types",
|
| 414 |
+
show_label=False,
|
| 415 |
+
# info="Which model types to include.",
|
| 416 |
+
)
|
| 417 |
+
with gr.Row():
|
| 418 |
+
# reference data
|
| 419 |
+
rewardbench_table_hidden_v1 = gr.Dataframe(
|
| 420 |
+
rb_orig_snapshot.values,
|
| 421 |
+
datatype=col_types_rewardbench,
|
| 422 |
+
headers=rb_orig_snapshot.columns.tolist(),
|
| 423 |
+
visible=False,
|
| 424 |
+
)
|
| 425 |
+
rewardbench_table_v1 = gr.Dataframe(
|
| 426 |
+
regex_table(
|
| 427 |
+
rb_orig_snapshot.copy(),
|
| 428 |
+
"",
|
| 429 |
+
["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
| 430 |
+
),
|
| 431 |
+
datatype=col_types_rewardbench,
|
| 432 |
+
headers=rb_orig_snapshot.columns.tolist(),
|
| 433 |
+
elem_id="rewardbench_dataframe_avg_v1",
|
| 434 |
+
max_height=800, # 800 px β ~25 rows on default row-height
|
| 435 |
+
)
|
| 436 |
with gr.TabItem("About"):
|
| 437 |
with gr.Row():
|
| 438 |
gr.Markdown(ABOUT_TEXT_V1)
|
|
|
|
| 445 |
button_data_v1 = gr.Button("Show Random Sample")
|
| 446 |
|
| 447 |
with gr.Row():
|
| 448 |
+
sample_display_v1 = gr.Markdown("{sampled data loads here}")
|
|
|
|
|
|
|
| 449 |
|
| 450 |
+
button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display_v1])
|
| 451 |
|
| 452 |
|
| 453 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
| 454 |
+
search_1_v1.change(regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1)
|
| 455 |
|
| 456 |
model_types_1.change(
|
| 457 |
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
|
| 458 |
)
|
| 459 |
+
model_types_1_v1.change(
|
| 460 |
+
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
|
| 461 |
+
)
|
| 462 |
|
| 463 |
with gr.Row():
|
| 464 |
with gr.Accordion("π Citation", open=False):
|
leaderboard/md.py
CHANGED
|
@@ -112,7 +112,7 @@ CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human
|
|
| 112 |
|
| 113 |
CAPTION_V1 = """The original RewardBench -- the first reward model evaluation.
|
| 114 |
|
| 115 |
-
**Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available
|
| 116 |
|
| 117 |
β οΈ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300).
|
| 118 |
"""
|
|
|
|
| 112 |
|
| 113 |
CAPTION_V1 = """The original RewardBench -- the first reward model evaluation.
|
| 114 |
|
| 115 |
+
**Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available in the source for this application.
|
| 116 |
|
| 117 |
β οΈ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300).
|
| 118 |
"""
|
leaderboard/utils.py
CHANGED
|
@@ -6,7 +6,10 @@ import numpy as np
|
|
| 6 |
import pandas as pd
|
| 7 |
from datasets import load_dataset
|
| 8 |
|
| 9 |
-
UNVERIFIED_MODELS = [
|
|
|
|
|
|
|
|
|
|
| 10 |
"nvidia/Nemotron-4-340B-Reward",
|
| 11 |
"nvidia/Llama3-70B-SteerLM-RM",
|
| 12 |
"Cohere May 2024",
|
|
@@ -24,7 +27,8 @@ UNVERIFIED_MODELS = [
|
|
| 24 |
"nvidia/Llama-3.1-Nemotron-70B-Reward",
|
| 25 |
]
|
| 26 |
|
| 27 |
-
|
|
|
|
| 28 |
"Skywork/Skywork-Reward-Gemma-2-27B",
|
| 29 |
"Skywork/Skywork-Critic-Llama-3.1-70B",
|
| 30 |
"LxzGordon/URM-LLaMa-3.1-8B",
|
|
@@ -39,7 +43,6 @@ CONTAMINATED_MODELS = [
|
|
| 39 |
"Ray2333/GRM-Gemma-2B-rewardmodel-ft",
|
| 40 |
]
|
| 41 |
|
| 42 |
-
|
| 43 |
# From Open LLM Leaderboard
|
| 44 |
def model_hyperlink(link, model_name):
|
| 45 |
# if model_name is above 50 characters, return first 47 characters and "..."
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
from datasets import load_dataset
|
| 8 |
|
| 9 |
+
UNVERIFIED_MODELS = []
|
| 10 |
+
CONTAMINATED_MODELS = []
|
| 11 |
+
|
| 12 |
+
UNVERIFIED_MODELS_V1 = [
|
| 13 |
"nvidia/Nemotron-4-340B-Reward",
|
| 14 |
"nvidia/Llama3-70B-SteerLM-RM",
|
| 15 |
"Cohere May 2024",
|
|
|
|
| 27 |
"nvidia/Llama-3.1-Nemotron-70B-Reward",
|
| 28 |
]
|
| 29 |
|
| 30 |
+
# No longer used
|
| 31 |
+
CONTAMINATED_MODELS_V1 = [
|
| 32 |
"Skywork/Skywork-Reward-Gemma-2-27B",
|
| 33 |
"Skywork/Skywork-Critic-Llama-3.1-70B",
|
| 34 |
"LxzGordon/URM-LLaMa-3.1-8B",
|
|
|
|
| 43 |
"Ray2333/GRM-Gemma-2B-rewardmodel-ft",
|
| 44 |
]
|
| 45 |
|
|
|
|
| 46 |
# From Open LLM Leaderboard
|
| 47 |
def model_hyperlink(link, model_name):
|
| 48 |
# if model_name is above 50 characters, return first 47 characters and "..."
|