Add parsed training metrics and plots
Browse files- .gitattributes +1 -0
- training_logs/20260428_203121_metrics_job_389754.csv +15 -0
- training_logs/20260428_203121_metrics_report.md +223 -0
- training_logs/20260428_203121_metrics_table.csv +15 -0
- training_logs/20260428_203121_reward_vs_steps.png +3 -0
- training_logs/20260428_203121_trial_results.csv +0 -0
- training_logs/20260428_203121_turn_count_distribution.png +0 -0
- training_logs/20260428_203121_vllm_metrics_job_389754.csv +0 -0
- training_logs/20260428_203121_vllm_metrics_table.csv +0 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
training_logs/20260428_203121_reward_vs_steps.png filter=lfs diff=lfs merge=lfs -text
|
training_logs/20260428_203121_metrics_job_389754.csv
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
async/discard_rate,async/discarded_count,async/effective_batch_groups,async/effective_batch_samples,async/staleness_max,async/staleness_mean,async/staleness_min,async/staleness_ratio,generate/avg_num_tokens,generate/avg_tokens_non_zero_rewards,generate/avg_tokens_zero_rewards,generate/max_num_tokens,generate/min_num_tokens,generate/std_num_tokens,loss/avg_final_rewards,loss/avg_raw_advantages,loss/avg_raw_advantages_abs,policy/final_loss,policy/policy_entropy,policy/policy_loss,policy/policy_lr,policy/policy_update_steps,policy/ppo_clip_ratio,policy/raw_grad_norm,reward/avg_pass_at_8,reward/avg_raw_reward,system/process_rss_gb,system/process_vms_gb,system/ram_available_gb,system/ram_percent,system/ram_total_gb,system/ram_used_gb,timing/cleanup_old_checkpoints,timing/compute_advantages_and_returns,timing/convert_to_training_input,timing/fwd_logprobs_values_reward,timing/policy_train,timing/run_training,timing/save_checkpoints,timing/step,timing/sync_weights,timing/train_critic_and_policy,timing/wait_for_generation_buffer,trainer/epoch,trainer/global_step,batch_errors/total_batches,batch_errors/total_instances,batch_errors/total_successful,batch_errors/total_failed,batch_errors/total_masked,timing/save_hf_model
|
| 2 |
+
0.0,0,64,512,0,0.0,0,0.0,3837.043,1844.4,3856.6943,11791,1000,1728.238,0.0098,-0.003,0.0108,-0.0,0.2114,-0.0,0.0,1.0,0.0,0.0146,0.0469,0.0098,11.0404,73.0374,334.9649,61.0,857.9687,523.0038,10.3772,0.0525,2.3302,54.8335,599.7188,654.8685,44.3151,2252.5445,55.9998,599.982,1539.3458,0,1,128.0,1024.0,1024.0,0.0,0.0,
|
| 3 |
+
0.0,0,64,512,1,1.0,1,1.0,3734.7441,4189.0,3722.9098,14143,1008,1771.7086,0.0254,-0.003,0.0135,0.0,0.2119,0.0,0.0,1.0,0.0,0.0147,0.0781,0.0254,12.3731,73.238,331.4794,61.4,857.9687,526.4893,0.0156,0.0625,2.608,51.6906,593.7021,645.7138,33.1731,927.602,58.3788,593.9603,220.8803,0,2,64.0,512.0,512.0,0.0,0.0,
|
| 4 |
+
0.0,0,64,512,2,2.0,2,1.0,3680.1211,3429.5,3685.1135,17798,960,1903.0569,0.0195,-0.0057,0.0404,0.0,0.2131,0.0,0.0,1.0,0.0,0.017,0.125,0.0195,15.006,73.3821,331.1569,61.4,857.9687,526.8118,0.0079,0.0712,3.338,54.9564,607.7339,663.0308,33.7102,1161.2,54.4484,608.0028,440.3575,0,3,64.0,512.0,512.0,0.0,0.0,
|
| 5 |
+
0.0,0,64,512,3,3.0,3,1.0,4038.8242,3594.9412,4054.0687,19942,1070,2204.7849,0.0332,-0.0083,0.0398,0.0,0.1988,0.0,0.0,1.0,0.0,0.0159,0.1094,0.0332,18.2571,73.6598,332.0689,61.3,857.9687,525.8998,0.0123,0.0638,3.988,64.8817,648.7096,713.9267,33.5855,774.7256,56.7814,648.9808,0.0053,0,4,64.0,512.0,512.0,0.0,0.0,
|
| 6 |
+
0.0,0,64,512,4,4.0,4,1.0,3734.0176,3721.08,3734.6817,13083,1049,1658.2439,0.0488,-0.0016,0.0331,0.0,0.2072,0.0,0.0,1.0,0.0,0.0298,0.125,0.0488,18.5352,73.6546,329.1295,61.6,857.9687,528.8392,0.0099,0.0428,2.9611,54.8169,588.7437,643.8552,33.7826,946.1626,58.0555,588.9951,241.2625,0,5,64.0,512.0,512.0,0.0,0.0,38.1036
|
| 7 |
+
0.0,0,64,512,5,5.0,5,1.0,3678.4492,2779.8125,3707.4375,16575,982,1778.7688,0.0312,-0.0069,0.0393,0.0,0.2064,0.0,0.0,1.0,0.0,0.0194,0.1406,0.0312,18.5881,73.6669,327.9209,61.8,857.9687,530.0478,0.0087,0.0617,3.0487,49.8621,590.8918,641.066,34.0988,888.5801,61.7658,591.1417,182.6699,0,6,64.0,512.0,512.0,0.0,0.0,
|
| 8 |
+
0.0,0,64,512,6,6.0,6,1.0,4026.377,2695.15,4080.4919,30105,860,2435.5002,0.0391,-0.0117,0.0579,0.0,0.2004,0.0,0.0,1.0,0.0,0.0242,0.2031,0.0391,24.2751,73.9734,398.3101,53.6,857.9687,459.6586,0.0095,0.0949,4.4248,61.0183,660.677,722.0682,35.0715,2094.5908,55.4597,660.9545,1312.6129,0,7,64.0,512.0,512.0,0.0,0.0,
|
| 9 |
+
0.0,0,64,512,0,0.0,0,0.0,3169.8066,2304.1277,3257.3054,9841,950,1527.965,0.0918,-0.023,0.1174,0.0,0.2084,0.0,0.0,1.0,0.0,0.0364,0.3281,0.0918,24.1753,73.8303,401.2653,53.2,857.9687,456.7034,0.0112,0.0425,2.0132,44.7602,566.5425,611.7071,33.0222,1954.7684,56.9235,566.9038,1284.0936,1,8,64.0,512.0,512.0,0.0,0.0,
|
| 10 |
+
0.0,0,64,512,1,1.0,1,1.0,3208.3555,2564.1489,3273.4688,19175,927,1685.1496,0.0918,-0.0201,0.1149,0.0,0.2081,0.0,0.0,1.0,0.0,0.0337,0.3281,0.0918,24.6965,74.2126,396.9511,53.7,857.9687,461.0176,0.0079,0.0446,3.5998,47.727,576.6595,624.6952,33.46,810.2478,57.0584,576.9233,124.8704,1,9,64.0,512.0,512.0,0.0,0.0,
|
| 11 |
+
0.0,0,64,512,2,2.0,2,1.0,3237.8379,2406.9649,3341.9253,12501,890,1700.2375,0.1113,-0.0333,0.1439,0.0,0.2076,0.0,0.0,1.0,0.0,0.0374,0.3906,0.1113,24.8782,74.2612,391.3597,54.4,857.9687,466.609,0.0092,0.0533,2.4828,50.2219,593.8531,644.3998,33.4188,912.5225,56.7812,594.124,208.8362,1,10,64.0,512.0,512.0,0.0,0.0,37.5099
|
| 12 |
+
0.0,0,64,512,3,3.0,3,1.0,3001.6465,2650.619,3050.8998,13406,941,1483.113,0.123,-0.0266,0.1658,0.0,0.199,0.0,0.0,1.0,0.0,0.0435,0.4219,0.123,25.0717,74.3157,387.2821,54.9,857.9687,470.6866,0.0168,0.0371,2.6113,41.7835,561.0683,603.1351,33.5831,731.0664,56.667,561.3141,68.6264,1,11,64.0,512.0,512.0,0.0,0.0,
|
| 13 |
+
0.0,0,64,512,4,4.0,4,1.0,2883.377,2714.9545,2918.3325,14702,798,1601.3015,0.1719,-0.0285,0.1735,-0.0,0.1995,-0.0,0.0,1.0,0.0,0.0405,0.4219,0.1719,25.1736,74.6695,384.2253,55.2,857.9687,473.7434,0.011,0.0379,2.9524,47.2009,561.4127,608.9848,33.1065,931.2563,58.3024,561.7455,260.9926,1,12,64.0,512.0,512.0,0.0,0.0,
|
| 14 |
+
0.0,0,64,512,5,5.0,5,1.0,2857.6934,2411.3333,2960.6995,13216,868,1685.3561,0.1875,-0.0066,0.1342,0.0,0.1896,0.0,0.0,1.0,0.0,0.0396,0.3906,0.1875,25.3254,74.5136,380.6907,55.6,857.9687,477.278,0.0093,0.0471,3.5522,43.8172,560.515,604.6342,33.2148,682.4333,57.7062,560.7695,16.5161,1,13,64.0,512.0,512.0,0.0,0.0,
|
| 15 |
+
0.0,0,64,512,6,6.0,6,1.0,3206.502,2278.8974,3373.2143,15130,906,2031.0791,0.1523,-0.0278,0.1606,-0.0,0.1855,-0.0,0.0,1.0,0.0,0.0446,0.4688,0.1523,25.6977,74.7123,378.3962,55.9,857.9687,479.5725,0.0096,0.0577,2.5454,52.8053,611.3389,664.4901,33.0486,4631.2906,13.904,611.6265,3950.3283,1,14,,,,,,
|
training_logs/20260428_203121_metrics_report.md
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SkyRL Training Metrics Analysis
|
| 2 |
+
|
| 3 |
+
Generated from 1 log files
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
| Log File | Total Steps | Metric Blocks | Final Reward (mean) | Final Reward (max) | Total Time (s) |
|
| 8 |
+
|----------|-------------|---------------|---------------------|-------------------|----------------|
|
| 9 |
+
| job_389754 | 14 | 14 | 0.0812 | 0.1875 | 19699.0 |
|
| 10 |
+
|
| 11 |
+
## Async Metrics
|
| 12 |
+
|
| 13 |
+
| | Mean | Std | Min | Max | Count |
|
| 14 |
+
|:------------------------------|-----------:|---------:|------:|------:|--------:|
|
| 15 |
+
| async/discard_rate | 0 | 0 | 0 | 0 | 14 |
|
| 16 |
+
| async/discarded_count | 0 | 0 | 0 | 0 | 14 |
|
| 17 |
+
| async/effective_batch_groups | 64 | 0 | 64 | 64 | 14 |
|
| 18 |
+
| async/effective_batch_samples | 512 | 0 | 512 | 512 | 14 |
|
| 19 |
+
| async/staleness_max | 3 | 2.0755 | 0 | 6 | 14 |
|
| 20 |
+
| async/staleness_mean | 3 | 2.0755 | 0 | 6 | 14 |
|
| 21 |
+
| async/staleness_min | 3 | 2.0755 | 0 | 6 | 14 |
|
| 22 |
+
| async/staleness_ratio | 0.857143 | 0.363137 | 0 | 1 | 14 |
|
| 23 |
+
|
| 24 |
+
## Generate Metrics
|
| 25 |
+
|
| 26 |
+
| | Mean | Std | Min | Max | Count |
|
| 27 |
+
|:-------------------------------------|---------:|----------:|--------:|---------:|--------:|
|
| 28 |
+
| generate/avg_num_tokens | 3449.63 | 412.285 | 2857.69 | 4038.82 | 14 |
|
| 29 |
+
| generate/avg_tokens_non_zero_rewards | 2827.49 | 657.972 | 1844.4 | 4189 | 14 |
|
| 30 |
+
| generate/avg_tokens_zero_rewards | 3501.23 | 385.844 | 2918.33 | 4080.49 | 14 |
|
| 31 |
+
| generate/max_num_tokens | 15814.9 | 4995.58 | 9841 | 30105 | 14 |
|
| 32 |
+
| generate/min_num_tokens | 943.5 | 75.7889 | 798 | 1070 | 14 |
|
| 33 |
+
| generate/std_num_tokens | 1799.61 | 264.344 | 1483.11 | 2435.5 | 14 |
|
| 34 |
+
|
| 35 |
+
## Loss Metrics
|
| 36 |
+
|
| 37 |
+
| | Mean | Std | Min | Max | Count |
|
| 38 |
+
|:----------------------------|-----------:|----------:|--------:|--------:|--------:|
|
| 39 |
+
| loss/avg_final_rewards | 0.0811857 | 0.0601946 | 0.0098 | 0.1875 | 14 |
|
| 40 |
+
| loss/avg_raw_advantages | -0.0147214 | 0.0112676 | -0.0333 | -0.0016 | 14 |
|
| 41 |
+
| loss/avg_raw_advantages_abs | 0.0889357 | 0.0606754 | 0.0108 | 0.1735 | 14 |
|
| 42 |
+
|
| 43 |
+
## Policy Metrics
|
| 44 |
+
|
| 45 |
+
| | Mean | Std | Min | Max | Count |
|
| 46 |
+
|:---------------------------|----------:|-----------:|--------:|-------:|--------:|
|
| 47 |
+
| policy/final_loss | 0 | 0 | -0 | 0 | 14 |
|
| 48 |
+
| policy/policy_entropy | 0.20335 | 0.00829298 | 0.1855 | 0.2131 | 14 |
|
| 49 |
+
| policy/policy_loss | 0 | 0 | -0 | 0 | 14 |
|
| 50 |
+
| policy/policy_lr | 0 | 0 | 0 | 0 | 14 |
|
| 51 |
+
| policy/policy_update_steps | 1 | 0 | 1 | 1 | 14 |
|
| 52 |
+
| policy/ppo_clip_ratio | 0 | 0 | 0 | 0 | 14 |
|
| 53 |
+
| policy/raw_grad_norm | 0.0293786 | 0.0113914 | 0.0146 | 0.0446 | 14 |
|
| 54 |
+
|
| 55 |
+
## Reward Metrics
|
| 56 |
+
|
| 57 |
+
| | Mean | Std | Min | Max | Count |
|
| 58 |
+
|:----------------------|----------:|----------:|-------:|-------:|--------:|
|
| 59 |
+
| reward/avg_pass_at_8 | 0.255579 | 0.15045 | 0.0469 | 0.4688 | 14 |
|
| 60 |
+
| reward/avg_raw_reward | 0.0811857 | 0.0601946 | 0.0098 | 0.1875 | 14 |
|
| 61 |
+
|
| 62 |
+
## System Metrics
|
| 63 |
+
|
| 64 |
+
| | Mean | Std | Min | Max | Count |
|
| 65 |
+
|:------------------------|---------:|-------------:|---------:|---------:|--------:|
|
| 66 |
+
| system/process_rss_gb | 20.9352 | 5.21234 | 11.0404 | 25.6977 | 14 |
|
| 67 |
+
| system/process_vms_gb | 73.9377 | 0.529341 | 73.0374 | 74.7123 | 14 |
|
| 68 |
+
| system/ram_available_gb | 364.657 | 30.8191 | 327.921 | 401.265 | 14 |
|
| 69 |
+
| system/ram_percent | 57.5 | 3.59893 | 53.2 | 61.8 | 14 |
|
| 70 |
+
| system/ram_total_gb | 857.969 | 2.35957e-13 | 857.969 | 857.969 | 14 |
|
| 71 |
+
| system/ram_used_gb | 493.311 | 30.8191 | 456.703 | 530.048 | 14 |
|
| 72 |
+
|
| 73 |
+
## Timing Metrics
|
| 74 |
+
|
| 75 |
+
| | Mean | Std | Min | Max | Count |
|
| 76 |
+
|:--------------------------------------|-------------:|-------------:|---------:|----------:|--------:|
|
| 77 |
+
| timing/cleanup_old_checkpoints | 0.75115 | 2.77057 | 0.0079 | 10.3772 | 14 |
|
| 78 |
+
| timing/compute_advantages_and_returns | 0.0549714 | 0.0155364 | 0.0371 | 0.0949 | 14 |
|
| 79 |
+
| timing/convert_to_training_input | 3.03256 | 0.678105 | 2.0132 | 4.4248 | 14 |
|
| 80 |
+
| timing/fwd_logprobs_values_reward | 51.4554 | 6.453 | 41.7835 | 64.8817 | 14 |
|
| 81 |
+
| timing/policy_train | 594.398 | 30.7826 | 560.515 | 660.677 | 14 |
|
| 82 |
+
| timing/run_training | 646.184 | 36.8893 | 603.135 | 722.068 | 14 |
|
| 83 |
+
| timing/save_checkpoints | 34.3279 | 2.92364 | 33.0222 | 44.3151 | 14 |
|
| 84 |
+
| timing/step | 1407.07 | 1069.28 | 682.433 | 4631.29 | 14 |
|
| 85 |
+
| timing/sync_weights | 54.1594 | 11.7091 | 13.904 | 61.7658 | 14 |
|
| 86 |
+
| timing/train_critic_and_policy | 594.673 | 30.7772 | 560.769 | 660.955 | 14 |
|
| 87 |
+
| timing/wait_for_generation_buffer | 703.671 | 1069.9 | 0.0053 | 3950.33 | 14 |
|
| 88 |
+
| timing/save_hf_model | 37.8068 | 0.419809 | 37.5099 | 38.1036 | 2 |
|
| 89 |
+
|
| 90 |
+
## Trainer Metrics
|
| 91 |
+
|
| 92 |
+
| | Mean | Std | Min | Max | Count |
|
| 93 |
+
|:--------------------|-------:|---------:|------:|------:|--------:|
|
| 94 |
+
| trainer/epoch | 0.5 | 0.518875 | 0 | 1 | 14 |
|
| 95 |
+
| trainer/global_step | 7.5 | 4.1833 | 1 | 14 | 14 |
|
| 96 |
+
|
| 97 |
+
## Batch_Errors Metrics
|
| 98 |
+
|
| 99 |
+
| | Mean | Std | Min | Max | Count |
|
| 100 |
+
|:------------------------------|---------:|---------:|------:|------:|--------:|
|
| 101 |
+
| batch_errors/total_batches | 68.9231 | 17.7504 | 64 | 128 | 13 |
|
| 102 |
+
| batch_errors/total_instances | 551.385 | 142.003 | 512 | 1024 | 13 |
|
| 103 |
+
| batch_errors/total_successful | 551.385 | 142.003 | 512 | 1024 | 13 |
|
| 104 |
+
| batch_errors/total_failed | 0 | 0 | 0 | 0 | 13 |
|
| 105 |
+
| batch_errors/total_masked | 0 | 0 | 0 | 0 | 13 |
|
| 106 |
+
|
| 107 |
+
## Training Progression by Log
|
| 108 |
+
|
| 109 |
+
### job_389754
|
| 110 |
+
|
| 111 |
+
| Step | Reward | Pass@8 | KL | Loss | Step Time (s) | Gen Wait (s) |
|
| 112 |
+
|------|--------|--------|-----|------|---------------|-------------|
|
| 113 |
+
| 1 | 0.0098 | 0.0469 | 0.000000 | -0.0000 | 2252.5 | 1539.3 |
|
| 114 |
+
| 2 | 0.0254 | 0.0781 | 0.000000 | 0.0000 | 927.6 | 220.9 |
|
| 115 |
+
| 3 | 0.0195 | 0.1250 | 0.000000 | 0.0000 | 1161.2 | 440.4 |
|
| 116 |
+
| 4 | 0.0332 | 0.1094 | 0.000000 | 0.0000 | 774.7 | 0.0 |
|
| 117 |
+
| 5 | 0.0488 | 0.1250 | 0.000000 | 0.0000 | 946.2 | 241.3 |
|
| 118 |
+
| 6 | 0.0312 | 0.1406 | 0.000000 | 0.0000 | 888.6 | 182.7 |
|
| 119 |
+
| 7 | 0.0391 | 0.2031 | 0.000000 | 0.0000 | 2094.6 | 1312.6 |
|
| 120 |
+
| 8 | 0.0918 | 0.3281 | 0.000000 | 0.0000 | 1954.8 | 1284.1 |
|
| 121 |
+
| 9 | 0.0918 | 0.3281 | 0.000000 | 0.0000 | 810.2 | 124.9 |
|
| 122 |
+
| 10 | 0.1113 | 0.3906 | 0.000000 | 0.0000 | 912.5 | 208.8 |
|
| 123 |
+
| 11 | 0.1230 | 0.4219 | 0.000000 | 0.0000 | 731.1 | 68.6 |
|
| 124 |
+
| 12 | 0.1719 | 0.4219 | 0.000000 | -0.0000 | 931.3 | 261.0 |
|
| 125 |
+
| 13 | 0.1875 | 0.3906 | 0.000000 | 0.0000 | 682.4 | 16.5 |
|
| 126 |
+
| 14 | 0.1523 | 0.4688 | 0.000000 | -0.0000 | 4631.3 | 3950.3 |
|
| 127 |
+
|
| 128 |
+
## Timing Analysis
|
| 129 |
+
|
| 130 |
+
### Average Time Breakdown (% of step time)
|
| 131 |
+
|
| 132 |
+
| Component | Avg % of Step Time |
|
| 133 |
+
|-----------|-------------------|
|
| 134 |
+
| run_training | 60.9% |
|
| 135 |
+
| train_critic_and_policy | 56.1% |
|
| 136 |
+
| policy_train | 56.0% |
|
| 137 |
+
| wait_for_generation_buffer | 33.4% |
|
| 138 |
+
| sync_weights | 5.4% |
|
| 139 |
+
| fwd_logprobs_values_reward | 4.8% |
|
| 140 |
+
| save_hf_model | 4.1% |
|
| 141 |
+
| save_checkpoints | 3.2% |
|
| 142 |
+
| convert_to_training_input | 0.3% |
|
| 143 |
+
| cleanup_old_checkpoints | 0.0% |
|
| 144 |
+
| compute_advantages_and_returns | 0.0% |
|
| 145 |
+
|
| 146 |
+
## vLLM Inference Engine Analysis
|
| 147 |
+
|
| 148 |
+
Metrics from vLLM stat loggers (V1LoggingStatLoggerFixed).
|
| 149 |
+
|
| 150 |
+
> **Note**: Ray deduplicates similar log messages with `[repeated Nx across cluster]`,
|
| 151 |
+
> so we typically capture stats from one engine per timestamp. The stats shown are
|
| 152 |
+
> **per-engine** values. Multiply by num_inference_engines for cluster-wide estimates.
|
| 153 |
+
|
| 154 |
+
### Summary by Log (Per-Engine Stats)
|
| 155 |
+
|
| 156 |
+
| Log | Avg Running/Engine | Avg Waiting/Engine | Avg Gen Throughput/Engine | Avg KV Cache % | Avg Prefix Hit % |
|
| 157 |
+
|-----|-------------------|-------------------|--------------------------|----------------|------------------|
|
| 158 |
+
| job_389754 | 3.5 | 0.0 | 66.8 tok/s | 6.4% | 83.3% |
|
| 159 |
+
|
| 160 |
+
### Utilization Analysis (Per-Engine)
|
| 161 |
+
|
| 162 |
+
Key indicators of inference engine utilization:
|
| 163 |
+
|
| 164 |
+
- **Running requests/engine**: Concurrent requests being processed by each engine
|
| 165 |
+
- **Waiting requests**: Requests queued (0 = engine not saturated, has spare capacity)
|
| 166 |
+
- **Generation throughput**: Decode tokens/sec per engine
|
| 167 |
+
- 8B model on H100 can do **1000+ tok/s** when saturated
|
| 168 |
+
- If seeing <300 tok/s with 0 waiting, engine is **starved for requests**
|
| 169 |
+
|
| 170 |
+
#### job_389754
|
| 171 |
+
|
| 172 |
+
- **Running requests/engine**: avg=3.5, max=13
|
| 173 |
+
- **Waiting requests**: avg=0.0, max=0
|
| 174 |
+
- **Generation throughput/engine**: avg=66.8 tok/s, max=298.0 tok/s
|
| 175 |
+
- **KV cache usage**: avg=6.4%
|
| 176 |
+
- **Prefix cache hit rate**: avg=83.3%
|
| 177 |
+
- ⚠️ **Underutilized**: Engines starved for requests (0 waiting, avg 3.5 running)
|
| 178 |
+
- Bottleneck is likely upstream (environment execution, not inference)
|
| 179 |
+
|
| 180 |
+
## Trial-Level Analysis (from result.json)
|
| 181 |
+
|
| 182 |
+
Total trials parsed: 7168
|
| 183 |
+
|
| 184 |
+
### Turn Count Statistics
|
| 185 |
+
|
| 186 |
+
| Metric | Value |
|
| 187 |
+
|--------|-------|
|
| 188 |
+
| Mean | 2.4 |
|
| 189 |
+
| Median | 2.0 |
|
| 190 |
+
| Std | 0.7 |
|
| 191 |
+
| Min | 2 |
|
| 192 |
+
| Max | 20 |
|
| 193 |
+
| Count | 7168 |
|
| 194 |
+
|
| 195 |
+
### Exception Distribution
|
| 196 |
+
|
| 197 |
+
| Exception Type | Count | % |
|
| 198 |
+
|---------------|-------|---|
|
| 199 |
+
| No exception | 7162 | 99.9% |
|
| 200 |
+
| AgentTimeoutError | 5 | 0.1% |
|
| 201 |
+
| ContextLengthExceededError | 1 | 0.0% |
|
| 202 |
+
|
| 203 |
+
### Turn Count by Exception Type
|
| 204 |
+
|
| 205 |
+
| Exception Type | Mean Turns | Median Turns | Count |
|
| 206 |
+
|---------------|-----------|-------------|-------|
|
| 207 |
+
| ContextLengthExceededError | 20.0 | 20.0 | 1 |
|
| 208 |
+
| AgentTimeoutError | 10.2 | 10.0 | 5 |
|
| 209 |
+
| No exception | 2.4 | 2.0 | 7162 |
|
| 210 |
+
|
| 211 |
+
### Turn Count by Outcome
|
| 212 |
+
|
| 213 |
+
| Outcome | Mean Turns | Median Turns | Count |
|
| 214 |
+
|---------|-----------|-------------|-------|
|
| 215 |
+
| Success | 2.3 | 2.0 | 582 |
|
| 216 |
+
| Failure | 2.5 | 2.0 | 6586 |
|
| 217 |
+
|
| 218 |
+
### Reward Summary
|
| 219 |
+
|
| 220 |
+
- Mean reward: 0.0812
|
| 221 |
+
- Success rate: 8.1%
|
| 222 |
+
- Trials with reward data: 7168
|
| 223 |
+
|
training_logs/20260428_203121_metrics_table.csv
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_file,async/discard_rate,async/discarded_count,async/effective_batch_groups,async/effective_batch_samples,async/staleness_max,async/staleness_mean,async/staleness_min,async/staleness_ratio,generate/avg_num_tokens,generate/avg_tokens_non_zero_rewards,generate/avg_tokens_zero_rewards,generate/max_num_tokens,generate/min_num_tokens,generate/std_num_tokens,loss/avg_final_rewards,loss/avg_raw_advantages,loss/avg_raw_advantages_abs,policy/final_loss,policy/policy_entropy,policy/policy_loss,policy/policy_lr,policy/policy_update_steps,policy/ppo_clip_ratio,policy/raw_grad_norm,reward/avg_pass_at_8,reward/avg_raw_reward,system/process_rss_gb,system/process_vms_gb,system/ram_available_gb,system/ram_percent,system/ram_total_gb,system/ram_used_gb,timing/cleanup_old_checkpoints,timing/compute_advantages_and_returns,timing/convert_to_training_input,timing/fwd_logprobs_values_reward,timing/policy_train,timing/run_training,timing/save_checkpoints,timing/step,timing/sync_weights,timing/train_critic_and_policy,timing/wait_for_generation_buffer,trainer/epoch,trainer/global_step,batch_errors/total_batches,batch_errors/total_instances,batch_errors/total_successful,batch_errors/total_failed,batch_errors/total_masked,timing/save_hf_model,global_step
|
| 2 |
+
job_389754,0.0,0,64,512,0,0.0,0,0.0,3837.043,1844.4,3856.6943,11791,1000,1728.238,0.0098,-0.003,0.0108,-0.0,0.2114,-0.0,0.0,1.0,0.0,0.0146,0.0469,0.0098,11.0404,73.0374,334.9649,61.0,857.9687,523.0038,10.3772,0.0525,2.3302,54.8335,599.7188,654.8685,44.3151,2252.5445,55.9998,599.982,1539.3458,0,1,128.0,1024.0,1024.0,0.0,0.0,,1
|
| 3 |
+
job_389754,0.0,0,64,512,1,1.0,1,1.0,3734.7441,4189.0,3722.9098,14143,1008,1771.7086,0.0254,-0.003,0.0135,0.0,0.2119,0.0,0.0,1.0,0.0,0.0147,0.0781,0.0254,12.3731,73.238,331.4794,61.4,857.9687,526.4893,0.0156,0.0625,2.608,51.6906,593.7021,645.7138,33.1731,927.602,58.3788,593.9603,220.8803,0,2,64.0,512.0,512.0,0.0,0.0,,2
|
| 4 |
+
job_389754,0.0,0,64,512,2,2.0,2,1.0,3680.1211,3429.5,3685.1135,17798,960,1903.0569,0.0195,-0.0057,0.0404,0.0,0.2131,0.0,0.0,1.0,0.0,0.017,0.125,0.0195,15.006,73.3821,331.1569,61.4,857.9687,526.8118,0.0079,0.0712,3.338,54.9564,607.7339,663.0308,33.7102,1161.2,54.4484,608.0028,440.3575,0,3,64.0,512.0,512.0,0.0,0.0,,3
|
| 5 |
+
job_389754,0.0,0,64,512,3,3.0,3,1.0,4038.8242,3594.9412,4054.0687,19942,1070,2204.7849,0.0332,-0.0083,0.0398,0.0,0.1988,0.0,0.0,1.0,0.0,0.0159,0.1094,0.0332,18.2571,73.6598,332.0689,61.3,857.9687,525.8998,0.0123,0.0638,3.988,64.8817,648.7096,713.9267,33.5855,774.7256,56.7814,648.9808,0.0053,0,4,64.0,512.0,512.0,0.0,0.0,,4
|
| 6 |
+
job_389754,0.0,0,64,512,4,4.0,4,1.0,3734.0176,3721.08,3734.6817,13083,1049,1658.2439,0.0488,-0.0016,0.0331,0.0,0.2072,0.0,0.0,1.0,0.0,0.0298,0.125,0.0488,18.5352,73.6546,329.1295,61.6,857.9687,528.8392,0.0099,0.0428,2.9611,54.8169,588.7437,643.8552,33.7826,946.1626,58.0555,588.9951,241.2625,0,5,64.0,512.0,512.0,0.0,0.0,38.1036,5
|
| 7 |
+
job_389754,0.0,0,64,512,5,5.0,5,1.0,3678.4492,2779.8125,3707.4375,16575,982,1778.7688,0.0312,-0.0069,0.0393,0.0,0.2064,0.0,0.0,1.0,0.0,0.0194,0.1406,0.0312,18.5881,73.6669,327.9209,61.8,857.9687,530.0478,0.0087,0.0617,3.0487,49.8621,590.8918,641.066,34.0988,888.5801,61.7658,591.1417,182.6699,0,6,64.0,512.0,512.0,0.0,0.0,,6
|
| 8 |
+
job_389754,0.0,0,64,512,6,6.0,6,1.0,4026.377,2695.15,4080.4919,30105,860,2435.5002,0.0391,-0.0117,0.0579,0.0,0.2004,0.0,0.0,1.0,0.0,0.0242,0.2031,0.0391,24.2751,73.9734,398.3101,53.6,857.9687,459.6586,0.0095,0.0949,4.4248,61.0183,660.677,722.0682,35.0715,2094.5908,55.4597,660.9545,1312.6129,0,7,64.0,512.0,512.0,0.0,0.0,,7
|
| 9 |
+
job_389754,0.0,0,64,512,0,0.0,0,0.0,3169.8066,2304.1277,3257.3054,9841,950,1527.965,0.0918,-0.023,0.1174,0.0,0.2084,0.0,0.0,1.0,0.0,0.0364,0.3281,0.0918,24.1753,73.8303,401.2653,53.2,857.9687,456.7034,0.0112,0.0425,2.0132,44.7602,566.5425,611.7071,33.0222,1954.7684,56.9235,566.9038,1284.0936,1,8,64.0,512.0,512.0,0.0,0.0,,8
|
| 10 |
+
job_389754,0.0,0,64,512,1,1.0,1,1.0,3208.3555,2564.1489,3273.4688,19175,927,1685.1496,0.0918,-0.0201,0.1149,0.0,0.2081,0.0,0.0,1.0,0.0,0.0337,0.3281,0.0918,24.6965,74.2126,396.9511,53.7,857.9687,461.0176,0.0079,0.0446,3.5998,47.727,576.6595,624.6952,33.46,810.2478,57.0584,576.9233,124.8704,1,9,64.0,512.0,512.0,0.0,0.0,,9
|
| 11 |
+
job_389754,0.0,0,64,512,2,2.0,2,1.0,3237.8379,2406.9649,3341.9253,12501,890,1700.2375,0.1113,-0.0333,0.1439,0.0,0.2076,0.0,0.0,1.0,0.0,0.0374,0.3906,0.1113,24.8782,74.2612,391.3597,54.4,857.9687,466.609,0.0092,0.0533,2.4828,50.2219,593.8531,644.3998,33.4188,912.5225,56.7812,594.124,208.8362,1,10,64.0,512.0,512.0,0.0,0.0,37.5099,10
|
| 12 |
+
job_389754,0.0,0,64,512,3,3.0,3,1.0,3001.6465,2650.619,3050.8998,13406,941,1483.113,0.123,-0.0266,0.1658,0.0,0.199,0.0,0.0,1.0,0.0,0.0435,0.4219,0.123,25.0717,74.3157,387.2821,54.9,857.9687,470.6866,0.0168,0.0371,2.6113,41.7835,561.0683,603.1351,33.5831,731.0664,56.667,561.3141,68.6264,1,11,64.0,512.0,512.0,0.0,0.0,,11
|
| 13 |
+
job_389754,0.0,0,64,512,4,4.0,4,1.0,2883.377,2714.9545,2918.3325,14702,798,1601.3015,0.1719,-0.0285,0.1735,-0.0,0.1995,-0.0,0.0,1.0,0.0,0.0405,0.4219,0.1719,25.1736,74.6695,384.2253,55.2,857.9687,473.7434,0.011,0.0379,2.9524,47.2009,561.4127,608.9848,33.1065,931.2563,58.3024,561.7455,260.9926,1,12,64.0,512.0,512.0,0.0,0.0,,12
|
| 14 |
+
job_389754,0.0,0,64,512,5,5.0,5,1.0,2857.6934,2411.3333,2960.6995,13216,868,1685.3561,0.1875,-0.0066,0.1342,0.0,0.1896,0.0,0.0,1.0,0.0,0.0396,0.3906,0.1875,25.3254,74.5136,380.6907,55.6,857.9687,477.278,0.0093,0.0471,3.5522,43.8172,560.515,604.6342,33.2148,682.4333,57.7062,560.7695,16.5161,1,13,64.0,512.0,512.0,0.0,0.0,,13
|
| 15 |
+
job_389754,0.0,0,64,512,6,6.0,6,1.0,3206.502,2278.8974,3373.2143,15130,906,2031.0791,0.1523,-0.0278,0.1606,-0.0,0.1855,-0.0,0.0,1.0,0.0,0.0446,0.4688,0.1523,25.6977,74.7123,378.3962,55.9,857.9687,479.5725,0.0096,0.0577,2.5454,52.8053,611.3389,664.4901,33.0486,4631.2906,13.904,611.6265,3950.3283,1,14,,,,,,,14
|
training_logs/20260428_203121_reward_vs_steps.png
ADDED
|
Git LFS Details
|
training_logs/20260428_203121_trial_results.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training_logs/20260428_203121_turn_count_distribution.png
ADDED
|
training_logs/20260428_203121_vllm_metrics_job_389754.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training_logs/20260428_203121_vllm_metrics_table.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|