{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.235294117647058, "eval_steps": 500, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0784313725490196, "grad_norm": 5.935066953271119, "learning_rate": 4.1666666666666667e-07, "loss": 1.1559, "step": 1 }, { "epoch": 0.1568627450980392, "grad_norm": 5.931820940431562, "learning_rate": 8.333333333333333e-07, "loss": 1.1107, "step": 2 }, { "epoch": 0.23529411764705882, "grad_norm": 6.143905320771078, "learning_rate": 1.25e-06, "loss": 1.1384, "step": 3 }, { "epoch": 0.3137254901960784, "grad_norm": 6.065274861977263, "learning_rate": 1.6666666666666667e-06, "loss": 1.1421, "step": 4 }, { "epoch": 0.39215686274509803, "grad_norm": 5.8625107951536535, "learning_rate": 2.0833333333333334e-06, "loss": 1.1439, "step": 5 }, { "epoch": 0.47058823529411764, "grad_norm": 5.067374092992189, "learning_rate": 2.5e-06, "loss": 1.0618, "step": 6 }, { "epoch": 0.5490196078431373, "grad_norm": 4.755801552471063, "learning_rate": 2.916666666666667e-06, "loss": 1.0401, "step": 7 }, { "epoch": 0.6274509803921569, "grad_norm": 3.3129565953149323, "learning_rate": 3.3333333333333333e-06, "loss": 0.9898, "step": 8 }, { "epoch": 0.7058823529411765, "grad_norm": 3.3912326629630547, "learning_rate": 3.7500000000000005e-06, "loss": 1.011, "step": 9 }, { "epoch": 0.7843137254901961, "grad_norm": 3.2048784731280806, "learning_rate": 4.166666666666667e-06, "loss": 0.9935, "step": 10 }, { "epoch": 0.8627450980392157, "grad_norm": 3.752806588180809, "learning_rate": 4.583333333333333e-06, "loss": 0.9037, "step": 11 }, { "epoch": 0.9411764705882353, "grad_norm": 3.7034973959501185, "learning_rate": 5e-06, "loss": 0.9095, "step": 12 }, { "epoch": 1.0, "grad_norm": 3.7034973959501185, "learning_rate": 4.998942375205502e-06, "loss": 0.6715, "step": 13 }, { "epoch": 1.0784313725490196, "grad_norm": 3.3436999577667104, "learning_rate": 4.995770395678171e-06, "loss": 0.8186, "step": 14 }, { "epoch": 1.156862745098039, "grad_norm": 2.662791286273683, "learning_rate": 4.990486745229364e-06, "loss": 0.7876, "step": 15 }, { "epoch": 1.2352941176470589, "grad_norm": 3.1462660546154915, "learning_rate": 4.983095894354858e-06, "loss": 0.7498, "step": 16 }, { "epoch": 1.3137254901960784, "grad_norm": 2.806483473961631, "learning_rate": 4.973604096452361e-06, "loss": 0.7377, "step": 17 }, { "epoch": 1.392156862745098, "grad_norm": 2.4168449532239933, "learning_rate": 4.962019382530521e-06, "loss": 0.6722, "step": 18 }, { "epoch": 1.4705882352941178, "grad_norm": 2.1659663165693823, "learning_rate": 4.948351554413879e-06, "loss": 0.6785, "step": 19 }, { "epoch": 1.5490196078431373, "grad_norm": 1.9715057945884606, "learning_rate": 4.93261217644956e-06, "loss": 0.6873, "step": 20 }, { "epoch": 1.6274509803921569, "grad_norm": 2.03020256390926, "learning_rate": 4.914814565722671e-06, "loss": 0.659, "step": 21 }, { "epoch": 1.7058823529411766, "grad_norm": 1.9753159977423889, "learning_rate": 4.894973780788722e-06, "loss": 0.6438, "step": 22 }, { "epoch": 1.784313725490196, "grad_norm": 1.7231552446064127, "learning_rate": 4.873106608932585e-06, "loss": 0.6467, "step": 23 }, { "epoch": 1.8627450980392157, "grad_norm": 1.7271946867600179, "learning_rate": 4.849231551964771e-06, "loss": 0.6004, "step": 24 }, { "epoch": 1.9411764705882353, "grad_norm": 1.908074354983798, "learning_rate": 4.823368810567056e-06, "loss": 0.6608, "step": 25 }, { "epoch": 2.0, "grad_norm": 1.8986968468078707, "learning_rate": 4.7955402672006855e-06, "loss": 0.4767, "step": 26 }, { "epoch": 2.0784313725490198, "grad_norm": 1.7821876091389497, "learning_rate": 4.765769467591626e-06, "loss": 0.5677, "step": 27 }, { "epoch": 2.156862745098039, "grad_norm": 1.5729961003705275, "learning_rate": 4.734081600808531e-06, "loss": 0.5875, "step": 28 }, { "epoch": 2.235294117647059, "grad_norm": 1.6161908564695435, "learning_rate": 4.700503477950278e-06, "loss": 0.5528, "step": 29 }, { "epoch": 2.313725490196078, "grad_norm": 1.453253235686431, "learning_rate": 4.665063509461098e-06, "loss": 0.5471, "step": 30 }, { "epoch": 2.392156862745098, "grad_norm": 1.5747903710062445, "learning_rate": 4.627791681092499e-06, "loss": 0.5058, "step": 31 }, { "epoch": 2.4705882352941178, "grad_norm": 1.4435044859613202, "learning_rate": 4.588719528532342e-06, "loss": 0.5226, "step": 32 }, { "epoch": 2.549019607843137, "grad_norm": 1.39605748394361, "learning_rate": 4.54788011072248e-06, "loss": 0.5071, "step": 33 }, { "epoch": 2.627450980392157, "grad_norm": 1.5254258052298646, "learning_rate": 4.50530798188761e-06, "loss": 0.4998, "step": 34 }, { "epoch": 2.7058823529411766, "grad_norm": 1.4261873477979348, "learning_rate": 4.46103916229894e-06, "loss": 0.5009, "step": 35 }, { "epoch": 2.784313725490196, "grad_norm": 1.4137129832097197, "learning_rate": 4.415111107797445e-06, "loss": 0.479, "step": 36 }, { "epoch": 2.8627450980392157, "grad_norm": 1.520647591268587, "learning_rate": 4.367562678102491e-06, "loss": 0.5122, "step": 37 }, { "epoch": 2.9411764705882355, "grad_norm": 1.437130314805203, "learning_rate": 4.318434103932622e-06, "loss": 0.4938, "step": 38 }, { "epoch": 3.0, "grad_norm": 1.381198514753958, "learning_rate": 4.267766952966369e-06, "loss": 0.3629, "step": 39 }, { "epoch": 3.0784313725490198, "grad_norm": 1.4414609712821322, "learning_rate": 4.215604094671835e-06, "loss": 0.4262, "step": 40 }, { "epoch": 3.156862745098039, "grad_norm": 1.3346547206899726, "learning_rate": 4.161989664034844e-06, "loss": 0.4259, "step": 41 }, { "epoch": 3.235294117647059, "grad_norm": 1.2955870784243064, "learning_rate": 4.106969024216348e-06, "loss": 0.4009, "step": 42 }, { "epoch": 3.313725490196078, "grad_norm": 1.381727916852796, "learning_rate": 4.0505887281706505e-06, "loss": 0.4082, "step": 43 }, { "epoch": 3.392156862745098, "grad_norm": 1.2215807897629705, "learning_rate": 3.992896479256966e-06, "loss": 0.4012, "step": 44 }, { "epoch": 3.4705882352941178, "grad_norm": 1.341754935028127, "learning_rate": 3.933941090877615e-06, "loss": 0.4003, "step": 45 }, { "epoch": 3.549019607843137, "grad_norm": 1.3219873249674265, "learning_rate": 3.8737724451770155e-06, "loss": 0.3906, "step": 46 }, { "epoch": 3.627450980392157, "grad_norm": 1.2735261344863207, "learning_rate": 3.8124414508364005e-06, "loss": 0.3956, "step": 47 }, { "epoch": 3.7058823529411766, "grad_norm": 1.408667555717726, "learning_rate": 3.7500000000000005e-06, "loss": 0.3841, "step": 48 }, { "epoch": 3.784313725490196, "grad_norm": 1.3920186495457132, "learning_rate": 3.6865009243691015e-06, "loss": 0.3998, "step": 49 }, { "epoch": 3.8627450980392157, "grad_norm": 1.3668192562682828, "learning_rate": 3.621997950501156e-06, "loss": 0.3947, "step": 50 }, { "epoch": 3.9411764705882355, "grad_norm": 1.3220345352979812, "learning_rate": 3.556545654351749e-06, "loss": 0.3898, "step": 51 }, { "epoch": 4.0, "grad_norm": 1.273119605790472, "learning_rate": 3.4901994150978926e-06, "loss": 0.2635, "step": 52 }, { "epoch": 4.078431372549019, "grad_norm": 1.5507480548662818, "learning_rate": 3.4230153682817112e-06, "loss": 0.3232, "step": 53 }, { "epoch": 4.1568627450980395, "grad_norm": 1.2464576840916806, "learning_rate": 3.3550503583141726e-06, "loss": 0.2997, "step": 54 }, { "epoch": 4.235294117647059, "grad_norm": 1.2140787104099877, "learning_rate": 3.2863618903790346e-06, "loss": 0.3061, "step": 55 }, { "epoch": 4.313725490196078, "grad_norm": 1.2569518947420275, "learning_rate": 3.217008081777726e-06, "loss": 0.2995, "step": 56 }, { "epoch": 4.392156862745098, "grad_norm": 1.2248006221848717, "learning_rate": 3.147047612756302e-06, "loss": 0.2973, "step": 57 }, { "epoch": 4.470588235294118, "grad_norm": 1.2928146585389553, "learning_rate": 3.0765396768561005e-06, "loss": 0.2987, "step": 58 }, { "epoch": 4.549019607843137, "grad_norm": 1.3574488686929125, "learning_rate": 3.0055439308300954e-06, "loss": 0.3132, "step": 59 }, { "epoch": 4.627450980392156, "grad_norm": 1.2894955154605787, "learning_rate": 2.9341204441673267e-06, "loss": 0.2929, "step": 60 }, { "epoch": 4.705882352941177, "grad_norm": 1.2326192638645608, "learning_rate": 2.862329648268117e-06, "loss": 0.2794, "step": 61 }, { "epoch": 4.784313725490196, "grad_norm": 1.3599625008219607, "learning_rate": 2.7902322853130758e-06, "loss": 0.2911, "step": 62 }, { "epoch": 4.862745098039216, "grad_norm": 1.2507485894142631, "learning_rate": 2.717889356869146e-06, "loss": 0.2945, "step": 63 }, { "epoch": 4.9411764705882355, "grad_norm": 1.3120759092640928, "learning_rate": 2.6453620722761897e-06, "loss": 0.3093, "step": 64 }, { "epoch": 5.0, "grad_norm": 1.3120759092640928, "learning_rate": 2.572711796857779e-06, "loss": 0.2038, "step": 65 }, { "epoch": 5.078431372549019, "grad_norm": 1.2133928459250458, "learning_rate": 2.5e-06, "loss": 0.2457, "step": 66 }, { "epoch": 5.1568627450980395, "grad_norm": 1.1164096271756436, "learning_rate": 2.4272882031422216e-06, "loss": 0.2378, "step": 67 }, { "epoch": 5.235294117647059, "grad_norm": 1.1865964714246466, "learning_rate": 2.3546379277238107e-06, "loss": 0.2365, "step": 68 }, { "epoch": 5.313725490196078, "grad_norm": 1.1133005502471431, "learning_rate": 2.2821106431308546e-06, "loss": 0.2273, "step": 69 }, { "epoch": 5.392156862745098, "grad_norm": 1.0726780522048756, "learning_rate": 2.2097677146869242e-06, "loss": 0.2251, "step": 70 }, { "epoch": 5.470588235294118, "grad_norm": 1.1507923857528541, "learning_rate": 2.1376703517318835e-06, "loss": 0.2199, "step": 71 }, { "epoch": 5.549019607843137, "grad_norm": 1.1965895392459576, "learning_rate": 2.0658795558326745e-06, "loss": 0.2275, "step": 72 }, { "epoch": 5.627450980392156, "grad_norm": 1.2081935303041835, "learning_rate": 1.994456069169906e-06, "loss": 0.2219, "step": 73 }, { "epoch": 5.705882352941177, "grad_norm": 1.2037511840449118, "learning_rate": 1.9234603231439e-06, "loss": 0.2204, "step": 74 }, { "epoch": 5.784313725490196, "grad_norm": 1.1530325179032028, "learning_rate": 1.852952387243698e-06, "loss": 0.22, "step": 75 }, { "epoch": 5.862745098039216, "grad_norm": 1.1780261182347391, "learning_rate": 1.7829919182222752e-06, "loss": 0.2199, "step": 76 }, { "epoch": 5.9411764705882355, "grad_norm": 1.1895169708488194, "learning_rate": 1.7136381096209665e-06, "loss": 0.2037, "step": 77 }, { "epoch": 6.0, "grad_norm": 1.0854093120483197, "learning_rate": 1.6449496416858285e-06, "loss": 0.1583, "step": 78 }, { "epoch": 6.078431372549019, "grad_norm": 1.0801354684612592, "learning_rate": 1.5769846317182894e-06, "loss": 0.1934, "step": 79 }, { "epoch": 6.1568627450980395, "grad_norm": 1.0439246440370717, "learning_rate": 1.509800584902108e-06, "loss": 0.1705, "step": 80 }, { "epoch": 6.235294117647059, "grad_norm": 1.0284457474816655, "learning_rate": 1.443454345648252e-06, "loss": 0.1723, "step": 81 }, { "epoch": 6.313725490196078, "grad_norm": 0.9926653408766625, "learning_rate": 1.3780020494988447e-06, "loss": 0.1792, "step": 82 }, { "epoch": 6.392156862745098, "grad_norm": 1.0138502371712474, "learning_rate": 1.313499075630899e-06, "loss": 0.1679, "step": 83 }, { "epoch": 6.470588235294118, "grad_norm": 1.0083965688320564, "learning_rate": 1.2500000000000007e-06, "loss": 0.169, "step": 84 }, { "epoch": 6.549019607843137, "grad_norm": 1.0514389443911982, "learning_rate": 1.1875585491636e-06, "loss": 0.1807, "step": 85 }, { "epoch": 6.627450980392156, "grad_norm": 1.0545495890932062, "learning_rate": 1.1262275548229852e-06, "loss": 0.1735, "step": 86 }, { "epoch": 6.705882352941177, "grad_norm": 1.1020211072090924, "learning_rate": 1.0660589091223854e-06, "loss": 0.1728, "step": 87 }, { "epoch": 6.784313725490196, "grad_norm": 1.0983738677411488, "learning_rate": 1.0071035207430352e-06, "loss": 0.171, "step": 88 }, { "epoch": 6.862745098039216, "grad_norm": 1.0848006154577043, "learning_rate": 9.494112718293503e-07, "loss": 0.1761, "step": 89 }, { "epoch": 6.9411764705882355, "grad_norm": 1.0504620709430657, "learning_rate": 8.930309757836517e-07, "loss": 0.1689, "step": 90 }, { "epoch": 7.0, "grad_norm": 1.0335128775784548, "learning_rate": 8.380103359651554e-07, "loss": 0.1193, "step": 91 }, { "epoch": 7.078431372549019, "grad_norm": 0.9668349947212325, "learning_rate": 7.843959053281663e-07, "loss": 0.1531, "step": 92 }, { "epoch": 7.1568627450980395, "grad_norm": 0.9049193018844318, "learning_rate": 7.322330470336314e-07, "loss": 0.1449, "step": 93 }, { "epoch": 7.235294117647059, "grad_norm": 0.8855250299857752, "learning_rate": 6.815658960673782e-07, "loss": 0.1398, "step": 94 }, { "epoch": 7.313725490196078, "grad_norm": 0.9608687848192914, "learning_rate": 6.324373218975105e-07, "loss": 0.1487, "step": 95 }, { "epoch": 7.392156862745098, "grad_norm": 0.9692880229899594, "learning_rate": 5.848888922025553e-07, "loss": 0.1403, "step": 96 }, { "epoch": 7.470588235294118, "grad_norm": 0.9035708627250062, "learning_rate": 5.389608377010608e-07, "loss": 0.1449, "step": 97 }, { "epoch": 7.549019607843137, "grad_norm": 0.8748928881891229, "learning_rate": 4.946920181123904e-07, "loss": 0.1451, "step": 98 }, { "epoch": 7.627450980392156, "grad_norm": 0.8912371091414796, "learning_rate": 4.5211988927752026e-07, "loss": 0.134, "step": 99 }, { "epoch": 7.705882352941177, "grad_norm": 0.9454342145405766, "learning_rate": 4.1128047146765936e-07, "loss": 0.1497, "step": 100 }, { "epoch": 7.784313725490196, "grad_norm": 0.9466608405460601, "learning_rate": 3.722083189075007e-07, "loss": 0.1523, "step": 101 }, { "epoch": 7.862745098039216, "grad_norm": 0.9716836299571869, "learning_rate": 3.3493649053890325e-07, "loss": 0.1406, "step": 102 }, { "epoch": 7.9411764705882355, "grad_norm": 0.9517618408879835, "learning_rate": 2.9949652204972257e-07, "loss": 0.1414, "step": 103 }, { "epoch": 8.0, "grad_norm": 0.8961669761492526, "learning_rate": 2.6591839919146963e-07, "loss": 0.0988, "step": 104 }, { "epoch": 8.07843137254902, "grad_norm": 0.8800809597012162, "learning_rate": 2.3423053240837518e-07, "loss": 0.1326, "step": 105 }, { "epoch": 8.156862745098039, "grad_norm": 0.8678268581817502, "learning_rate": 2.044597327993153e-07, "loss": 0.1382, "step": 106 }, { "epoch": 8.235294117647058, "grad_norm": 0.8749341283858125, "learning_rate": 1.7663118943294367e-07, "loss": 0.1394, "step": 107 }, { "epoch": 8.313725490196079, "grad_norm": 0.8593112391578863, "learning_rate": 1.507684480352292e-07, "loss": 0.1278, "step": 108 }, { "epoch": 8.392156862745098, "grad_norm": 0.9129847126820709, "learning_rate": 1.2689339106741529e-07, "loss": 0.1391, "step": 109 }, { "epoch": 8.470588235294118, "grad_norm": 0.8656126791419273, "learning_rate": 1.0502621921127776e-07, "loss": 0.1311, "step": 110 }, { "epoch": 8.549019607843137, "grad_norm": 0.8593751089585534, "learning_rate": 8.518543427732951e-08, "loss": 0.1296, "step": 111 }, { "epoch": 8.627450980392156, "grad_norm": 0.8879560191886899, "learning_rate": 6.738782355044048e-08, "loss": 0.1263, "step": 112 }, { "epoch": 8.705882352941176, "grad_norm": 0.9206743688905853, "learning_rate": 5.164844558612131e-08, "loss": 0.1381, "step": 113 }, { "epoch": 8.784313725490197, "grad_norm": 0.8795400657114122, "learning_rate": 3.798061746947995e-08, "loss": 0.1282, "step": 114 }, { "epoch": 8.862745098039216, "grad_norm": 0.8568365367984947, "learning_rate": 2.6395903547638825e-08, "loss": 0.1331, "step": 115 }, { "epoch": 8.941176470588236, "grad_norm": 0.847262314471311, "learning_rate": 1.6904105645142443e-08, "loss": 0.1264, "step": 116 }, { "epoch": 9.0, "grad_norm": 0.847262314471311, "learning_rate": 9.513254770636138e-09, "loss": 0.0991, "step": 117 }, { "epoch": 9.07843137254902, "grad_norm": 0.8727190548675097, "learning_rate": 4.229604321829561e-09, "loss": 0.1293, "step": 118 }, { "epoch": 9.156862745098039, "grad_norm": 0.816661788373, "learning_rate": 1.0576247944985018e-09, "loss": 0.1269, "step": 119 }, { "epoch": 9.235294117647058, "grad_norm": 0.885147564294731, "learning_rate": 0.0, "loss": 0.1341, "step": 120 }, { "epoch": 9.235294117647058, "step": 120, "total_flos": 27074593947648.0, "train_loss": 0.388430199213326, "train_runtime": 2487.6221, "train_samples_per_second": 1.624, "train_steps_per_second": 0.048 } ], "logging_steps": 1.0, "max_steps": 120, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 27074593947648.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }