dzungpham commited on
Commit
b00f41c
·
verified ·
1 Parent(s): caed328

upload best checkpoints 200 with f1 score 0.68

Browse files
graphcodebert-robust/checkpoint-200/config.json CHANGED
@@ -2,14 +2,14 @@
2
  "architectures": [
3
  "RobertaForSequenceClassification"
4
  ],
5
- "attention_probs_dropout_prob": 0.2,
6
  "bos_token_id": 0,
7
  "classifier_dropout": null,
8
  "dtype": "float32",
9
  "eos_token_id": 2,
10
  "gradient_checkpointing": false,
11
  "hidden_act": "gelu",
12
- "hidden_dropout_prob": 0.2,
13
  "hidden_size": 768,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 3072,
@@ -21,6 +21,7 @@
21
  "output_past": true,
22
  "pad_token_id": 1,
23
  "position_embedding_type": "absolute",
 
24
  "transformers_version": "4.56.0",
25
  "type_vocab_size": 1,
26
  "use_cache": true,
 
2
  "architectures": [
3
  "RobertaForSequenceClassification"
4
  ],
5
+ "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "classifier_dropout": null,
8
  "dtype": "float32",
9
  "eos_token_id": 2,
10
  "gradient_checkpointing": false,
11
  "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
  "hidden_size": 768,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 3072,
 
21
  "output_past": true,
22
  "pad_token_id": 1,
23
  "position_embedding_type": "absolute",
24
+ "problem_type": "single_label_classification",
25
  "transformers_version": "4.56.0",
26
  "type_vocab_size": 1,
27
  "use_cache": true,
graphcodebert-robust/checkpoint-200/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:984b47ee0cbc9e8aff5459859ab8785583eda66a482745e97fa137aac9d69a20
3
  size 498612824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34f62f2e2935abbdd0f8d5567e447c234e77e119d414ca9ce31e3a1ce06552e2
3
  size 498612824
graphcodebert-robust/checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cc66376057788af21d1adb5a92f0a63c44af3eccb38c6f45cfafc48c80f02d4
3
- size 4741923
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94ced15c772e225b8afaaa561ce73077f5f491b910b543982886ee79b2be71c0
3
+ size 4741859
graphcodebert-robust/checkpoint-200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e231312cfb6dd836b89c3a8dd38d52af114294447c5e2294714ea9206abde6af
3
- size 14581
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a69a2dd012809f4c1402b56a463f5f04ca5d8c3ea0ff42d1da133d0f80b1c5b9
3
+ size 14645
graphcodebert-robust/checkpoint-200/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:124625e167eb28acbfc793cfcb3e8a08b32e7fea06501462bc9e420a5e1beb2a
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b21c5349d5e7d02de630ebc1cb53ade1d9c6079eeb8594d223bb786011a0428b
3
  size 1383
graphcodebert-robust/checkpoint-200/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:003a1651b59b96bfbd66a9a12f6e0705e877f877138a8695267f15672bef92e3
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1620ef2f1785b97a0cabdbea3b6cfd78a32feee0218de95157fc0dbbc14db4ba
3
  size 1465
graphcodebert-robust/checkpoint-200/tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 512,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
graphcodebert-robust/checkpoint-200/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.0128,
6
  "eval_steps": 1000,
7
  "global_step": 200,
8
  "is_hyper_param_search": false,
@@ -10,150 +10,150 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.00064,
14
- "grad_norm": 1.6144306659698486,
15
- "learning_rate": 1.1520000000000002e-08,
16
- "loss": 0.729,
17
  "step": 10
18
  },
19
  {
20
- "epoch": 0.00128,
21
- "grad_norm": 2.0952296257019043,
22
- "learning_rate": 2.4320000000000002e-08,
23
- "loss": 0.7295,
24
  "step": 20
25
  },
26
  {
27
- "epoch": 0.00192,
28
- "grad_norm": 1.3587689399719238,
29
- "learning_rate": 3.7120000000000004e-08,
30
- "loss": 0.73,
31
  "step": 30
32
  },
33
  {
34
- "epoch": 0.00256,
35
- "grad_norm": 1.2531732320785522,
36
- "learning_rate": 4.9920000000000006e-08,
37
- "loss": 0.7221,
38
  "step": 40
39
  },
40
  {
41
- "epoch": 0.0032,
42
- "grad_norm": 1.437932014465332,
43
- "learning_rate": 6.272000000000001e-08,
44
- "loss": 0.7209,
45
  "step": 50
46
  },
47
  {
48
- "epoch": 0.00384,
49
- "grad_norm": 1.418426752090454,
50
- "learning_rate": 7.552e-08,
51
- "loss": 0.729,
52
  "step": 60
53
  },
54
  {
55
- "epoch": 0.00448,
56
- "grad_norm": 1.9476298093795776,
57
- "learning_rate": 8.832e-08,
58
- "loss": 0.7242,
59
  "step": 70
60
  },
61
  {
62
- "epoch": 0.00512,
63
- "grad_norm": 1.7948051691055298,
64
- "learning_rate": 1.0112000000000001e-07,
65
- "loss": 0.7227,
66
  "step": 80
67
  },
68
  {
69
- "epoch": 0.00576,
70
- "grad_norm": 1.6534360647201538,
71
- "learning_rate": 1.1392e-07,
72
- "loss": 0.7234,
73
  "step": 90
74
  },
75
  {
76
- "epoch": 0.0064,
77
- "grad_norm": 1.0920158624649048,
78
- "learning_rate": 1.2672e-07,
79
- "loss": 0.7328,
80
  "step": 100
81
  },
82
  {
83
- "epoch": 0.00704,
84
- "grad_norm": 1.977837085723877,
85
- "learning_rate": 1.3952000000000002e-07,
86
- "loss": 0.7263,
87
  "step": 110
88
  },
89
  {
90
- "epoch": 0.00768,
91
- "grad_norm": 1.388983130455017,
92
- "learning_rate": 1.5232000000000003e-07,
93
- "loss": 0.7286,
94
  "step": 120
95
  },
96
  {
97
- "epoch": 0.00832,
98
- "grad_norm": 1.2956682443618774,
99
- "learning_rate": 1.6512e-07,
100
- "loss": 0.7251,
101
  "step": 130
102
  },
103
  {
104
- "epoch": 0.00896,
105
- "grad_norm": 1.8125052452087402,
106
- "learning_rate": 1.7792e-07,
107
- "loss": 0.7251,
108
  "step": 140
109
  },
110
  {
111
- "epoch": 0.0096,
112
- "grad_norm": 1.626846194267273,
113
- "learning_rate": 1.9072e-07,
114
- "loss": 0.727,
115
  "step": 150
116
  },
117
  {
118
- "epoch": 0.01024,
119
- "grad_norm": 2.3243086338043213,
120
- "learning_rate": 2.0352e-07,
121
- "loss": 0.726,
122
  "step": 160
123
  },
124
  {
125
- "epoch": 0.01088,
126
- "grad_norm": 1.4734737873077393,
127
- "learning_rate": 2.1632e-07,
128
- "loss": 0.7252,
129
  "step": 170
130
  },
131
  {
132
- "epoch": 0.01152,
133
- "grad_norm": 2.090498685836792,
134
- "learning_rate": 2.2912e-07,
135
- "loss": 0.7273,
136
  "step": 180
137
  },
138
  {
139
- "epoch": 0.01216,
140
- "grad_norm": 1.7563093900680542,
141
- "learning_rate": 2.4192000000000004e-07,
142
- "loss": 0.719,
143
  "step": 190
144
  },
145
  {
146
- "epoch": 0.0128,
147
- "grad_norm": 1.449843168258667,
148
- "learning_rate": 2.5472000000000005e-07,
149
- "loss": 0.7237,
150
  "step": 200
151
  }
152
  ],
153
  "logging_steps": 10,
154
- "max_steps": 156250,
155
  "num_input_tokens_seen": 0,
156
- "num_train_epochs": 10,
157
  "save_steps": 200,
158
  "stateful_callbacks": {
159
  "EarlyStoppingCallback": {
@@ -176,8 +176,8 @@
176
  "attributes": {}
177
  }
178
  },
179
- "total_flos": 1683910754304000.0,
180
- "train_batch_size": 32,
181
  "trial_name": null,
182
  "trial_params": null
183
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.02559836170485089,
6
  "eval_steps": 1000,
7
  "global_step": 200,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.0012799180852425445,
14
+ "grad_norm": 89788.1796875,
15
+ "learning_rate": 2.304147465437788e-08,
16
+ "loss": 0.7088,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.002559836170485089,
21
+ "grad_norm": 39479.36328125,
22
+ "learning_rate": 4.86431131592422e-08,
23
+ "loss": 0.7087,
24
  "step": 20
25
  },
26
  {
27
+ "epoch": 0.0038397542557276334,
28
+ "grad_norm": 82478.765625,
29
+ "learning_rate": 7.424475166410652e-08,
30
+ "loss": 0.7074,
31
  "step": 30
32
  },
33
  {
34
+ "epoch": 0.005119672340970178,
35
+ "grad_norm": 58003.75390625,
36
+ "learning_rate": 9.984639016897082e-08,
37
+ "loss": 0.703,
38
  "step": 40
39
  },
40
  {
41
+ "epoch": 0.006399590426212722,
42
+ "grad_norm": 95491.0859375,
43
+ "learning_rate": 1.2544802867383514e-07,
44
+ "loss": 0.7073,
45
  "step": 50
46
  },
47
  {
48
+ "epoch": 0.007679508511455267,
49
+ "grad_norm": 44903.296875,
50
+ "learning_rate": 1.5104966717869944e-07,
51
+ "loss": 0.7061,
52
  "step": 60
53
  },
54
  {
55
+ "epoch": 0.008959426596697812,
56
+ "grad_norm": 142410.484375,
57
+ "learning_rate": 1.7665130568356375e-07,
58
+ "loss": 0.7082,
59
  "step": 70
60
  },
61
  {
62
+ "epoch": 0.010239344681940356,
63
+ "grad_norm": 148763.109375,
64
+ "learning_rate": 2.0225294418842808e-07,
65
+ "loss": 0.707,
66
  "step": 80
67
  },
68
  {
69
+ "epoch": 0.011519262767182901,
70
+ "grad_norm": 62031.30859375,
71
+ "learning_rate": 2.2785458269329238e-07,
72
+ "loss": 0.7036,
73
  "step": 90
74
  },
75
  {
76
+ "epoch": 0.012799180852425445,
77
+ "grad_norm": 135708.875,
78
+ "learning_rate": 2.5345622119815674e-07,
79
+ "loss": 0.7078,
80
  "step": 100
81
  },
82
  {
83
+ "epoch": 0.01407909893766799,
84
+ "grad_norm": 91129.421875,
85
+ "learning_rate": 2.79057859703021e-07,
86
+ "loss": 0.7035,
87
  "step": 110
88
  },
89
  {
90
+ "epoch": 0.015359017022910534,
91
+ "grad_norm": 39290.72265625,
92
+ "learning_rate": 3.0465949820788535e-07,
93
+ "loss": 0.7083,
94
  "step": 120
95
  },
96
  {
97
+ "epoch": 0.016638935108153077,
98
+ "grad_norm": 49473.61328125,
99
+ "learning_rate": 3.302611367127496e-07,
100
+ "loss": 0.7023,
101
  "step": 130
102
  },
103
  {
104
+ "epoch": 0.017918853193395624,
105
+ "grad_norm": 61292.984375,
106
+ "learning_rate": 3.5586277521761395e-07,
107
+ "loss": 0.7014,
108
  "step": 140
109
  },
110
  {
111
+ "epoch": 0.019198771278638168,
112
+ "grad_norm": 79102.0390625,
113
+ "learning_rate": 3.814644137224783e-07,
114
+ "loss": 0.7041,
115
  "step": 150
116
  },
117
  {
118
+ "epoch": 0.02047868936388071,
119
+ "grad_norm": 61779.62890625,
120
+ "learning_rate": 4.0706605222734256e-07,
121
+ "loss": 0.7039,
122
  "step": 160
123
  },
124
  {
125
+ "epoch": 0.021758607449123255,
126
+ "grad_norm": 63492.18359375,
127
+ "learning_rate": 4.326676907322069e-07,
128
+ "loss": 0.7035,
129
  "step": 170
130
  },
131
  {
132
+ "epoch": 0.023038525534365802,
133
+ "grad_norm": 44190.3203125,
134
+ "learning_rate": 4.582693292370712e-07,
135
+ "loss": 0.7019,
136
  "step": 180
137
  },
138
  {
139
+ "epoch": 0.024318443619608346,
140
+ "grad_norm": 67509.15625,
141
+ "learning_rate": 4.838709677419355e-07,
142
+ "loss": 0.6991,
143
  "step": 190
144
  },
145
  {
146
+ "epoch": 0.02559836170485089,
147
+ "grad_norm": 94820.5078125,
148
+ "learning_rate": 5.094726062467999e-07,
149
+ "loss": 0.7011,
150
  "step": 200
151
  }
152
  ],
153
  "logging_steps": 10,
154
+ "max_steps": 39065,
155
  "num_input_tokens_seen": 0,
156
+ "num_train_epochs": 5,
157
  "save_steps": 200,
158
  "stateful_callbacks": {
159
  "EarlyStoppingCallback": {
 
176
  "attributes": {}
177
  }
178
  },
179
+ "total_flos": 3367821508608000.0,
180
+ "train_batch_size": 64,
181
  "trial_name": null,
182
  "trial_params": null
183
  }
graphcodebert-robust/checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ff659b85d84ec0bae53596bc271ba773db9c463626db0f13fd8e747f433dad4
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ec2974753acccea9af7a8eb9c2abfaaba85cdcf89c926488b103f5662876bb0
3
  size 5841
graphcodebert-robust/checkpoint-400/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a01766ea37053c4e1086db23a592ccd390b6f66d530273ae2dae69fbf9aa39e
3
  size 498612824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92bce3c4e38ffa8155e9197c360622fa05c939bec62afcbfa3bf8fd778f88527
3
  size 498612824
graphcodebert-robust/checkpoint-400/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3993e14f8e5395da15ce3350b7a6c24a8b0c21921fd8cce7a29d5175f071b2fc
3
  size 4741923
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a83a52f1a15705e175493b2425539a92f6edb4c30253eadc01cb8a3f3c98b492
3
  size 4741923
graphcodebert-robust/checkpoint-400/trainer_state.json CHANGED
@@ -151,142 +151,142 @@
151
  },
152
  {
153
  "epoch": 0.01344,
154
- "grad_norm": 141396.296875,
155
  "learning_rate": 5.350742447516642e-07,
156
- "loss": 0.7217,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.01408,
161
- "grad_norm": 102339.1640625,
162
  "learning_rate": 5.606758832565284e-07,
163
- "loss": 0.7215,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.01472,
168
- "grad_norm": 134052.9375,
169
  "learning_rate": 5.862775217613928e-07,
170
- "loss": 0.7115,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.01536,
175
- "grad_norm": 87181.984375,
176
  "learning_rate": 6.118791602662571e-07,
177
- "loss": 0.7241,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.016,
182
- "grad_norm": 100231.328125,
183
  "learning_rate": 6.374807987711214e-07,
184
- "loss": 0.71,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.01664,
189
- "grad_norm": 136721.484375,
190
  "learning_rate": 6.630824372759858e-07,
191
- "loss": 0.7188,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.01728,
196
- "grad_norm": 115868.8125,
197
  "learning_rate": 6.8868407578085e-07,
198
- "loss": 0.7199,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.01792,
203
- "grad_norm": 70205.1484375,
204
  "learning_rate": 7.142857142857143e-07,
205
- "loss": 0.7299,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.01856,
210
- "grad_norm": 98926.4453125,
211
  "learning_rate": 7.398873527905787e-07,
212
- "loss": 0.7159,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.0192,
217
- "grad_norm": 134108.140625,
218
  "learning_rate": 7.65488991295443e-07,
219
  "loss": 0.7122,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.01984,
224
- "grad_norm": 103719.140625,
225
  "learning_rate": 7.910906298003073e-07,
226
- "loss": 0.7185,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.02048,
231
- "grad_norm": 85624.953125,
232
  "learning_rate": 8.166922683051716e-07,
233
- "loss": 0.718,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.02112,
238
- "grad_norm": 138824.15625,
239
  "learning_rate": 8.422939068100359e-07,
240
- "loss": 0.713,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.02176,
245
- "grad_norm": 73629.0859375,
246
  "learning_rate": 8.678955453149002e-07,
247
- "loss": 0.7186,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.0224,
252
- "grad_norm": 132493.0,
253
  "learning_rate": 8.934971838197646e-07,
254
- "loss": 0.7133,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.02304,
259
- "grad_norm": 85223.625,
260
  "learning_rate": 9.190988223246289e-07,
261
- "loss": 0.7124,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.02368,
266
- "grad_norm": 77868.78125,
267
  "learning_rate": 9.447004608294931e-07,
268
- "loss": 0.7058,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.02432,
273
- "grad_norm": 75874.3046875,
274
  "learning_rate": 9.703020993343575e-07,
275
- "loss": 0.7139,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.02496,
280
- "grad_norm": 151937.703125,
281
  "learning_rate": 9.959037378392218e-07,
282
- "loss": 0.713,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.0256,
287
- "grad_norm": 161711.671875,
288
  "learning_rate": 1.021505376344086e-06,
289
- "loss": 0.7137,
290
  "step": 400
291
  }
292
  ],
 
151
  },
152
  {
153
  "epoch": 0.01344,
154
+ "grad_norm": 144219.625,
155
  "learning_rate": 5.350742447516642e-07,
156
+ "loss": 0.7218,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.01408,
161
+ "grad_norm": 105046.0234375,
162
  "learning_rate": 5.606758832565284e-07,
163
+ "loss": 0.718,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.01472,
168
+ "grad_norm": 126142.4296875,
169
  "learning_rate": 5.862775217613928e-07,
170
+ "loss": 0.7107,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.01536,
175
+ "grad_norm": 92423.2265625,
176
  "learning_rate": 6.118791602662571e-07,
177
+ "loss": 0.7271,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.016,
182
+ "grad_norm": 98091.828125,
183
  "learning_rate": 6.374807987711214e-07,
184
+ "loss": 0.7123,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.01664,
189
+ "grad_norm": 131949.578125,
190
  "learning_rate": 6.630824372759858e-07,
191
+ "loss": 0.7204,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.01728,
196
+ "grad_norm": 112228.5625,
197
  "learning_rate": 6.8868407578085e-07,
198
+ "loss": 0.722,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.01792,
203
+ "grad_norm": 64587.734375,
204
  "learning_rate": 7.142857142857143e-07,
205
+ "loss": 0.7263,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.01856,
210
+ "grad_norm": 99893.203125,
211
  "learning_rate": 7.398873527905787e-07,
212
+ "loss": 0.7169,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.0192,
217
+ "grad_norm": 135749.875,
218
  "learning_rate": 7.65488991295443e-07,
219
  "loss": 0.7122,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.01984,
224
+ "grad_norm": 103292.5703125,
225
  "learning_rate": 7.910906298003073e-07,
226
+ "loss": 0.7183,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.02048,
231
+ "grad_norm": 86927.28125,
232
  "learning_rate": 8.166922683051716e-07,
233
+ "loss": 0.7192,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.02112,
238
+ "grad_norm": 153738.390625,
239
  "learning_rate": 8.422939068100359e-07,
240
+ "loss": 0.711,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.02176,
245
+ "grad_norm": 69994.7734375,
246
  "learning_rate": 8.678955453149002e-07,
247
+ "loss": 0.7176,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.0224,
252
+ "grad_norm": 141370.6875,
253
  "learning_rate": 8.934971838197646e-07,
254
+ "loss": 0.7105,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.02304,
259
+ "grad_norm": 71139.453125,
260
  "learning_rate": 9.190988223246289e-07,
261
+ "loss": 0.7126,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.02368,
266
+ "grad_norm": 82039.1953125,
267
  "learning_rate": 9.447004608294931e-07,
268
+ "loss": 0.7078,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.02432,
273
+ "grad_norm": 71275.7890625,
274
  "learning_rate": 9.703020993343575e-07,
275
+ "loss": 0.7145,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.02496,
280
+ "grad_norm": 145801.21875,
281
  "learning_rate": 9.959037378392218e-07,
282
+ "loss": 0.7102,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.0256,
287
+ "grad_norm": 171507.0,
288
  "learning_rate": 1.021505376344086e-06,
289
+ "loss": 0.7123,
290
  "step": 400
291
  }
292
  ],
graphcodebert-robust/checkpoint-400/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5fc4023adc281644437a690ea6a6001846d7762699cd428d4ad38e1888076db
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e524f8f7de87947806acfb17c136195f8d3668b26513da260f1a2f14442156
3
  size 5841
graphcodebert-robust/checkpoint-600/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75ec427b92df30abfd117ca61bf8855a95bff5b8e2f300c83f23131aa83f89a3
3
  size 498612824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:320da2fc28dfd7f2b08f5a311e169db9c3172c660ca5f1f28958df59ff94a372
3
  size 498612824
graphcodebert-robust/checkpoint-600/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6eaf9c7a3d50e76cca47c4da094a2db7ca99a2b289f3509dc98882e9debad13
3
  size 4741923
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41ff1d1389d831b2bc7715b986dcf40f64372807ce80b3368515da1fcaa1cb7a
3
  size 4741923
graphcodebert-robust/checkpoint-600/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:820bebfae8bbd9452955c53efeeb042e6227f4bb5c733fac637c835bd717c752
3
  size 14581
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d667b0153bf32427b60333b1fe4a206d72e36eefc1792fdf3d499d50e466bd30
3
  size 14581
graphcodebert-robust/checkpoint-600/trainer_state.json CHANGED
@@ -151,282 +151,282 @@
151
  },
152
  {
153
  "epoch": 0.01344,
154
- "grad_norm": 141396.296875,
155
  "learning_rate": 5.350742447516642e-07,
156
- "loss": 0.7217,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.01408,
161
- "grad_norm": 102339.1640625,
162
  "learning_rate": 5.606758832565284e-07,
163
- "loss": 0.7215,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.01472,
168
- "grad_norm": 134052.9375,
169
  "learning_rate": 5.862775217613928e-07,
170
- "loss": 0.7115,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.01536,
175
- "grad_norm": 87181.984375,
176
  "learning_rate": 6.118791602662571e-07,
177
- "loss": 0.7241,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.016,
182
- "grad_norm": 100231.328125,
183
  "learning_rate": 6.374807987711214e-07,
184
- "loss": 0.71,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.01664,
189
- "grad_norm": 136721.484375,
190
  "learning_rate": 6.630824372759858e-07,
191
- "loss": 0.7188,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.01728,
196
- "grad_norm": 115868.8125,
197
  "learning_rate": 6.8868407578085e-07,
198
- "loss": 0.7199,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.01792,
203
- "grad_norm": 70205.1484375,
204
  "learning_rate": 7.142857142857143e-07,
205
- "loss": 0.7299,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.01856,
210
- "grad_norm": 98926.4453125,
211
  "learning_rate": 7.398873527905787e-07,
212
- "loss": 0.7159,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.0192,
217
- "grad_norm": 134108.140625,
218
  "learning_rate": 7.65488991295443e-07,
219
  "loss": 0.7122,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.01984,
224
- "grad_norm": 103719.140625,
225
  "learning_rate": 7.910906298003073e-07,
226
- "loss": 0.7185,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.02048,
231
- "grad_norm": 85624.953125,
232
  "learning_rate": 8.166922683051716e-07,
233
- "loss": 0.718,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.02112,
238
- "grad_norm": 138824.15625,
239
  "learning_rate": 8.422939068100359e-07,
240
- "loss": 0.713,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.02176,
245
- "grad_norm": 73629.0859375,
246
  "learning_rate": 8.678955453149002e-07,
247
- "loss": 0.7186,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.0224,
252
- "grad_norm": 132493.0,
253
  "learning_rate": 8.934971838197646e-07,
254
- "loss": 0.7133,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.02304,
259
- "grad_norm": 85223.625,
260
  "learning_rate": 9.190988223246289e-07,
261
- "loss": 0.7124,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.02368,
266
- "grad_norm": 77868.78125,
267
  "learning_rate": 9.447004608294931e-07,
268
- "loss": 0.7058,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.02432,
273
- "grad_norm": 75874.3046875,
274
  "learning_rate": 9.703020993343575e-07,
275
- "loss": 0.7139,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.02496,
280
- "grad_norm": 151937.703125,
281
  "learning_rate": 9.959037378392218e-07,
282
- "loss": 0.713,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.0256,
287
- "grad_norm": 161711.671875,
288
  "learning_rate": 1.021505376344086e-06,
289
- "loss": 0.7137,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 0.02624,
294
- "grad_norm": 90800.234375,
295
  "learning_rate": 1.0471070148489503e-06,
296
- "loss": 0.7091,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.02688,
301
- "grad_norm": 82131.34375,
302
  "learning_rate": 1.0727086533538148e-06,
303
- "loss": 0.7098,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 0.02752,
308
- "grad_norm": 92818.9140625,
309
  "learning_rate": 1.0983102918586791e-06,
310
- "loss": 0.7099,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 0.02816,
315
- "grad_norm": 88555.5078125,
316
  "learning_rate": 1.1239119303635434e-06,
317
- "loss": 0.7086,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 0.0288,
322
- "grad_norm": 73428.6015625,
323
  "learning_rate": 1.1495135688684077e-06,
324
- "loss": 0.7117,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 0.02944,
329
- "grad_norm": 128938.7421875,
330
  "learning_rate": 1.175115207373272e-06,
331
- "loss": 0.7182,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 0.03008,
336
- "grad_norm": 102742.3359375,
337
  "learning_rate": 1.2007168458781362e-06,
338
- "loss": 0.7108,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 0.03072,
343
- "grad_norm": 73825.8125,
344
  "learning_rate": 1.2263184843830007e-06,
345
- "loss": 0.7087,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 0.03136,
350
- "grad_norm": 110930.75,
351
  "learning_rate": 1.251920122887865e-06,
352
- "loss": 0.7232,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 0.032,
357
- "grad_norm": 95068.84375,
358
  "learning_rate": 1.2775217613927293e-06,
359
- "loss": 0.703,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.03264,
364
- "grad_norm": 118731.9296875,
365
  "learning_rate": 1.3031233998975938e-06,
366
- "loss": 0.7063,
367
  "step": 510
368
  },
369
  {
370
  "epoch": 0.03328,
371
- "grad_norm": 80511.828125,
372
  "learning_rate": 1.3287250384024578e-06,
373
- "loss": 0.7143,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 0.03392,
378
- "grad_norm": 84864.484375,
379
  "learning_rate": 1.354326676907322e-06,
380
- "loss": 0.7055,
381
  "step": 530
382
  },
383
  {
384
  "epoch": 0.03456,
385
- "grad_norm": 107800.109375,
386
  "learning_rate": 1.3799283154121864e-06,
387
- "loss": 0.7119,
388
  "step": 540
389
  },
390
  {
391
  "epoch": 0.0352,
392
- "grad_norm": 83667.671875,
393
  "learning_rate": 1.4055299539170509e-06,
394
- "loss": 0.7082,
395
  "step": 550
396
  },
397
  {
398
  "epoch": 0.03584,
399
- "grad_norm": 75656.4140625,
400
  "learning_rate": 1.4311315924219151e-06,
401
- "loss": 0.7062,
402
  "step": 560
403
  },
404
  {
405
  "epoch": 0.03648,
406
- "grad_norm": 79985.875,
407
  "learning_rate": 1.4567332309267796e-06,
408
- "loss": 0.7155,
409
  "step": 570
410
  },
411
  {
412
  "epoch": 0.03712,
413
- "grad_norm": 76334.078125,
414
  "learning_rate": 1.4823348694316437e-06,
415
- "loss": 0.7075,
416
  "step": 580
417
  },
418
  {
419
  "epoch": 0.03776,
420
- "grad_norm": 140764.03125,
421
  "learning_rate": 1.507936507936508e-06,
422
- "loss": 0.7065,
423
  "step": 590
424
  },
425
  {
426
  "epoch": 0.0384,
427
- "grad_norm": 100877.296875,
428
  "learning_rate": 1.5335381464413722e-06,
429
- "loss": 0.7096,
430
  "step": 600
431
  }
432
  ],
@@ -456,7 +456,7 @@
456
  "attributes": {}
457
  }
458
  },
459
- "total_flos": 5049397152295680.0,
460
  "train_batch_size": 32,
461
  "trial_name": null,
462
  "trial_params": null
 
151
  },
152
  {
153
  "epoch": 0.01344,
154
+ "grad_norm": 144219.625,
155
  "learning_rate": 5.350742447516642e-07,
156
+ "loss": 0.7218,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.01408,
161
+ "grad_norm": 105046.0234375,
162
  "learning_rate": 5.606758832565284e-07,
163
+ "loss": 0.718,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.01472,
168
+ "grad_norm": 126142.4296875,
169
  "learning_rate": 5.862775217613928e-07,
170
+ "loss": 0.7107,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.01536,
175
+ "grad_norm": 92423.2265625,
176
  "learning_rate": 6.118791602662571e-07,
177
+ "loss": 0.7271,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.016,
182
+ "grad_norm": 98091.828125,
183
  "learning_rate": 6.374807987711214e-07,
184
+ "loss": 0.7123,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.01664,
189
+ "grad_norm": 131949.578125,
190
  "learning_rate": 6.630824372759858e-07,
191
+ "loss": 0.7204,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.01728,
196
+ "grad_norm": 112228.5625,
197
  "learning_rate": 6.8868407578085e-07,
198
+ "loss": 0.722,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.01792,
203
+ "grad_norm": 64587.734375,
204
  "learning_rate": 7.142857142857143e-07,
205
+ "loss": 0.7263,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.01856,
210
+ "grad_norm": 99893.203125,
211
  "learning_rate": 7.398873527905787e-07,
212
+ "loss": 0.7169,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.0192,
217
+ "grad_norm": 135749.875,
218
  "learning_rate": 7.65488991295443e-07,
219
  "loss": 0.7122,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.01984,
224
+ "grad_norm": 103292.5703125,
225
  "learning_rate": 7.910906298003073e-07,
226
+ "loss": 0.7183,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.02048,
231
+ "grad_norm": 86927.28125,
232
  "learning_rate": 8.166922683051716e-07,
233
+ "loss": 0.7192,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.02112,
238
+ "grad_norm": 153738.390625,
239
  "learning_rate": 8.422939068100359e-07,
240
+ "loss": 0.711,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.02176,
245
+ "grad_norm": 69994.7734375,
246
  "learning_rate": 8.678955453149002e-07,
247
+ "loss": 0.7176,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.0224,
252
+ "grad_norm": 141370.6875,
253
  "learning_rate": 8.934971838197646e-07,
254
+ "loss": 0.7105,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.02304,
259
+ "grad_norm": 71139.453125,
260
  "learning_rate": 9.190988223246289e-07,
261
+ "loss": 0.7126,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.02368,
266
+ "grad_norm": 82039.1953125,
267
  "learning_rate": 9.447004608294931e-07,
268
+ "loss": 0.7078,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.02432,
273
+ "grad_norm": 71275.7890625,
274
  "learning_rate": 9.703020993343575e-07,
275
+ "loss": 0.7145,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.02496,
280
+ "grad_norm": 145801.21875,
281
  "learning_rate": 9.959037378392218e-07,
282
+ "loss": 0.7102,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.0256,
287
+ "grad_norm": 171507.0,
288
  "learning_rate": 1.021505376344086e-06,
289
+ "loss": 0.7123,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 0.02624,
294
+ "grad_norm": 79134.203125,
295
  "learning_rate": 1.0471070148489503e-06,
296
+ "loss": 0.7083,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.02688,
301
+ "grad_norm": 69231.640625,
302
  "learning_rate": 1.0727086533538148e-06,
303
+ "loss": 0.7105,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 0.02752,
308
+ "grad_norm": 113099.3984375,
309
  "learning_rate": 1.0983102918586791e-06,
310
+ "loss": 0.7141,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 0.02816,
315
+ "grad_norm": 121013.734375,
316
  "learning_rate": 1.1239119303635434e-06,
317
+ "loss": 0.7146,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 0.0288,
322
+ "grad_norm": 89184.609375,
323
  "learning_rate": 1.1495135688684077e-06,
324
+ "loss": 0.7133,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 0.02944,
329
+ "grad_norm": 176246.890625,
330
  "learning_rate": 1.175115207373272e-06,
331
+ "loss": 0.7086,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 0.03008,
336
+ "grad_norm": 88161.2265625,
337
  "learning_rate": 1.2007168458781362e-06,
338
+ "loss": 0.709,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 0.03072,
343
+ "grad_norm": 74441.015625,
344
  "learning_rate": 1.2263184843830007e-06,
345
+ "loss": 0.7023,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 0.03136,
350
+ "grad_norm": 96409.40625,
351
  "learning_rate": 1.251920122887865e-06,
352
+ "loss": 0.715,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 0.032,
357
+ "grad_norm": 81090.6484375,
358
  "learning_rate": 1.2775217613927293e-06,
359
+ "loss": 0.7109,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.03264,
364
+ "grad_norm": 98153.8828125,
365
  "learning_rate": 1.3031233998975938e-06,
366
+ "loss": 0.7092,
367
  "step": 510
368
  },
369
  {
370
  "epoch": 0.03328,
371
+ "grad_norm": 78782.546875,
372
  "learning_rate": 1.3287250384024578e-06,
373
+ "loss": 0.7048,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 0.03392,
378
+ "grad_norm": 110360.5,
379
  "learning_rate": 1.354326676907322e-06,
380
+ "loss": 0.7108,
381
  "step": 530
382
  },
383
  {
384
  "epoch": 0.03456,
385
+ "grad_norm": 88462.0703125,
386
  "learning_rate": 1.3799283154121864e-06,
387
+ "loss": 0.7041,
388
  "step": 540
389
  },
390
  {
391
  "epoch": 0.0352,
392
+ "grad_norm": 97624.7421875,
393
  "learning_rate": 1.4055299539170509e-06,
394
+ "loss": 0.7114,
395
  "step": 550
396
  },
397
  {
398
  "epoch": 0.03584,
399
+ "grad_norm": 99471.4375,
400
  "learning_rate": 1.4311315924219151e-06,
401
+ "loss": 0.7191,
402
  "step": 560
403
  },
404
  {
405
  "epoch": 0.03648,
406
+ "grad_norm": 79087.90625,
407
  "learning_rate": 1.4567332309267796e-06,
408
+ "loss": 0.7022,
409
  "step": 570
410
  },
411
  {
412
  "epoch": 0.03712,
413
+ "grad_norm": 65275.0,
414
  "learning_rate": 1.4823348694316437e-06,
415
+ "loss": 0.7088,
416
  "step": 580
417
  },
418
  {
419
  "epoch": 0.03776,
420
+ "grad_norm": 153826.28125,
421
  "learning_rate": 1.507936507936508e-06,
422
+ "loss": 0.7079,
423
  "step": 590
424
  },
425
  {
426
  "epoch": 0.0384,
427
+ "grad_norm": 64280.38671875,
428
  "learning_rate": 1.5335381464413722e-06,
429
+ "loss": 0.7018,
430
  "step": 600
431
  }
432
  ],
 
456
  "attributes": {}
457
  }
458
  },
459
+ "total_flos": 5049545152264320.0,
460
  "train_batch_size": 32,
461
  "trial_name": null,
462
  "trial_params": null
graphcodebert-robust/checkpoint-600/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5fc4023adc281644437a690ea6a6001846d7762699cd428d4ad38e1888076db
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e524f8f7de87947806acfb17c136195f8d3668b26513da260f1a2f14442156
3
  size 5841
graphcodebert-robust/checkpoint-800/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d19fdc7a5fa21c91052f15414ec14e1da4bbc85f75aa66510c1c463b2f14e2f6
3
  size 498612824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c038fee615aa3289704b6c8446543a8902b07b09cc79c21ef54c5fe8590f914e
3
  size 498612824
graphcodebert-robust/checkpoint-800/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddec1d294331a984f4091595913e06b171ba550334d359ca9c07a294409ad9c1
3
  size 4741923
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:554c3a189d14a7538050afbd400501c37378790e4b17a4a388758bad08d098a0
3
  size 4741923
graphcodebert-robust/checkpoint-800/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36c90ae3575630687b6a7d64bf93dded50adb1dbab4b74db0c9cdd2945f93577
3
  size 14581
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc41893e18124a5b4346e5ad2eec904a9b13636e7df7f9d4e28520206d9aac00
3
  size 14581
graphcodebert-robust/checkpoint-800/trainer_state.json CHANGED
@@ -151,422 +151,422 @@
151
  },
152
  {
153
  "epoch": 0.01344,
154
- "grad_norm": 141396.296875,
155
  "learning_rate": 5.350742447516642e-07,
156
- "loss": 0.7217,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.01408,
161
- "grad_norm": 102339.1640625,
162
  "learning_rate": 5.606758832565284e-07,
163
- "loss": 0.7215,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.01472,
168
- "grad_norm": 134052.9375,
169
  "learning_rate": 5.862775217613928e-07,
170
- "loss": 0.7115,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.01536,
175
- "grad_norm": 87181.984375,
176
  "learning_rate": 6.118791602662571e-07,
177
- "loss": 0.7241,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.016,
182
- "grad_norm": 100231.328125,
183
  "learning_rate": 6.374807987711214e-07,
184
- "loss": 0.71,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.01664,
189
- "grad_norm": 136721.484375,
190
  "learning_rate": 6.630824372759858e-07,
191
- "loss": 0.7188,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.01728,
196
- "grad_norm": 115868.8125,
197
  "learning_rate": 6.8868407578085e-07,
198
- "loss": 0.7199,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.01792,
203
- "grad_norm": 70205.1484375,
204
  "learning_rate": 7.142857142857143e-07,
205
- "loss": 0.7299,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.01856,
210
- "grad_norm": 98926.4453125,
211
  "learning_rate": 7.398873527905787e-07,
212
- "loss": 0.7159,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.0192,
217
- "grad_norm": 134108.140625,
218
  "learning_rate": 7.65488991295443e-07,
219
  "loss": 0.7122,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.01984,
224
- "grad_norm": 103719.140625,
225
  "learning_rate": 7.910906298003073e-07,
226
- "loss": 0.7185,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.02048,
231
- "grad_norm": 85624.953125,
232
  "learning_rate": 8.166922683051716e-07,
233
- "loss": 0.718,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.02112,
238
- "grad_norm": 138824.15625,
239
  "learning_rate": 8.422939068100359e-07,
240
- "loss": 0.713,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.02176,
245
- "grad_norm": 73629.0859375,
246
  "learning_rate": 8.678955453149002e-07,
247
- "loss": 0.7186,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.0224,
252
- "grad_norm": 132493.0,
253
  "learning_rate": 8.934971838197646e-07,
254
- "loss": 0.7133,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.02304,
259
- "grad_norm": 85223.625,
260
  "learning_rate": 9.190988223246289e-07,
261
- "loss": 0.7124,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.02368,
266
- "grad_norm": 77868.78125,
267
  "learning_rate": 9.447004608294931e-07,
268
- "loss": 0.7058,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.02432,
273
- "grad_norm": 75874.3046875,
274
  "learning_rate": 9.703020993343575e-07,
275
- "loss": 0.7139,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.02496,
280
- "grad_norm": 151937.703125,
281
  "learning_rate": 9.959037378392218e-07,
282
- "loss": 0.713,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.0256,
287
- "grad_norm": 161711.671875,
288
  "learning_rate": 1.021505376344086e-06,
289
- "loss": 0.7137,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 0.02624,
294
- "grad_norm": 90800.234375,
295
  "learning_rate": 1.0471070148489503e-06,
296
- "loss": 0.7091,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.02688,
301
- "grad_norm": 82131.34375,
302
  "learning_rate": 1.0727086533538148e-06,
303
- "loss": 0.7098,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 0.02752,
308
- "grad_norm": 92818.9140625,
309
  "learning_rate": 1.0983102918586791e-06,
310
- "loss": 0.7099,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 0.02816,
315
- "grad_norm": 88555.5078125,
316
  "learning_rate": 1.1239119303635434e-06,
317
- "loss": 0.7086,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 0.0288,
322
- "grad_norm": 73428.6015625,
323
  "learning_rate": 1.1495135688684077e-06,
324
- "loss": 0.7117,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 0.02944,
329
- "grad_norm": 128938.7421875,
330
  "learning_rate": 1.175115207373272e-06,
331
- "loss": 0.7182,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 0.03008,
336
- "grad_norm": 102742.3359375,
337
  "learning_rate": 1.2007168458781362e-06,
338
- "loss": 0.7108,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 0.03072,
343
- "grad_norm": 73825.8125,
344
  "learning_rate": 1.2263184843830007e-06,
345
- "loss": 0.7087,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 0.03136,
350
- "grad_norm": 110930.75,
351
  "learning_rate": 1.251920122887865e-06,
352
- "loss": 0.7232,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 0.032,
357
- "grad_norm": 95068.84375,
358
  "learning_rate": 1.2775217613927293e-06,
359
- "loss": 0.703,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.03264,
364
- "grad_norm": 118731.9296875,
365
  "learning_rate": 1.3031233998975938e-06,
366
- "loss": 0.7063,
367
  "step": 510
368
  },
369
  {
370
  "epoch": 0.03328,
371
- "grad_norm": 80511.828125,
372
  "learning_rate": 1.3287250384024578e-06,
373
- "loss": 0.7143,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 0.03392,
378
- "grad_norm": 84864.484375,
379
  "learning_rate": 1.354326676907322e-06,
380
- "loss": 0.7055,
381
  "step": 530
382
  },
383
  {
384
  "epoch": 0.03456,
385
- "grad_norm": 107800.109375,
386
  "learning_rate": 1.3799283154121864e-06,
387
- "loss": 0.7119,
388
  "step": 540
389
  },
390
  {
391
  "epoch": 0.0352,
392
- "grad_norm": 83667.671875,
393
  "learning_rate": 1.4055299539170509e-06,
394
- "loss": 0.7082,
395
  "step": 550
396
  },
397
  {
398
  "epoch": 0.03584,
399
- "grad_norm": 75656.4140625,
400
  "learning_rate": 1.4311315924219151e-06,
401
- "loss": 0.7062,
402
  "step": 560
403
  },
404
  {
405
  "epoch": 0.03648,
406
- "grad_norm": 79985.875,
407
  "learning_rate": 1.4567332309267796e-06,
408
- "loss": 0.7155,
409
  "step": 570
410
  },
411
  {
412
  "epoch": 0.03712,
413
- "grad_norm": 76334.078125,
414
  "learning_rate": 1.4823348694316437e-06,
415
- "loss": 0.7075,
416
  "step": 580
417
  },
418
  {
419
  "epoch": 0.03776,
420
- "grad_norm": 140764.03125,
421
  "learning_rate": 1.507936507936508e-06,
422
- "loss": 0.7065,
423
  "step": 590
424
  },
425
  {
426
  "epoch": 0.0384,
427
- "grad_norm": 100877.296875,
428
  "learning_rate": 1.5335381464413722e-06,
429
- "loss": 0.7096,
430
  "step": 600
431
  },
432
  {
433
  "epoch": 0.03904,
434
- "grad_norm": 104088.1171875,
435
  "learning_rate": 1.5591397849462367e-06,
436
- "loss": 0.6987,
437
  "step": 610
438
  },
439
  {
440
  "epoch": 0.03968,
441
- "grad_norm": 80806.2265625,
442
  "learning_rate": 1.584741423451101e-06,
443
- "loss": 0.707,
444
  "step": 620
445
  },
446
  {
447
  "epoch": 0.04032,
448
- "grad_norm": 109884.765625,
449
  "learning_rate": 1.6103430619559655e-06,
450
- "loss": 0.6991,
451
  "step": 630
452
  },
453
  {
454
  "epoch": 0.04096,
455
- "grad_norm": 79944.890625,
456
  "learning_rate": 1.6359447004608298e-06,
457
- "loss": 0.7047,
458
  "step": 640
459
  },
460
  {
461
  "epoch": 0.0416,
462
- "grad_norm": 93673.3828125,
463
  "learning_rate": 1.6615463389656938e-06,
464
- "loss": 0.6971,
465
  "step": 650
466
  },
467
  {
468
  "epoch": 0.04224,
469
- "grad_norm": 76641.265625,
470
  "learning_rate": 1.6871479774705581e-06,
471
- "loss": 0.6957,
472
  "step": 660
473
  },
474
  {
475
  "epoch": 0.04288,
476
- "grad_norm": 73583.5546875,
477
  "learning_rate": 1.7127496159754226e-06,
478
- "loss": 0.7028,
479
  "step": 670
480
  },
481
  {
482
  "epoch": 0.04352,
483
- "grad_norm": 75177.9609375,
484
  "learning_rate": 1.7383512544802869e-06,
485
- "loss": 0.7012,
486
  "step": 680
487
  },
488
  {
489
  "epoch": 0.04416,
490
- "grad_norm": 78340.8515625,
491
  "learning_rate": 1.7639528929851512e-06,
492
- "loss": 0.6987,
493
  "step": 690
494
  },
495
  {
496
  "epoch": 0.0448,
497
- "grad_norm": 86004.1171875,
498
  "learning_rate": 1.7895545314900157e-06,
499
- "loss": 0.7061,
500
  "step": 700
501
  },
502
  {
503
  "epoch": 0.04544,
504
- "grad_norm": 94212.0390625,
505
  "learning_rate": 1.8151561699948797e-06,
506
- "loss": 0.6993,
507
  "step": 710
508
  },
509
  {
510
  "epoch": 0.04608,
511
- "grad_norm": 83918.2421875,
512
  "learning_rate": 1.840757808499744e-06,
513
- "loss": 0.7009,
514
  "step": 720
515
  },
516
  {
517
  "epoch": 0.04672,
518
- "grad_norm": 68374.3125,
519
  "learning_rate": 1.8663594470046085e-06,
520
- "loss": 0.6964,
521
  "step": 730
522
  },
523
  {
524
  "epoch": 0.04736,
525
- "grad_norm": 90348.78125,
526
  "learning_rate": 1.8919610855094728e-06,
527
- "loss": 0.7011,
528
  "step": 740
529
  },
530
  {
531
  "epoch": 0.048,
532
- "grad_norm": 146658.0,
533
  "learning_rate": 1.9175627240143373e-06,
534
- "loss": 0.7003,
535
  "step": 750
536
  },
537
  {
538
  "epoch": 0.04864,
539
- "grad_norm": 112037.1640625,
540
  "learning_rate": 1.9431643625192015e-06,
541
- "loss": 0.7051,
542
  "step": 760
543
  },
544
  {
545
  "epoch": 0.04928,
546
- "grad_norm": 70628.625,
547
  "learning_rate": 1.9687660010240654e-06,
548
- "loss": 0.6923,
549
  "step": 770
550
  },
551
  {
552
  "epoch": 0.04992,
553
- "grad_norm": 109922.125,
554
  "learning_rate": 1.99436763952893e-06,
555
- "loss": 0.6893,
556
  "step": 780
557
  },
558
  {
559
  "epoch": 0.05056,
560
- "grad_norm": 135306.375,
561
  "learning_rate": 2.0199692780337944e-06,
562
- "loss": 0.7008,
563
  "step": 790
564
  },
565
  {
566
  "epoch": 0.0512,
567
- "grad_norm": 82354.8046875,
568
  "learning_rate": 2.0455709165386586e-06,
569
- "loss": 0.705,
570
  "step": 800
571
  }
572
  ],
@@ -596,7 +596,7 @@
596
  "attributes": {}
597
  }
598
  },
599
- "total_flos": 6733094128867200.0,
600
  "train_batch_size": 32,
601
  "trial_name": null,
602
  "trial_params": null
 
151
  },
152
  {
153
  "epoch": 0.01344,
154
+ "grad_norm": 144219.625,
155
  "learning_rate": 5.350742447516642e-07,
156
+ "loss": 0.7218,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.01408,
161
+ "grad_norm": 105046.0234375,
162
  "learning_rate": 5.606758832565284e-07,
163
+ "loss": 0.718,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.01472,
168
+ "grad_norm": 126142.4296875,
169
  "learning_rate": 5.862775217613928e-07,
170
+ "loss": 0.7107,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.01536,
175
+ "grad_norm": 92423.2265625,
176
  "learning_rate": 6.118791602662571e-07,
177
+ "loss": 0.7271,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.016,
182
+ "grad_norm": 98091.828125,
183
  "learning_rate": 6.374807987711214e-07,
184
+ "loss": 0.7123,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.01664,
189
+ "grad_norm": 131949.578125,
190
  "learning_rate": 6.630824372759858e-07,
191
+ "loss": 0.7204,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.01728,
196
+ "grad_norm": 112228.5625,
197
  "learning_rate": 6.8868407578085e-07,
198
+ "loss": 0.722,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.01792,
203
+ "grad_norm": 64587.734375,
204
  "learning_rate": 7.142857142857143e-07,
205
+ "loss": 0.7263,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.01856,
210
+ "grad_norm": 99893.203125,
211
  "learning_rate": 7.398873527905787e-07,
212
+ "loss": 0.7169,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.0192,
217
+ "grad_norm": 135749.875,
218
  "learning_rate": 7.65488991295443e-07,
219
  "loss": 0.7122,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.01984,
224
+ "grad_norm": 103292.5703125,
225
  "learning_rate": 7.910906298003073e-07,
226
+ "loss": 0.7183,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.02048,
231
+ "grad_norm": 86927.28125,
232
  "learning_rate": 8.166922683051716e-07,
233
+ "loss": 0.7192,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.02112,
238
+ "grad_norm": 153738.390625,
239
  "learning_rate": 8.422939068100359e-07,
240
+ "loss": 0.711,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.02176,
245
+ "grad_norm": 69994.7734375,
246
  "learning_rate": 8.678955453149002e-07,
247
+ "loss": 0.7176,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.0224,
252
+ "grad_norm": 141370.6875,
253
  "learning_rate": 8.934971838197646e-07,
254
+ "loss": 0.7105,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.02304,
259
+ "grad_norm": 71139.453125,
260
  "learning_rate": 9.190988223246289e-07,
261
+ "loss": 0.7126,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.02368,
266
+ "grad_norm": 82039.1953125,
267
  "learning_rate": 9.447004608294931e-07,
268
+ "loss": 0.7078,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.02432,
273
+ "grad_norm": 71275.7890625,
274
  "learning_rate": 9.703020993343575e-07,
275
+ "loss": 0.7145,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.02496,
280
+ "grad_norm": 145801.21875,
281
  "learning_rate": 9.959037378392218e-07,
282
+ "loss": 0.7102,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.0256,
287
+ "grad_norm": 171507.0,
288
  "learning_rate": 1.021505376344086e-06,
289
+ "loss": 0.7123,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 0.02624,
294
+ "grad_norm": 79134.203125,
295
  "learning_rate": 1.0471070148489503e-06,
296
+ "loss": 0.7083,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.02688,
301
+ "grad_norm": 69231.640625,
302
  "learning_rate": 1.0727086533538148e-06,
303
+ "loss": 0.7105,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 0.02752,
308
+ "grad_norm": 113099.3984375,
309
  "learning_rate": 1.0983102918586791e-06,
310
+ "loss": 0.7141,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 0.02816,
315
+ "grad_norm": 121013.734375,
316
  "learning_rate": 1.1239119303635434e-06,
317
+ "loss": 0.7146,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 0.0288,
322
+ "grad_norm": 89184.609375,
323
  "learning_rate": 1.1495135688684077e-06,
324
+ "loss": 0.7133,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 0.02944,
329
+ "grad_norm": 176246.890625,
330
  "learning_rate": 1.175115207373272e-06,
331
+ "loss": 0.7086,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 0.03008,
336
+ "grad_norm": 88161.2265625,
337
  "learning_rate": 1.2007168458781362e-06,
338
+ "loss": 0.709,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 0.03072,
343
+ "grad_norm": 74441.015625,
344
  "learning_rate": 1.2263184843830007e-06,
345
+ "loss": 0.7023,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 0.03136,
350
+ "grad_norm": 96409.40625,
351
  "learning_rate": 1.251920122887865e-06,
352
+ "loss": 0.715,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 0.032,
357
+ "grad_norm": 81090.6484375,
358
  "learning_rate": 1.2775217613927293e-06,
359
+ "loss": 0.7109,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.03264,
364
+ "grad_norm": 98153.8828125,
365
  "learning_rate": 1.3031233998975938e-06,
366
+ "loss": 0.7092,
367
  "step": 510
368
  },
369
  {
370
  "epoch": 0.03328,
371
+ "grad_norm": 78782.546875,
372
  "learning_rate": 1.3287250384024578e-06,
373
+ "loss": 0.7048,
374
  "step": 520
375
  },
376
  {
377
  "epoch": 0.03392,
378
+ "grad_norm": 110360.5,
379
  "learning_rate": 1.354326676907322e-06,
380
+ "loss": 0.7108,
381
  "step": 530
382
  },
383
  {
384
  "epoch": 0.03456,
385
+ "grad_norm": 88462.0703125,
386
  "learning_rate": 1.3799283154121864e-06,
387
+ "loss": 0.7041,
388
  "step": 540
389
  },
390
  {
391
  "epoch": 0.0352,
392
+ "grad_norm": 97624.7421875,
393
  "learning_rate": 1.4055299539170509e-06,
394
+ "loss": 0.7114,
395
  "step": 550
396
  },
397
  {
398
  "epoch": 0.03584,
399
+ "grad_norm": 99471.4375,
400
  "learning_rate": 1.4311315924219151e-06,
401
+ "loss": 0.7191,
402
  "step": 560
403
  },
404
  {
405
  "epoch": 0.03648,
406
+ "grad_norm": 79087.90625,
407
  "learning_rate": 1.4567332309267796e-06,
408
+ "loss": 0.7022,
409
  "step": 570
410
  },
411
  {
412
  "epoch": 0.03712,
413
+ "grad_norm": 65275.0,
414
  "learning_rate": 1.4823348694316437e-06,
415
+ "loss": 0.7088,
416
  "step": 580
417
  },
418
  {
419
  "epoch": 0.03776,
420
+ "grad_norm": 153826.28125,
421
  "learning_rate": 1.507936507936508e-06,
422
+ "loss": 0.7079,
423
  "step": 590
424
  },
425
  {
426
  "epoch": 0.0384,
427
+ "grad_norm": 64280.38671875,
428
  "learning_rate": 1.5335381464413722e-06,
429
+ "loss": 0.7018,
430
  "step": 600
431
  },
432
  {
433
  "epoch": 0.03904,
434
+ "grad_norm": 65060.80078125,
435
  "learning_rate": 1.5591397849462367e-06,
436
+ "loss": 0.7027,
437
  "step": 610
438
  },
439
  {
440
  "epoch": 0.03968,
441
+ "grad_norm": 77339.2890625,
442
  "learning_rate": 1.584741423451101e-06,
443
+ "loss": 0.7038,
444
  "step": 620
445
  },
446
  {
447
  "epoch": 0.04032,
448
+ "grad_norm": 123140.5546875,
449
  "learning_rate": 1.6103430619559655e-06,
450
+ "loss": 0.7019,
451
  "step": 630
452
  },
453
  {
454
  "epoch": 0.04096,
455
+ "grad_norm": 67502.71875,
456
  "learning_rate": 1.6359447004608298e-06,
457
+ "loss": 0.7094,
458
  "step": 640
459
  },
460
  {
461
  "epoch": 0.0416,
462
+ "grad_norm": 95452.1796875,
463
  "learning_rate": 1.6615463389656938e-06,
464
+ "loss": 0.6998,
465
  "step": 650
466
  },
467
  {
468
  "epoch": 0.04224,
469
+ "grad_norm": 68556.421875,
470
  "learning_rate": 1.6871479774705581e-06,
471
+ "loss": 0.694,
472
  "step": 660
473
  },
474
  {
475
  "epoch": 0.04288,
476
+ "grad_norm": 78265.8046875,
477
  "learning_rate": 1.7127496159754226e-06,
478
+ "loss": 0.7051,
479
  "step": 670
480
  },
481
  {
482
  "epoch": 0.04352,
483
+ "grad_norm": 93559.3359375,
484
  "learning_rate": 1.7383512544802869e-06,
485
+ "loss": 0.6997,
486
  "step": 680
487
  },
488
  {
489
  "epoch": 0.04416,
490
+ "grad_norm": 88091.9375,
491
  "learning_rate": 1.7639528929851512e-06,
492
+ "loss": 0.6963,
493
  "step": 690
494
  },
495
  {
496
  "epoch": 0.0448,
497
+ "grad_norm": 73024.359375,
498
  "learning_rate": 1.7895545314900157e-06,
499
+ "loss": 0.7021,
500
  "step": 700
501
  },
502
  {
503
  "epoch": 0.04544,
504
+ "grad_norm": 100058.2890625,
505
  "learning_rate": 1.8151561699948797e-06,
506
+ "loss": 0.7022,
507
  "step": 710
508
  },
509
  {
510
  "epoch": 0.04608,
511
+ "grad_norm": 99197.1953125,
512
  "learning_rate": 1.840757808499744e-06,
513
+ "loss": 0.7017,
514
  "step": 720
515
  },
516
  {
517
  "epoch": 0.04672,
518
+ "grad_norm": 102018.984375,
519
  "learning_rate": 1.8663594470046085e-06,
520
+ "loss": 0.6985,
521
  "step": 730
522
  },
523
  {
524
  "epoch": 0.04736,
525
+ "grad_norm": 101586.0234375,
526
  "learning_rate": 1.8919610855094728e-06,
527
+ "loss": 0.6991,
528
  "step": 740
529
  },
530
  {
531
  "epoch": 0.048,
532
+ "grad_norm": 151948.25,
533
  "learning_rate": 1.9175627240143373e-06,
534
+ "loss": 0.6977,
535
  "step": 750
536
  },
537
  {
538
  "epoch": 0.04864,
539
+ "grad_norm": 88698.7109375,
540
  "learning_rate": 1.9431643625192015e-06,
541
+ "loss": 0.6961,
542
  "step": 760
543
  },
544
  {
545
  "epoch": 0.04928,
546
+ "grad_norm": 82451.9296875,
547
  "learning_rate": 1.9687660010240654e-06,
548
+ "loss": 0.6898,
549
  "step": 770
550
  },
551
  {
552
  "epoch": 0.04992,
553
+ "grad_norm": 82236.453125,
554
  "learning_rate": 1.99436763952893e-06,
555
+ "loss": 0.6886,
556
  "step": 780
557
  },
558
  {
559
  "epoch": 0.05056,
560
+ "grad_norm": 155064.484375,
561
  "learning_rate": 2.0199692780337944e-06,
562
+ "loss": 0.6921,
563
  "step": 790
564
  },
565
  {
566
  "epoch": 0.0512,
567
+ "grad_norm": 72238.6328125,
568
  "learning_rate": 2.0455709165386586e-06,
569
+ "loss": 0.6932,
570
  "step": 800
571
  }
572
  ],
 
596
  "attributes": {}
597
  }
598
  },
599
+ "total_flos": 6733455906568320.0,
600
  "train_batch_size": 32,
601
  "trial_name": null,
602
  "trial_params": null
graphcodebert-robust/checkpoint-800/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5fc4023adc281644437a690ea6a6001846d7762699cd428d4ad38e1888076db
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e524f8f7de87947806acfb17c136195f8d3668b26513da260f1a2f14442156
3
  size 5841
graphcodebert-robust/training.log CHANGED
@@ -1,10 +1,34 @@
1
- 2026-04-16 10:18:35,455 - INFO - train_pipeline - Logging to ./output_checkpoints/graphcodebert-robust/training.log
2
- 2026-04-16 10:18:35,457 - INFO - train_pipeline - Training config: TrainConfig(model_name='microsoft/graphcodebert-base', output_dir='./output_checkpoints/graphcodebert-robust', num_epochs=5, batch_size=32, learning_rate=2e-05, max_length=512, num_labels=2, use_wandb=True, freeze_base=True, loss_type='r-drop', focal_alpha=1.0, focal_gamma=2.0, r_drop_alpha=4.0, infonce_temperature=0.07, infonce_weight=0.5, seed=42, resume_from_checkpoint='output_checkpoints/graphcodebert-robust/checkpoint-1000', label_smoothing=0.1, adversarial_epsilon=0.5, use_swa=True, swa_start_epoch=2, swa_lr=1e-05, data_augmentation=True, aug_rename_prob=0.3, aug_format_prob=0.3, device=device(type='cuda'))
3
- 2026-04-16 10:18:35,458 - INFO - train_pipeline - Loading model & tokenizer for 'microsoft/graphcodebert-base'
4
- 2026-04-16 10:18:36,698 - INFO - train_pipeline - Model placed on cuda
5
- 2026-04-16 10:18:36,701 - INFO - train_pipeline - Base model weights frozen – only classifier head will be trained.
6
- 2026-04-16 10:18:36,702 - INFO - train_pipeline - ===== Model Architecture =====
7
- 2026-04-16 10:18:36,705 - INFO - train_pipeline -
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  RobertaForSequenceClassification(
9
  (roberta): RobertaModel(
10
  (embeddings): RobertaEmbeddings(
@@ -12,7 +36,7 @@ RobertaForSequenceClassification(
12
  (position_embeddings): Embedding(514, 768, padding_idx=1)
13
  (token_type_embeddings): Embedding(1, 768)
14
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
15
- (dropout): Dropout(p=0.2, inplace=False)
16
  )
17
  (encoder): RobertaEncoder(
18
  (layer): ModuleList(
@@ -22,12 +46,12 @@ RobertaForSequenceClassification(
22
  (query): Linear(in_features=768, out_features=768, bias=True)
23
  (key): Linear(in_features=768, out_features=768, bias=True)
24
  (value): Linear(in_features=768, out_features=768, bias=True)
25
- (dropout): Dropout(p=0.2, inplace=False)
26
  )
27
  (output): RobertaSelfOutput(
28
  (dense): Linear(in_features=768, out_features=768, bias=True)
29
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
30
- (dropout): Dropout(p=0.2, inplace=False)
31
  )
32
  )
33
  (intermediate): RobertaIntermediate(
@@ -37,7 +61,7 @@ RobertaForSequenceClassification(
37
  (output): RobertaOutput(
38
  (dense): Linear(in_features=3072, out_features=768, bias=True)
39
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
40
- (dropout): Dropout(p=0.2, inplace=False)
41
  )
42
  )
43
  )
@@ -45,16 +69,16 @@ RobertaForSequenceClassification(
45
  )
46
  (classifier): RobertaClassificationHead(
47
  (dense): Linear(in_features=768, out_features=768, bias=True)
48
- (dropout): Dropout(p=0.2, inplace=False)
49
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
50
  )
51
  )
52
- 2026-04-16 10:18:36,707 - INFO - train_pipeline - ===== Parameter Summary =====
53
- 2026-04-16 10:18:36,709 - INFO - train_pipeline - Total Parameters: 124,647,170
54
- 2026-04-16 10:18:36,711 - INFO - train_pipeline - Trainable Parameters: 592,130
55
- 2026-04-16 10:18:36,712 - INFO - train_pipeline - Non-trainable Parameters: 124,055,040
56
- 2026-04-16 10:18:36,713 - INFO - train_pipeline - ===== Tokenizer Summary =====
57
- 2026-04-16 10:18:36,732 - INFO - train_pipeline - Vocab size: 50265 | Special tokens: ['<s>', '</s>', '<unk>', '<pad>', '<mask>']
58
- 2026-04-16 10:18:36,734 - INFO - train_pipeline - ===== End of Architecture Log =====
59
- 2026-04-16 10:18:36,735 - INFO - train_pipeline - Data augmentation enabled (rename=0.3, format=0.3)
60
- 2026-04-16 10:18:38,005 - INFO - train_pipeline - === Starting training with robust regularisation ===
 
1
+ 2026-04-17 08:00:34,522 - INFO - train_pipeline - Logging to ./output_checkpoints/graphcodebert-robust/training.log
2
+ 2026-04-17 08:00:34,525 - INFO - train_pipeline - ===== Training Configuration =====
3
+ 2026-04-17 08:00:34,526 - INFO - train_pipeline - model_name : microsoft/graphcodebert-base
4
+ 2026-04-17 08:00:34,528 - INFO - train_pipeline - output_dir : ./output_checkpoints/graphcodebert-robust
5
+ 2026-04-17 08:00:34,529 - INFO - train_pipeline - num_epochs : 5
6
+ 2026-04-17 08:00:34,531 - INFO - train_pipeline - batch_size : 32
7
+ 2026-04-17 08:00:34,533 - INFO - train_pipeline - learning_rate : 2e-05
8
+ 2026-04-17 08:00:34,535 - INFO - train_pipeline - max_length : 512
9
+ 2026-04-17 08:00:34,536 - INFO - train_pipeline - num_labels : 2
10
+ 2026-04-17 08:00:34,538 - INFO - train_pipeline - use_wandb : True
11
+ 2026-04-17 08:00:34,540 - INFO - train_pipeline - freeze_base : True
12
+ 2026-04-17 08:00:34,541 - INFO - train_pipeline - loss_type : r-drop
13
+ 2026-04-17 08:00:34,542 - INFO - train_pipeline - focal_alpha : 1.0
14
+ 2026-04-17 08:00:34,544 - INFO - train_pipeline - focal_gamma : 2.0
15
+ 2026-04-17 08:00:34,545 - INFO - train_pipeline - r_drop_alpha : 4.0
16
+ 2026-04-17 08:00:34,546 - INFO - train_pipeline - infonce_temperature : 0.07
17
+ 2026-04-17 08:00:34,548 - INFO - train_pipeline - infonce_weight : 0.5
18
+ 2026-04-17 08:00:34,550 - INFO - train_pipeline - seed : 42
19
+ 2026-04-17 08:00:34,552 - INFO - train_pipeline - resume_from_checkpoint : None
20
+ 2026-04-17 08:00:34,553 - INFO - train_pipeline - label_smoothing : 0.1
21
+ 2026-04-17 08:00:34,554 - INFO - train_pipeline - adversarial_epsilon : 0.5
22
+ 2026-04-17 08:00:34,556 - INFO - train_pipeline - use_swa : True
23
+ 2026-04-17 08:00:34,557 - INFO - train_pipeline - swa_start_epoch : 2
24
+ 2026-04-17 08:00:34,558 - INFO - train_pipeline - swa_lr : 1e-05
25
+ 2026-04-17 08:00:34,559 - INFO - train_pipeline - data_augmentation : True
26
+ 2026-04-17 08:00:34,561 - INFO - train_pipeline - aug_rename_prob : 0.3
27
+ 2026-04-17 08:00:34,562 - INFO - train_pipeline - aug_format_prob : 0.3
28
+ 2026-04-17 08:00:34,564 - INFO - train_pipeline - =================================
29
+ 2026-04-17 08:00:35,711 - INFO - train_pipeline - Model placed on cuda
30
+ 2026-04-17 08:00:35,716 - INFO - train_pipeline - ===== Model Architecture =====
31
+ 2026-04-17 08:00:35,718 - INFO - train_pipeline -
32
  RobertaForSequenceClassification(
33
  (roberta): RobertaModel(
34
  (embeddings): RobertaEmbeddings(
 
36
  (position_embeddings): Embedding(514, 768, padding_idx=1)
37
  (token_type_embeddings): Embedding(1, 768)
38
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
39
+ (dropout): Dropout(p=0.1, inplace=False)
40
  )
41
  (encoder): RobertaEncoder(
42
  (layer): ModuleList(
 
46
  (query): Linear(in_features=768, out_features=768, bias=True)
47
  (key): Linear(in_features=768, out_features=768, bias=True)
48
  (value): Linear(in_features=768, out_features=768, bias=True)
49
+ (dropout): Dropout(p=0.1, inplace=False)
50
  )
51
  (output): RobertaSelfOutput(
52
  (dense): Linear(in_features=768, out_features=768, bias=True)
53
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
54
+ (dropout): Dropout(p=0.1, inplace=False)
55
  )
56
  )
57
  (intermediate): RobertaIntermediate(
 
61
  (output): RobertaOutput(
62
  (dense): Linear(in_features=3072, out_features=768, bias=True)
63
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
64
+ (dropout): Dropout(p=0.1, inplace=False)
65
  )
66
  )
67
  )
 
69
  )
70
  (classifier): RobertaClassificationHead(
71
  (dense): Linear(in_features=768, out_features=768, bias=True)
72
+ (dropout): Dropout(p=0.1, inplace=False)
73
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
74
  )
75
  )
76
+ 2026-04-17 08:00:35,722 - INFO - train_pipeline - ===== Parameter Summary =====
77
+ 2026-04-17 08:00:35,723 - INFO - train_pipeline - Total Parameters: 124,647,170
78
+ 2026-04-17 08:00:35,724 - INFO - train_pipeline - Trainable Parameters: 592,130
79
+ 2026-04-17 08:00:35,725 - INFO - train_pipeline - Non-trainable Parameters: 124,055,040
80
+ 2026-04-17 08:00:35,727 - INFO - train_pipeline - ===== Tokenizer Summary =====
81
+ 2026-04-17 08:00:35,747 - INFO - train_pipeline - Vocab size: 50265 | Special tokens: ['<s>', '</s>', '<unk>', '<pad>', '<mask>']
82
+ 2026-04-17 08:00:35,749 - INFO - train_pipeline - ===== End of Architecture Log =====
83
+ 2026-04-17 08:00:35,751 - INFO - train_pipeline - Data augmentation enabled (rename=0.3, format=0.3)
84
+ 2026-04-17 08:00:36,645 - INFO - train_pipeline - === Starting training with robust regularisation ===