kmchiti's picture
Upload results/eval_difficulty/summary.json
57a8c18 verified
[
{
"checkpoint": "checkpoint-18779",
"id_acc": 0.9630347222222222,
"ood_acc": 0.25220703125,
"total_acc": 0.588914884868421,
"id_avg_loss": 6.54602900314331,
"ood_avg_loss": 5.825457085227966,
"total_avg_loss": 6.166780625293129,
"id_avg_resp_len": 201.21272916666666,
"ood_avg_resp_len": 284.85050390625,
"count_id": 1152000,
"count_ood": 1280000,
"count_total": 2432000,
"template_metrics": {
"crazy_zootopia": {
"count": 809728,
"correct": 477128,
"answer_accuracy": 0.5892447834334492,
"avg_response_len": 235.475637498024,
"resp_tokens_sum": 190671217,
"pass_at_k": {
"pass@1": 0.5892447834334492,
"pass@2": 0.6242494417489666,
"pass@4": 0.6517067209785661,
"pass@8": 0.6752650274111465,
"pass@16": 0.6966098480320178,
"pass@32": 0.7161446042348242,
"pass@64": 0.7340453705919688,
"pass@128": 0.7510275055327221
},
"per_op_pass_at_k": {
"10": {
"pass@1": 0.8994976032448377,
"pass@2": 0.9366747090795067,
"pass@4": 0.954640098361598,
"pass@8": 0.9671935395338649,
"pass@16": 0.9780409476989385,
"pass@32": 0.9873672705205916,
"pass@64": 0.9931124421272916,
"pass@128": 0.9941002949852508
},
"2": {
"pass@1": 0.9997322819314641,
"pass@2": 0.9999915679348491,
"pass@4": 0.9999999897793149,
"pass@8": 1.0,
"pass@16": 1.0,
"pass@32": 1.0,
"pass@64": 1.0,
"pass@128": 1.0
},
"3": {
"pass@1": 0.9875525611620795,
"pass@2": 0.9937483445303281,
"pass@4": 0.9960841566249173,
"pass@8": 0.9973945614382809,
"pass@16": 0.9984052318072489,
"pass@32": 0.9992931754002767,
"pass@64": 0.9999118688143109,
"pass@128": 1.0
},
"4": {
"pass@1": 0.996337890625,
"pass@2": 0.9982602577509845,
"pass@4": 0.9992694307742784,
"pass@8": 0.9998455780548049,
"pass@16": 0.9999940908757206,
"pass@32": 0.9999999967265157,
"pass@64": 1.0,
"pass@128": 1.0
},
"5": {
"pass@1": 0.9481150793650793,
"pass@2": 0.9641533089613797,
"pass@4": 0.9757381440415184,
"pass@8": 0.9837977641761237,
"pass@16": 0.9891567790393069,
"pass@32": 0.9924331098404072,
"pass@64": 0.9948057827917357,
"pass@128": 0.9968253968253968
},
"6": {
"pass@1": 0.9665746631736527,
"pass@2": 0.9810815131547932,
"pass@4": 0.9889092237347574,
"pass@8": 0.992817054998713,
"pass@16": 0.9942118482404367,
"pass@32": 0.9947566055669427,
"pass@64": 0.9955089818359227,
"pass@128": 0.9970059880239521
},
"7": {
"pass@1": 0.9575397559171598,
"pass@2": 0.9706686361412663,
"pass@4": 0.9781720887884576,
"pass@8": 0.9836457175352227,
"pass@16": 0.9877211930993773,
"pass@32": 0.991030454111008,
"pass@64": 0.9934560055493304,
"pass@128": 0.9940828402366864
},
"8": {
"pass@1": 0.9534755608974359,
"pass@2": 0.9758759684786995,
"pass@4": 0.9847553208493165,
"pass@8": 0.9897007291187125,
"pass@16": 0.9935694000922215,
"pass@32": 0.9965151373391712,
"pass@64": 0.998300039303945,
"pass@128": 1.0
},
"9": {
"pass@1": 0.9408450704225352,
"pass@2": 0.9764160751913055,
"pass@4": 0.9897490211610868,
"pass@8": 0.9954928597779902,
"pass@16": 0.9986146363237696,
"pass@32": 0.9998472651017836,
"pass@64": 0.9999990904114461,
"pass@128": 1.0
},
"11": {
"pass@1": 0.7521689093484419,
"pass@2": 0.8267172826838576,
"pass@4": 0.8749963240253604,
"pass@8": 0.9072539747181511,
"pass@16": 0.9305891455160197,
"pass@32": 0.9482171914771189,
"pass@64": 0.9598296757598728,
"pass@128": 0.9660056657223796
},
"12": {
"pass@1": 0.4268626412429379,
"pass@2": 0.5226548250589441,
"pass@4": 0.5947451512946471,
"pass@8": 0.6499540700619184,
"pass@16": 0.697671407935311,
"pass@32": 0.7391764349175001,
"pass@64": 0.7728700105439386,
"pass@128": 0.8022598870056498
},
"13": {
"pass@1": 0.226048197492163,
"pass@2": 0.2745025516500875,
"pass@4": 0.32170087379359735,
"pass@8": 0.36726090847720233,
"pass@16": 0.4118837384913241,
"pass@32": 0.4577964268716136,
"pass@64": 0.5060004900301981,
"pass@128": 0.554858934169279
},
"14": {
"pass@1": 0.1970404984423676,
"pass@2": 0.2447659871955257,
"pass@4": 0.2905214605183697,
"pass@8": 0.33432116638118264,
"pass@16": 0.37689919457954874,
"pass@32": 0.41849874775311363,
"pass@64": 0.46201871752633594,
"pass@128": 0.5109034267912772
},
"15": {
"pass@1": 0.190774024566474,
"pass@2": 0.24494681910245306,
"pass@4": 0.29680882700356104,
"pass@8": 0.3440605324830605,
"pass@16": 0.38846742937996726,
"pass@32": 0.43008799604881653,
"pass@64": 0.4654618905817254,
"pass@128": 0.4913294797687861
},
"16": {
"pass@1": 0.1630796370967742,
"pass@2": 0.1964297053594107,
"pass@4": 0.23033826426938572,
"pass@8": 0.2660781043146322,
"pass@16": 0.3019958633560044,
"pass@32": 0.33569879011745574,
"pass@64": 0.3686862876847773,
"pass@128": 0.4064516129032258
},
"17": {
"pass@1": 0.1676300578034682,
"pass@2": 0.20699551397296437,
"pass@4": 0.2408987041359715,
"pass@8": 0.271017564290959,
"pass@16": 0.29876652985892266,
"pass@32": 0.3264747152113181,
"pass@64": 0.3549548508714773,
"pass@128": 0.38439306358381503
},
"18": {
"pass@1": 0.1616517857142857,
"pass@2": 0.19595648200224966,
"pass@4": 0.23106420322459695,
"pass@8": 0.26888622832851616,
"pass@16": 0.3071732341417159,
"pass@32": 0.3420263056744111,
"pass@64": 0.3714714335108666,
"pass@128": 0.39714285714285713
},
"19": {
"pass@1": 0.15040822072072071,
"pass@2": 0.1915057281691139,
"pass@4": 0.2273034600967671,
"pass@8": 0.2598221642368353,
"pass@16": 0.2905735616301428,
"pass@32": 0.32042698971733125,
"pass@64": 0.35124146512270843,
"pass@128": 0.3843843843843844
},
"20": {
"pass@1": 0.15052552552552553,
"pass@2": 0.18885223806483653,
"pass@4": 0.22942468621602477,
"pass@8": 0.26979999128432536,
"pass@16": 0.30689533475643643,
"pass@32": 0.3397581402934893,
"pass@64": 0.370991289408613,
"pass@128": 0.4024024024024024
}
}
},
"teachers_in_school": {
"count": 821120,
"correct": 483803,
"answer_accuracy": 0.5891988990646921,
"avg_response_len": 242.341766124318,
"resp_tokens_sum": 198991671,
"pass_at_k": {
"pass@1": 0.5891988990646921,
"pass@2": 0.6227068003142299,
"pass@4": 0.650107249088997,
"pass@8": 0.6741994105674554,
"pass@16": 0.6956359816626693,
"pass@32": 0.7149534992963424,
"pass@64": 0.7325456550204769,
"pass@128": 0.7480904130943102
},
"per_op_pass_at_k": {
"10": {
"pass@1": 0.8878930214723927,
"pass@2": 0.9240016333751996,
"pass@4": 0.9427751138086263,
"pass@8": 0.9559837013558444,
"pass@16": 0.9669589357176731,
"pass@32": 0.9757400698048759,
"pass@64": 0.9820426961144421,
"pass@128": 0.9877300613496932
},
"2": {
"pass@1": 0.9987177051671733,
"pass@2": 0.9996395064978579,
"pass@4": 0.9999613177152248,
"pass@8": 0.9999996232076699,
"pass@16": 0.9999999999864305,
"pass@32": 1.0,
"pass@64": 1.0,
"pass@128": 1.0
},
"3": {
"pass@1": 0.9907670454545454,
"pass@2": 0.9960163892865667,
"pass@4": 0.9985778993534897,
"pass@8": 0.9996915957072215,
"pass@16": 0.9999815253902985,
"pass@32": 0.99999995938087,
"pass@64": 0.9999999999999987,
"pass@128": 1.0
},
"4": {
"pass@1": 0.9982664571005917,
"pass@2": 0.9998132687648512,
"pass@4": 0.999995557138198,
"pass@8": 0.9999999967656203,
"pass@16": 0.9999999999999998,
"pass@32": 1.0,
"pass@64": 1.0,
"pass@128": 1.0
},
"5": {
"pass@1": 0.9600317028985508,
"pass@2": 0.9691975493552438,
"pass@4": 0.9763000869456536,
"pass@8": 0.9828931121195829,
"pass@16": 0.989036439654314,
"pass@32": 0.9944838684919953,
"pass@64": 0.9981653755234783,
"pass@128": 1.0
},
"6": {
"pass@1": 0.9823379297994269,
"pass@2": 0.9910194058389551,
"pass@4": 0.9946701218221643,
"pass@8": 0.9965445299840145,
"pass@16": 0.9976712739922713,
"pass@32": 0.9983904225804993,
"pass@64": 0.9992893080034889,
"pass@128": 1.0
},
"7": {
"pass@1": 0.9681855130057804,
"pass@2": 0.9832421146056164,
"pass@4": 0.9914642009163593,
"pass@8": 0.9964758591421767,
"pass@16": 0.9990374180123244,
"pass@32": 0.9999126056145744,
"pass@64": 0.9999997855847184,
"pass@128": 1.0
},
"8": {
"pass@1": 0.9455765845070423,
"pass@2": 0.9712158284351782,
"pass@4": 0.9835400283591312,
"pass@8": 0.9910390348371692,
"pass@16": 0.9955726997113149,
"pass@32": 0.9980539040490879,
"pass@64": 0.9995741614459047,
"pass@128": 1.0
},
"9": {
"pass@1": 0.9390437874251497,
"pass@2": 0.9670264539818006,
"pass@4": 0.9783135702722786,
"pass@8": 0.9840906736702117,
"pass@16": 0.9879928605789021,
"pass@32": 0.9908513468886352,
"pass@64": 0.9930496212972972,
"pass@128": 0.9940119760479041
},
"11": {
"pass@1": 0.7309864457831325,
"pass@2": 0.811214839673655,
"pass@4": 0.8645121111555325,
"pass@8": 0.9001346785530842,
"pass@16": 0.9235781504819824,
"pass@32": 0.939017807747773,
"pass@64": 0.9485292552001391,
"pass@128": 0.9548192771084337
},
"12": {
"pass@1": 0.439042907523511,
"pass@2": 0.5317378835188704,
"pass@4": 0.6075961422729685,
"pass@8": 0.6683062360710545,
"pass@16": 0.7177272091161964,
"pass@32": 0.7598458853246168,
"pass@64": 0.7942021269976,
"pass@128": 0.8213166144200627
},
"13": {
"pass@1": 0.19227065826330533,
"pass@2": 0.23474754074858276,
"pass@4": 0.27791213361985206,
"pass@8": 0.32123267772057174,
"pass@16": 0.36547236780648024,
"pass@32": 0.412591392141311,
"pass@64": 0.46083134462740805,
"pass@128": 0.5042016806722689
},
"14": {
"pass@1": 0.20837902046783627,
"pass@2": 0.2522699670764838,
"pass@4": 0.2964683594923442,
"pass@8": 0.3416618793696011,
"pass@16": 0.3841336779597966,
"pass@32": 0.42375806001618826,
"pass@64": 0.4632207639904677,
"pass@128": 0.5029239766081871
},
"15": {
"pass@1": 0.19093276515151514,
"pass@2": 0.24199400501073715,
"pass@4": 0.2891956014020975,
"pass@8": 0.3326345282383242,
"pass@16": 0.37366558323041027,
"pass@32": 0.4128573661812689,
"pass@64": 0.4483166035224402,
"pass@128": 0.4727272727272727
},
"16": {
"pass@1": 0.15642806267806267,
"pass@2": 0.1904278691926329,
"pass@4": 0.22132350069489173,
"pass@8": 0.251268169779741,
"pass@16": 0.28180434335829546,
"pass@32": 0.3122157586188942,
"pass@64": 0.34211985927431315,
"pass@128": 0.3732193732193732
},
"17": {
"pass@1": 0.16779891304347827,
"pass@2": 0.21377104526336374,
"pass@4": 0.2616579632708955,
"pass@8": 0.3084358591590647,
"pass@16": 0.3485515761094292,
"pass@32": 0.38192734451252297,
"pass@64": 0.4116145438455866,
"pass@128": 0.43788819875776397
},
"18": {
"pass@1": 0.1518612132352941,
"pass@2": 0.1958187384205651,
"pass@4": 0.23828893419572547,
"pass@8": 0.27971765090438966,
"pass@16": 0.31909231139592203,
"pass@32": 0.3541919542192856,
"pass@64": 0.3879892602530647,
"pass@128": 0.4235294117647059
},
"19": {
"pass@1": 0.14004371279761904,
"pass@2": 0.17627703763592056,
"pass@4": 0.21033196631671033,
"pass@8": 0.24241868591666452,
"pass@16": 0.2725452956766527,
"pass@32": 0.3023936104609465,
"pass@64": 0.331830773125219,
"pass@128": 0.3601190476190476
},
"20": {
"pass@1": 0.1367421407185629,
"pass@2": 0.17771825934744678,
"pass@4": 0.21898993408009634,
"pass@8": 0.260066157867715,
"pass@16": 0.2989195527821917,
"pass@32": 0.3334463816599912,
"pass@64": 0.3631967204725436,
"pass@128": 0.38622754491017963
}
}
},
"movie_festival_awards": {
"count": 801152,
"correct": 471310,
"answer_accuracy": 0.5882903618788944,
"avg_response_len": 258.0569242790382,
"resp_tokens_sum": 206742821,
"pass_at_k": {
"pass@1": 0.5882903618788944,
"pass@2": 0.6212685622467475,
"pass@4": 0.6478714293112718,
"pass@8": 0.6709216329453926,
"pass@16": 0.6916683710373397,
"pass@32": 0.7106315691905148,
"pass@64": 0.728740000525103,
"pass@128": 0.7466048889598977
},
"per_op_pass_at_k": {
"10": {
"pass@1": 0.8983908582089553,
"pass@2": 0.9340132506757551,
"pass@4": 0.9530650669599133,
"pass@8": 0.9652801856862316,
"pass@16": 0.9745693164759084,
"pass@32": 0.9818461284408381,
"pass@64": 0.9871328155545905,
"pass@128": 0.991044776119403
},
"2": {
"pass@1": 0.9998660714285714,
"pass@2": 0.9999992969628797,
"pass@4": 1.0,
"pass@8": 1.0,
"pass@16": 1.0,
"pass@32": 1.0,
"pass@64": 1.0,
"pass@128": 1.0
},
"3": {
"pass@1": 0.9907069970845481,
"pass@2": 0.9970669721769471,
"pass@4": 0.9993563997831172,
"pass@8": 0.9999514691641971,
"pass@16": 0.9999996499221584,
"pass@32": 0.9999999999946803,
"pass@64": 1.0,
"pass@128": 1.0
},
"4": {
"pass@1": 0.9958196271929824,
"pass@2": 0.9977559342911083,
"pass@4": 0.9990211264710332,
"pass@8": 0.9997663197662907,
"pass@16": 0.9999856176620865,
"pass@32": 0.9999999740185194,
"pass@64": 0.9999999999999997,
"pass@128": 1.0
},
"5": {
"pass@1": 0.9548943014705882,
"pass@2": 0.9685079174386292,
"pass@4": 0.9784535587463331,
"pass@8": 0.9869293632462349,
"pass@16": 0.9934826994856965,
"pass@32": 0.9970310861536675,
"pass@64": 0.9985066302193607,
"pass@128": 1.0
},
"6": {
"pass@1": 0.9775483044164038,
"pass@2": 0.9891293716932861,
"pass@4": 0.9959075131381444,
"pass@8": 0.9991092920224423,
"pass@16": 0.9999352040645176,
"pass@32": 0.9999996662295679,
"pass@64": 0.9999999999997393,
"pass@128": 1.0
},
"7": {
"pass@1": 0.9744115901898734,
"pass@2": 0.9867456923402768,
"pass@4": 0.9912063912303684,
"pass@8": 0.9935151685224703,
"pass@16": 0.9954041601504686,
"pass@32": 0.9972167179953726,
"pass@64": 0.9990265956175474,
"pass@128": 1.0
},
"8": {
"pass@1": 0.9515531156156156,
"pass@2": 0.9743795222387744,
"pass@4": 0.9852795146102226,
"pass@8": 0.9899431408996546,
"pass@16": 0.9925003924523234,
"pass@32": 0.9950902478963195,
"pass@64": 0.9976578190444356,
"pass@128": 1.0
},
"9": {
"pass@1": 0.9379521704180064,
"pass@2": 0.9720591912043948,
"pass@4": 0.9858236963747689,
"pass@8": 0.9932814976290268,
"pass@16": 0.9968830356374474,
"pass@32": 0.9981725276413653,
"pass@64": 0.9992024635603814,
"pass@128": 1.0
},
"11": {
"pass@1": 0.7650545634920635,
"pass@2": 0.8431473409573809,
"pass@4": 0.8877359086066622,
"pass@8": 0.9145899228176874,
"pass@16": 0.9334761048558039,
"pass@32": 0.9489928847114996,
"pass@64": 0.9642010673849086,
"pass@128": 0.9809523809523809
},
"12": {
"pass@1": 0.4054615825688073,
"pass@2": 0.49957183534397626,
"pass@4": 0.5768995727598271,
"pass@8": 0.639355428961972,
"pass@16": 0.6924379381519293,
"pass@32": 0.7389465717283092,
"pass@64": 0.7833812207530858,
"pass@128": 0.8256880733944955
},
"13": {
"pass@1": 0.21805073302469136,
"pass@2": 0.2614506628511712,
"pass@4": 0.30509700407819385,
"pass@8": 0.34843945585161107,
"pass@16": 0.3892841325907048,
"pass@32": 0.42707158587605587,
"pass@64": 0.46350207303509766,
"pass@128": 0.5
},
"14": {
"pass@1": 0.19482566765578635,
"pass@2": 0.24658432440010283,
"pass@4": 0.2977982871762691,
"pass@8": 0.34710213397424194,
"pass@16": 0.39305571585018145,
"pass@32": 0.4354334888531387,
"pass@64": 0.4760201367236571,
"pass@128": 0.516320474777448
},
"15": {
"pass@1": 0.15263310185185186,
"pass@2": 0.20657633724603863,
"pass@4": 0.26040626837154607,
"pass@8": 0.31066200412971595,
"pass@16": 0.3575054534309268,
"pass@32": 0.40181191645690056,
"pass@64": 0.4460592807353767,
"pass@128": 0.49074074074074076
},
"16": {
"pass@1": 0.15613477138643067,
"pass@2": 0.1863313822497852,
"pass@4": 0.21273005393131156,
"pass@8": 0.23865739981671225,
"pass@16": 0.2676959517806339,
"pass@32": 0.3011310763692745,
"pass@64": 0.3362407178410369,
"pass@128": 0.37168141592920356
},
"17": {
"pass@1": 0.1378012048192771,
"pass@2": 0.17401619272365054,
"pass@4": 0.21124320340981484,
"pass@8": 0.24873698272180494,
"pass@16": 0.28468715470923933,
"pass@32": 0.31684364503562207,
"pass@64": 0.3460631347877581,
"pass@128": 0.37650602409638556
},
"18": {
"pass@1": 0.16940524193548387,
"pass@2": 0.20793830962661927,
"pass@4": 0.24442847708552565,
"pass@8": 0.27696590133200394,
"pass@16": 0.3047539165997947,
"pass@32": 0.329140512075412,
"pass@64": 0.3525837977178592,
"pass@128": 0.3774193548387097
},
"19": {
"pass@1": 0.13812311178247735,
"pass@2": 0.16759305790137258,
"pass@4": 0.19758813267676884,
"pass@8": 0.2284911534750015,
"pass@16": 0.2603002938970115,
"pass@32": 0.29249731296624343,
"pass@64": 0.3260332407783438,
"pass@128": 0.36253776435045315
},
"20": {
"pass@1": 0.13072447447447447,
"pass@2": 0.16936079780567978,
"pass@4": 0.2094049935762534,
"pass@8": 0.25098284669882537,
"pass@16": 0.291417867530838,
"pass@32": 0.32739810777438544,
"pass@64": 0.35806392636422313,
"pass@128": 0.3813813813813814
}
}
}
}
}
]