diff --git "a/trainer_state.json" "b/trainer_state.json"
--- "a/trainer_state.json"
+++ "b/trainer_state.json"
@@ -1,8 +1,8 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0,
-  "global_step": 2105,
+  "epoch": 5.0,
+  "global_step": 10525,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -13,12 +13,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.9824163913726807,
+      "distillation_loss": 1.9993383884429932,
       "epoch": 0.0,
-      "learning_rate": 1.99144893111639e-05,
-      "loss": 1.8791,
+      "learning_rate": 1.998289786223278e-05,
+      "loss": 1.8842,
       "step": 10,
-      "task_loss": 0.6514205932617188
+      "task_loss": 0.6696395874023438
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -26,12 +26,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.9337328672409058,
+      "distillation_loss": 1.9561816453933716,
       "epoch": 0.01,
-      "learning_rate": 1.9819477434679337e-05,
-      "loss": 1.8084,
+      "learning_rate": 1.996389548693587e-05,
+      "loss": 1.8112,
       "step": 20,
-      "task_loss": 0.68878173828125
+      "task_loss": 0.6945114135742188
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -39,12 +39,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.7075116634368896,
+      "distillation_loss": 1.705786108970642,
       "epoch": 0.01,
-      "learning_rate": 1.9724465558194775e-05,
-      "loss": 1.7447,
+      "learning_rate": 1.9944893111638956e-05,
+      "loss": 1.7851,
       "step": 30,
-      "task_loss": 0.5935821533203125
+      "task_loss": 0.599273681640625
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -52,12 +52,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.5379259586334229,
+      "distillation_loss": 1.6281533241271973,
       "epoch": 0.02,
-      "learning_rate": 1.9629453681710216e-05,
-      "loss": 1.4292,
+      "learning_rate": 1.9925890736342042e-05,
+      "loss": 1.5178,
       "step": 40,
-      "task_loss": 0.5262355804443359
+      "task_loss": 0.555267333984375
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -65,12 +65,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7455981969833374,
+      "distillation_loss": 0.9100077152252197,
       "epoch": 0.02,
-      "learning_rate": 1.954394299287411e-05,
-      "loss": 1.2067,
+      "learning_rate": 1.9908788598574825e-05,
+      "loss": 1.2264,
       "step": 50,
-      "task_loss": 0.20388412475585938
+      "task_loss": 0.27131175994873047
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -78,12 +78,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.1262179613113403,
+      "distillation_loss": 1.0724254846572876,
       "epoch": 0.03,
-      "learning_rate": 1.944893111638955e-05,
-      "loss": 0.9282,
+      "learning_rate": 1.988978622327791e-05,
+      "loss": 0.9712,
       "step": 60,
-      "task_loss": 0.45258522033691406
+      "task_loss": 0.4018378257751465
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -91,12 +91,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.2520625591278076,
+      "distillation_loss": 1.2103850841522217,
       "epoch": 0.03,
-      "learning_rate": 1.935391923990499e-05,
-      "loss": 0.9656,
+      "learning_rate": 1.9870783847981e-05,
+      "loss": 0.9075,
       "step": 70,
-      "task_loss": 0.6228516101837158
+      "task_loss": 0.5983531475067139
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -104,12 +104,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.8546397686004639,
+      "distillation_loss": 0.6644728183746338,
       "epoch": 0.04,
-      "learning_rate": 1.9268408551068884e-05,
-      "loss": 0.9789,
+      "learning_rate": 1.9851781472684087e-05,
+      "loss": 0.8009,
       "step": 80,
-      "task_loss": 0.31339454650878906
+      "task_loss": 0.25090086460113525
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -117,12 +117,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.1221280097961426,
+      "distillation_loss": 1.2112879753112793,
       "epoch": 0.04,
-      "learning_rate": 1.9173396674584325e-05,
-      "loss": 0.8387,
+      "learning_rate": 1.9832779097387176e-05,
+      "loss": 0.8291,
       "step": 90,
-      "task_loss": 0.44170093536376953
+      "task_loss": 0.4889531135559082
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -130,12 +130,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4717516005039215,
+      "distillation_loss": 0.6701904535293579,
       "epoch": 0.05,
-      "learning_rate": 1.9078384798099766e-05,
-      "loss": 0.6816,
+      "learning_rate": 1.9813776722090262e-05,
+      "loss": 0.714,
       "step": 100,
-      "task_loss": 0.15700101852416992
+      "task_loss": 0.25854694843292236
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -143,12 +143,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7137082815170288,
+      "distillation_loss": 0.6915435791015625,
       "epoch": 0.05,
-      "learning_rate": 1.8983372921615203e-05,
-      "loss": 0.6424,
+      "learning_rate": 1.9794774346793352e-05,
+      "loss": 0.6274,
       "step": 110,
-      "task_loss": 0.3361825942993164
+      "task_loss": 0.3031274080276489
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -156,12 +156,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7122435569763184,
+      "distillation_loss": 0.6346196532249451,
       "epoch": 0.06,
-      "learning_rate": 1.8888361045130644e-05,
-      "loss": 0.7162,
+      "learning_rate": 1.9775771971496438e-05,
+      "loss": 0.7066,
       "step": 120,
-      "task_loss": 0.5846670866012573
+      "task_loss": 0.5627198815345764
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -169,12 +169,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5021461248397827,
+      "distillation_loss": 0.8290033340454102,
       "epoch": 0.06,
-      "learning_rate": 1.8793349168646082e-05,
-      "loss": 0.5908,
+      "learning_rate": 1.9756769596199528e-05,
+      "loss": 0.5735,
       "step": 130,
-      "task_loss": 0.18646156787872314
+      "task_loss": 0.3992866575717926
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -182,12 +182,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.33661046624183655,
+      "distillation_loss": 0.5018503665924072,
       "epoch": 0.07,
-      "learning_rate": 1.8698337292161523e-05,
-      "loss": 0.8316,
+      "learning_rate": 1.9737767220902614e-05,
+      "loss": 0.7937,
       "step": 140,
-      "task_loss": 0.11925870180130005
+      "task_loss": 0.16568666696548462
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -195,12 +195,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4071422219276428,
+      "distillation_loss": 0.49170026183128357,
       "epoch": 0.07,
-      "learning_rate": 1.860332541567696e-05,
-      "loss": 0.6662,
+      "learning_rate": 1.9718764845605703e-05,
+      "loss": 0.7142,
       "step": 150,
-      "task_loss": 0.38656365871429443
+      "task_loss": 0.3975885510444641
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -208,12 +208,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3433850407600403,
+      "distillation_loss": 0.5039875507354736,
       "epoch": 0.08,
-      "learning_rate": 1.8508313539192398e-05,
-      "loss": 0.6492,
+      "learning_rate": 1.969976247030879e-05,
+      "loss": 0.591,
       "step": 160,
-      "task_loss": 0.1292821764945984
+      "task_loss": 0.24026933312416077
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -221,12 +221,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.853670597076416,
+      "distillation_loss": 0.9136797189712524,
       "epoch": 0.08,
-      "learning_rate": 1.8413301662707842e-05,
-      "loss": 0.6022,
+      "learning_rate": 1.9680760095011876e-05,
+      "loss": 0.5924,
       "step": 170,
-      "task_loss": 0.4025900363922119
+      "task_loss": 0.4351678192615509
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -234,12 +234,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6042287945747375,
+      "distillation_loss": 0.7978807091712952,
       "epoch": 0.09,
-      "learning_rate": 1.831828978622328e-05,
-      "loss": 0.7525,
+      "learning_rate": 1.9661757719714965e-05,
+      "loss": 0.7811,
       "step": 180,
-      "task_loss": 0.23989427089691162
+      "task_loss": 0.3571215867996216
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -247,12 +247,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.0951387882232666,
+      "distillation_loss": 1.0396177768707275,
       "epoch": 0.09,
-      "learning_rate": 1.8223277909738718e-05,
-      "loss": 0.7468,
+      "learning_rate": 1.9642755344418055e-05,
+      "loss": 0.7498,
       "step": 190,
-      "task_loss": 0.5307990908622742
+      "task_loss": 0.5442102551460266
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -260,12 +260,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.27730026841163635,
+      "distillation_loss": 0.20894655585289001,
       "epoch": 0.1,
-      "learning_rate": 1.812826603325416e-05,
-      "loss": 0.5985,
+      "learning_rate": 1.962375296912114e-05,
+      "loss": 0.5461,
       "step": 200,
-      "task_loss": 0.05895298719406128
+      "task_loss": 0.05690506100654602
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -273,12 +273,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.9861497282981873,
+      "distillation_loss": 0.8320358991622925,
       "epoch": 0.1,
-      "learning_rate": 1.8033254156769596e-05,
-      "loss": 0.6073,
+      "learning_rate": 1.960475059382423e-05,
+      "loss": 0.6077,
       "step": 210,
-      "task_loss": 0.5837754011154175
+      "task_loss": 0.4803912937641144
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -286,12 +286,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.532859206199646,
+      "distillation_loss": 0.5520890951156616,
       "epoch": 0.1,
-      "learning_rate": 1.7938242280285037e-05,
-      "loss": 0.5815,
+      "learning_rate": 1.958574821852732e-05,
+      "loss": 0.5951,
       "step": 220,
-      "task_loss": 0.2384859323501587
+      "task_loss": 0.21851623058319092
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -299,12 +299,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7019934058189392,
+      "distillation_loss": 0.6869984865188599,
       "epoch": 0.11,
-      "learning_rate": 1.7843230403800475e-05,
-      "loss": 0.778,
+      "learning_rate": 1.9566745843230406e-05,
+      "loss": 0.7494,
       "step": 230,
-      "task_loss": 0.3135228157043457
+      "task_loss": 0.30402871966362
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -312,12 +312,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.9965037703514099,
+      "distillation_loss": 1.0405832529067993,
       "epoch": 0.11,
-      "learning_rate": 1.7748218527315916e-05,
-      "loss": 0.6839,
+      "learning_rate": 1.9547743467933492e-05,
+      "loss": 0.6492,
       "step": 240,
-      "task_loss": 0.4901553988456726
+      "task_loss": 0.5161008834838867
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -325,20 +325,20 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5550552606582642,
+      "distillation_loss": 0.44516924023628235,
       "epoch": 0.12,
-      "learning_rate": 1.7653206650831357e-05,
-      "loss": 0.4129,
+      "learning_rate": 1.9528741092636582e-05,
+      "loss": 0.392,
       "step": 250,
-      "task_loss": 0.2684090733528137
+      "task_loss": 0.1959661990404129
     },
     {
       "epoch": 0.12,
-      "eval_accuracy": 0.8761467889908257,
-      "eval_loss": 0.44158032536506653,
-      "eval_runtime": 21.9879,
-      "eval_samples_per_second": 39.658,
-      "eval_steps_per_second": 4.957,
+      "eval_accuracy": 0.8887614678899083,
+      "eval_loss": 0.45345592498779297,
+      "eval_runtime": 28.6659,
+      "eval_samples_per_second": 30.419,
+      "eval_steps_per_second": 3.802,
       "step": 250
     },
     {
@@ -347,12 +347,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5287630558013916,
+      "distillation_loss": 0.5078727006912231,
       "epoch": 0.12,
-      "learning_rate": 1.7558194774346795e-05,
-      "loss": 0.5833,
+      "learning_rate": 1.9509738717339668e-05,
+      "loss": 0.6549,
       "step": 260,
-      "task_loss": 0.2730463147163391
+      "task_loss": 0.22611352801322937
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -360,12 +360,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.9515942335128784,
+      "distillation_loss": 0.772598147392273,
       "epoch": 0.13,
-      "learning_rate": 1.7463182897862236e-05,
-      "loss": 0.6329,
+      "learning_rate": 1.9490736342042758e-05,
+      "loss": 0.646,
       "step": 270,
-      "task_loss": 0.4847598969936371
+      "task_loss": 0.345528244972229
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -373,12 +373,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5145795345306396,
+      "distillation_loss": 0.5505326986312866,
       "epoch": 0.13,
-      "learning_rate": 1.7368171021377673e-05,
-      "loss": 0.6194,
+      "learning_rate": 1.9471733966745844e-05,
+      "loss": 0.5464,
       "step": 280,
-      "task_loss": 0.4189565181732178
+      "task_loss": 0.45021820068359375
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -386,12 +386,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7319836020469666,
+      "distillation_loss": 0.6786664128303528,
       "epoch": 0.14,
-      "learning_rate": 1.727315914489311e-05,
-      "loss": 0.4816,
+      "learning_rate": 1.9452731591448933e-05,
+      "loss": 0.4428,
       "step": 290,
-      "task_loss": 0.3963426649570465
+      "task_loss": 0.3662004768848419
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -399,12 +399,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4611780047416687,
+      "distillation_loss": 0.5807572603225708,
       "epoch": 0.14,
-      "learning_rate": 1.7178147268408552e-05,
-      "loss": 0.5048,
+      "learning_rate": 1.943372921615202e-05,
+      "loss": 0.5107,
       "step": 300,
-      "task_loss": 0.19796603918075562
+      "task_loss": 0.26058703660964966
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -412,12 +412,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5850609540939331,
+      "distillation_loss": 0.3432176411151886,
       "epoch": 0.15,
-      "learning_rate": 1.7083135391923993e-05,
-      "loss": 0.5864,
+      "learning_rate": 1.941472684085511e-05,
+      "loss": 0.5331,
       "step": 310,
-      "task_loss": 0.2181399166584015
+      "task_loss": 0.10017021000385284
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -425,12 +425,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4029274880886078,
+      "distillation_loss": 0.4090806841850281,
       "epoch": 0.15,
-      "learning_rate": 1.698812351543943e-05,
-      "loss": 0.534,
+      "learning_rate": 1.9395724465558195e-05,
+      "loss": 0.5085,
       "step": 320,
-      "task_loss": 0.1707906723022461
+      "task_loss": 0.1661628633737564
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -438,12 +438,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5725364685058594,
+      "distillation_loss": 0.6101839542388916,
       "epoch": 0.16,
-      "learning_rate": 1.689311163895487e-05,
-      "loss": 0.6037,
+      "learning_rate": 1.9376722090261285e-05,
+      "loss": 0.5719,
       "step": 330,
-      "task_loss": 0.23323196172714233
+      "task_loss": 0.2671370506286621
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -451,12 +451,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6093335747718811,
+      "distillation_loss": 0.6847037672996521,
       "epoch": 0.16,
-      "learning_rate": 1.679809976247031e-05,
-      "loss": 0.7098,
+      "learning_rate": 1.935771971496437e-05,
+      "loss": 0.7526,
       "step": 340,
-      "task_loss": 0.4206717610359192
+      "task_loss": 0.5243218541145325
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -464,12 +464,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.0119699239730835,
+      "distillation_loss": 1.0114264488220215,
       "epoch": 0.17,
-      "learning_rate": 1.670308788598575e-05,
-      "loss": 0.4015,
+      "learning_rate": 1.933871733966746e-05,
+      "loss": 0.4603,
       "step": 350,
-      "task_loss": 0.6245448589324951
+      "task_loss": 0.5498969554901123
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -477,12 +477,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6928797960281372,
+      "distillation_loss": 0.6255608797073364,
       "epoch": 0.17,
-      "learning_rate": 1.6608076009501188e-05,
-      "loss": 0.6882,
+      "learning_rate": 1.9319714964370547e-05,
+      "loss": 0.5996,
       "step": 360,
-      "task_loss": 0.32340091466903687
+      "task_loss": 0.2702621519565582
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -490,12 +490,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2802911400794983,
+      "distillation_loss": 0.266034334897995,
       "epoch": 0.18,
-      "learning_rate": 1.651306413301663e-05,
-      "loss": 0.5088,
+      "learning_rate": 1.9300712589073636e-05,
+      "loss": 0.4917,
       "step": 370,
-      "task_loss": 0.12505391240119934
+      "task_loss": 0.13177835941314697
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -503,12 +503,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6381040811538696,
+      "distillation_loss": 0.7009546756744385,
       "epoch": 0.18,
-      "learning_rate": 1.641805225653207e-05,
-      "loss": 0.5281,
+      "learning_rate": 1.9281710213776723e-05,
+      "loss": 0.5101,
       "step": 380,
-      "task_loss": 0.25593793392181396
+      "task_loss": 0.28630542755126953
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -516,12 +516,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 1.2553234100341797,
+      "distillation_loss": 0.9093587398529053,
       "epoch": 0.19,
-      "learning_rate": 1.6323040380047507e-05,
-      "loss": 0.4673,
+      "learning_rate": 1.9262707838479812e-05,
+      "loss": 0.5288,
       "step": 390,
-      "task_loss": 0.7861297130584717
+      "task_loss": 0.5337154865264893
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -529,12 +529,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.32122546434402466,
+      "distillation_loss": 0.2342422604560852,
       "epoch": 0.19,
-      "learning_rate": 1.622802850356295e-05,
-      "loss": 0.5358,
+      "learning_rate": 1.92437054631829e-05,
+      "loss": 0.5478,
       "step": 400,
-      "task_loss": 0.14392141997814178
+      "task_loss": 0.10641683638095856
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -542,12 +542,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4124768078327179,
+      "distillation_loss": 0.4991934597492218,
       "epoch": 0.19,
-      "learning_rate": 1.6133016627078386e-05,
-      "loss": 0.388,
+      "learning_rate": 1.9224703087885988e-05,
+      "loss": 0.3909,
       "step": 410,
-      "task_loss": 0.18850776553153992
+      "task_loss": 0.23908966779708862
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -555,12 +555,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5564821362495422,
+      "distillation_loss": 0.4699850380420685,
       "epoch": 0.2,
-      "learning_rate": 1.6038004750593824e-05,
-      "loss": 0.4706,
+      "learning_rate": 1.9205700712589074e-05,
+      "loss": 0.4033,
       "step": 420,
-      "task_loss": 0.35438424348831177
+      "task_loss": 0.28487616777420044
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -568,12 +568,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.8114709854125977,
+      "distillation_loss": 0.9620916247367859,
       "epoch": 0.2,
-      "learning_rate": 1.5942992874109265e-05,
-      "loss": 0.3992,
+      "learning_rate": 1.9188598574821856e-05,
+      "loss": 0.4304,
       "step": 430,
-      "task_loss": 0.5884829163551331
+      "task_loss": 0.6841728687286377
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -581,12 +581,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.9019944071769714,
+      "distillation_loss": 0.7321962714195251,
       "epoch": 0.21,
-      "learning_rate": 1.5847980997624702e-05,
-      "loss": 0.5377,
+      "learning_rate": 1.9169596199524942e-05,
+      "loss": 0.6077,
       "step": 440,
-      "task_loss": 0.4972737431526184
+      "task_loss": 0.4345829486846924
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -594,12 +594,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4662189781665802,
+      "distillation_loss": 0.46323296427726746,
       "epoch": 0.21,
-      "learning_rate": 1.5752969121140143e-05,
-      "loss": 0.4676,
+      "learning_rate": 1.915059382422803e-05,
+      "loss": 0.4439,
       "step": 450,
-      "task_loss": 0.22651702165603638
+      "task_loss": 0.17068199813365936
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -607,12 +607,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7801036834716797,
+      "distillation_loss": 0.9150896072387695,
       "epoch": 0.22,
-      "learning_rate": 1.5657957244655584e-05,
-      "loss": 0.5115,
+      "learning_rate": 1.9131591448931118e-05,
+      "loss": 0.5283,
       "step": 460,
-      "task_loss": 0.3273088335990906
+      "task_loss": 0.42760002613067627
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -620,12 +620,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.24446932971477509,
+      "distillation_loss": 0.3148682713508606,
       "epoch": 0.22,
-      "learning_rate": 1.5562945368171022e-05,
-      "loss": 0.4323,
+      "learning_rate": 1.9112589073634208e-05,
+      "loss": 0.459,
       "step": 470,
-      "task_loss": 0.28846049308776855
+      "task_loss": 0.3109205663204193
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -633,12 +633,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.31131500005722046,
+      "distillation_loss": 0.455905556678772,
       "epoch": 0.23,
-      "learning_rate": 1.5467933491686463e-05,
-      "loss": 0.4727,
+      "learning_rate": 1.9093586698337294e-05,
+      "loss": 0.4673,
       "step": 480,
-      "task_loss": 0.06133585423231125
+      "task_loss": 0.13838760554790497
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -646,12 +646,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2505212426185608,
+      "distillation_loss": 0.25112056732177734,
       "epoch": 0.23,
-      "learning_rate": 1.53729216152019e-05,
-      "loss": 0.6103,
+      "learning_rate": 1.907458432304038e-05,
+      "loss": 0.5656,
       "step": 490,
-      "task_loss": 0.16016395390033722
+      "task_loss": 0.1593867689371109
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -659,20 +659,20 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6978294849395752,
+      "distillation_loss": 0.8803852796554565,
       "epoch": 0.24,
-      "learning_rate": 1.527790973871734e-05,
-      "loss": 0.412,
+      "learning_rate": 1.905558194774347e-05,
+      "loss": 0.4413,
       "step": 500,
-      "task_loss": 0.3384208679199219
+      "task_loss": 0.46672773361206055
     },
     {
       "epoch": 0.24,
       "eval_accuracy": 0.8899082568807339,
-      "eval_loss": 0.49690014123916626,
-      "eval_runtime": 22.004,
-      "eval_samples_per_second": 39.629,
-      "eval_steps_per_second": 4.954,
+      "eval_loss": 0.4671143591403961,
+      "eval_runtime": 23.1207,
+      "eval_samples_per_second": 37.715,
+      "eval_steps_per_second": 4.714,
       "step": 500
     },
     {
@@ -681,12 +681,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5271720886230469,
+      "distillation_loss": 0.4302404522895813,
       "epoch": 0.24,
-      "learning_rate": 1.5182897862232779e-05,
-      "loss": 0.3288,
+      "learning_rate": 1.9036579572446556e-05,
+      "loss": 0.3487,
       "step": 510,
-      "task_loss": 0.3533409833908081
+      "task_loss": 0.34454867243766785
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -694,12 +694,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.39492568373680115,
+      "distillation_loss": 0.339828759431839,
       "epoch": 0.25,
-      "learning_rate": 1.508788598574822e-05,
-      "loss": 0.3691,
+      "learning_rate": 1.9017577197149645e-05,
+      "loss": 0.417,
       "step": 520,
-      "task_loss": 0.17543694376945496
+      "task_loss": 0.14348742365837097
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -707,12 +707,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5424761772155762,
+      "distillation_loss": 0.4093138575553894,
       "epoch": 0.25,
-      "learning_rate": 1.499287410926366e-05,
-      "loss": 0.5957,
+      "learning_rate": 1.8998574821852735e-05,
+      "loss": 0.6335,
       "step": 530,
-      "task_loss": 0.35330963134765625
+      "task_loss": 0.2882145345211029
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -720,12 +720,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2259323000907898,
+      "distillation_loss": 0.2194601595401764,
       "epoch": 0.26,
-      "learning_rate": 1.4897862232779099e-05,
-      "loss": 0.4177,
+      "learning_rate": 1.897957244655582e-05,
+      "loss": 0.4188,
       "step": 540,
-      "task_loss": 0.05318892002105713
+      "task_loss": 0.058180660009384155
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -733,12 +733,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.912689208984375,
+      "distillation_loss": 0.5428069829940796,
       "epoch": 0.26,
-      "learning_rate": 1.4802850356294538e-05,
-      "loss": 0.5001,
+      "learning_rate": 1.8960570071258907e-05,
+      "loss": 0.4371,
       "step": 550,
-      "task_loss": 0.447698175907135
+      "task_loss": 0.259736031293869
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -746,12 +746,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7216652631759644,
+      "distillation_loss": 0.8619398474693298,
       "epoch": 0.27,
-      "learning_rate": 1.4707838479809977e-05,
-      "loss": 0.5728,
+      "learning_rate": 1.8941567695961997e-05,
+      "loss": 0.5938,
       "step": 560,
-      "task_loss": 0.5098874568939209
+      "task_loss": 0.5211101174354553
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -759,12 +759,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2023361772298813,
+      "distillation_loss": 0.1949978768825531,
       "epoch": 0.27,
-      "learning_rate": 1.4612826603325417e-05,
-      "loss": 0.3707,
+      "learning_rate": 1.8922565320665086e-05,
+      "loss": 0.3784,
       "step": 570,
-      "task_loss": 0.22737565636634827
+      "task_loss": 0.20978039503097534
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -772,12 +772,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.46668243408203125,
+      "distillation_loss": 0.4044320583343506,
       "epoch": 0.28,
-      "learning_rate": 1.4517814726840856e-05,
-      "loss": 0.4068,
+      "learning_rate": 1.8903562945368172e-05,
+      "loss": 0.3817,
       "step": 580,
-      "task_loss": 0.2710917294025421
+      "task_loss": 0.22608956694602966
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -785,12 +785,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5557296276092529,
+      "distillation_loss": 0.7097648978233337,
       "epoch": 0.28,
-      "learning_rate": 1.4422802850356297e-05,
-      "loss": 0.3836,
+      "learning_rate": 1.888456057007126e-05,
+      "loss": 0.413,
       "step": 590,
-      "task_loss": 0.43243685364723206
+      "task_loss": 0.5404372811317444
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -798,12 +798,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6273843050003052,
+      "distillation_loss": 0.6512647867202759,
       "epoch": 0.29,
-      "learning_rate": 1.4327790973871736e-05,
-      "loss": 0.5479,
+      "learning_rate": 1.8865558194774348e-05,
+      "loss": 0.5914,
       "step": 600,
-      "task_loss": 0.2661285400390625
+      "task_loss": 0.2793722450733185
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -811,12 +811,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3024005889892578,
+      "distillation_loss": 0.3936828374862671,
       "epoch": 0.29,
-      "learning_rate": 1.4232779097387176e-05,
-      "loss": 0.3326,
+      "learning_rate": 1.8846555819477438e-05,
+      "loss": 0.3537,
       "step": 610,
-      "task_loss": 0.11362183094024658
+      "task_loss": 0.1763158142566681
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -824,12 +824,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6180804967880249,
+      "distillation_loss": 0.5243096947669983,
       "epoch": 0.29,
-      "learning_rate": 1.4137767220902613e-05,
-      "loss": 0.2927,
+      "learning_rate": 1.8827553444180524e-05,
+      "loss": 0.3381,
       "step": 620,
-      "task_loss": 0.3157673478126526
+      "task_loss": 0.24796336889266968
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -837,12 +837,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2821282148361206,
+      "distillation_loss": 0.32880261540412903,
       "epoch": 0.3,
-      "learning_rate": 1.4042755344418053e-05,
-      "loss": 0.3051,
+      "learning_rate": 1.880855106888361e-05,
+      "loss": 0.329,
       "step": 630,
-      "task_loss": 0.1337086707353592
+      "task_loss": 0.1616968810558319
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -850,12 +850,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5159429907798767,
+      "distillation_loss": 0.3982900381088257,
       "epoch": 0.3,
-      "learning_rate": 1.3947743467933492e-05,
-      "loss": 0.4209,
+      "learning_rate": 1.87895486935867e-05,
+      "loss": 0.3864,
       "step": 640,
-      "task_loss": 0.35967183113098145
+      "task_loss": 0.2720021903514862
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -863,12 +863,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.8615990281105042,
+      "distillation_loss": 0.59184730052948,
       "epoch": 0.31,
-      "learning_rate": 1.3852731591448931e-05,
-      "loss": 0.4166,
+      "learning_rate": 1.877054631828979e-05,
+      "loss": 0.4413,
       "step": 650,
-      "task_loss": 0.6024147868156433
+      "task_loss": 0.41478922963142395
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -876,12 +876,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.1939113736152649,
+      "distillation_loss": 0.2740156352519989,
       "epoch": 0.31,
-      "learning_rate": 1.3757719714964372e-05,
-      "loss": 0.3193,
+      "learning_rate": 1.8751543942992875e-05,
+      "loss": 0.363,
       "step": 660,
-      "task_loss": 0.023959562182426453
+      "task_loss": 0.04690548777580261
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -889,12 +889,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.31010207533836365,
+      "distillation_loss": 0.2690792977809906,
       "epoch": 0.32,
-      "learning_rate": 1.3662707838479811e-05,
-      "loss": 0.3788,
+      "learning_rate": 1.873254156769596e-05,
+      "loss": 0.3805,
       "step": 670,
-      "task_loss": 0.17067261040210724
+      "task_loss": 0.13159048557281494
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -902,12 +902,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6690168976783752,
+      "distillation_loss": 0.6759920120239258,
       "epoch": 0.32,
-      "learning_rate": 1.356769596199525e-05,
-      "loss": 0.3763,
+      "learning_rate": 1.871353919239905e-05,
+      "loss": 0.3392,
       "step": 680,
-      "task_loss": 0.355002760887146
+      "task_loss": 0.37233513593673706
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -915,12 +915,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.23823483288288116,
+      "distillation_loss": 0.2218218445777893,
       "epoch": 0.33,
-      "learning_rate": 1.347268408551069e-05,
-      "loss": 0.4861,
+      "learning_rate": 1.869453681710214e-05,
+      "loss": 0.4838,
       "step": 690,
-      "task_loss": 0.0851842388510704
+      "task_loss": 0.0667574405670166
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -928,12 +928,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.19940122961997986,
+      "distillation_loss": 0.35146889090538025,
       "epoch": 0.33,
-      "learning_rate": 1.337767220902613e-05,
-      "loss": 0.3373,
+      "learning_rate": 1.8675534441805227e-05,
+      "loss": 0.3663,
       "step": 700,
-      "task_loss": 0.1375439167022705
+      "task_loss": 0.19517013430595398
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -941,12 +941,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.34697651863098145,
+      "distillation_loss": 0.4373435974121094,
       "epoch": 0.34,
-      "learning_rate": 1.3282660332541569e-05,
-      "loss": 0.2368,
+      "learning_rate": 1.8656532066508316e-05,
+      "loss": 0.2644,
       "step": 710,
-      "task_loss": 0.32895755767822266
+      "task_loss": 0.3898613750934601
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -954,12 +954,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5809268951416016,
+      "distillation_loss": 0.4680957794189453,
       "epoch": 0.34,
-      "learning_rate": 1.3187648456057008e-05,
-      "loss": 0.4335,
+      "learning_rate": 1.8637529691211403e-05,
+      "loss": 0.4633,
       "step": 720,
-      "task_loss": 0.350343257188797
+      "task_loss": 0.2719492018222809
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -967,12 +967,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4058021605014801,
+      "distillation_loss": 0.9040226936340332,
       "epoch": 0.35,
-      "learning_rate": 1.3092636579572449e-05,
-      "loss": 0.3762,
+      "learning_rate": 1.8618527315914492e-05,
+      "loss": 0.4141,
       "step": 730,
-      "task_loss": 0.4189315438270569
+      "task_loss": 0.6466548442840576
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -980,12 +980,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5029650330543518,
+      "distillation_loss": 0.5508012771606445,
       "epoch": 0.35,
-      "learning_rate": 1.2997624703087888e-05,
-      "loss": 0.3535,
+      "learning_rate": 1.8599524940617578e-05,
+      "loss": 0.3819,
       "step": 740,
-      "task_loss": 0.352708101272583
+      "task_loss": 0.3917207717895508
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -993,20 +993,20 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.15115290880203247,
+      "distillation_loss": 0.10818565636873245,
       "epoch": 0.36,
-      "learning_rate": 1.2902612826603326e-05,
-      "loss": 0.3191,
+      "learning_rate": 1.8580522565320668e-05,
+      "loss": 0.29,
       "step": 750,
-      "task_loss": 0.05512949079275131
+      "task_loss": 0.033290036022663116
     },
     {
       "epoch": 0.36,
-      "eval_accuracy": 0.9162844036697247,
-      "eval_loss": 0.2716875970363617,
-      "eval_runtime": 21.9554,
-      "eval_samples_per_second": 39.717,
-      "eval_steps_per_second": 4.965,
+      "eval_accuracy": 0.9128440366972477,
+      "eval_loss": 0.32853972911834717,
+      "eval_runtime": 31.1664,
+      "eval_samples_per_second": 27.979,
+      "eval_steps_per_second": 3.497,
       "step": 750
     },
     {
@@ -1015,12 +1015,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.42094355821609497,
+      "distillation_loss": 0.5789792537689209,
       "epoch": 0.36,
-      "learning_rate": 1.2807600950118765e-05,
-      "loss": 0.3572,
+      "learning_rate": 1.8561520190023754e-05,
+      "loss": 0.3252,
       "step": 760,
-      "task_loss": 0.22914057970046997
+      "task_loss": 0.3467680811882019
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1028,12 +1028,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.22227834165096283,
+      "distillation_loss": 0.1704629510641098,
       "epoch": 0.37,
-      "learning_rate": 1.2712589073634205e-05,
-      "loss": 0.5129,
+      "learning_rate": 1.854251781472684e-05,
+      "loss": 0.4369,
       "step": 770,
-      "task_loss": 0.13911421597003937
+      "task_loss": 0.11706624180078506
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1041,12 +1041,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6405156254768372,
+      "distillation_loss": 0.4032849967479706,
       "epoch": 0.37,
-      "learning_rate": 1.2617577197149644e-05,
-      "loss": 0.4576,
+      "learning_rate": 1.852351543942993e-05,
+      "loss": 0.4449,
       "step": 780,
-      "task_loss": 0.4629451036453247
+      "task_loss": 0.26390761137008667
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1054,12 +1054,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.432763010263443,
+      "distillation_loss": 0.3772156238555908,
       "epoch": 0.38,
-      "learning_rate": 1.2522565320665083e-05,
-      "loss": 0.3085,
+      "learning_rate": 1.850451306413302e-05,
+      "loss": 0.3257,
       "step": 790,
-      "task_loss": 0.36907055974006653
+      "task_loss": 0.3016626834869385
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1067,12 +1067,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.30914443731307983,
+      "distillation_loss": 0.39227956533432007,
       "epoch": 0.38,
-      "learning_rate": 1.2427553444180524e-05,
-      "loss": 0.4019,
+      "learning_rate": 1.8485510688836105e-05,
+      "loss": 0.412,
       "step": 800,
-      "task_loss": 0.21935078501701355
+      "task_loss": 0.2939774990081787
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1080,12 +1080,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5716268420219421,
+      "distillation_loss": 0.3966776728630066,
       "epoch": 0.38,
-      "learning_rate": 1.2332541567695964e-05,
-      "loss": 0.3996,
+      "learning_rate": 1.846650831353919e-05,
+      "loss": 0.4206,
       "step": 810,
-      "task_loss": 0.4366046190261841
+      "task_loss": 0.32761743664741516
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1093,12 +1093,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.20762380957603455,
+      "distillation_loss": 0.14727869629859924,
       "epoch": 0.39,
-      "learning_rate": 1.2237529691211403e-05,
-      "loss": 0.4031,
+      "learning_rate": 1.844750593824228e-05,
+      "loss": 0.4085,
       "step": 820,
-      "task_loss": 0.07009849697351456
+      "task_loss": 0.03596585988998413
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1106,12 +1106,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.48847681283950806,
+      "distillation_loss": 0.5814430713653564,
       "epoch": 0.39,
-      "learning_rate": 1.2142517814726842e-05,
-      "loss": 0.4559,
+      "learning_rate": 1.842850356294537e-05,
+      "loss": 0.5506,
       "step": 830,
-      "task_loss": 0.20559999346733093
+      "task_loss": 0.2807302474975586
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1119,12 +1119,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.38028720021247864,
+      "distillation_loss": 0.3324885964393616,
       "epoch": 0.4,
-      "learning_rate": 1.2047505938242281e-05,
-      "loss": 0.2783,
+      "learning_rate": 1.8409501187648457e-05,
+      "loss": 0.2877,
       "step": 840,
-      "task_loss": 0.12343298643827438
+      "task_loss": 0.11497768759727478
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1132,12 +1132,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6831973791122437,
+      "distillation_loss": 0.4697301387786865,
       "epoch": 0.4,
-      "learning_rate": 1.195249406175772e-05,
-      "loss": 0.4664,
+      "learning_rate": 1.8390498812351546e-05,
+      "loss": 0.4512,
       "step": 850,
-      "task_loss": 0.6523771286010742
+      "task_loss": 0.42074650526046753
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1145,12 +1145,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.21925556659698486,
+      "distillation_loss": 0.1672717034816742,
       "epoch": 0.41,
-      "learning_rate": 1.1857482185273158e-05,
-      "loss": 0.366,
+      "learning_rate": 1.8371496437054633e-05,
+      "loss": 0.3373,
       "step": 860,
-      "task_loss": 0.15919412672519684
+      "task_loss": 0.10989774763584137
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1158,12 +1158,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.07446402311325073,
+      "distillation_loss": 0.07770746946334839,
       "epoch": 0.41,
-      "learning_rate": 1.1762470308788601e-05,
-      "loss": 0.3713,
+      "learning_rate": 1.8352494061757722e-05,
+      "loss": 0.3943,
       "step": 870,
-      "task_loss": 0.014185778796672821
+      "task_loss": 0.013416633009910583
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1171,12 +1171,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.31408172845840454,
+      "distillation_loss": 0.4550301730632782,
       "epoch": 0.42,
-      "learning_rate": 1.166745843230404e-05,
-      "loss": 0.3861,
+      "learning_rate": 1.833349168646081e-05,
+      "loss": 0.3908,
       "step": 880,
-      "task_loss": 0.13161490857601166
+      "task_loss": 0.20334404706954956
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1184,12 +1184,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.14646156132221222,
+      "distillation_loss": 0.14840258657932281,
       "epoch": 0.42,
-      "learning_rate": 1.1572446555819478e-05,
-      "loss": 0.2613,
+      "learning_rate": 1.8314489311163898e-05,
+      "loss": 0.2848,
       "step": 890,
-      "task_loss": 0.07529384642839432
+      "task_loss": 0.0623587965965271
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1197,12 +1197,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.05397426709532738,
+      "distillation_loss": 0.10735790431499481,
       "epoch": 0.43,
-      "learning_rate": 1.1477434679334917e-05,
-      "loss": 0.3604,
+      "learning_rate": 1.8295486935866984e-05,
+      "loss": 0.3985,
       "step": 900,
-      "task_loss": 0.043072961270809174
+      "task_loss": 0.09991317242383957
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1210,12 +1210,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.12949523329734802,
+      "distillation_loss": 0.09649023413658142,
       "epoch": 0.43,
-      "learning_rate": 1.1382422802850357e-05,
-      "loss": 0.2215,
+      "learning_rate": 1.8276484560570074e-05,
+      "loss": 0.2586,
       "step": 910,
-      "task_loss": 0.029655031859874725
+      "task_loss": 0.022964343428611755
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1223,12 +1223,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5533275008201599,
+      "distillation_loss": 0.565139651298523,
       "epoch": 0.44,
-      "learning_rate": 1.1287410926365796e-05,
-      "loss": 0.3736,
+      "learning_rate": 1.825748218527316e-05,
+      "loss": 0.3881,
       "step": 920,
-      "task_loss": 0.3869485855102539
+      "task_loss": 0.3771653473377228
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1236,12 +1236,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.06997716426849365,
+      "distillation_loss": 0.07271742820739746,
       "epoch": 0.44,
-      "learning_rate": 1.1192399049881235e-05,
-      "loss": 0.2925,
+      "learning_rate": 1.823847980997625e-05,
+      "loss": 0.3163,
       "step": 930,
-      "task_loss": 0.17637060582637787
+      "task_loss": 0.1788436472415924
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1249,12 +1249,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5217384099960327,
+      "distillation_loss": 0.5031943321228027,
       "epoch": 0.45,
-      "learning_rate": 1.1097387173396676e-05,
-      "loss": 0.3657,
+      "learning_rate": 1.8219477434679336e-05,
+      "loss": 0.379,
       "step": 940,
-      "task_loss": 0.35078302025794983
+      "task_loss": 0.3599282503128052
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1262,12 +1262,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.20354126393795013,
+      "distillation_loss": 0.14190588891506195,
       "epoch": 0.45,
-      "learning_rate": 1.1002375296912116e-05,
-      "loss": 0.2885,
+      "learning_rate": 1.8200475059382425e-05,
+      "loss": 0.2631,
       "step": 950,
-      "task_loss": 0.33367669582366943
+      "task_loss": 0.2649960517883301
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1275,12 +1275,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6612205505371094,
+      "distillation_loss": 0.8079104423522949,
       "epoch": 0.46,
-      "learning_rate": 1.0907363420427555e-05,
-      "loss": 0.4112,
+      "learning_rate": 1.818147268408551e-05,
+      "loss": 0.4097,
       "step": 960,
-      "task_loss": 0.38991737365722656
+      "task_loss": 0.49955570697784424
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1288,12 +1288,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5148907899856567,
+      "distillation_loss": 0.32833898067474365,
       "epoch": 0.46,
-      "learning_rate": 1.0812351543942994e-05,
-      "loss": 0.3226,
+      "learning_rate": 1.81624703087886e-05,
+      "loss": 0.2893,
       "step": 970,
-      "task_loss": 0.2609829604625702
+      "task_loss": 0.13918891549110413
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1301,12 +1301,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6710062026977539,
+      "distillation_loss": 0.8008242845535278,
       "epoch": 0.47,
-      "learning_rate": 1.0717339667458434e-05,
-      "loss": 0.2813,
+      "learning_rate": 1.8143467933491687e-05,
+      "loss": 0.3336,
       "step": 980,
-      "task_loss": 0.34536463022232056
+      "task_loss": 0.39846181869506836
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1314,12 +1314,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.35892608761787415,
+      "distillation_loss": 0.4929274320602417,
       "epoch": 0.47,
-      "learning_rate": 1.0622327790973871e-05,
-      "loss": 0.3694,
+      "learning_rate": 1.8124465558194773e-05,
+      "loss": 0.3877,
       "step": 990,
-      "task_loss": 0.190689355134964
+      "task_loss": 0.2788354456424713
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1327,20 +1327,20 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.1477653980255127,
+      "distillation_loss": 0.24550215899944305,
       "epoch": 0.48,
-      "learning_rate": 1.052731591448931e-05,
-      "loss": 0.2688,
+      "learning_rate": 1.8105463182897863e-05,
+      "loss": 0.2851,
       "step": 1000,
-      "task_loss": 0.04818693548440933
+      "task_loss": 0.10739203542470932
     },
     {
       "epoch": 0.48,
-      "eval_accuracy": 0.911697247706422,
-      "eval_loss": 0.24315589666366577,
-      "eval_runtime": 22.0177,
-      "eval_samples_per_second": 39.604,
-      "eval_steps_per_second": 4.951,
+      "eval_accuracy": 0.9151376146788991,
+      "eval_loss": 0.24980628490447998,
+      "eval_runtime": 24.2778,
+      "eval_samples_per_second": 35.918,
+      "eval_steps_per_second": 4.49,
       "step": 1000
     },
     {
@@ -1349,12 +1349,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.47181323170661926,
+      "distillation_loss": 0.32931697368621826,
       "epoch": 0.48,
-      "learning_rate": 1.0432304038004753e-05,
-      "loss": 0.2546,
+      "learning_rate": 1.8086460807600952e-05,
+      "loss": 0.2591,
       "step": 1010,
-      "task_loss": 0.2418743222951889
+      "task_loss": 0.17109191417694092
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1362,12 +1362,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.21640026569366455,
+      "distillation_loss": 0.19154950976371765,
       "epoch": 0.48,
-      "learning_rate": 1.033729216152019e-05,
-      "loss": 0.3818,
+      "learning_rate": 1.806745843230404e-05,
+      "loss": 0.3552,
       "step": 1020,
-      "task_loss": 0.14040601253509521
+      "task_loss": 0.13995346426963806
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1375,12 +1375,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.27131223678588867,
+      "distillation_loss": 0.2809268534183502,
       "epoch": 0.49,
-      "learning_rate": 1.024228028503563e-05,
-      "loss": 0.253,
+      "learning_rate": 1.8048456057007128e-05,
+      "loss": 0.2895,
       "step": 1030,
-      "task_loss": 0.1497097909450531
+      "task_loss": 0.18339580297470093
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1388,12 +1388,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7797483801841736,
+      "distillation_loss": 0.7353945970535278,
       "epoch": 0.49,
-      "learning_rate": 1.014726840855107e-05,
-      "loss": 0.4507,
+      "learning_rate": 1.8029453681710218e-05,
+      "loss": 0.4529,
       "step": 1040,
-      "task_loss": 0.46980684995651245
+      "task_loss": 0.516687273979187
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1401,12 +1401,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.11180303990840912,
+      "distillation_loss": 0.1993633657693863,
       "epoch": 0.5,
-      "learning_rate": 1.0052256532066509e-05,
-      "loss": 0.2814,
+      "learning_rate": 1.8010451306413304e-05,
+      "loss": 0.3296,
       "step": 1050,
-      "task_loss": 0.4403308629989624
+      "task_loss": 0.4725501835346222
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1414,12 +1414,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.18791627883911133,
+      "distillation_loss": 0.17681953310966492,
       "epoch": 0.5,
-      "learning_rate": 9.95724465558195e-06,
-      "loss": 0.387,
+      "learning_rate": 1.799144893111639e-05,
+      "loss": 0.3377,
       "step": 1060,
-      "task_loss": 0.06846746802330017
+      "task_loss": 0.06336037814617157
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1427,12 +1427,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.16468051075935364,
+      "distillation_loss": 0.13305272161960602,
       "epoch": 0.51,
-      "learning_rate": 9.862232779097387e-06,
-      "loss": 0.3529,
+      "learning_rate": 1.797244655581948e-05,
+      "loss": 0.2699,
       "step": 1070,
-      "task_loss": 0.18533943593502045
+      "task_loss": 0.2189784049987793
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1440,12 +1440,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.42276692390441895,
+      "distillation_loss": 0.5291624665260315,
       "epoch": 0.51,
-      "learning_rate": 9.767220902612827e-06,
-      "loss": 0.2331,
+      "learning_rate": 1.795344418052257e-05,
+      "loss": 0.2867,
       "step": 1080,
-      "task_loss": 0.23515379428863525
+      "task_loss": 0.299777090549469
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1453,12 +1453,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.31465187668800354,
+      "distillation_loss": 0.11125596612691879,
       "epoch": 0.52,
-      "learning_rate": 9.672209026128266e-06,
-      "loss": 0.2775,
+      "learning_rate": 1.7934441805225655e-05,
+      "loss": 0.2417,
       "step": 1090,
-      "task_loss": 0.2029043436050415
+      "task_loss": 0.0719640851020813
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1466,12 +1466,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7025846838951111,
+      "distillation_loss": 0.6146207451820374,
       "epoch": 0.52,
-      "learning_rate": 9.577197149643707e-06,
-      "loss": 0.3614,
+      "learning_rate": 1.791543942992874e-05,
+      "loss": 0.3645,
       "step": 1100,
-      "task_loss": 0.43116331100463867
+      "task_loss": 0.3506692051887512
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1479,12 +1479,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.43838998675346375,
+      "distillation_loss": 0.43747201561927795,
       "epoch": 0.53,
-      "learning_rate": 9.482185273159146e-06,
-      "loss": 0.2565,
+      "learning_rate": 1.789643705463183e-05,
+      "loss": 0.2669,
       "step": 1110,
-      "task_loss": 0.24070000648498535
+      "task_loss": 0.2219913899898529
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1492,12 +1492,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.526702880859375,
+      "distillation_loss": 0.3032437562942505,
       "epoch": 0.53,
-      "learning_rate": 9.387173396674586e-06,
-      "loss": 0.3354,
+      "learning_rate": 1.7877434679334917e-05,
+      "loss": 0.2974,
       "step": 1120,
-      "task_loss": 0.28051871061325073
+      "task_loss": 0.12568572163581848
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1505,12 +1505,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.471240758895874,
+      "distillation_loss": 0.3314521610736847,
       "epoch": 0.54,
-      "learning_rate": 9.292161520190025e-06,
-      "loss": 0.3342,
+      "learning_rate": 1.7858432304038007e-05,
+      "loss": 0.2802,
       "step": 1130,
-      "task_loss": 0.2174275815486908
+      "task_loss": 0.1527169644832611
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1518,12 +1518,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.13051442801952362,
+      "distillation_loss": 0.31865638494491577,
       "epoch": 0.54,
-      "learning_rate": 9.197149643705464e-06,
-      "loss": 0.2616,
+      "learning_rate": 1.7839429928741093e-05,
+      "loss": 0.3081,
       "step": 1140,
-      "task_loss": 0.08567549288272858
+      "task_loss": 0.22448736429214478
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1531,12 +1531,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5350298881530762,
+      "distillation_loss": 0.6496268510818481,
       "epoch": 0.55,
-      "learning_rate": 9.102137767220904e-06,
-      "loss": 0.3387,
+      "learning_rate": 1.7820427553444182e-05,
+      "loss": 0.3204,
       "step": 1150,
-      "task_loss": 0.3132125437259674
+      "task_loss": 0.40611302852630615
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1544,12 +1544,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.32484662532806396,
+      "distillation_loss": 0.6914563179016113,
       "epoch": 0.55,
-      "learning_rate": 9.007125890736343e-06,
-      "loss": 0.1753,
+      "learning_rate": 1.780142517814727e-05,
+      "loss": 0.2414,
       "step": 1160,
-      "task_loss": 0.18460990488529205
+      "task_loss": 0.3544915020465851
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1557,12 +1557,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3009415864944458,
+      "distillation_loss": 0.152155801653862,
       "epoch": 0.56,
-      "learning_rate": 8.912114014251782e-06,
-      "loss": 0.3577,
+      "learning_rate": 1.7782422802850358e-05,
+      "loss": 0.3349,
       "step": 1170,
-      "task_loss": 0.22530747950077057
+      "task_loss": 0.07939426600933075
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1570,12 +1570,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4420413672924042,
+      "distillation_loss": 0.48079612851142883,
       "epoch": 0.56,
-      "learning_rate": 8.817102137767222e-06,
-      "loss": 0.3324,
+      "learning_rate": 1.7763420427553448e-05,
+      "loss": 0.3187,
       "step": 1180,
-      "task_loss": 0.4033554196357727
+      "task_loss": 0.44884148240089417
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1583,12 +1583,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.06501191854476929,
+      "distillation_loss": 0.05774353817105293,
       "epoch": 0.57,
-      "learning_rate": 8.722090261282661e-06,
-      "loss": 0.3307,
+      "learning_rate": 1.7744418052256534e-05,
+      "loss": 0.354,
       "step": 1190,
-      "task_loss": 0.012013241648674011
+      "task_loss": 0.007377400994300842
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1596,12 +1596,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2965516448020935,
+      "distillation_loss": 0.11594004929065704,
       "epoch": 0.57,
-      "learning_rate": 8.6270783847981e-06,
-      "loss": 0.3437,
+      "learning_rate": 1.772541567695962e-05,
+      "loss": 0.3231,
       "step": 1200,
-      "task_loss": 0.13238894939422607
+      "task_loss": 0.03987376019358635
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1609,12 +1609,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.36214399337768555,
+      "distillation_loss": 0.19646455347537994,
       "epoch": 0.57,
-      "learning_rate": 8.53206650831354e-06,
-      "loss": 0.3629,
+      "learning_rate": 1.770641330166271e-05,
+      "loss": 0.354,
       "step": 1210,
-      "task_loss": 0.26368093490600586
+      "task_loss": 0.19836881756782532
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1622,12 +1622,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3077784478664398,
+      "distillation_loss": 0.24901585280895233,
       "epoch": 0.58,
-      "learning_rate": 8.437054631828979e-06,
-      "loss": 0.3036,
+      "learning_rate": 1.76874109263658e-05,
+      "loss": 0.3213,
       "step": 1220,
-      "task_loss": 0.0927947610616684
+      "task_loss": 0.055774152278900146
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1635,12 +1635,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.20123688876628876,
+      "distillation_loss": 0.223684161901474,
       "epoch": 0.58,
-      "learning_rate": 8.342042755344418e-06,
-      "loss": 0.2894,
+      "learning_rate": 1.7668408551068885e-05,
+      "loss": 0.2828,
       "step": 1230,
-      "task_loss": 0.010961085557937622
+      "task_loss": 0.01634085178375244
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1648,12 +1648,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5563689470291138,
+      "distillation_loss": 0.2887876033782959,
       "epoch": 0.59,
-      "learning_rate": 8.247030878859859e-06,
-      "loss": 0.3299,
+      "learning_rate": 1.764940617577197e-05,
+      "loss": 0.331,
       "step": 1240,
-      "task_loss": 0.3913767337799072
+      "task_loss": 0.1671813428401947
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1661,20 +1661,20 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.471743643283844,
+      "distillation_loss": 0.4364287257194519,
       "epoch": 0.59,
-      "learning_rate": 8.152019002375298e-06,
-      "loss": 0.3306,
+      "learning_rate": 1.763040380047506e-05,
+      "loss": 0.3717,
       "step": 1250,
-      "task_loss": 0.19782987236976624
+      "task_loss": 0.1999722123146057
     },
     {
       "epoch": 0.59,
       "eval_accuracy": 0.9243119266055045,
-      "eval_loss": 0.20327819883823395,
-      "eval_runtime": 22.009,
-      "eval_samples_per_second": 39.62,
-      "eval_steps_per_second": 4.953,
+      "eval_loss": 0.2037193924188614,
+      "eval_runtime": 23.0016,
+      "eval_samples_per_second": 37.91,
+      "eval_steps_per_second": 4.739,
       "step": 1250
     },
     {
@@ -1683,12 +1683,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.01976114884018898,
+      "distillation_loss": 0.017401084303855896,
       "epoch": 0.6,
-      "learning_rate": 8.057007125890736e-06,
-      "loss": 0.2307,
+      "learning_rate": 1.761140142517815e-05,
+      "loss": 0.1969,
       "step": 1260,
-      "task_loss": 0.006360933184623718
+      "task_loss": 0.00458671897649765
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1696,12 +1696,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2867504358291626,
+      "distillation_loss": 0.26557794213294983,
       "epoch": 0.6,
-      "learning_rate": 7.961995249406177e-06,
-      "loss": 0.4469,
+      "learning_rate": 1.7592399049881237e-05,
+      "loss": 0.493,
       "step": 1270,
-      "task_loss": 0.1376619189977646
+      "task_loss": 0.1353331208229065
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1709,12 +1709,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3130514919757843,
+      "distillation_loss": 0.45439109206199646,
       "epoch": 0.61,
-      "learning_rate": 7.866983372921616e-06,
-      "loss": 0.2792,
+      "learning_rate": 1.7573396674584323e-05,
+      "loss": 0.3094,
       "step": 1280,
-      "task_loss": 0.30938223004341125
+      "task_loss": 0.35616135597229004
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1722,12 +1722,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.21932274103164673,
+      "distillation_loss": 0.3577348589897156,
       "epoch": 0.61,
-      "learning_rate": 7.771971496437056e-06,
-      "loss": 0.2458,
+      "learning_rate": 1.7554394299287412e-05,
+      "loss": 0.3268,
       "step": 1290,
-      "task_loss": 0.07220742851495743
+      "task_loss": 0.12562984228134155
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1735,12 +1735,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2027844786643982,
+      "distillation_loss": 0.35081303119659424,
       "epoch": 0.62,
-      "learning_rate": 7.676959619952495e-06,
-      "loss": 0.2947,
+      "learning_rate": 1.7535391923990502e-05,
+      "loss": 0.3708,
       "step": 1300,
-      "task_loss": 0.1091059073805809
+      "task_loss": 0.22274452447891235
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1748,12 +1748,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.4004560708999634,
+      "distillation_loss": 0.2592296004295349,
       "epoch": 0.62,
-      "learning_rate": 7.581947743467934e-06,
-      "loss": 0.437,
+      "learning_rate": 1.7516389548693588e-05,
+      "loss": 0.4674,
       "step": 1310,
-      "task_loss": 0.13185200095176697
+      "task_loss": 0.09062568843364716
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1761,12 +1761,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.11762793362140656,
+      "distillation_loss": 0.18336069583892822,
       "epoch": 0.63,
-      "learning_rate": 7.486935866983374e-06,
-      "loss": 0.2396,
+      "learning_rate": 1.7497387173396674e-05,
+      "loss": 0.2377,
       "step": 1320,
-      "task_loss": 0.035176947712898254
+      "task_loss": 0.055865660309791565
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1774,12 +1774,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.38001149892807007,
+      "distillation_loss": 0.3750176429748535,
       "epoch": 0.63,
-      "learning_rate": 7.391923990498813e-06,
-      "loss": 0.225,
+      "learning_rate": 1.7478384798099764e-05,
+      "loss": 0.2343,
       "step": 1330,
-      "task_loss": 0.49264460802078247
+      "task_loss": 0.4618932902812958
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1787,12 +1787,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.16133537888526917,
+      "distillation_loss": 0.277113139629364,
       "epoch": 0.64,
-      "learning_rate": 7.296912114014253e-06,
-      "loss": 0.2908,
+      "learning_rate": 1.745938242280285e-05,
+      "loss": 0.4476,
       "step": 1340,
-      "task_loss": 0.05434826388955116
+      "task_loss": 0.11207205802202225
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1800,12 +1800,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5223551988601685,
+      "distillation_loss": 0.37496018409729004,
       "epoch": 0.64,
-      "learning_rate": 7.201900237529692e-06,
-      "loss": 0.2895,
+      "learning_rate": 1.744038004750594e-05,
+      "loss": 0.3001,
       "step": 1350,
-      "task_loss": 0.27843406796455383
+      "task_loss": 0.2029455304145813
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1813,12 +1813,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.09743548929691315,
+      "distillation_loss": 0.08501767367124557,
       "epoch": 0.65,
-      "learning_rate": 7.106888361045131e-06,
-      "loss": 0.2579,
+      "learning_rate": 1.742137767220903e-05,
+      "loss": 0.2503,
       "step": 1360,
-      "task_loss": 0.014732744544744492
+      "task_loss": 0.02285398542881012
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1826,12 +1826,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.06801208853721619,
+      "distillation_loss": 0.14294752478599548,
       "epoch": 0.65,
-      "learning_rate": 7.01187648456057e-06,
-      "loss": 0.2038,
+      "learning_rate": 1.7402375296912115e-05,
+      "loss": 0.1823,
       "step": 1370,
-      "task_loss": 0.01958051323890686
+      "task_loss": 0.0727246105670929
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1839,12 +1839,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.49920564889907837,
+      "distillation_loss": 0.2509918510913849,
       "epoch": 0.66,
-      "learning_rate": 6.91686460807601e-06,
-      "loss": 0.3022,
+      "learning_rate": 1.73833729216152e-05,
+      "loss": 0.2555,
       "step": 1380,
-      "task_loss": 0.27152562141418457
+      "task_loss": 0.12989100813865662
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1852,12 +1852,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.07850401848554611,
+      "distillation_loss": 0.068404421210289,
       "epoch": 0.66,
-      "learning_rate": 6.82185273159145e-06,
-      "loss": 0.1903,
+      "learning_rate": 1.736437054631829e-05,
+      "loss": 0.2121,
       "step": 1390,
-      "task_loss": 0.19040407240390778
+      "task_loss": 0.10038695484399796
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1865,12 +1865,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.05175900086760521,
+      "distillation_loss": 0.04248424619436264,
       "epoch": 0.67,
-      "learning_rate": 6.726840855106889e-06,
-      "loss": 0.1951,
+      "learning_rate": 1.734536817102138e-05,
+      "loss": 0.22,
       "step": 1400,
-      "task_loss": 0.0510752871632576
+      "task_loss": 0.08192986994981766
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1878,12 +1878,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.11907292902469635,
+      "distillation_loss": 0.0670127123594284,
       "epoch": 0.67,
-      "learning_rate": 6.631828978622329e-06,
-      "loss": 0.1969,
+      "learning_rate": 1.7326365795724467e-05,
+      "loss": 0.2685,
       "step": 1410,
-      "task_loss": 0.10641947388648987
+      "task_loss": 0.07222311943769455
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1891,12 +1891,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3327619433403015,
+      "distillation_loss": 0.37206175923347473,
       "epoch": 0.67,
-      "learning_rate": 6.536817102137768e-06,
-      "loss": 0.2099,
+      "learning_rate": 1.7307363420427553e-05,
+      "loss": 0.2674,
       "step": 1420,
-      "task_loss": 0.3468553125858307
+      "task_loss": 0.35340070724487305
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1904,12 +1904,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2979893088340759,
+      "distillation_loss": 0.2714982032775879,
       "epoch": 0.68,
-      "learning_rate": 6.441805225653207e-06,
-      "loss": 0.3192,
+      "learning_rate": 1.7288361045130643e-05,
+      "loss": 0.3446,
       "step": 1430,
-      "task_loss": 0.26997804641723633
+      "task_loss": 0.2567555904388428
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1917,12 +1917,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.03590167686343193,
+      "distillation_loss": 0.07237125188112259,
       "epoch": 0.68,
-      "learning_rate": 6.346793349168646e-06,
-      "loss": 0.179,
+      "learning_rate": 1.7269358669833732e-05,
+      "loss": 0.197,
       "step": 1440,
-      "task_loss": 0.00364762544631958
+      "task_loss": 0.00902317464351654
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1930,12 +1930,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.16363513469696045,
+      "distillation_loss": 0.10892651230096817,
       "epoch": 0.69,
-      "learning_rate": 6.251781472684086e-06,
-      "loss": 0.3228,
+      "learning_rate": 1.7250356294536818e-05,
+      "loss": 0.2894,
       "step": 1450,
-      "task_loss": 0.20806732773780823
+      "task_loss": 0.13919095695018768
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1943,12 +1943,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.28741005063056946,
+      "distillation_loss": 0.3349069356918335,
       "epoch": 0.69,
-      "learning_rate": 6.156769596199526e-06,
-      "loss": 0.2762,
+      "learning_rate": 1.7231353919239904e-05,
+      "loss": 0.2457,
       "step": 1460,
-      "task_loss": 0.12442530691623688
+      "task_loss": 0.14725209772586823
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1956,12 +1956,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.49749070405960083,
+      "distillation_loss": 0.3073666989803314,
       "epoch": 0.7,
-      "learning_rate": 6.061757719714965e-06,
-      "loss": 0.3035,
+      "learning_rate": 1.7212351543942994e-05,
+      "loss": 0.2578,
       "step": 1470,
-      "task_loss": 0.22260920703411102
+      "task_loss": 0.07342517375946045
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1969,12 +1969,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5554820895195007,
+      "distillation_loss": 0.38823768496513367,
       "epoch": 0.7,
-      "learning_rate": 5.9667458432304035e-06,
-      "loss": 0.3454,
+      "learning_rate": 1.7193349168646084e-05,
+      "loss": 0.331,
       "step": 1480,
-      "task_loss": 0.2797449231147766
+      "task_loss": 0.1779521405696869
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1982,12 +1982,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.13691575825214386,
+      "distillation_loss": 0.2724040448665619,
       "epoch": 0.71,
-      "learning_rate": 5.871733966745844e-06,
-      "loss": 0.2283,
+      "learning_rate": 1.717434679334917e-05,
+      "loss": 0.2154,
       "step": 1490,
-      "task_loss": 0.18624231219291687
+      "task_loss": 0.2927827835083008
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -1995,20 +1995,20 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.80184406042099,
+      "distillation_loss": 0.8642085790634155,
       "epoch": 0.71,
-      "learning_rate": 5.776722090261283e-06,
-      "loss": 0.224,
+      "learning_rate": 1.715534441805226e-05,
+      "loss": 0.2467,
       "step": 1500,
-      "task_loss": 0.4933280646800995
+      "task_loss": 0.5641911029815674
     },
     {
       "epoch": 0.71,
-      "eval_accuracy": 0.9243119266055045,
-      "eval_loss": 0.2382841557264328,
-      "eval_runtime": 22.0278,
-      "eval_samples_per_second": 39.586,
-      "eval_steps_per_second": 4.948,
+      "eval_accuracy": 0.9174311926605505,
+      "eval_loss": 0.28397560119628906,
+      "eval_runtime": 22.6893,
+      "eval_samples_per_second": 38.432,
+      "eval_steps_per_second": 4.804,
       "step": 1500
     },
     {
@@ -2017,12 +2017,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.07949607074260712,
+      "distillation_loss": 0.05709172412753105,
       "epoch": 0.72,
-      "learning_rate": 5.681710213776722e-06,
-      "loss": 0.2137,
+      "learning_rate": 1.7136342042755345e-05,
+      "loss": 0.2719,
       "step": 1510,
-      "task_loss": 0.043377894908189774
+      "task_loss": 0.025617174804210663
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2030,12 +2030,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.42658352851867676,
+      "distillation_loss": 0.45153453946113586,
       "epoch": 0.72,
-      "learning_rate": 5.5866983372921624e-06,
-      "loss": 0.2807,
+      "learning_rate": 1.7117339667458435e-05,
+      "loss": 0.2957,
       "step": 1520,
-      "task_loss": 0.24954530596733093
+      "task_loss": 0.2549566328525543
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2043,12 +2043,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.244135782122612,
+      "distillation_loss": 0.2257353961467743,
       "epoch": 0.73,
-      "learning_rate": 5.491686460807602e-06,
-      "loss": 0.3032,
+      "learning_rate": 1.709833729216152e-05,
+      "loss": 0.2892,
       "step": 1530,
-      "task_loss": 0.5667487382888794
+      "task_loss": 0.5615079998970032
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2056,12 +2056,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.12168382853269577,
+      "distillation_loss": 0.16512510180473328,
       "epoch": 0.73,
-      "learning_rate": 5.39667458432304e-06,
-      "loss": 0.2224,
+      "learning_rate": 1.707933491686461e-05,
+      "loss": 0.1975,
       "step": 1540,
-      "task_loss": 0.051823940128088
+      "task_loss": 0.09258662164211273
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2069,12 +2069,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5252543091773987,
+      "distillation_loss": 0.4988386929035187,
       "epoch": 0.74,
-      "learning_rate": 5.3016627078384795e-06,
-      "loss": 0.3264,
+      "learning_rate": 1.7060332541567697e-05,
+      "loss": 0.3206,
       "step": 1550,
-      "task_loss": 0.5932276248931885
+      "task_loss": 0.6115778088569641
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2082,12 +2082,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.10500893741846085,
+      "distillation_loss": 0.18809723854064941,
       "epoch": 0.74,
-      "learning_rate": 5.20665083135392e-06,
-      "loss": 0.2,
+      "learning_rate": 1.7041330166270783e-05,
+      "loss": 0.1958,
       "step": 1560,
-      "task_loss": 0.06464926898479462
+      "task_loss": 0.10413852334022522
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2095,12 +2095,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.46214228868484497,
+      "distillation_loss": 0.4897454082965851,
       "epoch": 0.75,
-      "learning_rate": 5.111638954869359e-06,
-      "loss": 0.3135,
+      "learning_rate": 1.7022327790973873e-05,
+      "loss": 0.2965,
       "step": 1570,
-      "task_loss": 0.27241963148117065
+      "task_loss": 0.2838747799396515
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2108,12 +2108,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.41102948784828186,
+      "distillation_loss": 0.4037947654724121,
       "epoch": 0.75,
-      "learning_rate": 5.016627078384798e-06,
-      "loss": 0.2825,
+      "learning_rate": 1.7003325415676962e-05,
+      "loss": 0.2892,
       "step": 1580,
-      "task_loss": 0.25513461232185364
+      "task_loss": 0.25087568163871765
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2121,12 +2121,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.0665275901556015,
+      "distillation_loss": 0.05394119769334793,
       "epoch": 0.76,
-      "learning_rate": 4.921615201900238e-06,
-      "loss": 0.3124,
+      "learning_rate": 1.698432304038005e-05,
+      "loss": 0.2991,
       "step": 1590,
-      "task_loss": 0.01647743582725525
+      "task_loss": 0.015048503875732422
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2134,12 +2134,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.20230551064014435,
+      "distillation_loss": 0.30840539932250977,
       "epoch": 0.76,
-      "learning_rate": 4.826603325415678e-06,
-      "loss": 0.2472,
+      "learning_rate": 1.6965320665083134e-05,
+      "loss": 0.2293,
       "step": 1600,
-      "task_loss": 0.6591448783874512
+      "task_loss": 0.634148120880127
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2147,12 +2147,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.1525014340877533,
+      "distillation_loss": 0.1973361074924469,
       "epoch": 0.76,
-      "learning_rate": 4.731591448931116e-06,
-      "loss": 0.1319,
+      "learning_rate": 1.6946318289786224e-05,
+      "loss": 0.1691,
       "step": 1610,
-      "task_loss": 0.06440484523773193
+      "task_loss": 0.09310252964496613
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2160,12 +2160,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.38346511125564575,
+      "distillation_loss": 0.3464374244213104,
       "epoch": 0.77,
-      "learning_rate": 4.636579572446556e-06,
-      "loss": 0.2337,
+      "learning_rate": 1.6927315914489314e-05,
+      "loss": 0.2329,
       "step": 1620,
-      "task_loss": 0.27461180090904236
+      "task_loss": 0.28547829389572144
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2173,12 +2173,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.26302579045295715,
+      "distillation_loss": 0.2688274383544922,
       "epoch": 0.77,
-      "learning_rate": 4.541567695961996e-06,
-      "loss": 0.2821,
+      "learning_rate": 1.69083135391924e-05,
+      "loss": 0.2133,
       "step": 1630,
-      "task_loss": 0.1236814334988594
+      "task_loss": 0.1428394615650177
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2186,12 +2186,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.624555230140686,
+      "distillation_loss": 0.18282316625118256,
       "epoch": 0.78,
-      "learning_rate": 4.446555819477435e-06,
-      "loss": 0.3081,
+      "learning_rate": 1.6889311163895486e-05,
+      "loss": 0.2413,
       "step": 1640,
-      "task_loss": 0.3441321849822998
+      "task_loss": 0.05560823902487755
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2199,12 +2199,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.04459763318300247,
+      "distillation_loss": 0.2214377224445343,
       "epoch": 0.78,
-      "learning_rate": 4.351543942992874e-06,
-      "loss": 0.1871,
+      "learning_rate": 1.687030878859858e-05,
+      "loss": 0.2201,
       "step": 1650,
-      "task_loss": 0.009879574179649353
+      "task_loss": 0.10058430582284927
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2212,12 +2212,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.47281283140182495,
+      "distillation_loss": 0.5333009362220764,
       "epoch": 0.79,
-      "learning_rate": 4.256532066508314e-06,
-      "loss": 0.3221,
+      "learning_rate": 1.6851306413301665e-05,
+      "loss": 0.2844,
       "step": 1660,
-      "task_loss": 0.3637796640396118
+      "task_loss": 0.47247129678726196
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2225,12 +2225,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2519533634185791,
+      "distillation_loss": 0.2876150906085968,
       "epoch": 0.79,
-      "learning_rate": 4.161520190023753e-06,
-      "loss": 0.2995,
+      "learning_rate": 1.683230403800475e-05,
+      "loss": 0.3131,
       "step": 1670,
-      "task_loss": 0.24460454285144806
+      "task_loss": 0.09321459382772446
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2238,12 +2238,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3187342584133148,
+      "distillation_loss": 0.22139661014080048,
       "epoch": 0.8,
-      "learning_rate": 4.066508313539192e-06,
-      "loss": 0.3526,
+      "learning_rate": 1.681330166270784e-05,
+      "loss": 0.3365,
       "step": 1680,
-      "task_loss": 0.25094783306121826
+      "task_loss": 0.2087613344192505
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2251,12 +2251,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3183995485305786,
+      "distillation_loss": 0.21150922775268555,
       "epoch": 0.8,
-      "learning_rate": 3.9714964370546325e-06,
-      "loss": 0.2348,
+      "learning_rate": 1.6794299287410927e-05,
+      "loss": 0.1991,
       "step": 1690,
-      "task_loss": 0.16011971235275269
+      "task_loss": 0.1047440767288208
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2264,12 +2264,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5012897849082947,
+      "distillation_loss": 0.3274872303009033,
       "epoch": 0.81,
-      "learning_rate": 3.876484560570072e-06,
-      "loss": 0.2874,
+      "learning_rate": 1.6775296912114017e-05,
+      "loss": 0.2242,
       "step": 1700,
-      "task_loss": 0.30354738235473633
+      "task_loss": 0.2223893105983734
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2277,12 +2277,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.09144863486289978,
+      "distillation_loss": 0.11536785960197449,
       "epoch": 0.81,
-      "learning_rate": 3.781472684085511e-06,
-      "loss": 0.2732,
+      "learning_rate": 1.6756294536817103e-05,
+      "loss": 0.3133,
       "step": 1710,
-      "task_loss": 0.11224113404750824
+      "task_loss": 0.07253136485815048
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2290,12 +2290,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.39011454582214355,
+      "distillation_loss": 0.3560354709625244,
       "epoch": 0.82,
-      "learning_rate": 3.6864608076009504e-06,
-      "loss": 0.205,
+      "learning_rate": 1.6737292161520192e-05,
+      "loss": 0.214,
       "step": 1720,
-      "task_loss": 0.3499948978424072
+      "task_loss": 0.32433855533599854
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2303,12 +2303,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.0968744307756424,
+      "distillation_loss": 0.25568869709968567,
       "epoch": 0.82,
-      "learning_rate": 3.5914489311163897e-06,
-      "loss": 0.2088,
+      "learning_rate": 1.671828978622328e-05,
+      "loss": 0.2308,
       "step": 1730,
-      "task_loss": 0.03987715393304825
+      "task_loss": 0.1160731092095375
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2316,12 +2316,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.49030616879463196,
+      "distillation_loss": 0.5275914669036865,
       "epoch": 0.83,
-      "learning_rate": 3.4964370546318295e-06,
-      "loss": 0.3065,
+      "learning_rate": 1.6699287410926368e-05,
+      "loss": 0.3426,
       "step": 1740,
-      "task_loss": 0.33337563276290894
+      "task_loss": 0.3464980125427246
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2329,20 +2329,20 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3354378938674927,
+      "distillation_loss": 0.33361896872520447,
       "epoch": 0.83,
-      "learning_rate": 3.4014251781472683e-06,
-      "loss": 0.2082,
+      "learning_rate": 1.6680285035629454e-05,
+      "loss": 0.2114,
       "step": 1750,
-      "task_loss": 0.2503882646560669
+      "task_loss": 0.37090158462524414
     },
     {
       "epoch": 0.83,
-      "eval_accuracy": 0.9254587155963303,
-      "eval_loss": 0.22330859303474426,
-      "eval_runtime": 21.9928,
-      "eval_samples_per_second": 39.649,
-      "eval_steps_per_second": 4.956,
+      "eval_accuracy": 0.9243119266055045,
+      "eval_loss": 0.2238595187664032,
+      "eval_runtime": 28.7109,
+      "eval_samples_per_second": 30.372,
+      "eval_steps_per_second": 3.796,
       "step": 1750
     },
     {
@@ -2351,12 +2351,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.07079672068357468,
+      "distillation_loss": 0.09009531885385513,
       "epoch": 0.84,
-      "learning_rate": 3.306413301662708e-06,
-      "loss": 0.3192,
+      "learning_rate": 1.6661282660332544e-05,
+      "loss": 0.3014,
       "step": 1760,
-      "task_loss": 0.006712011992931366
+      "task_loss": 0.005033731460571289
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2364,12 +2364,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.12471574544906616,
+      "distillation_loss": 0.25561287999153137,
       "epoch": 0.84,
-      "learning_rate": 3.211401425178148e-06,
-      "loss": 0.1666,
+      "learning_rate": 1.664228028503563e-05,
+      "loss": 0.2109,
       "step": 1770,
-      "task_loss": 0.047819193452596664
+      "task_loss": 0.12138234078884125
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2377,12 +2377,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.11018840968608856,
+      "distillation_loss": 0.0638018250465393,
       "epoch": 0.85,
-      "learning_rate": 3.1163895486935867e-06,
-      "loss": 0.2473,
+      "learning_rate": 1.662327790973872e-05,
+      "loss": 0.2247,
       "step": 1780,
-      "task_loss": 0.029587876051664352
+      "task_loss": 0.013234104961156845
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2390,12 +2390,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.27604231238365173,
+      "distillation_loss": 0.2950674891471863,
       "epoch": 0.85,
-      "learning_rate": 3.0213776722090264e-06,
-      "loss": 0.299,
+      "learning_rate": 1.6604275534441806e-05,
+      "loss": 0.2714,
       "step": 1790,
-      "task_loss": 0.12574651837348938
+      "task_loss": 0.13912129402160645
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2403,12 +2403,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.19024261832237244,
+      "distillation_loss": 0.27107563614845276,
       "epoch": 0.86,
-      "learning_rate": 2.9263657957244658e-06,
-      "loss": 0.2579,
+      "learning_rate": 1.6585273159144895e-05,
+      "loss": 0.3252,
       "step": 1800,
-      "task_loss": 0.11378154903650284
+      "task_loss": 0.16670764982700348
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2416,12 +2416,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.24252384901046753,
+      "distillation_loss": 0.07524740695953369,
       "epoch": 0.86,
-      "learning_rate": 2.8313539192399055e-06,
-      "loss": 0.2374,
+      "learning_rate": 1.656627078384798e-05,
+      "loss": 0.2284,
       "step": 1810,
-      "task_loss": 0.08886312693357468
+      "task_loss": 0.01689385622739792
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2429,12 +2429,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.40556859970092773,
+      "distillation_loss": 0.49834099411964417,
       "epoch": 0.86,
-      "learning_rate": 2.7363420427553444e-06,
-      "loss": 0.2556,
+      "learning_rate": 1.654726840855107e-05,
+      "loss": 0.2676,
       "step": 1820,
-      "task_loss": 0.16999585926532745
+      "task_loss": 0.21745765209197998
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2442,12 +2442,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.6070685386657715,
+      "distillation_loss": 0.5565487742424011,
       "epoch": 0.87,
-      "learning_rate": 2.641330166270784e-06,
-      "loss": 0.3513,
+      "learning_rate": 1.652826603325416e-05,
+      "loss": 0.3219,
       "step": 1830,
-      "task_loss": 0.21770261228084564
+      "task_loss": 0.20299580693244934
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2455,12 +2455,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.11093200743198395,
+      "distillation_loss": 0.1903807818889618,
       "epoch": 0.87,
-      "learning_rate": 2.546318289786224e-06,
-      "loss": 0.2251,
+      "learning_rate": 1.6509263657957247e-05,
+      "loss": 0.2616,
       "step": 1840,
-      "task_loss": 0.07486331462860107
+      "task_loss": 0.14985938370227814
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2468,12 +2468,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.15582458674907684,
+      "distillation_loss": 0.07635773718357086,
       "epoch": 0.88,
-      "learning_rate": 2.4513064133016627e-06,
-      "loss": 0.3013,
+      "learning_rate": 1.6490261282660333e-05,
+      "loss": 0.2512,
       "step": 1850,
-      "task_loss": 0.2854090631008148
+      "task_loss": 0.20876792073249817
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2481,12 +2481,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5564573407173157,
+      "distillation_loss": 0.4339378774166107,
       "epoch": 0.88,
-      "learning_rate": 2.356294536817102e-06,
-      "loss": 0.3546,
+      "learning_rate": 1.6471258907363422e-05,
+      "loss": 0.3134,
       "step": 1860,
-      "task_loss": 0.2976299524307251
+      "task_loss": 0.23869368433952332
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2494,12 +2494,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.35662102699279785,
+      "distillation_loss": 0.5155743360519409,
       "epoch": 0.89,
-      "learning_rate": 2.261282660332542e-06,
-      "loss": 0.1928,
+      "learning_rate": 1.6452256532066512e-05,
+      "loss": 0.2005,
       "step": 1870,
-      "task_loss": 0.17799919843673706
+      "task_loss": 0.27464839816093445
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2507,12 +2507,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.07805321365594864,
+      "distillation_loss": 0.11325311660766602,
       "epoch": 0.89,
-      "learning_rate": 2.166270783847981e-06,
-      "loss": 0.273,
+      "learning_rate": 1.6433254156769598e-05,
+      "loss": 0.271,
       "step": 1880,
-      "task_loss": 0.032489631325006485
+      "task_loss": 0.07444935292005539
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2520,12 +2520,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.0970858484506607,
+      "distillation_loss": 0.11618862301111221,
       "epoch": 0.9,
-      "learning_rate": 2.071258907363421e-06,
-      "loss": 0.2183,
+      "learning_rate": 1.6414251781472684e-05,
+      "loss": 0.2204,
       "step": 1890,
-      "task_loss": 0.011649325489997864
+      "task_loss": 0.03171085566282272
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2533,12 +2533,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.06496274471282959,
+      "distillation_loss": 0.08980407565832138,
       "epoch": 0.9,
-      "learning_rate": 1.97624703087886e-06,
-      "loss": 0.3183,
+      "learning_rate": 1.6395249406175774e-05,
+      "loss": 0.2843,
       "step": 1900,
-      "task_loss": 0.09150334447622299
+      "task_loss": 0.09885088354349136
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2546,12 +2546,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.15299738943576813,
+      "distillation_loss": 0.2316761016845703,
       "epoch": 0.91,
-      "learning_rate": 1.8812351543942995e-06,
-      "loss": 0.2088,
+      "learning_rate": 1.637624703087886e-05,
+      "loss": 0.1765,
       "step": 1910,
-      "task_loss": 0.06896167993545532
+      "task_loss": 0.12237384915351868
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2559,12 +2559,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3061752915382385,
+      "distillation_loss": 0.03400692343711853,
       "epoch": 0.91,
-      "learning_rate": 1.7862232779097388e-06,
-      "loss": 0.1873,
+      "learning_rate": 1.635724465558195e-05,
+      "loss": 0.2091,
       "step": 1920,
-      "task_loss": 0.14114254713058472
+      "task_loss": 0.006525538861751556
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2572,12 +2572,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.04780574142932892,
+      "distillation_loss": 0.03192061930894852,
       "epoch": 0.92,
-      "learning_rate": 1.691211401425178e-06,
-      "loss": 0.1784,
+      "learning_rate": 1.6338242280285036e-05,
+      "loss": 0.2077,
       "step": 1930,
-      "task_loss": 0.009304128587245941
+      "task_loss": 0.005565345287322998
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2585,12 +2585,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.08279172331094742,
+      "distillation_loss": 0.056986477226018906,
       "epoch": 0.92,
-      "learning_rate": 1.5961995249406176e-06,
-      "loss": 0.2511,
+      "learning_rate": 1.6319239904988125e-05,
+      "loss": 0.2957,
       "step": 1940,
-      "task_loss": 0.017669253051280975
+      "task_loss": 0.011667303740978241
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2598,12 +2598,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.033537302166223526,
+      "distillation_loss": 0.03923925757408142,
       "epoch": 0.93,
-      "learning_rate": 1.5011876484560572e-06,
-      "loss": 0.2502,
+      "learning_rate": 1.630023752969121e-05,
+      "loss": 0.2349,
       "step": 1950,
-      "task_loss": 0.04457058385014534
+      "task_loss": 0.07591888308525085
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2611,12 +2611,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.1866932362318039,
+      "distillation_loss": 0.30294090509414673,
       "epoch": 0.93,
-      "learning_rate": 1.4061757719714967e-06,
-      "loss": 0.251,
+      "learning_rate": 1.62812351543943e-05,
+      "loss": 0.2222,
       "step": 1960,
-      "task_loss": 0.1178937703371048
+      "task_loss": 0.21820741891860962
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2624,12 +2624,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.3302549719810486,
+      "distillation_loss": 0.23649045825004578,
       "epoch": 0.94,
-      "learning_rate": 1.311163895486936e-06,
-      "loss": 0.1995,
+      "learning_rate": 1.626223277909739e-05,
+      "loss": 0.2035,
       "step": 1970,
-      "task_loss": 0.2511519193649292
+      "task_loss": 0.21682609617710114
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2637,12 +2637,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2090863585472107,
+      "distillation_loss": 0.39186713099479675,
       "epoch": 0.94,
-      "learning_rate": 1.2161520190023753e-06,
-      "loss": 0.2799,
+      "learning_rate": 1.6243230403800477e-05,
+      "loss": 0.2666,
       "step": 1980,
-      "task_loss": 0.14533783495426178
+      "task_loss": 0.11818552017211914
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2650,12 +2650,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.064464271068573,
+      "distillation_loss": 0.3313908576965332,
       "epoch": 0.95,
-      "learning_rate": 1.1211401425178148e-06,
-      "loss": 0.3001,
+      "learning_rate": 1.6224228028503563e-05,
+      "loss": 0.3135,
       "step": 1990,
-      "task_loss": 0.11410848796367645
+      "task_loss": 0.2329636514186859
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2663,20 +2663,20 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.5515795946121216,
+      "distillation_loss": 0.314828097820282,
       "epoch": 0.95,
-      "learning_rate": 1.0261282660332544e-06,
-      "loss": 0.2161,
+      "learning_rate": 1.6205225653206652e-05,
+      "loss": 0.1777,
       "step": 2000,
-      "task_loss": 0.21116505563259125
+      "task_loss": 0.31898385286331177
     },
     {
       "epoch": 0.95,
-      "eval_accuracy": 0.9254587155963303,
-      "eval_loss": 0.22065171599388123,
-      "eval_runtime": 21.9957,
-      "eval_samples_per_second": 39.644,
-      "eval_steps_per_second": 4.956,
+      "eval_accuracy": 0.926605504587156,
+      "eval_loss": 0.1968172788619995,
+      "eval_runtime": 29.4372,
+      "eval_samples_per_second": 29.622,
+      "eval_steps_per_second": 3.703,
       "step": 2000
     },
     {
@@ -2685,12 +2685,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.33025607466697693,
+      "distillation_loss": 0.5318342447280884,
       "epoch": 0.95,
-      "learning_rate": 9.311163895486937e-07,
-      "loss": 0.1835,
+      "learning_rate": 1.6186223277909742e-05,
+      "loss": 0.2675,
       "step": 2010,
-      "task_loss": 0.4712632894515991
+      "task_loss": 0.5134344100952148
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2698,12 +2698,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.30521997809410095,
+      "distillation_loss": 0.3911985158920288,
       "epoch": 0.96,
-      "learning_rate": 8.361045130641331e-07,
-      "loss": 0.2597,
+      "learning_rate": 1.6167220902612828e-05,
+      "loss": 0.2213,
       "step": 2020,
-      "task_loss": 0.2004762589931488
+      "task_loss": 0.2515479028224945
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2711,12 +2711,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.10356535017490387,
+      "distillation_loss": 0.1401204764842987,
       "epoch": 0.96,
-      "learning_rate": 7.410926365795724e-07,
-      "loss": 0.2461,
+      "learning_rate": 1.6148218527315914e-05,
+      "loss": 0.2272,
       "step": 2030,
-      "task_loss": 0.07815247774124146
+      "task_loss": 0.09529782831668854
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2724,12 +2724,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.07229259610176086,
+      "distillation_loss": 0.16212940216064453,
       "epoch": 0.97,
-      "learning_rate": 6.460807600950119e-07,
-      "loss": 0.1703,
+      "learning_rate": 1.6129216152019004e-05,
+      "loss": 0.212,
       "step": 2040,
-      "task_loss": 0.188236802816391
+      "task_loss": 0.31426337361335754
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2737,12 +2737,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.8154426217079163,
+      "distillation_loss": 0.21590156853199005,
       "epoch": 0.97,
-      "learning_rate": 5.510688836104513e-07,
-      "loss": 0.2093,
+      "learning_rate": 1.6110213776722093e-05,
+      "loss": 0.1846,
       "step": 2050,
-      "task_loss": 0.6037241220474243
+      "task_loss": 0.2860991060733795
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2750,12 +2750,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.26825618743896484,
+      "distillation_loss": 0.38810980319976807,
       "epoch": 0.98,
-      "learning_rate": 4.560570071258908e-07,
-      "loss": 0.2883,
+      "learning_rate": 1.609121140142518e-05,
+      "loss": 0.2844,
       "step": 2060,
-      "task_loss": 0.31348007917404175
+      "task_loss": 0.39935895800590515
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2763,12 +2763,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.7826902270317078,
+      "distillation_loss": 0.7248147130012512,
       "epoch": 0.98,
-      "learning_rate": 3.610451306413302e-07,
-      "loss": 0.3093,
+      "learning_rate": 1.6072209026128266e-05,
+      "loss": 0.3505,
       "step": 2070,
-      "task_loss": 0.6269980669021606
+      "task_loss": 0.5984583497047424
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2776,12 +2776,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.39573800563812256,
+      "distillation_loss": 0.2939862608909607,
       "epoch": 0.99,
-      "learning_rate": 2.660332541567696e-07,
-      "loss": 0.3111,
+      "learning_rate": 1.6053206650831355e-05,
+      "loss": 0.2442,
       "step": 2080,
-      "task_loss": 0.24385812878608704
+      "task_loss": 0.15204966068267822
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2789,12 +2789,12 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.23440049588680267,
+      "distillation_loss": 0.13827058672904968,
       "epoch": 0.99,
-      "learning_rate": 1.7102137767220902e-07,
-      "loss": 0.2771,
+      "learning_rate": 1.6034204275534445e-05,
+      "loss": 0.1982,
       "step": 2090,
-      "task_loss": 0.12204352021217346
+      "task_loss": 0.05438768118619919
     },
     {
       "compression/movement_sparsity/importance_regularization_factor": 0.0,
@@ -2802,26 +2802,11278 @@
       "compression/movement_sparsity/linear_layer_sparsity": 0.0,
       "compression/movement_sparsity/model_sparsity": 0.0,
       "compression_loss": 0.0,
-      "distillation_loss": 0.2844744920730591,
+      "distillation_loss": 0.18541285395622253,
       "epoch": 1.0,
-      "learning_rate": 7.600950118764846e-08,
-      "loss": 0.2458,
+      "learning_rate": 1.601520190023753e-05,
+      "loss": 0.2719,
       "step": 2100,
-      "task_loss": 0.3573310077190399
+      "task_loss": 0.3633626699447632
     },
     {
+      "compression/movement_sparsity/importance_regularization_factor": 0.00028449433713194084,
+      "compression/movement_sparsity/importance_threshold": -0.0026588774864763865,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0010491357083709726,
+      "compression/movement_sparsity/model_sparsity": 0.0008146869027482775,
+      "compression_loss": 0.07738782465457916,
+      "distillation_loss": 0.10512904077768326,
       "epoch": 1.0,
-      "step": 2105,
-      "total_flos": 4441630972486656.0,
-      "train_loss": 0.40093172477146793,
-      "train_runtime": 1354.5918,
-      "train_samples_per_second": 49.719,
-      "train_steps_per_second": 1.554
+      "learning_rate": 1.5996199524940617e-05,
+      "loss": 0.1552,
+      "step": 2110,
+      "task_loss": 0.061604227870702744
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0009910043789297608,
+      "compression/movement_sparsity/importance_threshold": -0.002621092018561318,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0011034303485395966,
+      "compression/movement_sparsity/model_sparsity": 0.0008568483999520002,
+      "compression_loss": 0.2695717215538025,
+      "distillation_loss": 0.11719319969415665,
+      "epoch": 1.01,
+      "learning_rate": 1.5977197149643707e-05,
+      "loss": 0.3396,
+      "step": 2120,
+      "task_loss": 0.3348212242126465
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.001690789007294047,
+      "compression/movement_sparsity/importance_threshold": -0.0025836662382352064,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0012307216388136105,
+      "compression/movement_sparsity/model_sparsity": 0.0009556940937862046,
+      "compression_loss": 0.4599255919456482,
+      "distillation_loss": 0.0449785441160202,
+      "epoch": 1.01,
+      "learning_rate": 1.5958194774346796e-05,
+      "loss": 0.5465,
+      "step": 2130,
+      "task_loss": 0.017805740237236023
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0023838803858471425,
+      "compression/movement_sparsity/importance_threshold": -0.002546598425327852,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0014600153380006022,
+      "compression/movement_sparsity/model_sparsity": 0.0011337478690222034,
+      "compression_loss": 0.6484553217887878,
+      "distillation_loss": 0.08259381353855133,
+      "epoch": 1.02,
+      "learning_rate": 1.5939192399049882e-05,
+      "loss": 0.7333,
+      "step": 2140,
+      "task_loss": 0.034424424171447754
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0030703106782113913,
+      "compression/movement_sparsity/importance_threshold": -0.0025098868596690545,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0017334528380006022,
+      "compression/movement_sparsity/model_sparsity": 0.001346080695169288,
+      "compression_loss": 0.8351697325706482,
+      "distillation_loss": 0.07959377765655518,
+      "epoch": 1.02,
+      "learning_rate": 1.5920190023752972e-05,
+      "loss": 0.9215,
+      "step": 2150,
+      "task_loss": 0.015001043677330017
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0037501120480091413,
+      "compression/movement_sparsity/importance_threshold": -0.0024735298210886133,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0020618201407708522,
+      "compression/movement_sparsity/model_sparsity": 0.0016010682422742115,
+      "compression_loss": 1.0200759172439575,
+      "distillation_loss": 0.38012033700942993,
+      "epoch": 1.03,
+      "learning_rate": 1.5901187648456058e-05,
+      "loss": 1.1094,
+      "step": 2160,
+      "task_loss": 0.1993572860956192
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.004423316658862747,
+      "compression/movement_sparsity/importance_threshold": -0.0024375255894163272,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0023032407407407407,
+      "compression/movement_sparsity/model_sparsity": 0.0017885389377045428,
+      "compression_loss": 1.2031831741333008,
+      "distillation_loss": 0.09336289763450623,
+      "epoch": 1.03,
+      "learning_rate": 1.5882185273159144e-05,
+      "loss": 1.319,
+      "step": 2170,
+      "task_loss": 0.3724411129951477
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.005089956674394564,
+      "compression/movement_sparsity/importance_threshold": -0.0024018724444819957,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.00264551104712436,
+      "compression/movement_sparsity/model_sparsity": 0.00205432260476068,
+      "compression_loss": 1.384499192237854,
+      "distillation_loss": 0.1010119616985321,
+      "epoch": 1.04,
+      "learning_rate": 1.5863182897862234e-05,
+      "loss": 1.4167,
+      "step": 2180,
+      "task_loss": 0.05338115245103836
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0057500642582269225,
+      "compression/movement_sparsity/importance_threshold": -0.002366568666115419,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0030340894120746763,
+      "compression/movement_sparsity/model_sparsity": 0.0023560659370011876,
+      "compression_loss": 1.564030647277832,
+      "distillation_loss": 0.021691124886274338,
+      "epoch": 1.04,
+      "learning_rate": 1.5844180522565323e-05,
+      "loss": 1.6449,
+      "step": 2190,
+      "task_loss": 0.003468889743089676
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.006403671573982206,
+      "compression/movement_sparsity/importance_threshold": -0.002331612534146395,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0036809201859379706,
+      "compression/movement_sparsity/model_sparsity": 0.0028583503941561096,
+      "compression_loss": 1.741767406463623,
+      "distillation_loss": 0.07079476118087769,
+      "epoch": 1.05,
+      "learning_rate": 1.582517814726841e-05,
+      "loss": 1.7437,
+      "step": 2200,
+      "task_loss": 0.022576410323381424
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.007050810785282746,
+      "compression/movement_sparsity/importance_threshold": -0.0022970023284047234,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.00432740985395965,
+      "compression/movement_sparsity/model_sparsity": 0.0033603699718875247,
+      "compression_loss": 1.9177247285842896,
+      "distillation_loss": 0.030747881159186363,
+      "epoch": 1.05,
+      "learning_rate": 1.5806175771971496e-05,
+      "loss": 1.9982,
+      "step": 2210,
+      "task_loss": 0.005105555057525635
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.007691514055750898,
+      "compression/movement_sparsity/importance_threshold": -0.002262736328720204,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0052953741342968985,
+      "compression/movement_sparsity/model_sparsity": 0.004112024705614379,
+      "compression_loss": 2.0919368267059326,
+      "distillation_loss": 0.2072967141866684,
+      "epoch": 1.05,
+      "learning_rate": 1.5787173396674585e-05,
+      "loss": 2.1582,
+      "step": 2220,
+      "task_loss": 0.33855485916137695
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.00832581354900901,
+      "compression/movement_sparsity/importance_threshold": -0.002228812814922636,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.006365270249924722,
+      "compression/movement_sparsity/model_sparsity": 0.004942832718103681,
+      "compression_loss": 2.2644152641296387,
+      "distillation_loss": 0.13281983137130737,
+      "epoch": 1.06,
+      "learning_rate": 1.5768171021377675e-05,
+      "loss": 2.3957,
+      "step": 2230,
+      "task_loss": 0.10870643705129623
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.00895374142867943,
+      "compression/movement_sparsity/importance_threshold": -0.002195230066841819,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.007350924984944294,
+      "compression/movement_sparsity/model_sparsity": 0.005708224646759998,
+      "compression_loss": 2.43515682220459,
+      "distillation_loss": 0.023621466010808945,
+      "epoch": 1.06,
+      "learning_rate": 1.574916864608076e-05,
+      "loss": 2.5349,
+      "step": 2240,
+      "task_loss": 0.0038209035992622375
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.009575329858384518,
+      "compression/movement_sparsity/importance_threshold": -0.0021619863643075514,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.008769878236976815,
+      "compression/movement_sparsity/model_sparsity": 0.006810086513455902,
+      "compression_loss": 2.6041481494903564,
+      "distillation_loss": 0.014014622196555138,
+      "epoch": 1.07,
+      "learning_rate": 1.5730166270783847e-05,
+      "loss": 2.6501,
+      "step": 2250,
+      "task_loss": 0.002429734915494919
+    },
+    {
+      "epoch": 1.07,
+      "eval_accuracy": 0.9254587155963303,
+      "eval_loss": 2.8218579292297363,
+      "eval_runtime": 30.6417,
+      "eval_samples_per_second": 28.458,
+      "eval_steps_per_second": 3.557,
+      "step": 2250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.010190611001746603,
+      "compression/movement_sparsity/importance_threshold": -0.0021290799871496345,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.010168882678410117,
+      "compression/movement_sparsity/model_sparsity": 0.007896457500763263,
+      "compression_loss": 2.7714102268218994,
+      "distillation_loss": 0.1411275863647461,
+      "epoch": 1.07,
+      "learning_rate": 1.5711163895486937e-05,
+      "loss": 2.8369,
+      "step": 2260,
+      "task_loss": 0.06266731023788452
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.010799617022388065,
+      "compression/movement_sparsity/importance_threshold": -0.0020965092151978655,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.011874576558265583,
+      "compression/movement_sparsity/model_sparsity": 0.009220982491123023,
+      "compression_loss": 2.936958074569702,
+      "distillation_loss": 0.32265156507492065,
+      "epoch": 1.08,
+      "learning_rate": 1.5692161520190026e-05,
+      "loss": 3.0225,
+      "step": 2270,
+      "task_loss": 0.4202197194099426
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.01140238008393124,
+      "compression/movement_sparsity/importance_threshold": -0.0020642723282820446,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.013452696853357423,
+      "compression/movement_sparsity/model_sparsity": 0.010446442577091193,
+      "compression_loss": 3.100813865661621,
+      "distillation_loss": 0.03656826913356781,
+      "epoch": 1.08,
+      "learning_rate": 1.5673159144893113e-05,
+      "loss": 3.1307,
+      "step": 2280,
+      "task_loss": 0.16597847640514374
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.011998932349998482,
+      "compression/movement_sparsity/importance_threshold": -0.002032367606231971,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.015547780694820837,
+      "compression/movement_sparsity/model_sparsity": 0.01207334113004394,
+      "compression_loss": 3.262972354888916,
+      "distillation_loss": 0.3931387662887573,
+      "epoch": 1.09,
+      "learning_rate": 1.5654156769596202e-05,
+      "loss": 3.4253,
+      "step": 2290,
+      "task_loss": 0.39384859800338745
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.012589305984212141,
+      "compression/movement_sparsity/importance_threshold": -0.0020007933288774442,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.018422667777024993,
+      "compression/movement_sparsity/model_sparsity": 0.01430578144645319,
+      "compression_loss": 3.4234325885772705,
+      "distillation_loss": 0.05559838190674782,
+      "epoch": 1.09,
+      "learning_rate": 1.5635154394299288e-05,
+      "loss": 3.4753,
+      "step": 2300,
+      "task_loss": 0.011821478605270386
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.013173533150194566,
+      "compression/movement_sparsity/importance_threshold": -0.0019695477760482637,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.02154230418172237,
+      "compression/movement_sparsity/model_sparsity": 0.01672827731611527,
+      "compression_loss": 3.5822105407714844,
+      "distillation_loss": 0.10458941757678986,
+      "epoch": 1.1,
+      "learning_rate": 1.5616152019002378e-05,
+      "loss": 3.7453,
+      "step": 2310,
+      "task_loss": 0.33254674077033997
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.013751646011568098,
+      "compression/movement_sparsity/importance_threshold": -0.001938629227574229,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.025347445705359833,
+      "compression/movement_sparsity/model_sparsity": 0.019683089489294023,
+      "compression_loss": 3.739313840866089,
+      "distillation_loss": 0.07036435604095459,
+      "epoch": 1.1,
+      "learning_rate": 1.5597149643705464e-05,
+      "loss": 3.7802,
+      "step": 2320,
+      "task_loss": 0.042275626212358475
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.01432367673195511,
+      "compression/movement_sparsity/importance_threshold": -0.0019080359632851387,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.02890405045543511,
+      "compression/movement_sparsity/model_sparsity": 0.022444905034241694,
+      "compression_loss": 3.894759178161621,
+      "distillation_loss": 0.06921583414077759,
+      "epoch": 1.11,
+      "learning_rate": 1.5578147268408554e-05,
+      "loss": 3.984,
+      "step": 2330,
+      "task_loss": 0.2081681489944458
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.014889657474977936,
+      "compression/movement_sparsity/importance_threshold": -0.0018777662630107927,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.03373462671258657,
+      "compression/movement_sparsity/model_sparsity": 0.026195999557121603,
+      "compression_loss": 4.048542022705078,
+      "distillation_loss": 0.021231140941381454,
+      "epoch": 1.11,
+      "learning_rate": 1.555914489311164e-05,
+      "loss": 4.1113,
+      "step": 2340,
+      "task_loss": 0.00398905947804451
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.015449620404258941,
+      "compression/movement_sparsity/importance_threshold": -0.0018478184065809896,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.04030603075127973,
+      "compression/movement_sparsity/model_sparsity": 0.03129890165098249,
+      "compression_loss": 4.2006354331970215,
+      "distillation_loss": 0.024290261790156364,
+      "epoch": 1.12,
+      "learning_rate": 1.554014251781473e-05,
+      "loss": 4.25,
+      "step": 2350,
+      "task_loss": 0.005856834352016449
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.016003597683420464,
+      "compression/movement_sparsity/importance_threshold": -0.0018181906738255296,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.04683566696778079,
+      "compression/movement_sparsity/model_sparsity": 0.036369369716123603,
+      "compression_loss": 4.351089954376221,
+      "distillation_loss": 0.03239811956882477,
+      "epoch": 1.12,
+      "learning_rate": 1.5521140142517815e-05,
+      "loss": 4.4352,
+      "step": 2360,
+      "task_loss": 0.012328799813985825
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.016551621476084855,
+      "compression/movement_sparsity/importance_threshold": -0.0017888813445742116,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.05416729006699789,
+      "compression/movement_sparsity/model_sparsity": 0.0420626058410224,
+      "compression_loss": 4.499930381774902,
+      "distillation_loss": 0.12336177378892899,
+      "epoch": 1.13,
+      "learning_rate": 1.5502137767220905e-05,
+      "loss": 4.5803,
+      "step": 2370,
+      "task_loss": 0.06430968642234802
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.017093723945874474,
+      "compression/movement_sparsity/importance_threshold": -0.0017598886986568353,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.06110632386705812,
+      "compression/movement_sparsity/model_sparsity": 0.04745098401701118,
+      "compression_loss": 4.647174835205078,
+      "distillation_loss": 0.030381930992007256,
+      "epoch": 1.13,
+      "learning_rate": 1.548313539192399e-05,
+      "loss": 4.7628,
+      "step": 2380,
+      "task_loss": 0.1719023883342743
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.01762993725641166,
+      "compression/movement_sparsity/importance_threshold": -0.0017312110159031996,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.06815575833709726,
+      "compression/movement_sparsity/model_sparsity": 0.05292509178848395,
+      "compression_loss": 4.792819023132324,
+      "distillation_loss": 0.44239741563796997,
+      "epoch": 1.14,
+      "learning_rate": 1.5464133016627077e-05,
+      "loss": 4.888,
+      "step": 2390,
+      "task_loss": 0.2511236369609833
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.018160293571318775,
+      "compression/movement_sparsity/importance_threshold": -0.0017028465761431042,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.07490678400707618,
+      "compression/movement_sparsity/model_sparsity": 0.058167475733254294,
+      "compression_loss": 4.936878681182861,
+      "distillation_loss": 0.07934065163135529,
+      "epoch": 1.14,
+      "learning_rate": 1.5445130641330167e-05,
+      "loss": 5.0129,
+      "step": 2400,
+      "task_loss": 0.19744229316711426
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.018684825054218146,
+      "compression/movement_sparsity/importance_threshold": -0.0016747936592063492,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.0832007019722975,
+      "compression/movement_sparsity/model_sparsity": 0.06460796411318578,
+      "compression_loss": 5.079326152801514,
+      "distillation_loss": 0.08092580735683441,
+      "epoch": 1.14,
+      "learning_rate": 1.5426128266033256e-05,
+      "loss": 5.1946,
+      "step": 2410,
+      "task_loss": 0.020919568836688995
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.019203563868732162,
+      "compression/movement_sparsity/importance_threshold": -0.001647050544922732,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.09128394591237579,
+      "compression/movement_sparsity/model_sparsity": 0.07088485748089549,
+      "compression_loss": 5.220214366912842,
+      "distillation_loss": 0.1532871127128601,
+      "epoch": 1.15,
+      "learning_rate": 1.5407125890736343e-05,
+      "loss": 5.4019,
+      "step": 2420,
+      "task_loss": 0.10705733299255371
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.01971654217848315,
+      "compression/movement_sparsity/importance_threshold": -0.0016196155131220531,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.09979940624059018,
+      "compression/movement_sparsity/model_sparsity": 0.07749738047950812,
+      "compression_loss": 5.3595380783081055,
+      "distillation_loss": 0.08937288820743561,
+      "epoch": 1.15,
+      "learning_rate": 1.538812351543943e-05,
+      "loss": 5.5179,
+      "step": 2430,
+      "task_loss": 0.2294204831123352
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.020223792147093467,
+      "compression/movement_sparsity/importance_threshold": -0.0015924868436341117,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.10875641043736826,
+      "compression/movement_sparsity/model_sparsity": 0.08445277619119063,
+      "compression_loss": 5.49728536605835,
+      "distillation_loss": 0.039297595620155334,
+      "epoch": 1.16,
+      "learning_rate": 1.536912114014252e-05,
+      "loss": 5.5902,
+      "step": 2440,
+      "task_loss": 0.012466028332710266
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.020725345938185466,
+      "compression/movement_sparsity/importance_threshold": -0.0015656628162887072,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.11579635275519422,
+      "compression/movement_sparsity/model_sparsity": 0.08991951300767133,
+      "compression_loss": 5.633523941040039,
+      "distillation_loss": 0.271115243434906,
+      "epoch": 1.16,
+      "learning_rate": 1.5350118764845608e-05,
+      "loss": 5.8822,
+      "step": 2450,
+      "task_loss": 0.06959662586450577
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.02122123571538147,
+      "compression/movement_sparsity/importance_threshold": -0.0015391417109156397,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.1232250616342969,
+      "compression/movement_sparsity/model_sparsity": 0.09568813929676413,
+      "compression_loss": 5.76822566986084,
+      "distillation_loss": 0.6192089319229126,
+      "epoch": 1.17,
+      "learning_rate": 1.5331116389548694e-05,
+      "loss": 5.9777,
+      "step": 2460,
+      "task_loss": 0.36697810888290405
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.021711493642303875,
+      "compression/movement_sparsity/importance_threshold": -0.001512921807344707,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.1321434620596206,
+      "compression/movement_sparsity/model_sparsity": 0.10261355796472424,
+      "compression_loss": 5.90135383605957,
+      "distillation_loss": 0.33043575286865234,
+      "epoch": 1.17,
+      "learning_rate": 1.5312114014251784e-05,
+      "loss": 6.1168,
+      "step": 2470,
+      "task_loss": 0.23188000917434692
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.022196151882575007,
+      "compression/movement_sparsity/importance_threshold": -0.0014870013854057092,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.14109207975760313,
+      "compression/movement_sparsity/model_sparsity": 0.10956244129609777,
+      "compression_loss": 6.032968044281006,
+      "distillation_loss": 0.44384852051734924,
+      "epoch": 1.18,
+      "learning_rate": 1.5293111638954873e-05,
+      "loss": 6.2156,
+      "step": 2480,
+      "task_loss": 0.26120489835739136
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.022675242599817222,
+      "compression/movement_sparsity/importance_threshold": -0.0014613787249284456,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.14897257744278833,
+      "compression/movement_sparsity/model_sparsity": 0.11568189581473888,
+      "compression_loss": 6.16309928894043,
+      "distillation_loss": 0.13425683975219727,
+      "epoch": 1.18,
+      "learning_rate": 1.527410926365796e-05,
+      "loss": 6.4394,
+      "step": 2490,
+      "task_loss": 0.04769594222307205
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.02314879795765287,
+      "compression/movement_sparsity/importance_threshold": -0.0014360521057427153,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.1572927958446251,
+      "compression/movement_sparsity/model_sparsity": 0.12214280731160009,
+      "compression_loss": 6.291727542877197,
+      "distillation_loss": 0.3531065583229065,
+      "epoch": 1.19,
+      "learning_rate": 1.5255106888361047e-05,
+      "loss": 6.4768,
+      "step": 2500,
+      "task_loss": 0.4275900721549988
+    },
+    {
+      "epoch": 1.19,
+      "eval_accuracy": 0.8979357798165137,
+      "eval_loss": 6.5765380859375,
+      "eval_runtime": 26.114,
+      "eval_samples_per_second": 33.392,
+      "eval_steps_per_second": 4.174,
+      "step": 2500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.023616850119704293,
+      "compression/movement_sparsity/importance_threshold": -0.0014110198076783182,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.16596552949036436,
+      "compression/movement_sparsity/model_sparsity": 0.12887745799199585,
+      "compression_loss": 6.418839454650879,
+      "distillation_loss": 0.32202211022377014,
+      "epoch": 1.19,
+      "learning_rate": 1.5236104513064133e-05,
+      "loss": 6.6039,
+      "step": 2510,
+      "task_loss": 0.21520735323429108
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.024079431249593854,
+      "compression/movement_sparsity/importance_threshold": -0.0013862801105650534,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.17616387665612768,
+      "compression/movement_sparsity/model_sparsity": 0.1367967955946862,
+      "compression_loss": 6.544415473937988,
+      "distillation_loss": 0.24521012604236603,
+      "epoch": 1.2,
+      "learning_rate": 1.5217102137767221e-05,
+      "loss": 6.6944,
+      "step": 2520,
+      "task_loss": 0.1956482082605362
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0245365735109439,
+      "compression/movement_sparsity/importance_threshold": -0.00136183129423272,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.18661719455736225,
+      "compression/movement_sparsity/model_sparsity": 0.14491412599956133,
+      "compression_loss": 6.668520450592041,
+      "distillation_loss": 0.3077329397201538,
+      "epoch": 1.2,
+      "learning_rate": 1.519809976247031e-05,
+      "loss": 6.8466,
+      "step": 2530,
+      "task_loss": 0.16484826803207397
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.02498830906737678,
+      "compression/movement_sparsity/importance_threshold": -0.0013376716385111176,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.19720587266636555,
+      "compression/movement_sparsity/model_sparsity": 0.1531365678667026,
+      "compression_loss": 6.791137218475342,
+      "distillation_loss": 0.15260806679725647,
+      "epoch": 1.21,
+      "learning_rate": 1.5179097387173399e-05,
+      "loss": 6.9378,
+      "step": 2540,
+      "task_loss": 0.04942808300256729
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.025434670082514835,
+      "compression/movement_sparsity/importance_threshold": -0.001313799423230046,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.20696387571514605,
+      "compression/movement_sparsity/model_sparsity": 0.16071396440119193,
+      "compression_loss": 6.912316799163818,
+      "distillation_loss": 0.1831030249595642,
+      "epoch": 1.21,
+      "learning_rate": 1.5160095011876485e-05,
+      "loss": 7.1445,
+      "step": 2550,
+      "task_loss": 0.0789690762758255
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.025875688719980434,
+      "compression/movement_sparsity/importance_threshold": -0.0012902129282193035,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.21675474254742547,
+      "compression/movement_sparsity/model_sparsity": 0.1683168806980702,
+      "compression_loss": 7.032046794891357,
+      "distillation_loss": 0.14177289605140686,
+      "epoch": 1.22,
+      "learning_rate": 1.5141092636579573e-05,
+      "loss": 7.2325,
+      "step": 2560,
+      "task_loss": 0.2033880352973938
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.02631139714339592,
+      "compression/movement_sparsity/importance_threshold": -0.00126691043330869,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.22645565153568203,
+      "compression/movement_sparsity/model_sparsity": 0.1758499418973284,
+      "compression_loss": 7.150337219238281,
+      "distillation_loss": 0.21663016080856323,
+      "epoch": 1.22,
+      "learning_rate": 1.5122090261282662e-05,
+      "loss": 7.3976,
+      "step": 2570,
+      "task_loss": 0.18229436874389648
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.02674182751638365,
+      "compression/movement_sparsity/importance_threshold": -0.0012438902183280046,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.2362620798517013,
+      "compression/movement_sparsity/model_sparsity": 0.18346494217618248,
+      "compression_loss": 7.267183780670166,
+      "distillation_loss": 0.062164291739463806,
+      "epoch": 1.23,
+      "learning_rate": 1.510308788598575e-05,
+      "loss": 7.4626,
+      "step": 2580,
+      "task_loss": 0.006800137460231781
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.027167012002565962,
+      "compression/movement_sparsity/importance_threshold": -0.0012211505631070472,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.24733038806082505,
+      "compression/movement_sparsity/model_sparsity": 0.19205983191409415,
+      "compression_loss": 7.382561683654785,
+      "distillation_loss": 0.05586311221122742,
+      "epoch": 1.23,
+      "learning_rate": 1.5084085510688838e-05,
+      "loss": 7.5546,
+      "step": 2590,
+      "task_loss": 0.01366850733757019
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.027586982765565204,
+      "compression/movement_sparsity/importance_threshold": -0.001198689747475617,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.2569306825880759,
+      "compression/movement_sparsity/model_sparsity": 0.19951476281719105,
+      "compression_loss": 7.496555805206299,
+      "distillation_loss": 0.6351262331008911,
+      "epoch": 1.24,
+      "learning_rate": 1.5065083135391924e-05,
+      "loss": 7.888,
+      "step": 2600,
+      "task_loss": 0.46579158306121826
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.028001771969003754,
+      "compression/movement_sparsity/importance_threshold": -0.0011765060512635124,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.2665302008054803,
+      "compression/movement_sparsity/model_sparsity": 0.20696909089125515,
+      "compression_loss": 7.60915994644165,
+      "distillation_loss": 0.42842957377433777,
+      "epoch": 1.24,
+      "learning_rate": 1.5046080760095012e-05,
+      "loss": 7.9082,
+      "step": 2610,
+      "task_loss": 0.14085018634796143
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.02841141177650394,
+      "compression/movement_sparsity/importance_threshold": -0.0011545977543005338,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.2762365557249323,
+      "compression/movement_sparsity/model_sparsity": 0.21450638102751624,
+      "compression_loss": 7.720366954803467,
+      "distillation_loss": 0.3830278515815735,
+      "epoch": 1.24,
+      "learning_rate": 1.5027078384798102e-05,
+      "loss": 8.14,
+      "step": 2620,
+      "task_loss": 0.28632909059524536
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.028815934351688118,
+      "compression/movement_sparsity/importance_threshold": -0.00113296313641648,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.285795494109455,
+      "compression/movement_sparsity/model_sparsity": 0.22192919758395624,
+      "compression_loss": 7.830181121826172,
+      "distillation_loss": 0.2521060109138489,
+      "epoch": 1.25,
+      "learning_rate": 1.500807600950119e-05,
+      "loss": 8.1153,
+      "step": 2630,
+      "task_loss": 0.1563887596130371
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.029215371858178636,
+      "compression/movement_sparsity/importance_threshold": -0.0011116004774411505,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.2953732638888889,
+      "compression/movement_sparsity/model_sparsity": 0.22936663731132845,
+      "compression_loss": 7.938580513000488,
+      "distillation_loss": 0.1190139651298523,
+      "epoch": 1.25,
+      "learning_rate": 1.4989073634204276e-05,
+      "loss": 8.1567,
+      "step": 2640,
+      "task_loss": 0.04846365749835968
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.029609756459597847,
+      "compression/movement_sparsity/importance_threshold": -0.0010905080572043448,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.30575512599744054,
+      "compression/movement_sparsity/model_sparsity": 0.2374284800438655,
+      "compression_loss": 8.045588493347168,
+      "distillation_loss": 0.06826656311750412,
+      "epoch": 1.26,
+      "learning_rate": 1.4970071258907363e-05,
+      "loss": 8.2518,
+      "step": 2650,
+      "task_loss": 0.011266250163316727
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.02999912031956811,
+      "compression/movement_sparsity/importance_threshold": -0.001069684155535862,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.31633204183604335,
+      "compression/movement_sparsity/model_sparsity": 0.2456417881377824,
+      "compression_loss": 8.151252746582031,
+      "distillation_loss": 0.1792048215866089,
+      "epoch": 1.26,
+      "learning_rate": 1.4951068883610453e-05,
+      "loss": 8.4222,
+      "step": 2660,
+      "task_loss": 0.06012868136167526
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03038349560171176,
+      "compression/movement_sparsity/importance_threshold": -0.0010491270522655018,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.32636890479147845,
+      "compression/movement_sparsity/model_sparsity": 0.25343572816787524,
+      "compression_loss": 8.255572319030762,
+      "distillation_loss": 0.0919712483882904,
+      "epoch": 1.27,
+      "learning_rate": 1.4932066508313541e-05,
+      "loss": 8.513,
+      "step": 2670,
+      "task_loss": 0.017085224390029907
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.030762914469651154,
+      "compression/movement_sparsity/importance_threshold": -0.001028835027223063,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.3357471276535682,
+      "compression/movement_sparsity/model_sparsity": 0.2607182134324959,
+      "compression_loss": 8.358521461486816,
+      "distillation_loss": 0.5716733336448669,
+      "epoch": 1.27,
+      "learning_rate": 1.4913064133016629e-05,
+      "loss": 8.648,
+      "step": 2680,
+      "task_loss": 0.34179773926734924
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03113740908700865,
+      "compression/movement_sparsity/importance_threshold": -0.0010088063602383453,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.3457659002371274,
+      "compression/movement_sparsity/model_sparsity": 0.26849810571936966,
+      "compression_loss": 8.460137367248535,
+      "distillation_loss": 0.3870583772659302,
+      "epoch": 1.28,
+      "learning_rate": 1.4894061757719715e-05,
+      "loss": 8.7722,
+      "step": 2690,
+      "task_loss": 0.2868567109107971
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03150701161740658,
+      "compression/movement_sparsity/importance_threshold": -0.0009890393311411487,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.35571594587473654,
+      "compression/movement_sparsity/model_sparsity": 0.27622462936929343,
+      "compression_loss": 8.560412406921387,
+      "distillation_loss": 0.5741435289382935,
+      "epoch": 1.28,
+      "learning_rate": 1.4875059382422804e-05,
+      "loss": 8.8839,
+      "step": 2700,
+      "task_loss": 0.5338079333305359
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03187175422446732,
+      "compression/movement_sparsity/importance_threshold": -0.000969532219761271,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.36540933877220716,
+      "compression/movement_sparsity/model_sparsity": 0.28375185408746123,
+      "compression_loss": 8.659358024597168,
+      "distillation_loss": 0.5666482448577881,
+      "epoch": 1.29,
+      "learning_rate": 1.4856057007125892e-05,
+      "loss": 9.0379,
+      "step": 2710,
+      "task_loss": 0.368877112865448
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.032231669071813206,
+      "compression/movement_sparsity/importance_threshold": -0.0009502833059285125,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.3747603790274014,
+      "compression/movement_sparsity/model_sparsity": 0.2910132312021604,
+      "compression_loss": 8.756999015808105,
+      "distillation_loss": 0.4417470097541809,
+      "epoch": 1.29,
+      "learning_rate": 1.483705463182898e-05,
+      "loss": 9.134,
+      "step": 2720,
+      "task_loss": 0.2064928412437439
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.032586788323066586,
+      "compression/movement_sparsity/importance_threshold": -0.0009312908694726729,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.38400508835817526,
+      "compression/movement_sparsity/model_sparsity": 0.29819203900691116,
+      "compression_loss": 8.85331916809082,
+      "distillation_loss": 0.38386473059654236,
+      "epoch": 1.3,
+      "learning_rate": 1.4818052256532068e-05,
+      "loss": 9.1598,
+      "step": 2730,
+      "task_loss": 0.21585097908973694
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03293714414184983,
+      "compression/movement_sparsity/importance_threshold": -0.00091255319022355,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.39283086090409514,
+      "compression/movement_sparsity/model_sparsity": 0.3050455292107293,
+      "compression_loss": 8.948369026184082,
+      "distillation_loss": 0.3611488938331604,
+      "epoch": 1.3,
+      "learning_rate": 1.4799049881235154e-05,
+      "loss": 9.3696,
+      "step": 2740,
+      "task_loss": 0.24724030494689941
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.033282768691785265,
+      "compression/movement_sparsity/importance_threshold": -0.0008940685480109448,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4017413100346281,
+      "compression/movement_sparsity/model_sparsity": 0.31196477344798973,
+      "compression_loss": 9.04212474822998,
+      "distillation_loss": 0.436295747756958,
+      "epoch": 1.31,
+      "learning_rate": 1.4780047505938244e-05,
+      "loss": 9.3594,
+      "step": 2750,
+      "task_loss": 0.18571683764457703
+    },
+    {
+      "epoch": 1.31,
+      "eval_accuracy": 0.8818807339449541,
+      "eval_loss": 9.464767456054688,
+      "eval_runtime": 25.44,
+      "eval_samples_per_second": 34.277,
+      "eval_steps_per_second": 4.285,
+      "step": 2750
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.033623694136495255,
+      "compression/movement_sparsity/importance_threshold": -0.000875835222664656,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4111423987503764,
+      "compression/movement_sparsity/model_sparsity": 0.31926501476775854,
+      "compression_loss": 9.134596824645996,
+      "distillation_loss": 0.541549563407898,
+      "epoch": 1.31,
+      "learning_rate": 1.4761045130641332e-05,
+      "loss": 9.4946,
+      "step": 2760,
+      "task_loss": 0.27293768525123596
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03395995263960215,
+      "compression/movement_sparsity/importance_threshold": -0.0008578514940144827,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4209716693955134,
+      "compression/movement_sparsity/model_sparsity": 0.3268977528342144,
+      "compression_loss": 9.225789070129395,
+      "distillation_loss": 0.18441905081272125,
+      "epoch": 1.32,
+      "learning_rate": 1.474204275534442e-05,
+      "loss": 9.5998,
+      "step": 2770,
+      "task_loss": 0.05545267462730408
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03429157636472829,
+      "compression/movement_sparsity/importance_threshold": -0.0008401156418902246,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4294070404245709,
+      "compression/movement_sparsity/model_sparsity": 0.3334480839709423,
+      "compression_loss": 9.31571102142334,
+      "distillation_loss": 0.2552655339241028,
+      "epoch": 1.32,
+      "learning_rate": 1.4723040380047506e-05,
+      "loss": 9.5848,
+      "step": 2780,
+      "task_loss": 0.07465029507875443
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03461859747549605,
+      "compression/movement_sparsity/importance_threshold": -0.0008226259461216808,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.43871413683754895,
+      "compression/movement_sparsity/model_sparsity": 0.3406753373088751,
+      "compression_loss": 9.404358863830566,
+      "distillation_loss": 0.5486918687820435,
+      "epoch": 1.33,
+      "learning_rate": 1.4704038004750595e-05,
+      "loss": 9.6934,
+      "step": 2790,
+      "task_loss": 0.25401195883750916
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03494104813552775,
+      "compression/movement_sparsity/importance_threshold": -0.0008053806865386509,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.44789961137458595,
+      "compression/movement_sparsity/model_sparsity": 0.34780814743166794,
+      "compression_loss": 9.491792678833008,
+      "distillation_loss": 0.7868499755859375,
+      "epoch": 1.33,
+      "learning_rate": 1.4686935866983374e-05,
+      "loss": 9.9283,
+      "step": 2800,
+      "task_loss": 0.47319698333740234
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03525896050844576,
+      "compression/movement_sparsity/importance_threshold": -0.0007883781429709343,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4564576275971093,
+      "compression/movement_sparsity/model_sparsity": 0.35445371642180634,
+      "compression_loss": 9.577984809875488,
+      "distillation_loss": 0.3035008907318115,
+      "epoch": 1.33,
+      "learning_rate": 1.4667933491686462e-05,
+      "loss": 9.8715,
+      "step": 2810,
+      "task_loss": 0.44653627276420593
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03557236675787243,
+      "compression/movement_sparsity/importance_threshold": -0.0007716165952483297,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4642448975271003,
+      "compression/movement_sparsity/model_sparsity": 0.36050077665387104,
+      "compression_loss": 9.66294002532959,
+      "distillation_loss": 0.10570189356803894,
+      "epoch": 1.34,
+      "learning_rate": 1.4648931116389552e-05,
+      "loss": 10.013,
+      "step": 2820,
+      "task_loss": 0.022750303149223328
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03588129904743011,
+      "compression/movement_sparsity/importance_threshold": -0.0007550943232006373,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4726805626129178,
+      "compression/movement_sparsity/model_sparsity": 0.3670513361349295,
+      "compression_loss": 9.746687889099121,
+      "distillation_loss": 0.5314592123031616,
+      "epoch": 1.34,
+      "learning_rate": 1.4629928741092638e-05,
+      "loss": 10.1699,
+      "step": 2830,
+      "task_loss": 0.5152712464332581
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.036185789540741135,
+      "compression/movement_sparsity/importance_threshold": -0.0007388096066576562,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.48106820234869013,
+      "compression/movement_sparsity/model_sparsity": 0.37356460241991285,
+      "compression_loss": 9.829238891601562,
+      "distillation_loss": 0.6711559295654297,
+      "epoch": 1.35,
+      "learning_rate": 1.4610926365795726e-05,
+      "loss": 10.3624,
+      "step": 2840,
+      "task_loss": 0.32968375086784363
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.036485870401427874,
+      "compression/movement_sparsity/importance_threshold": -0.0007227607254491853,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4889538048592291,
+      "compression/movement_sparsity/model_sparsity": 0.3796880209961333,
+      "compression_loss": 9.910573959350586,
+      "distillation_loss": 0.2346329241991043,
+      "epoch": 1.35,
+      "learning_rate": 1.4591923990498813e-05,
+      "loss": 10.5236,
+      "step": 2850,
+      "task_loss": 0.04817202687263489
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03678157379311268,
+      "compression/movement_sparsity/importance_threshold": -0.0007069459594050238,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4977513832429991,
+      "compression/movement_sparsity/model_sparsity": 0.38651961754553266,
+      "compression_loss": 9.990703582763672,
+      "distillation_loss": 0.2503522038459778,
+      "epoch": 1.36,
+      "learning_rate": 1.45729216152019e-05,
+      "loss": 10.4097,
+      "step": 2860,
+      "task_loss": 0.17465853691101074
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03707293187941788,
+      "compression/movement_sparsity/importance_threshold": -0.0006913635883549723,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5060777297877146,
+      "compression/movement_sparsity/model_sparsity": 0.39298528773824376,
+      "compression_loss": 10.069620132446289,
+      "distillation_loss": 0.6797527074813843,
+      "epoch": 1.36,
+      "learning_rate": 1.455391923990499e-05,
+      "loss": 10.5071,
+      "step": 2870,
+      "task_loss": 0.306240975856781
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03735997682396585,
+      "compression/movement_sparsity/importance_threshold": -0.0006760118921288289,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5141535282106293,
+      "compression/movement_sparsity/model_sparsity": 0.3992563994275024,
+      "compression_loss": 10.147372245788574,
+      "distillation_loss": 0.5875498056411743,
+      "epoch": 1.37,
+      "learning_rate": 1.4534916864608077e-05,
+      "loss": 10.567,
+      "step": 2880,
+      "task_loss": 0.4646558165550232
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03764274079037894,
+      "compression/movement_sparsity/importance_threshold": -0.0006608891505563933,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5215901883845228,
+      "compression/movement_sparsity/model_sparsity": 0.4050312001472949,
+      "compression_loss": 10.223993301391602,
+      "distillation_loss": 0.8365118503570557,
+      "epoch": 1.37,
+      "learning_rate": 1.4515914489311165e-05,
+      "loss": 10.7409,
+      "step": 2890,
+      "task_loss": 0.357979416847229
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03792125594227949,
+      "compression/movement_sparsity/importance_threshold": -0.0006459936434674647,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5287817816546221,
+      "compression/movement_sparsity/model_sparsity": 0.41061569870195747,
+      "compression_loss": 10.299442291259766,
+      "distillation_loss": 0.34872639179229736,
+      "epoch": 1.38,
+      "learning_rate": 1.4496912114014253e-05,
+      "loss": 10.6705,
+      "step": 2900,
+      "task_loss": 0.3244781494140625
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03819555444328984,
+      "compression/movement_sparsity/importance_threshold": -0.0006313236506918429,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5359220914257754,
+      "compression/movement_sparsity/model_sparsity": 0.41616037400536177,
+      "compression_loss": 10.373762130737305,
+      "distillation_loss": 0.966638445854187,
+      "epoch": 1.38,
+      "learning_rate": 1.4477909738717342e-05,
+      "loss": 10.9055,
+      "step": 2910,
+      "task_loss": 0.6029879450798035
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03846566845703237,
+      "compression/movement_sparsity/importance_threshold": -0.0006168774520593267,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5422028027137911,
+      "compression/movement_sparsity/model_sparsity": 0.4210375440277554,
+      "compression_loss": 10.446945190429688,
+      "distillation_loss": 0.1921069324016571,
+      "epoch": 1.39,
+      "learning_rate": 1.4458907363420428e-05,
+      "loss": 10.8335,
+      "step": 2920,
+      "task_loss": 0.06273065507411957
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03873163014712941,
+      "compression/movement_sparsity/importance_threshold": -0.0006026533273997157,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5490401163994278,
+      "compression/movement_sparsity/model_sparsity": 0.4263469333328998,
+      "compression_loss": 10.518943786621094,
+      "distillation_loss": 0.16988125443458557,
+      "epoch": 1.39,
+      "learning_rate": 1.4439904988123516e-05,
+      "loss": 10.808,
+      "step": 2930,
+      "task_loss": 0.1513996422290802
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03899347167720332,
+      "compression/movement_sparsity/importance_threshold": -0.0005886495565428094,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5570446472259861,
+      "compression/movement_sparsity/model_sparsity": 0.43256270349019194,
+      "compression_loss": 10.589797973632812,
+      "distillation_loss": 0.29276514053344727,
+      "epoch": 1.4,
+      "learning_rate": 1.4420902612826604e-05,
+      "loss": 11.0576,
+      "step": 2940,
+      "task_loss": 0.39499735832214355
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03925122521087644,
+      "compression/movement_sparsity/importance_threshold": -0.0005748644193184068,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5645892144685336,
+      "compression/movement_sparsity/model_sparsity": 0.43842129744554487,
+      "compression_loss": 10.659571647644043,
+      "distillation_loss": 0.4497772753238678,
+      "epoch": 1.4,
+      "learning_rate": 1.4401900237529694e-05,
+      "loss": 11.046,
+      "step": 2950,
+      "task_loss": 0.18338404595851898
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03950492291177113,
+      "compression/movement_sparsity/importance_threshold": -0.0005612961955563074,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5716300271943692,
+      "compression/movement_sparsity/model_sparsity": 0.44388871016124415,
+      "compression_loss": 10.72823429107666,
+      "distillation_loss": 0.9414302110671997,
+      "epoch": 1.41,
+      "learning_rate": 1.4382897862232782e-05,
+      "loss": 11.2474,
+      "step": 2960,
+      "task_loss": 0.3824530839920044
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.03975459694350974,
+      "compression/movement_sparsity/importance_threshold": -0.0005479431650863105,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5780708112202649,
+      "compression/movement_sparsity/model_sparsity": 0.44889018170344835,
+      "compression_loss": 10.795801162719727,
+      "distillation_loss": 0.5067998170852661,
+      "epoch": 1.41,
+      "learning_rate": 1.4363895486935868e-05,
+      "loss": 11.2138,
+      "step": 2970,
+      "task_loss": 0.17481166124343872
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04000027946971461,
+      "compression/movement_sparsity/importance_threshold": -0.000534803607738216,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5847961833785005,
+      "compression/movement_sparsity/model_sparsity": 0.4541126448888163,
+      "compression_loss": 10.862251281738281,
+      "distillation_loss": 0.47052228450775146,
+      "epoch": 1.42,
+      "learning_rate": 1.4344893111638956e-05,
+      "loss": 11.3225,
+      "step": 2980,
+      "task_loss": 0.4483351707458496
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.040242002654008104,
+      "compression/movement_sparsity/importance_threshold": -0.0005218758033418225,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5909785738482385,
+      "compression/movement_sparsity/model_sparsity": 0.45891346570082736,
+      "compression_loss": 10.92764663696289,
+      "distillation_loss": 0.43901222944259644,
+      "epoch": 1.42,
+      "learning_rate": 1.4325890736342044e-05,
+      "loss": 11.3835,
+      "step": 2990,
+      "task_loss": 0.24378883838653564
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04047979866001257,
+      "compression/movement_sparsity/importance_threshold": -0.0005091580317269292,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.5969155445460704,
+      "compression/movement_sparsity/model_sparsity": 0.46352371033451184,
+      "compression_loss": 10.991974830627441,
+      "distillation_loss": 0.6936439275741577,
+      "epoch": 1.43,
+      "learning_rate": 1.4306888361045133e-05,
+      "loss": 11.5481,
+      "step": 3000,
+      "task_loss": 0.26209497451782227
+    },
+    {
+      "epoch": 1.43,
+      "eval_accuracy": 0.856651376146789,
+      "eval_loss": 11.539105415344238,
+      "eval_runtime": 24.425,
+      "eval_samples_per_second": 35.701,
+      "eval_steps_per_second": 4.463,
+      "step": 3000
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.040713699651350355,
+      "compression/movement_sparsity/importance_threshold": -0.0004966485727233363,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6025890286246612,
+      "compression/movement_sparsity/model_sparsity": 0.46792934931419705,
+      "compression_loss": 11.055262565612793,
+      "distillation_loss": 0.39871829748153687,
+      "epoch": 1.43,
+      "learning_rate": 1.428788598574822e-05,
+      "loss": 11.5298,
+      "step": 3010,
+      "task_loss": 0.16510379314422607
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.040943737791643814,
+      "compression/movement_sparsity/importance_threshold": -0.00048434570616084265,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6088090701219512,
+      "compression/movement_sparsity/model_sparsity": 0.47275940733429933,
+      "compression_loss": 11.11749267578125,
+      "distillation_loss": 0.4512562155723572,
+      "epoch": 1.43,
+      "learning_rate": 1.4268883610451307e-05,
+      "loss": 11.6419,
+      "step": 3020,
+      "task_loss": 0.2681746184825897
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.041169945244515296,
+      "compression/movement_sparsity/importance_threshold": -0.000472247711869247,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6145064551339958,
+      "compression/movement_sparsity/model_sparsity": 0.47718360614117644,
+      "compression_loss": 11.178674697875977,
+      "distillation_loss": 0.33271324634552,
+      "epoch": 1.44,
+      "learning_rate": 1.4249881235154395e-05,
+      "loss": 11.726,
+      "step": 3030,
+      "task_loss": 0.2389669120311737
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04139235417358715,
+      "compression/movement_sparsity/importance_threshold": -0.00046035286967835,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6200651276912075,
+      "compression/movement_sparsity/model_sparsity": 0.4815000903604185,
+      "compression_loss": 11.238801002502441,
+      "distillation_loss": 1.2966899871826172,
+      "epoch": 1.44,
+      "learning_rate": 1.4230878859857485e-05,
+      "loss": 11.8034,
+      "step": 3040,
+      "task_loss": 0.7004615068435669
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04161099674248173,
+      "compression/movement_sparsity/importance_threshold": -0.00044865945941794955,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6260913622591087,
+      "compression/movement_sparsity/model_sparsity": 0.4861796511991028,
+      "compression_loss": 11.297928810119629,
+      "distillation_loss": 0.4639695882797241,
+      "epoch": 1.45,
+      "learning_rate": 1.4211876484560572e-05,
+      "loss": 11.7174,
+      "step": 3050,
+      "task_loss": 0.23858779668807983
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.041825905114821385,
+      "compression/movement_sparsity/importance_threshold": -0.0004371657609178463,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.630700890168624,
+      "compression/movement_sparsity/model_sparsity": 0.489759094721777,
+      "compression_loss": 11.356048583984375,
+      "distillation_loss": 0.2719360888004303,
+      "epoch": 1.45,
+      "learning_rate": 1.4192874109263659e-05,
+      "loss": 11.7773,
+      "step": 3060,
+      "task_loss": 0.21212808787822723
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04203711145422847,
+      "compression/movement_sparsity/importance_threshold": -0.00042587005400783863,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6357438459801265,
+      "compression/movement_sparsity/model_sparsity": 0.4936751086539962,
+      "compression_loss": 11.413163185119629,
+      "distillation_loss": 0.750543475151062,
+      "epoch": 1.46,
+      "learning_rate": 1.4173871733966746e-05,
+      "loss": 12.0443,
+      "step": 3070,
+      "task_loss": 0.33162063360214233
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04224464792432533,
+      "compression/movement_sparsity/importance_threshold": -0.00041477061851772625,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6405869490552545,
+      "compression/movement_sparsity/model_sparsity": 0.49743593064536007,
+      "compression_loss": 11.469284057617188,
+      "distillation_loss": 0.4959287941455841,
+      "epoch": 1.46,
+      "learning_rate": 1.4154869358669834e-05,
+      "loss": 11.8504,
+      "step": 3080,
+      "task_loss": 0.22281783819198608
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04244854668873431,
+      "compression/movement_sparsity/importance_threshold": -0.0004038657342773089,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6453207100647396,
+      "compression/movement_sparsity/model_sparsity": 0.5011118450808302,
+      "compression_loss": 11.524415969848633,
+      "distillation_loss": 0.7054104804992676,
+      "epoch": 1.47,
+      "learning_rate": 1.4135866983372924e-05,
+      "loss": 12.0627,
+      "step": 3090,
+      "task_loss": 0.45865899324417114
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04264883991107778,
+      "compression/movement_sparsity/importance_threshold": -0.0003931536811163849,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6499866733476363,
+      "compression/movement_sparsity/model_sparsity": 0.5047351124474349,
+      "compression_loss": 11.57857608795166,
+      "distillation_loss": 0.363597571849823,
+      "epoch": 1.47,
+      "learning_rate": 1.411686460807601e-05,
+      "loss": 11.9987,
+      "step": 3100,
+      "task_loss": 0.1337648332118988
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.042845559754978065,
+      "compression/movement_sparsity/importance_threshold": -0.0003826327388647549,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6550466021153267,
+      "compression/movement_sparsity/model_sparsity": 0.508664306414417,
+      "compression_loss": 11.631749153137207,
+      "distillation_loss": 0.42896637320518494,
+      "epoch": 1.48,
+      "learning_rate": 1.4097862232779098e-05,
+      "loss": 12.1107,
+      "step": 3110,
+      "task_loss": 0.39144840836524963
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04303873838405753,
+      "compression/movement_sparsity/importance_threshold": -0.0003723011873522173,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6597050610697079,
+      "compression/movement_sparsity/model_sparsity": 0.5122817464337045,
+      "compression_loss": 11.68390941619873,
+      "distillation_loss": 0.6013063192367554,
+      "epoch": 1.48,
+      "learning_rate": 1.4078859857482186e-05,
+      "loss": 12.2025,
+      "step": 3120,
+      "task_loss": 0.20251774787902832
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04322840796193853,
+      "compression/movement_sparsity/importance_threshold": -0.0003621573064085718,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6641005862315568,
+      "compression/movement_sparsity/model_sparsity": 0.5156950100863344,
+      "compression_loss": 11.735136985778809,
+      "distillation_loss": 0.4356003701686859,
+      "epoch": 1.49,
+      "learning_rate": 1.4059857482185275e-05,
+      "loss": 12.1346,
+      "step": 3130,
+      "task_loss": 0.19159378111362457
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.043414600652243424,
+      "compression/movement_sparsity/importance_threshold": -0.00035219937586361723,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6691960135313159,
+      "compression/movement_sparsity/model_sparsity": 0.5196517697809077,
+      "compression_loss": 11.785431861877441,
+      "distillation_loss": 0.36588340997695923,
+      "epoch": 1.49,
+      "learning_rate": 1.4040855106888363e-05,
+      "loss": 12.2288,
+      "step": 3140,
+      "task_loss": 0.11971582472324371
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04359734861859453,
+      "compression/movement_sparsity/importance_threshold": -0.00034242567554715374,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6736123226249624,
+      "compression/movement_sparsity/model_sparsity": 0.523081172810825,
+      "compression_loss": 11.834785461425781,
+      "distillation_loss": 0.27993667125701904,
+      "epoch": 1.5,
+      "learning_rate": 1.402185273159145e-05,
+      "loss": 12.3277,
+      "step": 3150,
+      "task_loss": 0.10686977207660675
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04377668402461424,
+      "compression/movement_sparsity/importance_threshold": -0.00033283448528897974,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.677556011931647,
+      "compression/movement_sparsity/model_sparsity": 0.5261435716988137,
+      "compression_loss": 11.883201599121094,
+      "distillation_loss": 0.22374649345874786,
+      "epoch": 1.5,
+      "learning_rate": 1.4002850356294537e-05,
+      "loss": 12.1793,
+      "step": 3160,
+      "task_loss": 0.11772890388965607
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.043952639033924865,
+      "compression/movement_sparsity/importance_threshold": -0.0003234240849188958,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6809970641373081,
+      "compression/movement_sparsity/model_sparsity": 0.5288156570556051,
+      "compression_loss": 11.930685997009277,
+      "distillation_loss": 1.1999856233596802,
+      "epoch": 1.51,
+      "learning_rate": 1.3983847980997627e-05,
+      "loss": 12.4018,
+      "step": 3170,
+      "task_loss": 0.6132296323776245
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04412524581014878,
+      "compression/movement_sparsity/importance_threshold": -0.0003141927542666999,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6850743257866606,
+      "compression/movement_sparsity/model_sparsity": 0.5319817790723298,
+      "compression_loss": 11.977304458618164,
+      "distillation_loss": 0.48509520292282104,
+      "epoch": 1.51,
+      "learning_rate": 1.3964845605700715e-05,
+      "loss": 12.4075,
+      "step": 3180,
+      "task_loss": 0.3772471845149994
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04429453651690833,
+      "compression/movement_sparsity/importance_threshold": -0.0003051387731621926,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6893989597448058,
+      "compression/movement_sparsity/model_sparsity": 0.5353399934737364,
+      "compression_loss": 12.023012161254883,
+      "distillation_loss": 0.5859512090682983,
+      "epoch": 1.52,
+      "learning_rate": 1.39458432304038e-05,
+      "loss": 12.5605,
+      "step": 3190,
+      "task_loss": 0.14015838503837585
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04446054331782587,
+      "compression/movement_sparsity/importance_threshold": -0.00029626042143517236,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6931833173366456,
+      "compression/movement_sparsity/model_sparsity": 0.5382786662696276,
+      "compression_loss": 12.067791938781738,
+      "distillation_loss": 0.3705664277076721,
+      "epoch": 1.52,
+      "learning_rate": 1.3926840855106889e-05,
+      "loss": 12.4846,
+      "step": 3200,
+      "task_loss": 0.1473858654499054
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04462329837652374,
+      "compression/movement_sparsity/importance_threshold": -0.00028755597891543883,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6960933147959952,
+      "compression/movement_sparsity/model_sparsity": 0.5405383708991116,
+      "compression_loss": 12.111686706542969,
+      "distillation_loss": 0.826543927192688,
+      "epoch": 1.52,
+      "learning_rate": 1.3907838479809977e-05,
+      "loss": 12.5674,
+      "step": 3210,
+      "task_loss": 0.4986266791820526
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04478283385662431,
+      "compression/movement_sparsity/importance_threshold": -0.0002790237254327913,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.6998678391297802,
+      "compression/movement_sparsity/model_sparsity": 0.5434694078605874,
+      "compression_loss": 12.154720306396484,
+      "distillation_loss": 0.4464240074157715,
+      "epoch": 1.53,
+      "learning_rate": 1.3888836104513066e-05,
+      "loss": 12.4925,
+      "step": 3220,
+      "task_loss": 0.23257675766944885
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04493918192174991,
+      "compression/movement_sparsity/importance_threshold": -0.00027066194081702905,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7031833526234568,
+      "compression/movement_sparsity/model_sparsity": 0.546044008455755,
+      "compression_loss": 12.196907043457031,
+      "distillation_loss": 0.6404677629470825,
+      "epoch": 1.53,
+      "learning_rate": 1.3869833729216154e-05,
+      "loss": 12.7082,
+      "step": 3230,
+      "task_loss": 0.3287440538406372
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04509237473552289,
+      "compression/movement_sparsity/importance_threshold": -0.0002624689048979522,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.707251851381361,
+      "compression/movement_sparsity/model_sparsity": 0.5492033258114275,
+      "compression_loss": 12.238248825073242,
+      "distillation_loss": 0.7356147766113281,
+      "epoch": 1.54,
+      "learning_rate": 1.385083135391924e-05,
+      "loss": 12.8188,
+      "step": 3240,
+      "task_loss": 0.4595518708229065
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04524244446156562,
+      "compression/movement_sparsity/importance_threshold": -0.0002544428975053588,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7103969883882867,
+      "compression/movement_sparsity/model_sparsity": 0.55164562370144,
+      "compression_loss": 12.278731346130371,
+      "distillation_loss": 0.5563812851905823,
+      "epoch": 1.54,
+      "learning_rate": 1.3831828978622328e-05,
+      "loss": 12.7541,
+      "step": 3250,
+      "task_loss": 0.2265515923500061
+    },
+    {
+      "epoch": 1.54,
+      "eval_accuracy": 0.8577981651376146,
+      "eval_loss": 12.835886001586914,
+      "eval_runtime": 24.5855,
+      "eval_samples_per_second": 35.468,
+      "eval_steps_per_second": 4.434,
+      "step": 3250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.045389423263500435,
+      "compression/movement_sparsity/importance_threshold": -0.0002465821984690493,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7141112898976212,
+      "compression/movement_sparsity/model_sparsity": 0.5545298957440069,
+      "compression_loss": 12.318343162536621,
+      "distillation_loss": 0.7919510006904602,
+      "epoch": 1.55,
+      "learning_rate": 1.3812826603325418e-05,
+      "loss": 12.8273,
+      "step": 3260,
+      "task_loss": 0.3617420196533203
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0455333433049497,
+      "compression/movement_sparsity/importance_threshold": -0.0002388850876188218,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7169433387910268,
+      "compression/movement_sparsity/model_sparsity": 0.556729070858333,
+      "compression_loss": 12.357135772705078,
+      "distillation_loss": 0.6720787286758423,
+      "epoch": 1.55,
+      "learning_rate": 1.3793824228028505e-05,
+      "loss": 12.7631,
+      "step": 3270,
+      "task_loss": 0.34426239132881165
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.045674236749535746,
+      "compression/movement_sparsity/importance_threshold": -0.00023134984478447723,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.71958969672162,
+      "compression/movement_sparsity/model_sparsity": 0.5587840510947658,
+      "compression_loss": 12.395111083984375,
+      "distillation_loss": 0.5282790064811707,
+      "epoch": 1.56,
+      "learning_rate": 1.3774821852731593e-05,
+      "loss": 12.8532,
+      "step": 3280,
+      "task_loss": 0.2643144726753235
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.045812135760880945,
+      "compression/movement_sparsity/importance_threshold": -0.0002239747497958136,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7223523129328515,
+      "compression/movement_sparsity/model_sparsity": 0.5609293095457484,
+      "compression_loss": 12.432284355163574,
+      "distillation_loss": 0.40331676602363586,
+      "epoch": 1.56,
+      "learning_rate": 1.375581947743468e-05,
+      "loss": 12.9658,
+      "step": 3290,
+      "task_loss": 0.1825210452079773
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.045947072502607635,
+      "compression/movement_sparsity/importance_threshold": -0.0002167580824826306,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.725434474744053,
+      "compression/movement_sparsity/model_sparsity": 0.5633227052139176,
+      "compression_loss": 12.468629837036133,
+      "distillation_loss": 0.4709409475326538,
+      "epoch": 1.57,
+      "learning_rate": 1.3736817102137769e-05,
+      "loss": 12.9347,
+      "step": 3300,
+      "task_loss": 0.2354767620563507
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04607907913833817,
+      "compression/movement_sparsity/importance_threshold": -0.00020969812267472836,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7282369768142126,
+      "compression/movement_sparsity/model_sparsity": 0.5654989362899041,
+      "compression_loss": 12.504148483276367,
+      "distillation_loss": 0.44413477182388306,
+      "epoch": 1.57,
+      "learning_rate": 1.3717814726840857e-05,
+      "loss": 12.9811,
+      "step": 3310,
+      "task_loss": 0.25892922282218933
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04620818783169489,
+      "compression/movement_sparsity/importance_threshold": -0.0002027931502019049,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.73153312961081,
+      "compression/movement_sparsity/model_sparsity": 0.5680585026943443,
+      "compression_loss": 12.538874626159668,
+      "distillation_loss": 0.2548993229866028,
+      "epoch": 1.58,
+      "learning_rate": 1.3698812351543945e-05,
+      "loss": 12.9981,
+      "step": 3320,
+      "task_loss": 0.06697467714548111
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04633443074630017,
+      "compression/movement_sparsity/importance_threshold": -0.0001960414448939603,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7338615886592894,
+      "compression/movement_sparsity/model_sparsity": 0.569866624441842,
+      "compression_loss": 12.572813034057617,
+      "distillation_loss": 1.0099992752075195,
+      "epoch": 1.58,
+      "learning_rate": 1.3679809976247031e-05,
+      "loss": 13.0474,
+      "step": 3330,
+      "task_loss": 0.6512800455093384
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04645784004577634,
+      "compression/movement_sparsity/importance_threshold": -0.00018944128658069433,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7365036533611864,
+      "compression/movement_sparsity/model_sparsity": 0.571918270851048,
+      "compression_loss": 12.606016159057617,
+      "distillation_loss": 0.48587775230407715,
+      "epoch": 1.59,
+      "learning_rate": 1.3660807600950119e-05,
+      "loss": 13.2127,
+      "step": 3340,
+      "task_loss": 0.2523800730705261
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04657844789374576,
+      "compression/movement_sparsity/importance_threshold": -0.0001829909550919058,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.739569489141072,
+      "compression/movement_sparsity/model_sparsity": 0.5742989888419817,
+      "compression_loss": 12.638442993164062,
+      "distillation_loss": 0.8291702270507812,
+      "epoch": 1.59,
+      "learning_rate": 1.3641805225653208e-05,
+      "loss": 13.1407,
+      "step": 3350,
+      "task_loss": 0.37619519233703613
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04669628645383078,
+      "compression/movement_sparsity/importance_threshold": -0.00017668873025739355,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7418803871198434,
+      "compression/movement_sparsity/model_sparsity": 0.5760934738660553,
+      "compression_loss": 12.670111656188965,
+      "distillation_loss": 0.5576849579811096,
+      "epoch": 1.6,
+      "learning_rate": 1.3622802850356296e-05,
+      "loss": 13.1378,
+      "step": 3360,
+      "task_loss": 0.2429836392402649
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04681138788965375,
+      "compression/movement_sparsity/importance_threshold": -0.00017053289190695773,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7444494081225534,
+      "compression/movement_sparsity/model_sparsity": 0.5780883995435379,
+      "compression_loss": 12.701035499572754,
+      "distillation_loss": 0.9135178327560425,
+      "epoch": 1.6,
+      "learning_rate": 1.3603800475059384e-05,
+      "loss": 13.1637,
+      "step": 3370,
+      "task_loss": 0.5078893899917603
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04692378436483701,
+      "compression/movement_sparsity/importance_threshold": -0.0001645217198703976,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7467615764265282,
+      "compression/movement_sparsity/model_sparsity": 0.5798838710151197,
+      "compression_loss": 12.731249809265137,
+      "distillation_loss": 0.5046910047531128,
+      "epoch": 1.61,
+      "learning_rate": 1.358479809976247e-05,
+      "loss": 13.1697,
+      "step": 3380,
+      "task_loss": 0.35627833008766174
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04703350804300293,
+      "compression/movement_sparsity/importance_threshold": -0.00015865349397751203,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7494276361600422,
+      "compression/movement_sparsity/model_sparsity": 0.5819541503217034,
+      "compression_loss": 12.760735511779785,
+      "distillation_loss": 0.8998797535896301,
+      "epoch": 1.61,
+      "learning_rate": 1.356579572446556e-05,
+      "loss": 13.1898,
+      "step": 3390,
+      "task_loss": 0.5463681221008301
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.047140591087773846,
+      "compression/movement_sparsity/importance_threshold": -0.0001529264940581007,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7521114686841313,
+      "compression/movement_sparsity/model_sparsity": 0.584038230759629,
+      "compression_loss": 12.789498329162598,
+      "distillation_loss": 0.8819484114646912,
+      "epoch": 1.62,
+      "learning_rate": 1.3546793349168648e-05,
+      "loss": 13.3173,
+      "step": 3400,
+      "task_loss": 0.5092288851737976
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04724506566277212,
+      "compression/movement_sparsity/importance_threshold": -0.0001473389999419629,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7540012185712135,
+      "compression/movement_sparsity/model_sparsity": 0.5855056810334038,
+      "compression_loss": 12.8175630569458,
+      "distillation_loss": 0.6930016875267029,
+      "epoch": 1.62,
+      "learning_rate": 1.3527790973871735e-05,
+      "loss": 13.311,
+      "step": 3410,
+      "task_loss": 0.25591135025024414
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.047346963931620085,
+      "compression/movement_sparsity/importance_threshold": -0.00014188929145889833,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7560711429162903,
+      "compression/movement_sparsity/model_sparsity": 0.5871130424454297,
+      "compression_loss": 12.844893455505371,
+      "distillation_loss": 0.5201950073242188,
+      "epoch": 1.62,
+      "learning_rate": 1.3508788598574822e-05,
+      "loss": 13.2855,
+      "step": 3420,
+      "task_loss": 0.2590838670730591
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04744631805794011,
+      "compression/movement_sparsity/importance_threshold": -0.0001365756484387054,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7579267822192111,
+      "compression/movement_sparsity/model_sparsity": 0.5885540047768538,
+      "compression_loss": 12.871529579162598,
+      "distillation_loss": 0.5567278861999512,
+      "epoch": 1.63,
+      "learning_rate": 1.348978622327791e-05,
+      "loss": 13.2248,
+      "step": 3430,
+      "task_loss": 0.2669626772403717
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04754316020535455,
+      "compression/movement_sparsity/importance_threshold": -0.00013139635071118426,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7606231180367359,
+      "compression/movement_sparsity/model_sparsity": 0.590647794415717,
+      "compression_loss": 12.897520065307617,
+      "distillation_loss": 0.45216840505599976,
+      "epoch": 1.63,
+      "learning_rate": 1.3470783847980999e-05,
+      "loss": 13.4032,
+      "step": 3440,
+      "task_loss": 0.242957204580307
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04763752253748573,
+      "compression/movement_sparsity/importance_threshold": -0.00012634967810613417,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7631808825466727,
+      "compression/movement_sparsity/model_sparsity": 0.5926339790722238,
+      "compression_loss": 12.922788619995117,
+      "distillation_loss": 0.6922011375427246,
+      "epoch": 1.64,
+      "learning_rate": 1.3451781472684087e-05,
+      "loss": 13.3809,
+      "step": 3450,
+      "task_loss": 0.4704532027244568
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04772943721795603,
+      "compression/movement_sparsity/importance_threshold": -0.00012143391045335399,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7656033927092744,
+      "compression/movement_sparsity/model_sparsity": 0.5945151344704236,
+      "compression_loss": 12.947429656982422,
+      "distillation_loss": 0.35389190912246704,
+      "epoch": 1.64,
+      "learning_rate": 1.3432779097387175e-05,
+      "loss": 13.5379,
+      "step": 3460,
+      "task_loss": 0.2909751236438751
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04781893641038778,
+      "compression/movement_sparsity/importance_threshold": -0.00011664732758264297,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7674263564250225,
+      "compression/movement_sparsity/model_sparsity": 0.5959307231798303,
+      "compression_loss": 12.971384048461914,
+      "distillation_loss": 0.40384596586227417,
+      "epoch": 1.65,
+      "learning_rate": 1.3413776722090261e-05,
+      "loss": 13.4134,
+      "step": 3470,
+      "task_loss": 0.23214919865131378
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04790605227840333,
+      "compression/movement_sparsity/importance_threshold": -0.00011198820932380171,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7692812899917194,
+      "compression/movement_sparsity/model_sparsity": 0.597371137484861,
+      "compression_loss": 12.994702339172363,
+      "distillation_loss": 0.8970961570739746,
+      "epoch": 1.65,
+      "learning_rate": 1.339477434679335e-05,
+      "loss": 13.5349,
+      "step": 3480,
+      "task_loss": 0.45820656418800354
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.047990816985625045,
+      "compression/movement_sparsity/importance_threshold": -0.00010745483550662818,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7712226997704005,
+      "compression/movement_sparsity/model_sparsity": 0.5988787032906373,
+      "compression_loss": 13.017374992370605,
+      "distillation_loss": 0.43547579646110535,
+      "epoch": 1.66,
+      "learning_rate": 1.3375771971496438e-05,
+      "loss": 13.4468,
+      "step": 3490,
+      "task_loss": 0.22628280520439148
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04807326269567527,
+      "compression/movement_sparsity/importance_threshold": -0.00010304548596092207,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.773500263474857,
+      "compression/movement_sparsity/model_sparsity": 0.600647303201393,
+      "compression_loss": 13.039436340332031,
+      "distillation_loss": 0.4127658009529114,
+      "epoch": 1.66,
+      "learning_rate": 1.3356769596199526e-05,
+      "loss": 13.6184,
+      "step": 3500,
+      "task_loss": 0.13857224583625793
+    },
+    {
+      "epoch": 1.66,
+      "eval_accuracy": 0.8428899082568807,
+      "eval_loss": 13.651920318603516,
+      "eval_runtime": 32.6189,
+      "eval_samples_per_second": 26.733,
+      "eval_steps_per_second": 3.342,
+      "step": 3500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.048153421572176364,
+      "compression/movement_sparsity/importance_threshold": -9.87584405164831e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7749152881285757,
+      "compression/movement_sparsity/model_sparsity": 0.601746114387832,
+      "compression_loss": 13.060900688171387,
+      "distillation_loss": 0.3880394995212555,
+      "epoch": 1.67,
+      "learning_rate": 1.3337767220902612e-05,
+      "loss": 13.5323,
+      "step": 3510,
+      "task_loss": 0.09369392693042755
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04823132577875065,
+      "compression/movement_sparsity/importance_threshold": -9.459197900311099e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7763604947681422,
+      "compression/movement_sparsity/model_sparsity": 0.6028683628363647,
+      "compression_loss": 13.081767082214355,
+      "distillation_loss": 0.6340326070785522,
+      "epoch": 1.67,
+      "learning_rate": 1.3318764845605704e-05,
+      "loss": 13.5637,
+      "step": 3520,
+      "task_loss": 0.42461642622947693
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04830700747902051,
+      "compression/movement_sparsity/importance_threshold": -9.05443812506037e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7780264674608551,
+      "compression/movement_sparsity/model_sparsity": 0.6041620430745457,
+      "compression_loss": 13.102035522460938,
+      "distillation_loss": 0.25822052359580994,
+      "epoch": 1.68,
+      "learning_rate": 1.329976247030879e-05,
+      "loss": 13.6241,
+      "step": 3530,
+      "task_loss": 0.06908401101827621
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04838049883660828,
+      "compression/movement_sparsity/importance_threshold": -8.66139270887618e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7795130772922313,
+      "compression/movement_sparsity/model_sparsity": 0.6053164424048282,
+      "compression_loss": 13.121685981750488,
+      "distillation_loss": 0.9944248795509338,
+      "epoch": 1.68,
+      "learning_rate": 1.3280760095011878e-05,
+      "loss": 13.6799,
+      "step": 3540,
+      "task_loss": 0.43491220474243164
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04845183201513631,
+      "compression/movement_sparsity/importance_threshold": -8.279889634738372e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7810252935862692,
+      "compression/movement_sparsity/model_sparsity": 0.6064907259594202,
+      "compression_loss": 13.140732765197754,
+      "distillation_loss": 0.4648146629333496,
+      "epoch": 1.69,
+      "learning_rate": 1.3261757719714966e-05,
+      "loss": 13.6035,
+      "step": 3550,
+      "task_loss": 0.22887027263641357
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.048521039178226956,
+      "compression/movement_sparsity/importance_threshold": -7.909756885626958e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7832119702273411,
+      "compression/movement_sparsity/model_sparsity": 0.6081887492044712,
+      "compression_loss": 13.159188270568848,
+      "distillation_loss": 0.7744244933128357,
+      "epoch": 1.69,
+      "learning_rate": 1.3242755344418052e-05,
+      "loss": 13.6462,
+      "step": 3560,
+      "task_loss": 0.3696683943271637
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04858815248950257,
+      "compression/movement_sparsity/importance_threshold": -7.550822444521823e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7850979091388136,
+      "compression/movement_sparsity/model_sparsity": 0.6096532401357213,
+      "compression_loss": 13.177058219909668,
+      "distillation_loss": 0.3203733563423157,
+      "epoch": 1.7,
+      "learning_rate": 1.3223752969121141e-05,
+      "loss": 13.6384,
+      "step": 3570,
+      "task_loss": 0.19227594137191772
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.048653204112585495,
+      "compression/movement_sparsity/importance_threshold": -7.202914294402937e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7865801081187895,
+      "compression/movement_sparsity/model_sparsity": 0.6108042143010447,
+      "compression_loss": 13.194385528564453,
+      "distillation_loss": 0.5051361918449402,
+      "epoch": 1.7,
+      "learning_rate": 1.3204750593824229e-05,
+      "loss": 13.6339,
+      "step": 3580,
+      "task_loss": 0.25856292247772217
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.048716226211098085,
+      "compression/movement_sparsity/importance_threshold": -6.865860418250229e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.78752878227567,
+      "compression/movement_sparsity/model_sparsity": 0.6115408896466835,
+      "compression_loss": 13.211155891418457,
+      "distillation_loss": 0.5065594911575317,
+      "epoch": 1.71,
+      "learning_rate": 1.3185748218527317e-05,
+      "loss": 13.7948,
+      "step": 3590,
+      "task_loss": 0.31428200006484985
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0487772509486627,
+      "compression/movement_sparsity/importance_threshold": -6.539488799043583e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7893345260275519,
+      "compression/movement_sparsity/model_sparsity": 0.6129431065120898,
+      "compression_loss": 13.227370262145996,
+      "distillation_loss": 0.44985029101371765,
+      "epoch": 1.71,
+      "learning_rate": 1.3166745843230405e-05,
+      "loss": 13.7212,
+      "step": 3600,
+      "task_loss": 0.16183635592460632
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04883631048890167,
+      "compression/movement_sparsity/importance_threshold": -6.223627419762968e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7903761574074074,
+      "compression/movement_sparsity/model_sparsity": 0.613751966067521,
+      "compression_loss": 13.2430419921875,
+      "distillation_loss": 0.8156948089599609,
+      "epoch": 1.71,
+      "learning_rate": 1.3147743467933494e-05,
+      "loss": 13.6668,
+      "step": 3610,
+      "task_loss": 0.43693339824676514
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04889343699543736,
+      "compression/movement_sparsity/importance_threshold": -5.918104263388357e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7916715480088828,
+      "compression/movement_sparsity/model_sparsity": 0.6147578776464948,
+      "compression_loss": 13.258223533630371,
+      "distillation_loss": 0.9145314693450928,
+      "epoch": 1.72,
+      "learning_rate": 1.312874109263658e-05,
+      "loss": 13.7678,
+      "step": 3620,
+      "task_loss": 0.4340613782405853
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.048948662631892126,
+      "compression/movement_sparsity/importance_threshold": -5.622747312899589e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7927108857460102,
+      "compression/movement_sparsity/model_sparsity": 0.6155649561161471,
+      "compression_loss": 13.272916793823242,
+      "distillation_loss": 0.8170522451400757,
+      "epoch": 1.72,
+      "learning_rate": 1.3109738717339668e-05,
+      "loss": 13.7102,
+      "step": 3630,
+      "task_loss": 0.38500523567199707
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049002019561888314,
+      "compression/movement_sparsity/importance_threshold": -5.3373845512766794e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7941787935674496,
+      "compression/movement_sparsity/model_sparsity": 0.6167048327470029,
+      "compression_loss": 13.287137031555176,
+      "distillation_loss": 0.32002827525138855,
+      "epoch": 1.73,
+      "learning_rate": 1.3090736342042756e-05,
+      "loss": 13.8523,
+      "step": 3640,
+      "task_loss": 0.15141043066978455
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049053539949048264,
+      "compression/movement_sparsity/importance_threshold": -5.061843961499555e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.795318228225685,
+      "compression/movement_sparsity/model_sparsity": 0.6175896396267948,
+      "compression_loss": 13.300834655761719,
+      "distillation_loss": 0.7697837352752686,
+      "epoch": 1.73,
+      "learning_rate": 1.3071733966745846e-05,
+      "loss": 13.8336,
+      "step": 3650,
+      "task_loss": 0.4357471466064453
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04910325595699434,
+      "compression/movement_sparsity/importance_threshold": -4.795953526548144e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7962184535907859,
+      "compression/movement_sparsity/model_sparsity": 0.6182886929605225,
+      "compression_loss": 13.313998222351074,
+      "distillation_loss": 0.6824323534965515,
+      "epoch": 1.74,
+      "learning_rate": 1.3052731591448932e-05,
+      "loss": 13.8227,
+      "step": 3660,
+      "task_loss": 0.34603098034858704
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049151199749348895,
+      "compression/movement_sparsity/importance_threshold": -4.53954122940233e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7974448937631737,
+      "compression/movement_sparsity/model_sparsity": 0.619241062360855,
+      "compression_loss": 13.326610565185547,
+      "distillation_loss": 0.08792783319950104,
+      "epoch": 1.74,
+      "learning_rate": 1.303372921615202e-05,
+      "loss": 13.6874,
+      "step": 3670,
+      "task_loss": 0.2505362629890442
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04919740348973427,
+      "compression/movement_sparsity/importance_threshold": -4.2924350530420836e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7981858697493225,
+      "compression/movement_sparsity/model_sparsity": 0.6198164535388978,
+      "compression_loss": 13.338798522949219,
+      "distillation_loss": 1.1169078350067139,
+      "epoch": 1.75,
+      "learning_rate": 1.3014726840855108e-05,
+      "loss": 13.9032,
+      "step": 3680,
+      "task_loss": 0.44423192739486694
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04924189934177281,
+      "compression/movement_sparsity/importance_threshold": -4.054462980447376e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.7994761437631738,
+      "compression/movement_sparsity/model_sparsity": 0.620818391926519,
+      "compression_loss": 13.350573539733887,
+      "distillation_loss": 0.7253522872924805,
+      "epoch": 1.75,
+      "learning_rate": 1.2995724465558196e-05,
+      "loss": 13.7283,
+      "step": 3690,
+      "task_loss": 0.4123845100402832
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049284719469086885,
+      "compression/movement_sparsity/importance_threshold": -3.8254529945980914e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8006924801452876,
+      "compression/movement_sparsity/model_sparsity": 0.6217629154156518,
+      "compression_loss": 13.361876487731934,
+      "distillation_loss": 0.7416278719902039,
+      "epoch": 1.76,
+      "learning_rate": 1.2976722090261285e-05,
+      "loss": 13.7298,
+      "step": 3700,
+      "task_loss": 0.4112977981567383
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04932589603529883,
+      "compression/movement_sparsity/importance_threshold": -3.605233078474157e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.801716456357272,
+      "compression/movement_sparsity/model_sparsity": 0.6225580651774731,
+      "compression_loss": 13.372750282287598,
+      "distillation_loss": 0.42862361669540405,
+      "epoch": 1.76,
+      "learning_rate": 1.2957719714964371e-05,
+      "loss": 13.8126,
+      "step": 3710,
+      "task_loss": 0.1545902043581009
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049365461204031,
+      "compression/movement_sparsity/importance_threshold": -3.393631215055587e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8031718373607347,
+      "compression/movement_sparsity/model_sparsity": 0.6236882143398449,
+      "compression_loss": 13.383188247680664,
+      "distillation_loss": 0.1796613186597824,
+      "epoch": 1.77,
+      "learning_rate": 1.293871733966746e-05,
+      "loss": 13.9012,
+      "step": 3720,
+      "task_loss": 0.04331651329994202
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04940344713890575,
+      "compression/movement_sparsity/importance_threshold": -3.190475387322223e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8042020240514905,
+      "compression/movement_sparsity/model_sparsity": 0.6244881867339287,
+      "compression_loss": 13.39314079284668,
+      "distillation_loss": 0.6083955764770508,
+      "epoch": 1.77,
+      "learning_rate": 1.2919714964370547e-05,
+      "loss": 13.7336,
+      "step": 3730,
+      "task_loss": 0.5452806353569031
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04943988600354543,
+      "compression/movement_sparsity/importance_threshold": -2.995593578253991e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.805487898976212,
+      "compression/movement_sparsity/model_sparsity": 0.6254867090903641,
+      "compression_loss": 13.402615547180176,
+      "distillation_loss": 0.35754120349884033,
+      "epoch": 1.78,
+      "learning_rate": 1.2900712589073637e-05,
+      "loss": 13.7667,
+      "step": 3740,
+      "task_loss": 0.12456899136304855
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049474809961572386,
+      "compression/movement_sparsity/importance_threshold": -2.8088137708309063e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8067680691997893,
+      "compression/movement_sparsity/model_sparsity": 0.6264808015667855,
+      "compression_loss": 13.411738395690918,
+      "distillation_loss": 0.689171552658081,
+      "epoch": 1.78,
+      "learning_rate": 1.2881710213776723e-05,
+      "loss": 13.9171,
+      "step": 3750,
+      "task_loss": 0.5145508646965027
+    },
+    {
+      "epoch": 1.78,
+      "eval_accuracy": 0.8474770642201835,
+      "eval_loss": 14.073362350463867,
+      "eval_runtime": 23.1584,
+      "eval_samples_per_second": 37.654,
+      "eval_steps_per_second": 4.707,
+      "step": 3750
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04950825117660897,
+      "compression/movement_sparsity/importance_threshold": -2.629963948032896e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8076983001166818,
+      "compression/movement_sparsity/model_sparsity": 0.6272031551560086,
+      "compression_loss": 13.420463562011719,
+      "distillation_loss": 0.3995903730392456,
+      "epoch": 1.79,
+      "learning_rate": 1.286270783847981e-05,
+      "loss": 13.7818,
+      "step": 3760,
+      "task_loss": 0.16869285702705383
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049540241812277536,
+      "compression/movement_sparsity/importance_threshold": -2.458872092839801e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8085055094474556,
+      "compression/movement_sparsity/model_sparsity": 0.6278299786110779,
+      "compression_loss": 13.42878532409668,
+      "distillation_loss": 0.34937620162963867,
+      "epoch": 1.79,
+      "learning_rate": 1.2843705463182899e-05,
+      "loss": 13.9115,
+      "step": 3770,
+      "task_loss": 0.16115997731685638
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04957081403220043,
+      "compression/movement_sparsity/importance_threshold": -2.2953661882316786e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.809237898976212,
+      "compression/movement_sparsity/model_sparsity": 0.628398702134667,
+      "compression_loss": 13.43675708770752,
+      "distillation_loss": 0.7227451801300049,
+      "epoch": 1.8,
+      "learning_rate": 1.2824703087885986e-05,
+      "loss": 13.9422,
+      "step": 3780,
+      "task_loss": 0.4984316825866699
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049600000000000005,
+      "compression/movement_sparsity/importance_threshold": -2.1392742171883698e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8102044517841012,
+      "compression/movement_sparsity/model_sparsity": 0.6291492608156068,
+      "compression_loss": 13.444360733032227,
+      "distillation_loss": 0.44791698455810547,
+      "epoch": 1.8,
+      "learning_rate": 1.2805700712589076e-05,
+      "loss": 13.9218,
+      "step": 3790,
+      "task_loss": 0.32737138867378235
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04962783187929862,
+      "compression/movement_sparsity/importance_threshold": -1.990424162689802e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8111833902627221,
+      "compression/movement_sparsity/model_sparsity": 0.629909437359752,
+      "compression_loss": 13.45160961151123,
+      "distillation_loss": 0.26000645756721497,
+      "epoch": 1.81,
+      "learning_rate": 1.2786698337292162e-05,
+      "loss": 13.9993,
+      "step": 3800,
+      "task_loss": 0.18172568082809448
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049654341833718606,
+      "compression/movement_sparsity/importance_threshold": -1.8486440077159893e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8120328261442337,
+      "compression/movement_sparsity/model_sparsity": 0.6305690510606968,
+      "compression_loss": 13.458487510681152,
+      "distillation_loss": 0.5715627670288086,
+      "epoch": 1.81,
+      "learning_rate": 1.276769596199525e-05,
+      "loss": 13.9063,
+      "step": 3810,
+      "task_loss": 0.3135397434234619
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04967956202688233,
+      "compression/movement_sparsity/importance_threshold": -1.713761735246816e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8128433641975309,
+      "compression/movement_sparsity/model_sparsity": 0.6311984593735886,
+      "compression_loss": 13.46501636505127,
+      "distillation_loss": 0.36794915795326233,
+      "epoch": 1.81,
+      "learning_rate": 1.2748693586698338e-05,
+      "loss": 13.9184,
+      "step": 3820,
+      "task_loss": 0.3162195384502411
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04970352462241214,
+      "compression/movement_sparsity/importance_threshold": -1.5856053282622528e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8135937852868113,
+      "compression/movement_sparsity/model_sparsity": 0.6317811849715307,
+      "compression_loss": 13.471232414245605,
+      "distillation_loss": 0.7687112092971802,
+      "epoch": 1.82,
+      "learning_rate": 1.2729691211401427e-05,
+      "loss": 13.9925,
+      "step": 3830,
+      "task_loss": 0.4176146686077118
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04972626178393039,
+      "compression/movement_sparsity/importance_threshold": -1.4640027697421405e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8145009386291779,
+      "compression/movement_sparsity/model_sparsity": 0.6324856180976874,
+      "compression_loss": 13.477070808410645,
+      "distillation_loss": 0.18658028542995453,
+      "epoch": 1.82,
+      "learning_rate": 1.2710688836104515e-05,
+      "loss": 13.8647,
+      "step": 3840,
+      "task_loss": 0.1433614045381546
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049747805675059424,
+      "compression/movement_sparsity/importance_threshold": -1.3487820426664934e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8151889255871725,
+      "compression/movement_sparsity/model_sparsity": 0.6330198616273545,
+      "compression_loss": 13.482598304748535,
+      "distillation_loss": 0.5786043405532837,
+      "epoch": 1.83,
+      "learning_rate": 1.2691686460807601e-05,
+      "loss": 13.9974,
+      "step": 3850,
+      "task_loss": 0.24868422746658325
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049768188459421596,
+      "compression/movement_sparsity/importance_threshold": -1.2397711300152388e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8155505330660946,
+      "compression/movement_sparsity/model_sparsity": 0.6333006612175915,
+      "compression_loss": 13.487833023071289,
+      "distillation_loss": 0.2891767621040344,
+      "epoch": 1.83,
+      "learning_rate": 1.267268408551069e-05,
+      "loss": 13.8999,
+      "step": 3860,
+      "task_loss": 0.17670656740665436
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04978744230063925,
+      "compression/movement_sparsity/importance_threshold": -1.1367980147683476e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8161176156466425,
+      "compression/movement_sparsity/model_sparsity": 0.6337410186922852,
+      "compression_loss": 13.4927339553833,
+      "distillation_loss": 0.9970003366470337,
+      "epoch": 1.84,
+      "learning_rate": 1.2653681710213779e-05,
+      "loss": 14.0161,
+      "step": 3870,
+      "task_loss": 0.44373542070388794
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04980559936233475,
+      "compression/movement_sparsity/importance_threshold": -1.0396906799056173e-05,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8173062989310449,
+      "compression/movement_sparsity/model_sparsity": 0.6346640686805675,
+      "compression_loss": 13.497337341308594,
+      "distillation_loss": 0.15605174005031586,
+      "epoch": 1.84,
+      "learning_rate": 1.2634679334916867e-05,
+      "loss": 13.7718,
+      "step": 3880,
+      "task_loss": 0.025827720761299133
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04982269180813043,
+      "compression/movement_sparsity/importance_threshold": -9.482771084071487e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8178974588791027,
+      "compression/movement_sparsity/model_sparsity": 0.6351231229890514,
+      "compression_loss": 13.501633644104004,
+      "distillation_loss": 0.5499266982078552,
+      "epoch": 1.85,
+      "learning_rate": 1.2615676959619953e-05,
+      "loss": 13.9453,
+      "step": 3890,
+      "task_loss": 0.355935275554657
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04983875180164866,
+      "compression/movement_sparsity/importance_threshold": -8.623852832527392e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8187233819820837,
+      "compression/movement_sparsity/model_sparsity": 0.6357644782773206,
+      "compression_loss": 13.50567626953125,
+      "distillation_loss": 0.44997888803482056,
+      "epoch": 1.85,
+      "learning_rate": 1.259667458432304e-05,
+      "loss": 13.9764,
+      "step": 3900,
+      "task_loss": 0.2622376084327698
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049853811506511775,
+      "compression/movement_sparsity/importance_threshold": -7.81843187422403e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8195478700880758,
+      "compression/movement_sparsity/model_sparsity": 0.6364047192452565,
+      "compression_loss": 13.509462356567383,
+      "distillation_loss": 0.7038769721984863,
+      "epoch": 1.86,
+      "learning_rate": 1.2577672209026129e-05,
+      "loss": 14.057,
+      "step": 3910,
+      "task_loss": 0.33292850852012634
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04986790308634213,
+      "compression/movement_sparsity/importance_threshold": -7.064788038961111e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8199815449977417,
+      "compression/movement_sparsity/model_sparsity": 0.6367414814640393,
+      "compression_loss": 13.513002395629883,
+      "distillation_loss": 0.3087387681007385,
+      "epoch": 1.86,
+      "learning_rate": 1.2558669833729218e-05,
+      "loss": 13.9792,
+      "step": 3920,
+      "task_loss": 0.10697836428880692
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04988105870476209,
+      "compression/movement_sparsity/importance_threshold": -6.361201156536607e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8203553617133393,
+      "compression/movement_sparsity/model_sparsity": 0.6370317619108833,
+      "compression_loss": 13.516273498535156,
+      "distillation_loss": 0.9965323805809021,
+      "epoch": 1.87,
+      "learning_rate": 1.2539667458432306e-05,
+      "loss": 13.9582,
+      "step": 3930,
+      "task_loss": 0.5111120343208313
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049893310525393975,
+      "compression/movement_sparsity/importance_threshold": -5.705951056751094e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8209573899992472,
+      "compression/movement_sparsity/model_sparsity": 0.6374992558258266,
+      "compression_loss": 13.519289016723633,
+      "distillation_loss": 0.6441047787666321,
+      "epoch": 1.87,
+      "learning_rate": 1.2520665083135392e-05,
+      "loss": 13.9644,
+      "step": 3940,
+      "task_loss": 0.35741135478019714
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04990469071186017,
+      "compression/movement_sparsity/importance_threshold": -5.09731756940298e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8216001980766335,
+      "compression/movement_sparsity/model_sparsity": 0.6379984165325387,
+      "compression_loss": 13.522068977355957,
+      "distillation_loss": 0.3542700409889221,
+      "epoch": 1.88,
+      "learning_rate": 1.250166270783848e-05,
+      "loss": 13.9309,
+      "step": 3950,
+      "task_loss": 0.16567806899547577
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049915231427783,
+      "compression/movement_sparsity/importance_threshold": -4.533580524292407e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8219789667080699,
+      "compression/movement_sparsity/model_sparsity": 0.6382925422979101,
+      "compression_loss": 13.524628639221191,
+      "distillation_loss": 0.6030027866363525,
+      "epoch": 1.88,
+      "learning_rate": 1.248266033254157e-05,
+      "loss": 14.0541,
+      "step": 3960,
+      "task_loss": 0.4497295618057251
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04992496483678483,
+      "compression/movement_sparsity/importance_threshold": -4.013019751218216e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8225840296597411,
+      "compression/movement_sparsity/model_sparsity": 0.6387623927263453,
+      "compression_loss": 13.52697467803955,
+      "distillation_loss": 0.28092095255851746,
+      "epoch": 1.89,
+      "learning_rate": 1.2463657957244657e-05,
+      "loss": 13.9522,
+      "step": 3970,
+      "task_loss": 0.09975674748420715
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04993392310248801,
+      "compression/movement_sparsity/importance_threshold": -3.533915079980115e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.823128846262421,
+      "compression/movement_sparsity/model_sparsity": 0.6391854599683252,
+      "compression_loss": 13.529121398925781,
+      "distillation_loss": 0.1360899955034256,
+      "epoch": 1.89,
+      "learning_rate": 1.2444655581947744e-05,
+      "loss": 14.0078,
+      "step": 3980,
+      "task_loss": 0.031911686062812805
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04994213838851488,
+      "compression/movement_sparsity/importance_threshold": -3.094546340377379e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8237104552469136,
+      "compression/movement_sparsity/model_sparsity": 0.639637097652951,
+      "compression_loss": 13.53105640411377,
+      "distillation_loss": 0.8466547727584839,
+      "epoch": 1.9,
+      "learning_rate": 1.2425653206650832e-05,
+      "loss": 14.0804,
+      "step": 3990,
+      "task_loss": 0.4057835340499878
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0499496428584878,
+      "compression/movement_sparsity/importance_threshold": -2.6931933622088497e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8239907736750979,
+      "compression/movement_sparsity/model_sparsity": 0.6398547737364343,
+      "compression_loss": 13.532774925231934,
+      "distillation_loss": 0.13539861142635345,
+      "epoch": 1.9,
+      "learning_rate": 1.2406650831353921e-05,
+      "loss": 13.9601,
+      "step": 4000,
+      "task_loss": 0.03517580032348633
+    },
+    {
+      "epoch": 1.9,
+      "eval_accuracy": 0.8577981651376146,
+      "eval_loss": 14.102431297302246,
+      "eval_runtime": 23.3122,
+      "eval_samples_per_second": 37.405,
+      "eval_steps_per_second": 4.676,
+      "step": 4000
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04995646867602912,
+      "compression/movement_sparsity/importance_threshold": -2.3281359752746686e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8247015323885878,
+      "compression/movement_sparsity/model_sparsity": 0.6404067002510637,
+      "compression_loss": 13.5343017578125,
+      "distillation_loss": 0.37791934609413147,
+      "epoch": 1.9,
+      "learning_rate": 1.2387648456057009e-05,
+      "loss": 14.0972,
+      "step": 4010,
+      "task_loss": 0.31983357667922974
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04996264800476118,
+      "compression/movement_sparsity/importance_threshold": -1.997654009373677e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8250938629177959,
+      "compression/movement_sparsity/model_sparsity": 0.6407113572569627,
+      "compression_loss": 13.535682678222656,
+      "distillation_loss": 0.20223000645637512,
+      "epoch": 1.91,
+      "learning_rate": 1.2368646080760097e-05,
+      "loss": 13.8255,
+      "step": 4020,
+      "task_loss": 0.0476217158138752
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04996821300830635,
+      "compression/movement_sparsity/importance_threshold": -1.7000272943051495e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8256924801452875,
+      "compression/movement_sparsity/model_sparsity": 0.6411762023776709,
+      "compression_loss": 13.536922454833984,
+      "distillation_loss": 0.5841785669326782,
+      "epoch": 1.91,
+      "learning_rate": 1.2349643705463183e-05,
+      "loss": 14.0384,
+      "step": 4030,
+      "task_loss": 0.35522913932800293
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04997319585028697,
+      "compression/movement_sparsity/importance_threshold": -1.4335356598687947e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8266022917607648,
+      "compression/movement_sparsity/model_sparsity": 0.6418826997365765,
+      "compression_loss": 13.537969589233398,
+      "distillation_loss": 0.47049567103385925,
+      "epoch": 1.92,
+      "learning_rate": 1.233064133016627e-05,
+      "loss": 13.9148,
+      "step": 4040,
+      "task_loss": 0.39884519577026367
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04997762869432539,
+      "compression/movement_sparsity/importance_threshold": -1.1964589358634536e-06,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8273503133468835,
+      "compression/movement_sparsity/model_sparsity": 0.6424635620447807,
+      "compression_loss": 13.538784980773926,
+      "distillation_loss": 0.41415804624557495,
+      "epoch": 1.92,
+      "learning_rate": 1.231163895486936e-05,
+      "loss": 13.8998,
+      "step": 4050,
+      "task_loss": 0.3223814368247986
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049981543704043965,
+      "compression/movement_sparsity/importance_threshold": -9.870769520888348e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8279402735433604,
+      "compression/movement_sparsity/model_sparsity": 0.6429216847083958,
+      "compression_loss": 13.53950023651123,
+      "distillation_loss": 0.19833412766456604,
+      "epoch": 1.93,
+      "learning_rate": 1.2292636579572448e-05,
+      "loss": 14.0509,
+      "step": 4060,
+      "task_loss": 0.10071046650409698
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04998497304306504,
+      "compression/movement_sparsity/importance_threshold": -8.036695383442129e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8284314541930141,
+      "compression/movement_sparsity/model_sparsity": 0.6433031019444725,
+      "compression_loss": 13.540120124816895,
+      "distillation_loss": 0.48713254928588867,
+      "epoch": 1.93,
+      "learning_rate": 1.2273634204275534e-05,
+      "loss": 14.023,
+      "step": 4070,
+      "task_loss": 0.2855875492095947
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04998794887501098,
+      "compression/movement_sparsity/importance_threshold": -6.44516524428429e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8293861388700693,
+      "compression/movement_sparsity/model_sparsity": 0.6440444446482291,
+      "compression_loss": 13.540621757507324,
+      "distillation_loss": 0.48493221402168274,
+      "epoch": 1.94,
+      "learning_rate": 1.2254631828978622e-05,
+      "loss": 14.0355,
+      "step": 4080,
+      "task_loss": 0.24878904223442078
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04999050336350411,
+      "compression/movement_sparsity/importance_threshold": -5.078977401416253e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8297654603282144,
+      "compression/movement_sparsity/model_sparsity": 0.644338999700942,
+      "compression_loss": 13.540968894958496,
+      "distillation_loss": 0.4940585196018219,
+      "epoch": 1.94,
+      "learning_rate": 1.2235629453681712e-05,
+      "loss": 13.8999,
+      "step": 4090,
+      "task_loss": 0.3463954031467438
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0499926686721668,
+      "compression/movement_sparsity/importance_threshold": -3.920930152830765e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8303199808039747,
+      "compression/movement_sparsity/model_sparsity": 0.644769602305832,
+      "compression_loss": 13.541226387023926,
+      "distillation_loss": 0.3753039836883545,
+      "epoch": 1.95,
+      "learning_rate": 1.22166270783848e-05,
+      "loss": 13.9771,
+      "step": 4100,
+      "task_loss": 0.5592837333679199
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.0499944769646214,
+      "compression/movement_sparsity/importance_threshold": -2.953821796516237e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8311270254629629,
+      "compression/movement_sparsity/model_sparsity": 0.6453962978880762,
+      "compression_loss": 13.541411399841309,
+      "distillation_loss": 0.4341853857040405,
+      "epoch": 1.95,
+      "learning_rate": 1.2197624703087888e-05,
+      "loss": 14.01,
+      "step": 4110,
+      "task_loss": 0.2506285309791565
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04999596040449025,
+      "compression/movement_sparsity/importance_threshold": -2.160450630469754e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8317775143029208,
+      "compression/movement_sparsity/model_sparsity": 0.6459014229487039,
+      "compression_loss": 13.541484832763672,
+      "distillation_loss": 0.3095904588699341,
+      "epoch": 1.96,
+      "learning_rate": 1.2178622327790974e-05,
+      "loss": 13.9163,
+      "step": 4120,
+      "task_loss": 0.12216134369373322
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049997151155395714,
+      "compression/movement_sparsity/importance_threshold": -1.5236149526797263e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8322562739950317,
+      "compression/movement_sparsity/model_sparsity": 0.6462731949202557,
+      "compression_loss": 13.541465759277344,
+      "distillation_loss": 0.4424591064453125,
+      "epoch": 1.96,
+      "learning_rate": 1.2159619952494062e-05,
+      "loss": 13.915,
+      "step": 4130,
+      "task_loss": 0.1971740871667862
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04999808138096014,
+      "compression/movement_sparsity/importance_threshold": -1.0261130611475752e-07,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8326778572907256,
+      "compression/movement_sparsity/model_sparsity": 0.6466005676201638,
+      "compression_loss": 13.541352272033691,
+      "distillation_loss": 0.8569298982620239,
+      "epoch": 1.97,
+      "learning_rate": 1.2140617577197151e-05,
+      "loss": 13.9558,
+      "step": 4140,
+      "task_loss": 0.45095348358154297
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04999878324480587,
+      "compression/movement_sparsity/importance_threshold": -6.507432538617117e-08,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8331807531616983,
+      "compression/movement_sparsity/model_sparsity": 0.6469910820943721,
+      "compression_loss": 13.541196823120117,
+      "distillation_loss": 0.5633354187011719,
+      "epoch": 1.97,
+      "learning_rate": 1.2121615201900239e-05,
+      "loss": 13.8935,
+      "step": 4150,
+      "task_loss": 0.26475971937179565
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04999928891055526,
+      "compression/movement_sparsity/importance_threshold": -3.803038288148833e-08,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8333070799457994,
+      "compression/movement_sparsity/model_sparsity": 0.6470891788188018,
+      "compression_loss": 13.540977478027344,
+      "distillation_loss": 0.5583696365356445,
+      "epoch": 1.98,
+      "learning_rate": 1.2102612826603327e-05,
+      "loss": 13.9072,
+      "step": 4160,
+      "task_loss": 0.3018655776977539
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04999963054183066,
+      "compression/movement_sparsity/importance_threshold": -1.975930839998377e-08,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.834074744523487,
+      "compression/movement_sparsity/model_sparsity": 0.6476852945282907,
+      "compression_loss": 13.540740013122559,
+      "distillation_loss": 0.480516254901886,
+      "epoch": 1.98,
+      "learning_rate": 1.2083610451306413e-05,
+      "loss": 14.0023,
+      "step": 4170,
+      "task_loss": 0.21734555065631866
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04999984030225443,
+      "compression/movement_sparsity/importance_threshold": -8.540931741365942e-09,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8346077047576032,
+      "compression/movement_sparsity/model_sparsity": 0.6480991549268605,
+      "compression_loss": 13.540478706359863,
+      "distillation_loss": 0.5318132638931274,
+      "epoch": 1.99,
+      "learning_rate": 1.2064608076009503e-05,
+      "loss": 13.8838,
+      "step": 4180,
+      "task_loss": 0.33174723386764526
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.04999995035544891,
+      "compression/movement_sparsity/importance_threshold": -2.655082704475925e-09,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8350744198848239,
+      "compression/movement_sparsity/model_sparsity": 0.6484615739146306,
+      "compression_loss": 13.54019546508789,
+      "distillation_loss": 0.5515092611312866,
+      "epoch": 1.99,
+      "learning_rate": 1.204560570071259e-05,
+      "loss": 13.9953,
+      "step": 4190,
+      "task_loss": 0.2831692099571228
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049999992865036445,
+      "compression/movement_sparsity/importance_threshold": -3.815910894558461e-10,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.8356860344211081,
+      "compression/movement_sparsity/model_sparsity": 0.6489365118547517,
+      "compression_loss": 13.539885520935059,
+      "distillation_loss": 0.5041601061820984,
+      "epoch": 2.0,
+      "learning_rate": 1.2026603325415678e-05,
+      "loss": 13.9405,
+      "step": 4200,
+      "task_loss": 0.24462735652923584
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.049999999994639395,
+      "compression/movement_sparsity/importance_threshold": -2.8669514678947294e-13,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.836323984680819,
+      "compression/movement_sparsity/model_sparsity": 0.6494319003131221,
+      "compression_loss": 13.539533615112305,
+      "distillation_loss": 0.37183666229248047,
+      "epoch": 2.0,
+      "learning_rate": 1.2007600950118764e-05,
+      "loss": 13.9897,
+      "step": 4210,
+      "task_loss": 0.1697111278772354
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.6137279272079468,
+      "epoch": 2.0,
+      "learning_rate": 1.1988598574821854e-05,
+      "loss": 0.7213,
+      "step": 4220,
+      "task_loss": 0.35716748237609863
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.24926382303237915,
+      "epoch": 2.01,
+      "learning_rate": 1.1969596199524942e-05,
+      "loss": 0.2347,
+      "step": 4230,
+      "task_loss": 0.09346465766429901
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11653564870357513,
+      "epoch": 2.01,
+      "learning_rate": 1.195059382422803e-05,
+      "loss": 0.1942,
+      "step": 4240,
+      "task_loss": 0.07350118458271027
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.4009395241737366,
+      "epoch": 2.02,
+      "learning_rate": 1.1931591448931118e-05,
+      "loss": 0.2701,
+      "step": 4250,
+      "task_loss": 0.47706255316734314
+    },
+    {
+      "epoch": 2.02,
+      "eval_accuracy": 0.9048165137614679,
+      "eval_loss": 0.33537691831588745,
+      "eval_runtime": 22.0839,
+      "eval_samples_per_second": 39.486,
+      "eval_steps_per_second": 4.936,
+      "step": 4250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.260000079870224,
+      "epoch": 2.02,
+      "learning_rate": 1.1912589073634204e-05,
+      "loss": 0.2824,
+      "step": 4260,
+      "task_loss": 0.33928802609443665
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.5958622097969055,
+      "epoch": 2.03,
+      "learning_rate": 1.1893586698337293e-05,
+      "loss": 0.2131,
+      "step": 4270,
+      "task_loss": 0.2858772873878479
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.28239572048187256,
+      "epoch": 2.03,
+      "learning_rate": 1.1874584323040381e-05,
+      "loss": 0.3669,
+      "step": 4280,
+      "task_loss": 0.3893040120601654
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09593847393989563,
+      "epoch": 2.04,
+      "learning_rate": 1.1855581947743469e-05,
+      "loss": 0.2448,
+      "step": 4290,
+      "task_loss": 0.20446573197841644
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.017060134559869766,
+      "epoch": 2.04,
+      "learning_rate": 1.1836579572446555e-05,
+      "loss": 0.1788,
+      "step": 4300,
+      "task_loss": 0.0036827102303504944
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1649906486272812,
+      "epoch": 2.05,
+      "learning_rate": 1.1817577197149645e-05,
+      "loss": 0.1344,
+      "step": 4310,
+      "task_loss": 0.07797713577747345
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.36799219250679016,
+      "epoch": 2.05,
+      "learning_rate": 1.1798574821852733e-05,
+      "loss": 0.1798,
+      "step": 4320,
+      "task_loss": 0.19989198446273804
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2834874391555786,
+      "epoch": 2.06,
+      "learning_rate": 1.177957244655582e-05,
+      "loss": 0.0852,
+      "step": 4330,
+      "task_loss": 0.11706365644931793
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.4932941794395447,
+      "epoch": 2.06,
+      "learning_rate": 1.1760570071258908e-05,
+      "loss": 0.3161,
+      "step": 4340,
+      "task_loss": 0.25535914301872253
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.4178961515426636,
+      "epoch": 2.07,
+      "learning_rate": 1.1741567695961998e-05,
+      "loss": 0.2081,
+      "step": 4350,
+      "task_loss": 0.10093335807323456
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3657403588294983,
+      "epoch": 2.07,
+      "learning_rate": 1.1722565320665084e-05,
+      "loss": 0.2158,
+      "step": 4360,
+      "task_loss": 0.25138112902641296
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02273857593536377,
+      "epoch": 2.08,
+      "learning_rate": 1.1703562945368172e-05,
+      "loss": 0.1983,
+      "step": 4370,
+      "task_loss": 0.006311226636171341
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.17630484700202942,
+      "epoch": 2.08,
+      "learning_rate": 1.168456057007126e-05,
+      "loss": 0.2566,
+      "step": 4380,
+      "task_loss": 0.505398154258728
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.031373463571071625,
+      "epoch": 2.09,
+      "learning_rate": 1.1665558194774346e-05,
+      "loss": 0.2201,
+      "step": 4390,
+      "task_loss": 0.002458591014146805
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.33634716272354126,
+      "epoch": 2.09,
+      "learning_rate": 1.1646555819477436e-05,
+      "loss": 0.1559,
+      "step": 4400,
+      "task_loss": 0.2244289070367813
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2994489073753357,
+      "epoch": 2.1,
+      "learning_rate": 1.1627553444180523e-05,
+      "loss": 0.248,
+      "step": 4410,
+      "task_loss": 0.2632516026496887
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.060996197164058685,
+      "epoch": 2.1,
+      "learning_rate": 1.1608551068883611e-05,
+      "loss": 0.1764,
+      "step": 4420,
+      "task_loss": 0.022328753024339676
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0799054354429245,
+      "epoch": 2.1,
+      "learning_rate": 1.15895486935867e-05,
+      "loss": 0.2472,
+      "step": 4430,
+      "task_loss": 0.028038904070854187
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.4037141799926758,
+      "epoch": 2.11,
+      "learning_rate": 1.1570546318289789e-05,
+      "loss": 0.2217,
+      "step": 4440,
+      "task_loss": 0.40060561895370483
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.29084426164627075,
+      "epoch": 2.11,
+      "learning_rate": 1.1551543942992875e-05,
+      "loss": 0.2478,
+      "step": 4450,
+      "task_loss": 0.22699187695980072
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.022717095911502838,
+      "epoch": 2.12,
+      "learning_rate": 1.1532541567695963e-05,
+      "loss": 0.1458,
+      "step": 4460,
+      "task_loss": 0.007930740714073181
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05607360601425171,
+      "epoch": 2.12,
+      "learning_rate": 1.151353919239905e-05,
+      "loss": 0.2155,
+      "step": 4470,
+      "task_loss": 0.23636063933372498
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0458001047372818,
+      "epoch": 2.13,
+      "learning_rate": 1.1494536817102138e-05,
+      "loss": 0.2193,
+      "step": 4480,
+      "task_loss": 0.03247682377696037
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.27424195408821106,
+      "epoch": 2.13,
+      "learning_rate": 1.1475534441805228e-05,
+      "loss": 0.2339,
+      "step": 4490,
+      "task_loss": 0.16137200593948364
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.33258911967277527,
+      "epoch": 2.14,
+      "learning_rate": 1.1456532066508314e-05,
+      "loss": 0.2689,
+      "step": 4500,
+      "task_loss": 0.34519919753074646
+    },
+    {
+      "epoch": 2.14,
+      "eval_accuracy": 0.9048165137614679,
+      "eval_loss": 0.3319544792175293,
+      "eval_runtime": 22.0326,
+      "eval_samples_per_second": 39.578,
+      "eval_steps_per_second": 4.947,
+      "step": 4500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06233971193432808,
+      "epoch": 2.14,
+      "learning_rate": 1.1437529691211402e-05,
+      "loss": 0.2959,
+      "step": 4510,
+      "task_loss": 0.008978258818387985
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.19504857063293457,
+      "epoch": 2.15,
+      "learning_rate": 1.141852731591449e-05,
+      "loss": 0.16,
+      "step": 4520,
+      "task_loss": 0.12643758952617645
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02659631334245205,
+      "epoch": 2.15,
+      "learning_rate": 1.139952494061758e-05,
+      "loss": 0.2226,
+      "step": 4530,
+      "task_loss": 0.003300584852695465
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05742825195193291,
+      "epoch": 2.16,
+      "learning_rate": 1.1380522565320666e-05,
+      "loss": 0.1707,
+      "step": 4540,
+      "task_loss": 0.27017584443092346
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.197635218501091,
+      "epoch": 2.16,
+      "learning_rate": 1.1361520190023754e-05,
+      "loss": 0.2152,
+      "step": 4550,
+      "task_loss": 0.17246709764003754
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13968348503112793,
+      "epoch": 2.17,
+      "learning_rate": 1.1342517814726841e-05,
+      "loss": 0.2187,
+      "step": 4560,
+      "task_loss": 0.3724798858165741
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2040019929409027,
+      "epoch": 2.17,
+      "learning_rate": 1.1323515439429931e-05,
+      "loss": 0.1535,
+      "step": 4570,
+      "task_loss": 0.049664292484521866
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.570436954498291,
+      "epoch": 2.18,
+      "learning_rate": 1.1304513064133019e-05,
+      "loss": 0.1904,
+      "step": 4580,
+      "task_loss": 0.3329172134399414
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09249302744865417,
+      "epoch": 2.18,
+      "learning_rate": 1.1285510688836105e-05,
+      "loss": 0.1101,
+      "step": 4590,
+      "task_loss": 0.21237826347351074
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03505774587392807,
+      "epoch": 2.19,
+      "learning_rate": 1.1266508313539193e-05,
+      "loss": 0.1319,
+      "step": 4600,
+      "task_loss": 0.4041575789451599
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3652401864528656,
+      "epoch": 2.19,
+      "learning_rate": 1.124750593824228e-05,
+      "loss": 0.1775,
+      "step": 4610,
+      "task_loss": 0.18814264237880707
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.16492074728012085,
+      "epoch": 2.19,
+      "learning_rate": 1.122850356294537e-05,
+      "loss": 0.2287,
+      "step": 4620,
+      "task_loss": 0.16214123368263245
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.17321833968162537,
+      "epoch": 2.2,
+      "learning_rate": 1.1209501187648456e-05,
+      "loss": 0.1745,
+      "step": 4630,
+      "task_loss": 0.1337524801492691
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04696015268564224,
+      "epoch": 2.2,
+      "learning_rate": 1.1190498812351544e-05,
+      "loss": 0.1773,
+      "step": 4640,
+      "task_loss": 0.011036917567253113
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3878232538700104,
+      "epoch": 2.21,
+      "learning_rate": 1.1171496437054632e-05,
+      "loss": 0.2254,
+      "step": 4650,
+      "task_loss": 0.28115496039390564
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09996585547924042,
+      "epoch": 2.21,
+      "learning_rate": 1.1152494061757722e-05,
+      "loss": 0.1769,
+      "step": 4660,
+      "task_loss": 0.1222931444644928
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.4124844968318939,
+      "epoch": 2.22,
+      "learning_rate": 1.113349168646081e-05,
+      "loss": 0.2081,
+      "step": 4670,
+      "task_loss": 0.3749869763851166
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13545894622802734,
+      "epoch": 2.22,
+      "learning_rate": 1.1114489311163896e-05,
+      "loss": 0.0857,
+      "step": 4680,
+      "task_loss": 0.05755548179149628
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1826428771018982,
+      "epoch": 2.23,
+      "learning_rate": 1.1095486935866984e-05,
+      "loss": 0.2862,
+      "step": 4690,
+      "task_loss": 0.01588393747806549
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1444108933210373,
+      "epoch": 2.23,
+      "learning_rate": 1.1076484560570073e-05,
+      "loss": 0.1427,
+      "step": 4700,
+      "task_loss": 0.26151856780052185
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1410968005657196,
+      "epoch": 2.24,
+      "learning_rate": 1.1057482185273161e-05,
+      "loss": 0.1974,
+      "step": 4710,
+      "task_loss": 0.07411689311265945
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1891230046749115,
+      "epoch": 2.24,
+      "learning_rate": 1.1038479809976247e-05,
+      "loss": 0.1835,
+      "step": 4720,
+      "task_loss": 0.11419402062892914
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10753890872001648,
+      "epoch": 2.25,
+      "learning_rate": 1.1019477434679335e-05,
+      "loss": 0.201,
+      "step": 4730,
+      "task_loss": 0.05918397009372711
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12443285435438156,
+      "epoch": 2.25,
+      "learning_rate": 1.1000475059382423e-05,
+      "loss": 0.1185,
+      "step": 4740,
+      "task_loss": 0.06015586480498314
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.39146068692207336,
+      "epoch": 2.26,
+      "learning_rate": 1.0981472684085512e-05,
+      "loss": 0.1775,
+      "step": 4750,
+      "task_loss": 0.406578928232193
+    },
+    {
+      "epoch": 2.26,
+      "eval_accuracy": 0.9162844036697247,
+      "eval_loss": 0.28384512662887573,
+      "eval_runtime": 22.1283,
+      "eval_samples_per_second": 39.407,
+      "eval_steps_per_second": 4.926,
+      "step": 4750
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.019335411489009857,
+      "epoch": 2.26,
+      "learning_rate": 1.09624703087886e-05,
+      "loss": 0.166,
+      "step": 4760,
+      "task_loss": 0.16980049014091492
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05398799106478691,
+      "epoch": 2.27,
+      "learning_rate": 1.0943467933491686e-05,
+      "loss": 0.1554,
+      "step": 4770,
+      "task_loss": 0.007133938372135162
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.018566645681858063,
+      "epoch": 2.27,
+      "learning_rate": 1.0924465558194774e-05,
+      "loss": 0.2266,
+      "step": 4780,
+      "task_loss": 0.005496695637702942
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09009189158678055,
+      "epoch": 2.28,
+      "learning_rate": 1.0905463182897864e-05,
+      "loss": 0.1098,
+      "step": 4790,
+      "task_loss": 0.04330487921833992
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.29310229420661926,
+      "epoch": 2.28,
+      "learning_rate": 1.0886460807600952e-05,
+      "loss": 0.2037,
+      "step": 4800,
+      "task_loss": 0.21032559871673584
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.32484009861946106,
+      "epoch": 2.29,
+      "learning_rate": 1.086745843230404e-05,
+      "loss": 0.1728,
+      "step": 4810,
+      "task_loss": 0.23763622343540192
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.32907170057296753,
+      "epoch": 2.29,
+      "learning_rate": 1.0848456057007126e-05,
+      "loss": 0.1479,
+      "step": 4820,
+      "task_loss": 0.27866876125335693
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015447848476469517,
+      "epoch": 2.29,
+      "learning_rate": 1.0829453681710214e-05,
+      "loss": 0.1612,
+      "step": 4830,
+      "task_loss": 0.003272462636232376
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07381051778793335,
+      "epoch": 2.3,
+      "learning_rate": 1.0810451306413303e-05,
+      "loss": 0.0796,
+      "step": 4840,
+      "task_loss": 0.13187864422798157
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11486229300498962,
+      "epoch": 2.3,
+      "learning_rate": 1.0791448931116391e-05,
+      "loss": 0.1633,
+      "step": 4850,
+      "task_loss": 0.17579086124897003
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2775013744831085,
+      "epoch": 2.31,
+      "learning_rate": 1.0772446555819477e-05,
+      "loss": 0.1521,
+      "step": 4860,
+      "task_loss": 0.34235501289367676
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07822778820991516,
+      "epoch": 2.31,
+      "learning_rate": 1.0753444180522565e-05,
+      "loss": 0.1217,
+      "step": 4870,
+      "task_loss": 0.017286375164985657
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08122527599334717,
+      "epoch": 2.32,
+      "learning_rate": 1.0734441805225655e-05,
+      "loss": 0.1756,
+      "step": 4880,
+      "task_loss": 0.24036413431167603
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.8243784308433533,
+      "epoch": 2.32,
+      "learning_rate": 1.0715439429928743e-05,
+      "loss": 0.2231,
+      "step": 4890,
+      "task_loss": 0.5445951819419861
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.061645179986953735,
+      "epoch": 2.33,
+      "learning_rate": 1.069643705463183e-05,
+      "loss": 0.1871,
+      "step": 4900,
+      "task_loss": 0.023713212460279465
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11187595874071121,
+      "epoch": 2.33,
+      "learning_rate": 1.0677434679334917e-05,
+      "loss": 0.1132,
+      "step": 4910,
+      "task_loss": 0.023900482803583145
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.026393216103315353,
+      "epoch": 2.34,
+      "learning_rate": 1.0658432304038006e-05,
+      "loss": 0.1716,
+      "step": 4920,
+      "task_loss": 0.1366603672504425
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01934751495718956,
+      "epoch": 2.34,
+      "learning_rate": 1.0639429928741094e-05,
+      "loss": 0.1488,
+      "step": 4930,
+      "task_loss": 0.004630662500858307
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.019772972911596298,
+      "epoch": 2.35,
+      "learning_rate": 1.0620427553444182e-05,
+      "loss": 0.123,
+      "step": 4940,
+      "task_loss": 0.004192207008600235
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07974082976579666,
+      "epoch": 2.35,
+      "learning_rate": 1.0601425178147268e-05,
+      "loss": 0.2069,
+      "step": 4950,
+      "task_loss": 0.02976549044251442
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.4843176603317261,
+      "epoch": 2.36,
+      "learning_rate": 1.0582422802850356e-05,
+      "loss": 0.2589,
+      "step": 4960,
+      "task_loss": 0.5428990125656128
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11596342921257019,
+      "epoch": 2.36,
+      "learning_rate": 1.0563420427553445e-05,
+      "loss": 0.1505,
+      "step": 4970,
+      "task_loss": 0.40985506772994995
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.31906628608703613,
+      "epoch": 2.37,
+      "learning_rate": 1.0544418052256533e-05,
+      "loss": 0.2339,
+      "step": 4980,
+      "task_loss": 0.16161520779132843
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08732592314481735,
+      "epoch": 2.37,
+      "learning_rate": 1.0525415676959621e-05,
+      "loss": 0.1856,
+      "step": 4990,
+      "task_loss": 0.03401995077729225
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.16932491958141327,
+      "epoch": 2.38,
+      "learning_rate": 1.0506413301662707e-05,
+      "loss": 0.1648,
+      "step": 5000,
+      "task_loss": 0.1186380535364151
+    },
+    {
+      "epoch": 2.38,
+      "eval_accuracy": 0.9128440366972477,
+      "eval_loss": 0.2842116057872772,
+      "eval_runtime": 22.1968,
+      "eval_samples_per_second": 39.285,
+      "eval_steps_per_second": 4.911,
+      "step": 5000
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.4252215325832367,
+      "epoch": 2.38,
+      "learning_rate": 1.0487410926365797e-05,
+      "loss": 0.1908,
+      "step": 5010,
+      "task_loss": 0.26106417179107666
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.4758025109767914,
+      "epoch": 2.38,
+      "learning_rate": 1.0468408551068885e-05,
+      "loss": 0.2279,
+      "step": 5020,
+      "task_loss": 0.26795437932014465
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11485397070646286,
+      "epoch": 2.39,
+      "learning_rate": 1.0449406175771973e-05,
+      "loss": 0.1344,
+      "step": 5030,
+      "task_loss": 0.10242946445941925
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07752517610788345,
+      "epoch": 2.39,
+      "learning_rate": 1.0430403800475059e-05,
+      "loss": 0.1472,
+      "step": 5040,
+      "task_loss": 0.23853403329849243
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23056337237358093,
+      "epoch": 2.4,
+      "learning_rate": 1.041140142517815e-05,
+      "loss": 0.111,
+      "step": 5050,
+      "task_loss": 0.1433607041835785
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.46395134925842285,
+      "epoch": 2.4,
+      "learning_rate": 1.0392399049881236e-05,
+      "loss": 0.1743,
+      "step": 5060,
+      "task_loss": 0.2823677659034729
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.16598868370056152,
+      "epoch": 2.41,
+      "learning_rate": 1.0373396674584324e-05,
+      "loss": 0.1343,
+      "step": 5070,
+      "task_loss": 0.03318723663687706
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.016996556892991066,
+      "epoch": 2.41,
+      "learning_rate": 1.0354394299287412e-05,
+      "loss": 0.1427,
+      "step": 5080,
+      "task_loss": 0.006868541240692139
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.6260548830032349,
+      "epoch": 2.42,
+      "learning_rate": 1.0335391923990498e-05,
+      "loss": 0.2121,
+      "step": 5090,
+      "task_loss": 0.34184902906417847
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0207875557243824,
+      "epoch": 2.42,
+      "learning_rate": 1.0316389548693588e-05,
+      "loss": 0.1736,
+      "step": 5100,
+      "task_loss": 0.10779790580272675
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11162707954645157,
+      "epoch": 2.43,
+      "learning_rate": 1.0297387173396676e-05,
+      "loss": 0.1683,
+      "step": 5110,
+      "task_loss": 0.17393611371517181
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10660862922668457,
+      "epoch": 2.43,
+      "learning_rate": 1.0278384798099763e-05,
+      "loss": 0.2287,
+      "step": 5120,
+      "task_loss": 0.013046719133853912
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11832073330879211,
+      "epoch": 2.44,
+      "learning_rate": 1.0259382422802851e-05,
+      "loss": 0.1859,
+      "step": 5130,
+      "task_loss": 0.03974215313792229
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.19762274622917175,
+      "epoch": 2.44,
+      "learning_rate": 1.024038004750594e-05,
+      "loss": 0.158,
+      "step": 5140,
+      "task_loss": 0.09981634467840195
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.22084400057792664,
+      "epoch": 2.45,
+      "learning_rate": 1.0221377672209027e-05,
+      "loss": 0.1586,
+      "step": 5150,
+      "task_loss": 0.10561450570821762
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.42767268419265747,
+      "epoch": 2.45,
+      "learning_rate": 1.0202375296912115e-05,
+      "loss": 0.1494,
+      "step": 5160,
+      "task_loss": 0.2538478970527649
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03990669548511505,
+      "epoch": 2.46,
+      "learning_rate": 1.0183372921615203e-05,
+      "loss": 0.1552,
+      "step": 5170,
+      "task_loss": 0.012294076383113861
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0684843510389328,
+      "epoch": 2.46,
+      "learning_rate": 1.0164370546318289e-05,
+      "loss": 0.1114,
+      "step": 5180,
+      "task_loss": 0.04257909581065178
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10727253556251526,
+      "epoch": 2.47,
+      "learning_rate": 1.0145368171021378e-05,
+      "loss": 0.1408,
+      "step": 5190,
+      "task_loss": 0.0835094004869461
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1803852915763855,
+      "epoch": 2.47,
+      "learning_rate": 1.0126365795724466e-05,
+      "loss": 0.1463,
+      "step": 5200,
+      "task_loss": 0.10350771993398666
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.21467408537864685,
+      "epoch": 2.48,
+      "learning_rate": 1.0107363420427554e-05,
+      "loss": 0.1776,
+      "step": 5210,
+      "task_loss": 0.12939737737178802
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.49049556255340576,
+      "epoch": 2.48,
+      "learning_rate": 1.0088361045130642e-05,
+      "loss": 0.187,
+      "step": 5220,
+      "task_loss": 0.4802893400192261
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.18884505331516266,
+      "epoch": 2.48,
+      "learning_rate": 1.0069358669833732e-05,
+      "loss": 0.1488,
+      "step": 5230,
+      "task_loss": 0.09134702384471893
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.17404451966285706,
+      "epoch": 2.49,
+      "learning_rate": 1.0050356294536818e-05,
+      "loss": 0.1596,
+      "step": 5240,
+      "task_loss": 0.08957971632480621
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012248549610376358,
+      "epoch": 2.49,
+      "learning_rate": 1.0031353919239906e-05,
+      "loss": 0.1316,
+      "step": 5250,
+      "task_loss": 0.004954520612955093
+    },
+    {
+      "epoch": 2.49,
+      "eval_accuracy": 0.9162844036697247,
+      "eval_loss": 0.2750292420387268,
+      "eval_runtime": 22.048,
+      "eval_samples_per_second": 39.55,
+      "eval_steps_per_second": 4.944,
+      "step": 5250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08270278573036194,
+      "epoch": 2.5,
+      "learning_rate": 1.0012351543942993e-05,
+      "loss": 0.1146,
+      "step": 5260,
+      "task_loss": 0.04688364639878273
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10440254956483841,
+      "epoch": 2.5,
+      "learning_rate": 9.993349168646081e-06,
+      "loss": 0.1711,
+      "step": 5270,
+      "task_loss": 0.06673218309879303
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.34925076365470886,
+      "epoch": 2.51,
+      "learning_rate": 9.97434679334917e-06,
+      "loss": 0.2062,
+      "step": 5280,
+      "task_loss": 0.16814163327217102
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.44582313299179077,
+      "epoch": 2.51,
+      "learning_rate": 9.955344418052257e-06,
+      "loss": 0.2331,
+      "step": 5290,
+      "task_loss": 0.35081130266189575
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02439703233540058,
+      "epoch": 2.52,
+      "learning_rate": 9.936342042755345e-06,
+      "loss": 0.1357,
+      "step": 5300,
+      "task_loss": 0.003609389066696167
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0913199633359909,
+      "epoch": 2.52,
+      "learning_rate": 9.917339667458433e-06,
+      "loss": 0.1696,
+      "step": 5310,
+      "task_loss": 0.3426245152950287
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02632327377796173,
+      "epoch": 2.53,
+      "learning_rate": 9.89833729216152e-06,
+      "loss": 0.1167,
+      "step": 5320,
+      "task_loss": 0.005381196737289429
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1984187364578247,
+      "epoch": 2.53,
+      "learning_rate": 9.879334916864608e-06,
+      "loss": 0.1597,
+      "step": 5330,
+      "task_loss": 0.12277568876743317
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.16098986566066742,
+      "epoch": 2.54,
+      "learning_rate": 9.860332541567696e-06,
+      "loss": 0.1252,
+      "step": 5340,
+      "task_loss": 0.081682950258255
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.5197485685348511,
+      "epoch": 2.54,
+      "learning_rate": 9.841330166270784e-06,
+      "loss": 0.1671,
+      "step": 5350,
+      "task_loss": 0.3930453062057495
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0463409461081028,
+      "epoch": 2.55,
+      "learning_rate": 9.822327790973872e-06,
+      "loss": 0.1617,
+      "step": 5360,
+      "task_loss": 0.01061389222741127
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.042029425501823425,
+      "epoch": 2.55,
+      "learning_rate": 9.803325415676962e-06,
+      "loss": 0.1519,
+      "step": 5370,
+      "task_loss": 0.027968235313892365
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3101033866405487,
+      "epoch": 2.56,
+      "learning_rate": 9.784323040380048e-06,
+      "loss": 0.193,
+      "step": 5380,
+      "task_loss": 0.16425858438014984
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.7263270020484924,
+      "epoch": 2.56,
+      "learning_rate": 9.765320665083137e-06,
+      "loss": 0.2224,
+      "step": 5390,
+      "task_loss": 0.7286182641983032
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.040765728801488876,
+      "epoch": 2.57,
+      "learning_rate": 9.746318289786224e-06,
+      "loss": 0.1389,
+      "step": 5400,
+      "task_loss": 0.011140488088130951
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1607925444841385,
+      "epoch": 2.57,
+      "learning_rate": 9.727315914489311e-06,
+      "loss": 0.1126,
+      "step": 5410,
+      "task_loss": 0.11116501688957214
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0224788635969162,
+      "epoch": 2.57,
+      "learning_rate": 9.7083135391924e-06,
+      "loss": 0.1352,
+      "step": 5420,
+      "task_loss": 0.005074281245470047
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0271650031208992,
+      "epoch": 2.58,
+      "learning_rate": 9.689311163895487e-06,
+      "loss": 0.105,
+      "step": 5430,
+      "task_loss": 0.012768540531396866
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02819114923477173,
+      "epoch": 2.58,
+      "learning_rate": 9.670308788598575e-06,
+      "loss": 0.2538,
+      "step": 5440,
+      "task_loss": 0.00484645739197731
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.18920451402664185,
+      "epoch": 2.59,
+      "learning_rate": 9.651306413301663e-06,
+      "loss": 0.1027,
+      "step": 5450,
+      "task_loss": 0.24752211570739746
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11157466471195221,
+      "epoch": 2.59,
+      "learning_rate": 9.632304038004752e-06,
+      "loss": 0.162,
+      "step": 5460,
+      "task_loss": 0.1878063678741455
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.26885926723480225,
+      "epoch": 2.6,
+      "learning_rate": 9.613301662707839e-06,
+      "loss": 0.2035,
+      "step": 5470,
+      "task_loss": 0.058163829147815704
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1357247531414032,
+      "epoch": 2.6,
+      "learning_rate": 9.594299287410928e-06,
+      "loss": 0.2073,
+      "step": 5480,
+      "task_loss": 0.1269286572933197
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05651300773024559,
+      "epoch": 2.61,
+      "learning_rate": 9.575296912114014e-06,
+      "loss": 0.1478,
+      "step": 5490,
+      "task_loss": 0.014481060206890106
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11179080605506897,
+      "epoch": 2.61,
+      "learning_rate": 9.556294536817104e-06,
+      "loss": 0.2349,
+      "step": 5500,
+      "task_loss": 0.13668608665466309
+    },
+    {
+      "epoch": 2.61,
+      "eval_accuracy": 0.9231651376146789,
+      "eval_loss": 0.24054142832756042,
+      "eval_runtime": 22.021,
+      "eval_samples_per_second": 39.599,
+      "eval_steps_per_second": 4.95,
+      "step": 5500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02737560123205185,
+      "epoch": 2.62,
+      "learning_rate": 9.53729216152019e-06,
+      "loss": 0.1493,
+      "step": 5510,
+      "task_loss": 0.10180087387561798
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12310200929641724,
+      "epoch": 2.62,
+      "learning_rate": 9.518289786223278e-06,
+      "loss": 0.1098,
+      "step": 5520,
+      "task_loss": 0.058783046901226044
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13475853204727173,
+      "epoch": 2.63,
+      "learning_rate": 9.499287410926367e-06,
+      "loss": 0.1139,
+      "step": 5530,
+      "task_loss": 0.05447866767644882
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.17890390753746033,
+      "epoch": 2.63,
+      "learning_rate": 9.480285035629454e-06,
+      "loss": 0.1686,
+      "step": 5540,
+      "task_loss": 0.22207100689411163
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05488044023513794,
+      "epoch": 2.64,
+      "learning_rate": 9.461282660332543e-06,
+      "loss": 0.1037,
+      "step": 5550,
+      "task_loss": 0.2198052704334259
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07350677251815796,
+      "epoch": 2.64,
+      "learning_rate": 9.44228028503563e-06,
+      "loss": 0.1472,
+      "step": 5560,
+      "task_loss": 0.15250588953495026
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23793400824069977,
+      "epoch": 2.65,
+      "learning_rate": 9.423277909738719e-06,
+      "loss": 0.187,
+      "step": 5570,
+      "task_loss": 0.17216888070106506
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3636675775051117,
+      "epoch": 2.65,
+      "learning_rate": 9.404275534441805e-06,
+      "loss": 0.1792,
+      "step": 5580,
+      "task_loss": 0.21685606241226196
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05343058705329895,
+      "epoch": 2.66,
+      "learning_rate": 9.385273159144895e-06,
+      "loss": 0.1529,
+      "step": 5590,
+      "task_loss": 0.012824393808841705
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.21820136904716492,
+      "epoch": 2.66,
+      "learning_rate": 9.36627078384798e-06,
+      "loss": 0.2066,
+      "step": 5600,
+      "task_loss": 0.10303452610969543
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02510383166372776,
+      "epoch": 2.67,
+      "learning_rate": 9.34726840855107e-06,
+      "loss": 0.0728,
+      "step": 5610,
+      "task_loss": 0.004608385264873505
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2674543261528015,
+      "epoch": 2.67,
+      "learning_rate": 9.328266033254158e-06,
+      "loss": 0.146,
+      "step": 5620,
+      "task_loss": 0.2036900818347931
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2999550402164459,
+      "epoch": 2.67,
+      "learning_rate": 9.309263657957246e-06,
+      "loss": 0.1655,
+      "step": 5630,
+      "task_loss": 0.1722106784582138
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08812573552131653,
+      "epoch": 2.68,
+      "learning_rate": 9.290261282660334e-06,
+      "loss": 0.1105,
+      "step": 5640,
+      "task_loss": 0.041734665632247925
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.014322986826300621,
+      "epoch": 2.68,
+      "learning_rate": 9.27125890736342e-06,
+      "loss": 0.0986,
+      "step": 5650,
+      "task_loss": 0.11941255629062653
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12577423453330994,
+      "epoch": 2.69,
+      "learning_rate": 9.25225653206651e-06,
+      "loss": 0.1373,
+      "step": 5660,
+      "task_loss": 0.3789353370666504
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0945727676153183,
+      "epoch": 2.69,
+      "learning_rate": 9.233254156769596e-06,
+      "loss": 0.1489,
+      "step": 5670,
+      "task_loss": 0.14805974066257477
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.27903738617897034,
+      "epoch": 2.7,
+      "learning_rate": 9.214251781472685e-06,
+      "loss": 0.1515,
+      "step": 5680,
+      "task_loss": 0.14870142936706543
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1626128852367401,
+      "epoch": 2.7,
+      "learning_rate": 9.195249406175773e-06,
+      "loss": 0.149,
+      "step": 5690,
+      "task_loss": 0.08122530579566956
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.055356357246637344,
+      "epoch": 2.71,
+      "learning_rate": 9.176247030878861e-06,
+      "loss": 0.1217,
+      "step": 5700,
+      "task_loss": 0.16854631900787354
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1662861853837967,
+      "epoch": 2.71,
+      "learning_rate": 9.157244655581949e-06,
+      "loss": 0.1675,
+      "step": 5710,
+      "task_loss": 0.013219501823186874
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.20596551895141602,
+      "epoch": 2.72,
+      "learning_rate": 9.138242280285037e-06,
+      "loss": 0.2061,
+      "step": 5720,
+      "task_loss": 0.1961277425289154
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2168876677751541,
+      "epoch": 2.72,
+      "learning_rate": 9.119239904988125e-06,
+      "loss": 0.1496,
+      "step": 5730,
+      "task_loss": 0.2229800522327423
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.044798918068408966,
+      "epoch": 2.73,
+      "learning_rate": 9.100237529691213e-06,
+      "loss": 0.114,
+      "step": 5740,
+      "task_loss": 0.005338538438081741
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13803905248641968,
+      "epoch": 2.73,
+      "learning_rate": 9.0812351543943e-06,
+      "loss": 0.066,
+      "step": 5750,
+      "task_loss": 0.06024959683418274
+    },
+    {
+      "epoch": 2.73,
+      "eval_accuracy": 0.9174311926605505,
+      "eval_loss": 0.26952359080314636,
+      "eval_runtime": 22.0206,
+      "eval_samples_per_second": 39.599,
+      "eval_steps_per_second": 4.95,
+      "step": 5750
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.009917501360177994,
+      "epoch": 2.74,
+      "learning_rate": 9.062232779097387e-06,
+      "loss": 0.1051,
+      "step": 5760,
+      "task_loss": 0.0061325803399086
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08573028445243835,
+      "epoch": 2.74,
+      "learning_rate": 9.043230403800476e-06,
+      "loss": 0.1055,
+      "step": 5770,
+      "task_loss": 0.05740174278616905
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07769911736249924,
+      "epoch": 2.75,
+      "learning_rate": 9.024228028503564e-06,
+      "loss": 0.1586,
+      "step": 5780,
+      "task_loss": 0.019947297871112823
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04432743787765503,
+      "epoch": 2.75,
+      "learning_rate": 9.005225653206652e-06,
+      "loss": 0.1275,
+      "step": 5790,
+      "task_loss": 0.17999230325222015
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1737172156572342,
+      "epoch": 2.76,
+      "learning_rate": 8.98622327790974e-06,
+      "loss": 0.2693,
+      "step": 5800,
+      "task_loss": 0.18654870986938477
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021758923307061195,
+      "epoch": 2.76,
+      "learning_rate": 8.967220902612828e-06,
+      "loss": 0.119,
+      "step": 5810,
+      "task_loss": 0.0038488097488880157
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.011074014008045197,
+      "epoch": 2.76,
+      "learning_rate": 8.948218527315915e-06,
+      "loss": 0.1163,
+      "step": 5820,
+      "task_loss": 0.0033141709864139557
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23885256052017212,
+      "epoch": 2.77,
+      "learning_rate": 8.929216152019003e-06,
+      "loss": 0.1698,
+      "step": 5830,
+      "task_loss": 0.5643174052238464
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.046549778431653976,
+      "epoch": 2.77,
+      "learning_rate": 8.910213776722091e-06,
+      "loss": 0.0801,
+      "step": 5840,
+      "task_loss": 0.00456850603222847
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09816709160804749,
+      "epoch": 2.78,
+      "learning_rate": 8.891211401425179e-06,
+      "loss": 0.1631,
+      "step": 5850,
+      "task_loss": 0.12905465066432953
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08407769352197647,
+      "epoch": 2.78,
+      "learning_rate": 8.872209026128267e-06,
+      "loss": 0.1879,
+      "step": 5860,
+      "task_loss": 0.03167784959077835
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2762501537799835,
+      "epoch": 2.79,
+      "learning_rate": 8.853206650831355e-06,
+      "loss": 0.1614,
+      "step": 5870,
+      "task_loss": 0.16979815065860748
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12890750169754028,
+      "epoch": 2.79,
+      "learning_rate": 8.834204275534443e-06,
+      "loss": 0.1169,
+      "step": 5880,
+      "task_loss": 0.04023109748959541
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02156493254005909,
+      "epoch": 2.8,
+      "learning_rate": 8.81520190023753e-06,
+      "loss": 0.0952,
+      "step": 5890,
+      "task_loss": 0.0034607164561748505
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.007742735557258129,
+      "epoch": 2.8,
+      "learning_rate": 8.796199524940618e-06,
+      "loss": 0.1524,
+      "step": 5900,
+      "task_loss": 0.0028364397585392
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1480652093887329,
+      "epoch": 2.81,
+      "learning_rate": 8.777197149643706e-06,
+      "loss": 0.1092,
+      "step": 5910,
+      "task_loss": 0.29450657963752747
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.17767339944839478,
+      "epoch": 2.81,
+      "learning_rate": 8.758194774346794e-06,
+      "loss": 0.2108,
+      "step": 5920,
+      "task_loss": 0.10835998505353928
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.26279133558273315,
+      "epoch": 2.82,
+      "learning_rate": 8.739192399049882e-06,
+      "loss": 0.169,
+      "step": 5930,
+      "task_loss": 0.23328514397144318
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015537131577730179,
+      "epoch": 2.82,
+      "learning_rate": 8.72019002375297e-06,
+      "loss": 0.1353,
+      "step": 5940,
+      "task_loss": 0.02001919597387314
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07597079128026962,
+      "epoch": 2.83,
+      "learning_rate": 8.701187648456058e-06,
+      "loss": 0.0961,
+      "step": 5950,
+      "task_loss": 0.11477569490671158
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11929579079151154,
+      "epoch": 2.83,
+      "learning_rate": 8.682185273159146e-06,
+      "loss": 0.1904,
+      "step": 5960,
+      "task_loss": 0.04400225356221199
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.24032776057720184,
+      "epoch": 2.84,
+      "learning_rate": 8.663182897862233e-06,
+      "loss": 0.1713,
+      "step": 5970,
+      "task_loss": 0.16706398129463196
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.039569027721881866,
+      "epoch": 2.84,
+      "learning_rate": 8.644180522565321e-06,
+      "loss": 0.079,
+      "step": 5980,
+      "task_loss": 0.009293723851442337
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.14112095534801483,
+      "epoch": 2.85,
+      "learning_rate": 8.625178147268409e-06,
+      "loss": 0.1912,
+      "step": 5990,
+      "task_loss": 0.12100718915462494
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2663877308368683,
+      "epoch": 2.85,
+      "learning_rate": 8.606175771971497e-06,
+      "loss": 0.1285,
+      "step": 6000,
+      "task_loss": 0.1642446219921112
+    },
+    {
+      "epoch": 2.85,
+      "eval_accuracy": 0.9094036697247706,
+      "eval_loss": 0.3016970455646515,
+      "eval_runtime": 21.9287,
+      "eval_samples_per_second": 39.765,
+      "eval_steps_per_second": 4.971,
+      "step": 6000
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02504299208521843,
+      "epoch": 2.86,
+      "learning_rate": 8.587173396674585e-06,
+      "loss": 0.0738,
+      "step": 6010,
+      "task_loss": 0.004753179848194122
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05768204480409622,
+      "epoch": 2.86,
+      "learning_rate": 8.570071258907364e-06,
+      "loss": 0.1474,
+      "step": 6020,
+      "task_loss": 0.03982783481478691
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04381980746984482,
+      "epoch": 2.86,
+      "learning_rate": 8.551068883610452e-06,
+      "loss": 0.0811,
+      "step": 6030,
+      "task_loss": 0.014950472861528397
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10156150907278061,
+      "epoch": 2.87,
+      "learning_rate": 8.53206650831354e-06,
+      "loss": 0.0831,
+      "step": 6040,
+      "task_loss": 0.10873029381036758
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07243537902832031,
+      "epoch": 2.87,
+      "learning_rate": 8.513064133016627e-06,
+      "loss": 0.146,
+      "step": 6050,
+      "task_loss": 0.029440071433782578
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.27121058106422424,
+      "epoch": 2.88,
+      "learning_rate": 8.494061757719715e-06,
+      "loss": 0.1812,
+      "step": 6060,
+      "task_loss": 0.30982720851898193
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23871323466300964,
+      "epoch": 2.88,
+      "learning_rate": 8.475059382422803e-06,
+      "loss": 0.1833,
+      "step": 6070,
+      "task_loss": 0.17489267885684967
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12207458168268204,
+      "epoch": 2.89,
+      "learning_rate": 8.456057007125893e-06,
+      "loss": 0.1351,
+      "step": 6080,
+      "task_loss": 0.008778557181358337
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09064328670501709,
+      "epoch": 2.89,
+      "learning_rate": 8.437054631828979e-06,
+      "loss": 0.0597,
+      "step": 6090,
+      "task_loss": 0.026089288294315338
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.28841766715049744,
+      "epoch": 2.9,
+      "learning_rate": 8.418052256532068e-06,
+      "loss": 0.162,
+      "step": 6100,
+      "task_loss": 0.1540592610836029
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.049679264426231384,
+      "epoch": 2.9,
+      "learning_rate": 8.399049881235155e-06,
+      "loss": 0.0938,
+      "step": 6110,
+      "task_loss": 0.012325655668973923
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.031312961131334305,
+      "epoch": 2.91,
+      "learning_rate": 8.380047505938242e-06,
+      "loss": 0.1657,
+      "step": 6120,
+      "task_loss": 0.003661230206489563
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0725235864520073,
+      "epoch": 2.91,
+      "learning_rate": 8.36104513064133e-06,
+      "loss": 0.142,
+      "step": 6130,
+      "task_loss": 0.029958881437778473
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.19128333032131195,
+      "epoch": 2.92,
+      "learning_rate": 8.342042755344418e-06,
+      "loss": 0.1463,
+      "step": 6140,
+      "task_loss": 0.11310219764709473
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024800153449177742,
+      "epoch": 2.92,
+      "learning_rate": 8.323040380047506e-06,
+      "loss": 0.1464,
+      "step": 6150,
+      "task_loss": 0.007242865860462189
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0960264801979065,
+      "epoch": 2.93,
+      "learning_rate": 8.304038004750594e-06,
+      "loss": 0.1254,
+      "step": 6160,
+      "task_loss": 0.10413940250873566
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.029338371008634567,
+      "epoch": 2.93,
+      "learning_rate": 8.285035629453683e-06,
+      "loss": 0.1295,
+      "step": 6170,
+      "task_loss": 0.010850280523300171
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.40849167108535767,
+      "epoch": 2.94,
+      "learning_rate": 8.26603325415677e-06,
+      "loss": 0.1832,
+      "step": 6180,
+      "task_loss": 0.32399341464042664
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.017780784517526627,
+      "epoch": 2.94,
+      "learning_rate": 8.247030878859859e-06,
+      "loss": 0.1762,
+      "step": 6190,
+      "task_loss": 0.004788093268871307
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.20082998275756836,
+      "epoch": 2.95,
+      "learning_rate": 8.228028503562945e-06,
+      "loss": 0.1738,
+      "step": 6200,
+      "task_loss": 0.2769722044467926
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09029912948608398,
+      "epoch": 2.95,
+      "learning_rate": 8.209026128266035e-06,
+      "loss": 0.1472,
+      "step": 6210,
+      "task_loss": 0.22967661917209625
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03564874455332756,
+      "epoch": 2.95,
+      "learning_rate": 8.190023752969121e-06,
+      "loss": 0.1332,
+      "step": 6220,
+      "task_loss": 0.011339064687490463
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06550465524196625,
+      "epoch": 2.96,
+      "learning_rate": 8.171021377672209e-06,
+      "loss": 0.0727,
+      "step": 6230,
+      "task_loss": 0.03282541409134865
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021643269807100296,
+      "epoch": 2.96,
+      "learning_rate": 8.152019002375298e-06,
+      "loss": 0.1414,
+      "step": 6240,
+      "task_loss": 0.00811653584241867
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2023080289363861,
+      "epoch": 2.97,
+      "learning_rate": 8.133016627078385e-06,
+      "loss": 0.1813,
+      "step": 6250,
+      "task_loss": 0.1566222906112671
+    },
+    {
+      "epoch": 2.97,
+      "eval_accuracy": 0.9105504587155964,
+      "eval_loss": 0.347153902053833,
+      "eval_runtime": 22.058,
+      "eval_samples_per_second": 39.532,
+      "eval_steps_per_second": 4.942,
+      "step": 6250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.40393316745758057,
+      "epoch": 2.97,
+      "learning_rate": 8.114014251781474e-06,
+      "loss": 0.2437,
+      "step": 6260,
+      "task_loss": 0.2824239134788513
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04513658583164215,
+      "epoch": 2.98,
+      "learning_rate": 8.09501187648456e-06,
+      "loss": 0.1042,
+      "step": 6270,
+      "task_loss": 0.007071588188409805
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01414411049336195,
+      "epoch": 2.98,
+      "learning_rate": 8.07600950118765e-06,
+      "loss": 0.0815,
+      "step": 6280,
+      "task_loss": 0.010898426175117493
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.27671492099761963,
+      "epoch": 2.99,
+      "learning_rate": 8.057007125890736e-06,
+      "loss": 0.1583,
+      "step": 6290,
+      "task_loss": 0.4247187376022339
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04388013109564781,
+      "epoch": 2.99,
+      "learning_rate": 8.038004750593826e-06,
+      "loss": 0.1534,
+      "step": 6300,
+      "task_loss": 0.009620524942874908
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04090433940291405,
+      "epoch": 3.0,
+      "learning_rate": 8.019002375296912e-06,
+      "loss": 0.1315,
+      "step": 6310,
+      "task_loss": 0.004594910889863968
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.026270296424627304,
+      "epoch": 3.0,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.1069,
+      "step": 6320,
+      "task_loss": 0.0037337057292461395
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.16657724976539612,
+      "epoch": 3.01,
+      "learning_rate": 7.98099762470309e-06,
+      "loss": 0.0828,
+      "step": 6330,
+      "task_loss": 0.17973756790161133
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.022658517584204674,
+      "epoch": 3.01,
+      "learning_rate": 7.961995249406177e-06,
+      "loss": 0.095,
+      "step": 6340,
+      "task_loss": 0.008285708725452423
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03448961302638054,
+      "epoch": 3.02,
+      "learning_rate": 7.942992874109265e-06,
+      "loss": 0.0677,
+      "step": 6350,
+      "task_loss": 0.02081955224275589
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.020961280912160873,
+      "epoch": 3.02,
+      "learning_rate": 7.923990498812351e-06,
+      "loss": 0.0523,
+      "step": 6360,
+      "task_loss": 0.24772392213344574
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09194082766771317,
+      "epoch": 3.03,
+      "learning_rate": 7.90498812351544e-06,
+      "loss": 0.0895,
+      "step": 6370,
+      "task_loss": 0.3988223373889923
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.15075421333312988,
+      "epoch": 3.03,
+      "learning_rate": 7.885985748218527e-06,
+      "loss": 0.1081,
+      "step": 6380,
+      "task_loss": 0.06303508579730988
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012541974894702435,
+      "epoch": 3.04,
+      "learning_rate": 7.866983372921616e-06,
+      "loss": 0.0863,
+      "step": 6390,
+      "task_loss": 0.004561152309179306
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021147001534700394,
+      "epoch": 3.04,
+      "learning_rate": 7.847980997624704e-06,
+      "loss": 0.0674,
+      "step": 6400,
+      "task_loss": 0.0043773651123046875
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.016194012016057968,
+      "epoch": 3.05,
+      "learning_rate": 7.828978622327792e-06,
+      "loss": 0.1634,
+      "step": 6410,
+      "task_loss": 0.0029691122472286224
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04040956124663353,
+      "epoch": 3.05,
+      "learning_rate": 7.80997624703088e-06,
+      "loss": 0.0932,
+      "step": 6420,
+      "task_loss": 0.006615336984395981
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1302649974822998,
+      "epoch": 3.05,
+      "learning_rate": 7.790973871733968e-06,
+      "loss": 0.0852,
+      "step": 6430,
+      "task_loss": 0.08950361609458923
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.026604020968079567,
+      "epoch": 3.06,
+      "learning_rate": 7.771971496437056e-06,
+      "loss": 0.0547,
+      "step": 6440,
+      "task_loss": 0.19832749664783478
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1625809371471405,
+      "epoch": 3.06,
+      "learning_rate": 7.752969121140144e-06,
+      "loss": 0.0843,
+      "step": 6450,
+      "task_loss": 0.07263679802417755
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3386070132255554,
+      "epoch": 3.07,
+      "learning_rate": 7.733966745843231e-06,
+      "loss": 0.1032,
+      "step": 6460,
+      "task_loss": 0.3932771682739258
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02958713099360466,
+      "epoch": 3.07,
+      "learning_rate": 7.714964370546318e-06,
+      "loss": 0.0915,
+      "step": 6470,
+      "task_loss": 0.002492375671863556
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03094402514398098,
+      "epoch": 3.08,
+      "learning_rate": 7.695961995249407e-06,
+      "loss": 0.1083,
+      "step": 6480,
+      "task_loss": 0.006007764488458633
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.020859267562627792,
+      "epoch": 3.08,
+      "learning_rate": 7.676959619952495e-06,
+      "loss": 0.0465,
+      "step": 6490,
+      "task_loss": 0.00389765202999115
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.17978915572166443,
+      "epoch": 3.09,
+      "learning_rate": 7.657957244655583e-06,
+      "loss": 0.078,
+      "step": 6500,
+      "task_loss": 0.09985024482011795
+    },
+    {
+      "epoch": 3.09,
+      "eval_accuracy": 0.9139908256880734,
+      "eval_loss": 0.2914510667324066,
+      "eval_runtime": 22.4643,
+      "eval_samples_per_second": 38.817,
+      "eval_steps_per_second": 4.852,
+      "step": 6500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07285353541374207,
+      "epoch": 3.09,
+      "learning_rate": 7.63895486935867e-06,
+      "loss": 0.0599,
+      "step": 6510,
+      "task_loss": 0.21172893047332764
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.016873065382242203,
+      "epoch": 3.1,
+      "learning_rate": 7.619952494061759e-06,
+      "loss": 0.0994,
+      "step": 6520,
+      "task_loss": 0.1662091165781021
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.15915945172309875,
+      "epoch": 3.1,
+      "learning_rate": 7.600950118764846e-06,
+      "loss": 0.1219,
+      "step": 6530,
+      "task_loss": 0.2617415487766266
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.16674910485744476,
+      "epoch": 3.11,
+      "learning_rate": 7.581947743467934e-06,
+      "loss": 0.0909,
+      "step": 6540,
+      "task_loss": 0.12426068633794785
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0346527174115181,
+      "epoch": 3.11,
+      "learning_rate": 7.562945368171022e-06,
+      "loss": 0.0854,
+      "step": 6550,
+      "task_loss": 0.012002792209386826
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08293355256319046,
+      "epoch": 3.12,
+      "learning_rate": 7.54394299287411e-06,
+      "loss": 0.0816,
+      "step": 6560,
+      "task_loss": 0.08816082775592804
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.054979730397462845,
+      "epoch": 3.12,
+      "learning_rate": 7.524940617577198e-06,
+      "loss": 0.0739,
+      "step": 6570,
+      "task_loss": 0.5423688292503357
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021432993933558464,
+      "epoch": 3.13,
+      "learning_rate": 7.505938242280285e-06,
+      "loss": 0.0644,
+      "step": 6580,
+      "task_loss": 0.005201835185289383
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11038654297590256,
+      "epoch": 3.13,
+      "learning_rate": 7.486935866983374e-06,
+      "loss": 0.0679,
+      "step": 6590,
+      "task_loss": 0.10889068990945816
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.011515101417899132,
+      "epoch": 3.14,
+      "learning_rate": 7.467933491686461e-06,
+      "loss": 0.1239,
+      "step": 6600,
+      "task_loss": 0.0029702894389629364
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.027960218489170074,
+      "epoch": 3.14,
+      "learning_rate": 7.448931116389549e-06,
+      "loss": 0.0788,
+      "step": 6610,
+      "task_loss": 0.005036883056163788
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.018560441210865974,
+      "epoch": 3.14,
+      "learning_rate": 7.429928741092637e-06,
+      "loss": 0.0824,
+      "step": 6620,
+      "task_loss": 0.009363945573568344
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08277207612991333,
+      "epoch": 3.15,
+      "learning_rate": 7.410926365795725e-06,
+      "loss": 0.145,
+      "step": 6630,
+      "task_loss": 0.23960661888122559
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024465510621666908,
+      "epoch": 3.15,
+      "learning_rate": 7.391923990498813e-06,
+      "loss": 0.0744,
+      "step": 6640,
+      "task_loss": 0.09361692517995834
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.28370052576065063,
+      "epoch": 3.16,
+      "learning_rate": 7.372921615201901e-06,
+      "loss": 0.0718,
+      "step": 6650,
+      "task_loss": 0.18601828813552856
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1300351619720459,
+      "epoch": 3.16,
+      "learning_rate": 7.353919239904989e-06,
+      "loss": 0.1288,
+      "step": 6660,
+      "task_loss": 0.06523597240447998
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04163939133286476,
+      "epoch": 3.17,
+      "learning_rate": 7.334916864608077e-06,
+      "loss": 0.0974,
+      "step": 6670,
+      "task_loss": 0.17136383056640625
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.018006887286901474,
+      "epoch": 3.17,
+      "learning_rate": 7.315914489311164e-06,
+      "loss": 0.0903,
+      "step": 6680,
+      "task_loss": 0.005319155752658844
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.24133111536502838,
+      "epoch": 3.18,
+      "learning_rate": 7.296912114014253e-06,
+      "loss": 0.1113,
+      "step": 6690,
+      "task_loss": 0.2049761414527893
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.18800024688243866,
+      "epoch": 3.18,
+      "learning_rate": 7.27790973871734e-06,
+      "loss": 0.0828,
+      "step": 6700,
+      "task_loss": 0.10774320363998413
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05912996828556061,
+      "epoch": 3.19,
+      "learning_rate": 7.258907363420428e-06,
+      "loss": 0.1459,
+      "step": 6710,
+      "task_loss": 0.024521011859178543
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.042738813906908035,
+      "epoch": 3.19,
+      "learning_rate": 7.239904988123516e-06,
+      "loss": 0.0925,
+      "step": 6720,
+      "task_loss": 0.00665607675909996
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08050870150327682,
+      "epoch": 3.2,
+      "learning_rate": 7.220902612826604e-06,
+      "loss": 0.0643,
+      "step": 6730,
+      "task_loss": 0.22327980399131775
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23609286546707153,
+      "epoch": 3.2,
+      "learning_rate": 7.201900237529692e-06,
+      "loss": 0.1105,
+      "step": 6740,
+      "task_loss": 0.15705671906471252
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.00854767207056284,
+      "epoch": 3.21,
+      "learning_rate": 7.1828978622327794e-06,
+      "loss": 0.0886,
+      "step": 6750,
+      "task_loss": 0.003407653421163559
+    },
+    {
+      "epoch": 3.21,
+      "eval_accuracy": 0.9151376146788991,
+      "eval_loss": 0.28525349497795105,
+      "eval_runtime": 22.3514,
+      "eval_samples_per_second": 39.013,
+      "eval_steps_per_second": 4.877,
+      "step": 6750
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.014828482642769814,
+      "epoch": 3.21,
+      "learning_rate": 7.163895486935868e-06,
+      "loss": 0.0819,
+      "step": 6760,
+      "task_loss": 0.22838379442691803
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05267515778541565,
+      "epoch": 3.22,
+      "learning_rate": 7.144893111638955e-06,
+      "loss": 0.0694,
+      "step": 6770,
+      "task_loss": 0.17177042365074158
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11514155566692352,
+      "epoch": 3.22,
+      "learning_rate": 7.125890736342044e-06,
+      "loss": 0.1022,
+      "step": 6780,
+      "task_loss": 0.04417066648602486
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10625005513429642,
+      "epoch": 3.23,
+      "learning_rate": 7.106888361045131e-06,
+      "loss": 0.0949,
+      "step": 6790,
+      "task_loss": 0.06190282851457596
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23833660781383514,
+      "epoch": 3.23,
+      "learning_rate": 7.08788598574822e-06,
+      "loss": 0.1509,
+      "step": 6800,
+      "task_loss": 0.3238193392753601
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.060429543256759644,
+      "epoch": 3.24,
+      "learning_rate": 7.068883610451307e-06,
+      "loss": 0.0693,
+      "step": 6810,
+      "task_loss": 0.3467991352081299
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05774849280714989,
+      "epoch": 3.24,
+      "learning_rate": 7.0498812351543945e-06,
+      "loss": 0.0708,
+      "step": 6820,
+      "task_loss": 0.02291129156947136
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.018570270389318466,
+      "epoch": 3.24,
+      "learning_rate": 7.030878859857483e-06,
+      "loss": 0.066,
+      "step": 6830,
+      "task_loss": 0.005066726356744766
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06323438882827759,
+      "epoch": 3.25,
+      "learning_rate": 7.01187648456057e-06,
+      "loss": 0.0893,
+      "step": 6840,
+      "task_loss": 0.04271535202860832
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02259252592921257,
+      "epoch": 3.25,
+      "learning_rate": 6.992874109263659e-06,
+      "loss": 0.0495,
+      "step": 6850,
+      "task_loss": 0.08561189472675323
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03591727092862129,
+      "epoch": 3.26,
+      "learning_rate": 6.973871733966746e-06,
+      "loss": 0.0788,
+      "step": 6860,
+      "task_loss": 0.005202621221542358
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03306454047560692,
+      "epoch": 3.26,
+      "learning_rate": 6.954869358669835e-06,
+      "loss": 0.1167,
+      "step": 6870,
+      "task_loss": 0.0036557093262672424
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10268350690603256,
+      "epoch": 3.27,
+      "learning_rate": 6.935866983372922e-06,
+      "loss": 0.0842,
+      "step": 6880,
+      "task_loss": 0.11851201951503754
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03656313568353653,
+      "epoch": 3.27,
+      "learning_rate": 6.91686460807601e-06,
+      "loss": 0.0654,
+      "step": 6890,
+      "task_loss": 0.009406276047229767
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07688532024621964,
+      "epoch": 3.28,
+      "learning_rate": 6.897862232779098e-06,
+      "loss": 0.0885,
+      "step": 6900,
+      "task_loss": 0.09606030583381653
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021331768482923508,
+      "epoch": 3.28,
+      "learning_rate": 6.878859857482186e-06,
+      "loss": 0.1026,
+      "step": 6910,
+      "task_loss": 0.003994014114141464
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03601570054888725,
+      "epoch": 3.29,
+      "learning_rate": 6.859857482185274e-06,
+      "loss": 0.1132,
+      "step": 6920,
+      "task_loss": 0.1425406038761139
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.054355375468730927,
+      "epoch": 3.29,
+      "learning_rate": 6.840855106888361e-06,
+      "loss": 0.1151,
+      "step": 6930,
+      "task_loss": 0.1992948204278946
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.026305008679628372,
+      "epoch": 3.3,
+      "learning_rate": 6.82185273159145e-06,
+      "loss": 0.0999,
+      "step": 6940,
+      "task_loss": 0.008063357323408127
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.14803074300289154,
+      "epoch": 3.3,
+      "learning_rate": 6.802850356294537e-06,
+      "loss": 0.0867,
+      "step": 6950,
+      "task_loss": 0.21300971508026123
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10545548796653748,
+      "epoch": 3.31,
+      "learning_rate": 6.783847980997625e-06,
+      "loss": 0.0741,
+      "step": 6960,
+      "task_loss": 0.06187749281525612
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2063077986240387,
+      "epoch": 3.31,
+      "learning_rate": 6.764845605700712e-06,
+      "loss": 0.1095,
+      "step": 6970,
+      "task_loss": 0.27536579966545105
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012730289250612259,
+      "epoch": 3.32,
+      "learning_rate": 6.745843230403801e-06,
+      "loss": 0.1128,
+      "step": 6980,
+      "task_loss": 0.005988124758005142
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024963170289993286,
+      "epoch": 3.32,
+      "learning_rate": 6.726840855106889e-06,
+      "loss": 0.1089,
+      "step": 6990,
+      "task_loss": 0.006193935871124268
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1125224307179451,
+      "epoch": 3.33,
+      "learning_rate": 6.707838479809977e-06,
+      "loss": 0.117,
+      "step": 7000,
+      "task_loss": 0.25286245346069336
+    },
+    {
+      "epoch": 3.33,
+      "eval_accuracy": 0.9185779816513762,
+      "eval_loss": 0.2689138948917389,
+      "eval_runtime": 22.4717,
+      "eval_samples_per_second": 38.804,
+      "eval_steps_per_second": 4.851,
+      "step": 7000
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07371003925800323,
+      "epoch": 3.33,
+      "learning_rate": 6.688836104513065e-06,
+      "loss": 0.0961,
+      "step": 7010,
+      "task_loss": 0.0624442957341671
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08843027800321579,
+      "epoch": 3.33,
+      "learning_rate": 6.669833729216153e-06,
+      "loss": 0.053,
+      "step": 7020,
+      "task_loss": 0.22307521104812622
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01580330729484558,
+      "epoch": 3.34,
+      "learning_rate": 6.6508313539192404e-06,
+      "loss": 0.0582,
+      "step": 7030,
+      "task_loss": 0.021607249975204468
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3695874810218811,
+      "epoch": 3.34,
+      "learning_rate": 6.631828978622329e-06,
+      "loss": 0.1369,
+      "step": 7040,
+      "task_loss": 0.4205509424209595
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.025944426655769348,
+      "epoch": 3.35,
+      "learning_rate": 6.612826603325416e-06,
+      "loss": 0.054,
+      "step": 7050,
+      "task_loss": 0.18624553084373474
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.26268765330314636,
+      "epoch": 3.35,
+      "learning_rate": 6.593824228028504e-06,
+      "loss": 0.1181,
+      "step": 7060,
+      "task_loss": 0.2383035570383072
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.018553506582975388,
+      "epoch": 3.36,
+      "learning_rate": 6.574821852731592e-06,
+      "loss": 0.1264,
+      "step": 7070,
+      "task_loss": 0.002540022134780884
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.30001404881477356,
+      "epoch": 3.36,
+      "learning_rate": 6.55581947743468e-06,
+      "loss": 0.1015,
+      "step": 7080,
+      "task_loss": 0.4227195382118225
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.017469489946961403,
+      "epoch": 3.37,
+      "learning_rate": 6.536817102137768e-06,
+      "loss": 0.0563,
+      "step": 7090,
+      "task_loss": 0.005384139716625214
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2690996825695038,
+      "epoch": 3.37,
+      "learning_rate": 6.5178147268408555e-06,
+      "loss": 0.0829,
+      "step": 7100,
+      "task_loss": 0.19411098957061768
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012529075145721436,
+      "epoch": 3.38,
+      "learning_rate": 6.498812351543944e-06,
+      "loss": 0.0957,
+      "step": 7110,
+      "task_loss": 0.00231257826089859
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015447025187313557,
+      "epoch": 3.38,
+      "learning_rate": 6.479809976247031e-06,
+      "loss": 0.0599,
+      "step": 7120,
+      "task_loss": 0.0031574219465255737
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.016959063708782196,
+      "epoch": 3.39,
+      "learning_rate": 6.46080760095012e-06,
+      "loss": 0.0663,
+      "step": 7130,
+      "task_loss": 0.15422838926315308
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015068082138895988,
+      "epoch": 3.39,
+      "learning_rate": 6.441805225653207e-06,
+      "loss": 0.0898,
+      "step": 7140,
+      "task_loss": 0.004132535308599472
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2646453082561493,
+      "epoch": 3.4,
+      "learning_rate": 6.422802850356296e-06,
+      "loss": 0.0993,
+      "step": 7150,
+      "task_loss": 0.17641893029212952
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09765015542507172,
+      "epoch": 3.4,
+      "learning_rate": 6.403800475059383e-06,
+      "loss": 0.0545,
+      "step": 7160,
+      "task_loss": 0.0811493992805481
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.042697787284851074,
+      "epoch": 3.41,
+      "learning_rate": 6.3847980997624705e-06,
+      "loss": 0.0704,
+      "step": 7170,
+      "task_loss": 0.21445339918136597
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23853538930416107,
+      "epoch": 3.41,
+      "learning_rate": 6.365795724465559e-06,
+      "loss": 0.0793,
+      "step": 7180,
+      "task_loss": 0.15609657764434814
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09077732264995575,
+      "epoch": 3.42,
+      "learning_rate": 6.346793349168646e-06,
+      "loss": 0.1064,
+      "step": 7190,
+      "task_loss": 0.3665331304073334
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021392345428466797,
+      "epoch": 3.42,
+      "learning_rate": 6.327790973871735e-06,
+      "loss": 0.1384,
+      "step": 7200,
+      "task_loss": 0.18095025420188904
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2543436586856842,
+      "epoch": 3.43,
+      "learning_rate": 6.308788598574822e-06,
+      "loss": 0.1032,
+      "step": 7210,
+      "task_loss": 0.13427653908729553
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02127661183476448,
+      "epoch": 3.43,
+      "learning_rate": 6.289786223277911e-06,
+      "loss": 0.1165,
+      "step": 7220,
+      "task_loss": 0.004494883120059967
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.029238495975732803,
+      "epoch": 3.43,
+      "learning_rate": 6.270783847980998e-06,
+      "loss": 0.0929,
+      "step": 7230,
+      "task_loss": 0.0037034451961517334
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0373212993144989,
+      "epoch": 3.44,
+      "learning_rate": 6.251781472684086e-06,
+      "loss": 0.1337,
+      "step": 7240,
+      "task_loss": 0.005556315183639526
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23062750697135925,
+      "epoch": 3.44,
+      "learning_rate": 6.232779097387173e-06,
+      "loss": 0.0894,
+      "step": 7250,
+      "task_loss": 0.34356802701950073
+    },
+    {
+      "epoch": 3.44,
+      "eval_accuracy": 0.9174311926605505,
+      "eval_loss": 0.27475783228874207,
+      "eval_runtime": 22.2988,
+      "eval_samples_per_second": 39.105,
+      "eval_steps_per_second": 4.888,
+      "step": 7250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12996672093868256,
+      "epoch": 3.45,
+      "learning_rate": 6.213776722090262e-06,
+      "loss": 0.0485,
+      "step": 7260,
+      "task_loss": 0.1863107681274414
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.19036176800727844,
+      "epoch": 3.45,
+      "learning_rate": 6.19477434679335e-06,
+      "loss": 0.0722,
+      "step": 7270,
+      "task_loss": 0.15313661098480225
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.24250520765781403,
+      "epoch": 3.46,
+      "learning_rate": 6.175771971496437e-06,
+      "loss": 0.087,
+      "step": 7280,
+      "task_loss": 0.34011194109916687
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13349878787994385,
+      "epoch": 3.46,
+      "learning_rate": 6.156769596199526e-06,
+      "loss": 0.0665,
+      "step": 7290,
+      "task_loss": 0.2456045001745224
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03384882211685181,
+      "epoch": 3.47,
+      "learning_rate": 6.137767220902613e-06,
+      "loss": 0.1172,
+      "step": 7300,
+      "task_loss": 0.0046829357743263245
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02070469781756401,
+      "epoch": 3.47,
+      "learning_rate": 6.1187648456057014e-06,
+      "loss": 0.1301,
+      "step": 7310,
+      "task_loss": 0.005317840725183487
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06323584169149399,
+      "epoch": 3.48,
+      "learning_rate": 6.0997624703087884e-06,
+      "loss": 0.0884,
+      "step": 7320,
+      "task_loss": 0.01452043280005455
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.059165108948946,
+      "epoch": 3.48,
+      "learning_rate": 6.080760095011877e-06,
+      "loss": 0.0623,
+      "step": 7330,
+      "task_loss": 0.02391085773706436
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11576695740222931,
+      "epoch": 3.49,
+      "learning_rate": 6.061757719714965e-06,
+      "loss": 0.0891,
+      "step": 7340,
+      "task_loss": 0.19836050271987915
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012924795970320702,
+      "epoch": 3.49,
+      "learning_rate": 6.042755344418053e-06,
+      "loss": 0.0451,
+      "step": 7350,
+      "task_loss": 0.13838058710098267
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.029066815972328186,
+      "epoch": 3.5,
+      "learning_rate": 6.023752969121141e-06,
+      "loss": 0.1052,
+      "step": 7360,
+      "task_loss": 0.01864166557788849
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13578568398952484,
+      "epoch": 3.5,
+      "learning_rate": 6.004750593824229e-06,
+      "loss": 0.1078,
+      "step": 7370,
+      "task_loss": 0.08328451216220856
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.23648998141288757,
+      "epoch": 3.51,
+      "learning_rate": 5.9857482185273165e-06,
+      "loss": 0.0746,
+      "step": 7380,
+      "task_loss": 0.22922304272651672
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09257731586694717,
+      "epoch": 3.51,
+      "learning_rate": 5.9667458432304035e-06,
+      "loss": 0.0938,
+      "step": 7390,
+      "task_loss": 0.09933258593082428
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.17856569588184357,
+      "epoch": 3.52,
+      "learning_rate": 5.947743467933492e-06,
+      "loss": 0.1055,
+      "step": 7400,
+      "task_loss": 0.15133805572986603
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.009972669184207916,
+      "epoch": 3.52,
+      "learning_rate": 5.928741092636579e-06,
+      "loss": 0.0526,
+      "step": 7410,
+      "task_loss": 0.003201443701982498
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08634195476770401,
+      "epoch": 3.52,
+      "learning_rate": 5.909738717339668e-06,
+      "loss": 0.1139,
+      "step": 7420,
+      "task_loss": 0.1812535524368286
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.013718021102249622,
+      "epoch": 3.53,
+      "learning_rate": 5.890736342042756e-06,
+      "loss": 0.0834,
+      "step": 7430,
+      "task_loss": 0.0068436190485954285
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015075819566845894,
+      "epoch": 3.53,
+      "learning_rate": 5.871733966745844e-06,
+      "loss": 0.0571,
+      "step": 7440,
+      "task_loss": 0.13153530657291412
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1445593535900116,
+      "epoch": 3.54,
+      "learning_rate": 5.8527315914489315e-06,
+      "loss": 0.0546,
+      "step": 7450,
+      "task_loss": 0.07850364595651627
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1473163664340973,
+      "epoch": 3.54,
+      "learning_rate": 5.83372921615202e-06,
+      "loss": 0.1104,
+      "step": 7460,
+      "task_loss": 0.15125451982021332
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06551199406385422,
+      "epoch": 3.55,
+      "learning_rate": 5.814726840855107e-06,
+      "loss": 0.0888,
+      "step": 7470,
+      "task_loss": 0.034045103937387466
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.21904179453849792,
+      "epoch": 3.55,
+      "learning_rate": 5.795724465558196e-06,
+      "loss": 0.098,
+      "step": 7480,
+      "task_loss": 0.2412600815296173
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.031937625259160995,
+      "epoch": 3.56,
+      "learning_rate": 5.776722090261283e-06,
+      "loss": 0.0702,
+      "step": 7490,
+      "task_loss": 0.015268594026565552
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06982429325580597,
+      "epoch": 3.56,
+      "learning_rate": 5.757719714964372e-06,
+      "loss": 0.1023,
+      "step": 7500,
+      "task_loss": 0.3576207160949707
+    },
+    {
+      "epoch": 3.56,
+      "eval_accuracy": 0.9094036697247706,
+      "eval_loss": 0.32788407802581787,
+      "eval_runtime": 21.9451,
+      "eval_samples_per_second": 39.736,
+      "eval_steps_per_second": 4.967,
+      "step": 7500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.029036687687039375,
+      "epoch": 3.57,
+      "learning_rate": 5.738717339667459e-06,
+      "loss": 0.1211,
+      "step": 7510,
+      "task_loss": 0.014738757163286209
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021804381161928177,
+      "epoch": 3.57,
+      "learning_rate": 5.7197149643705466e-06,
+      "loss": 0.0494,
+      "step": 7520,
+      "task_loss": 0.005628753453493118
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.017943602055311203,
+      "epoch": 3.58,
+      "learning_rate": 5.700712589073634e-06,
+      "loss": 0.0556,
+      "step": 7530,
+      "task_loss": 0.17601385712623596
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04630360007286072,
+      "epoch": 3.58,
+      "learning_rate": 5.681710213776722e-06,
+      "loss": 0.092,
+      "step": 7540,
+      "task_loss": 0.07351444661617279
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.19516371190547943,
+      "epoch": 3.59,
+      "learning_rate": 5.662707838479811e-06,
+      "loss": 0.102,
+      "step": 7550,
+      "task_loss": 0.11753221601247787
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.075057253241539,
+      "epoch": 3.59,
+      "learning_rate": 5.643705463182898e-06,
+      "loss": 0.0556,
+      "step": 7560,
+      "task_loss": 0.004980906844139099
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04576558992266655,
+      "epoch": 3.6,
+      "learning_rate": 5.624703087885987e-06,
+      "loss": 0.1244,
+      "step": 7570,
+      "task_loss": 0.022598903626203537
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.008466158993542194,
+      "epoch": 3.6,
+      "learning_rate": 5.605700712589074e-06,
+      "loss": 0.0988,
+      "step": 7580,
+      "task_loss": 0.0024762973189353943
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02449963614344597,
+      "epoch": 3.61,
+      "learning_rate": 5.5866983372921624e-06,
+      "loss": 0.1019,
+      "step": 7590,
+      "task_loss": 0.003134731203317642
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.026412509381771088,
+      "epoch": 3.61,
+      "learning_rate": 5.5676959619952495e-06,
+      "loss": 0.0813,
+      "step": 7600,
+      "task_loss": 0.10514649748802185
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09414191544055939,
+      "epoch": 3.62,
+      "learning_rate": 5.548693586698338e-06,
+      "loss": 0.1004,
+      "step": 7610,
+      "task_loss": 0.15864884853363037
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.16137491166591644,
+      "epoch": 3.62,
+      "learning_rate": 5.529691211401426e-06,
+      "loss": 0.1594,
+      "step": 7620,
+      "task_loss": 0.06678696721792221
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.081720732152462,
+      "epoch": 3.62,
+      "learning_rate": 5.510688836104513e-06,
+      "loss": 0.0898,
+      "step": 7630,
+      "task_loss": 0.13728320598602295
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09084869921207428,
+      "epoch": 3.63,
+      "learning_rate": 5.491686460807602e-06,
+      "loss": 0.0892,
+      "step": 7640,
+      "task_loss": 0.03533271327614784
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.008733304217457771,
+      "epoch": 3.63,
+      "learning_rate": 5.472684085510689e-06,
+      "loss": 0.0742,
+      "step": 7650,
+      "task_loss": 0.006262246519327164
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024746078997850418,
+      "epoch": 3.64,
+      "learning_rate": 5.4536817102137775e-06,
+      "loss": 0.0666,
+      "step": 7660,
+      "task_loss": 0.1450251340866089
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.17060193419456482,
+      "epoch": 3.64,
+      "learning_rate": 5.4346793349168645e-06,
+      "loss": 0.1122,
+      "step": 7670,
+      "task_loss": 0.4199802875518799
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07124300301074982,
+      "epoch": 3.65,
+      "learning_rate": 5.415676959619953e-06,
+      "loss": 0.0768,
+      "step": 7680,
+      "task_loss": 0.3081812262535095
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04165620356798172,
+      "epoch": 3.65,
+      "learning_rate": 5.39667458432304e-06,
+      "loss": 0.0542,
+      "step": 7690,
+      "task_loss": 0.008864354342222214
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12530671060085297,
+      "epoch": 3.66,
+      "learning_rate": 5.377672209026129e-06,
+      "loss": 0.065,
+      "step": 7700,
+      "task_loss": 0.007050979882478714
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02896895259618759,
+      "epoch": 3.66,
+      "learning_rate": 5.358669833729217e-06,
+      "loss": 0.0978,
+      "step": 7710,
+      "task_loss": 0.008473467081785202
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02197858691215515,
+      "epoch": 3.67,
+      "learning_rate": 5.339667458432305e-06,
+      "loss": 0.0708,
+      "step": 7720,
+      "task_loss": 0.13779109716415405
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.14627870917320251,
+      "epoch": 3.67,
+      "learning_rate": 5.3206650831353925e-06,
+      "loss": 0.0796,
+      "step": 7730,
+      "task_loss": 0.11915778368711472
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.009426005184650421,
+      "epoch": 3.68,
+      "learning_rate": 5.3016627078384795e-06,
+      "loss": 0.0713,
+      "step": 7740,
+      "task_loss": 0.0036111027002334595
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.026764120906591415,
+      "epoch": 3.68,
+      "learning_rate": 5.282660332541568e-06,
+      "loss": 0.0495,
+      "step": 7750,
+      "task_loss": 0.17961205542087555
+    },
+    {
+      "epoch": 3.68,
+      "eval_accuracy": 0.9151376146788991,
+      "eval_loss": 0.2988388240337372,
+      "eval_runtime": 21.9907,
+      "eval_samples_per_second": 39.653,
+      "eval_steps_per_second": 4.957,
+      "step": 7750
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02125394716858864,
+      "epoch": 3.69,
+      "learning_rate": 5.263657957244655e-06,
+      "loss": 0.0592,
+      "step": 7760,
+      "task_loss": 0.13330192863941193
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.023334577679634094,
+      "epoch": 3.69,
+      "learning_rate": 5.244655581947744e-06,
+      "loss": 0.1014,
+      "step": 7770,
+      "task_loss": 0.2334352731704712
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07482799887657166,
+      "epoch": 3.7,
+      "learning_rate": 5.225653206650832e-06,
+      "loss": 0.0642,
+      "step": 7780,
+      "task_loss": 0.03224996477365494
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06566259264945984,
+      "epoch": 3.7,
+      "learning_rate": 5.20665083135392e-06,
+      "loss": 0.1021,
+      "step": 7790,
+      "task_loss": 0.03293333947658539
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.20237162709236145,
+      "epoch": 3.71,
+      "learning_rate": 5.1876484560570076e-06,
+      "loss": 0.0681,
+      "step": 7800,
+      "task_loss": 0.11539559811353683
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.020512059330940247,
+      "epoch": 3.71,
+      "learning_rate": 5.168646080760095e-06,
+      "loss": 0.1384,
+      "step": 7810,
+      "task_loss": 0.14106139540672302
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.033618003129959106,
+      "epoch": 3.71,
+      "learning_rate": 5.149643705463183e-06,
+      "loss": 0.0839,
+      "step": 7820,
+      "task_loss": 0.2362319827079773
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01834789663553238,
+      "epoch": 3.72,
+      "learning_rate": 5.130641330166272e-06,
+      "loss": 0.0992,
+      "step": 7830,
+      "task_loss": 0.06447672098875046
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08836167305707932,
+      "epoch": 3.72,
+      "learning_rate": 5.111638954869359e-06,
+      "loss": 0.0935,
+      "step": 7840,
+      "task_loss": 0.08710360527038574
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04010258615016937,
+      "epoch": 3.73,
+      "learning_rate": 5.092636579572448e-06,
+      "loss": 0.1045,
+      "step": 7850,
+      "task_loss": 0.14512062072753906
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3869872987270355,
+      "epoch": 3.73,
+      "learning_rate": 5.073634204275535e-06,
+      "loss": 0.0924,
+      "step": 7860,
+      "task_loss": 0.2303229570388794
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.19745223224163055,
+      "epoch": 3.74,
+      "learning_rate": 5.054631828978623e-06,
+      "loss": 0.0985,
+      "step": 7870,
+      "task_loss": 0.14754478633403778
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12212712317705154,
+      "epoch": 3.74,
+      "learning_rate": 5.0356294536817105e-06,
+      "loss": 0.1248,
+      "step": 7880,
+      "task_loss": 0.09426712989807129
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04913119226694107,
+      "epoch": 3.75,
+      "learning_rate": 5.016627078384798e-06,
+      "loss": 0.1346,
+      "step": 7890,
+      "task_loss": 0.09092673659324646
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12528668344020844,
+      "epoch": 3.75,
+      "learning_rate": 4.997624703087887e-06,
+      "loss": 0.1229,
+      "step": 7900,
+      "task_loss": 0.07792092859745026
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0339021235704422,
+      "epoch": 3.76,
+      "learning_rate": 4.978622327790975e-06,
+      "loss": 0.0752,
+      "step": 7910,
+      "task_loss": 0.015460405498743057
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024016963317990303,
+      "epoch": 3.76,
+      "learning_rate": 4.959619952494062e-06,
+      "loss": 0.0512,
+      "step": 7920,
+      "task_loss": 0.006217729300260544
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04985890910029411,
+      "epoch": 3.77,
+      "learning_rate": 4.94061757719715e-06,
+      "loss": 0.0897,
+      "step": 7930,
+      "task_loss": 0.20359504222869873
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05362790822982788,
+      "epoch": 3.77,
+      "learning_rate": 4.921615201900238e-06,
+      "loss": 0.0983,
+      "step": 7940,
+      "task_loss": 0.017056990414857864
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03559091314673424,
+      "epoch": 3.78,
+      "learning_rate": 4.9026128266033255e-06,
+      "loss": 0.1052,
+      "step": 7950,
+      "task_loss": 0.020256590098142624
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012951750308275223,
+      "epoch": 3.78,
+      "learning_rate": 4.883610451306413e-06,
+      "loss": 0.0661,
+      "step": 7960,
+      "task_loss": 0.002787817269563675
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.018247317522764206,
+      "epoch": 3.79,
+      "learning_rate": 4.864608076009501e-06,
+      "loss": 0.0567,
+      "step": 7970,
+      "task_loss": 0.005940131843090057
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01746300235390663,
+      "epoch": 3.79,
+      "learning_rate": 4.84560570071259e-06,
+      "loss": 0.0532,
+      "step": 7980,
+      "task_loss": 0.14214222133159637
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06765209138393402,
+      "epoch": 3.8,
+      "learning_rate": 4.826603325415678e-06,
+      "loss": 0.1438,
+      "step": 7990,
+      "task_loss": 0.02867637202143669
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08068070560693741,
+      "epoch": 3.8,
+      "learning_rate": 4.807600950118766e-06,
+      "loss": 0.0899,
+      "step": 8000,
+      "task_loss": 0.22640740871429443
+    },
+    {
+      "epoch": 3.8,
+      "eval_accuracy": 0.9174311926605505,
+      "eval_loss": 0.2796386182308197,
+      "eval_runtime": 22.3683,
+      "eval_samples_per_second": 38.984,
+      "eval_steps_per_second": 4.873,
+      "step": 8000
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.025517662987113,
+      "epoch": 3.81,
+      "learning_rate": 4.7885985748218535e-06,
+      "loss": 0.0794,
+      "step": 8010,
+      "task_loss": 0.009171344339847565
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.030752386897802353,
+      "epoch": 3.81,
+      "learning_rate": 4.769596199524941e-06,
+      "loss": 0.0868,
+      "step": 8020,
+      "task_loss": 0.004918545484542847
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024365507066249847,
+      "epoch": 3.81,
+      "learning_rate": 4.750593824228028e-06,
+      "loss": 0.0529,
+      "step": 8030,
+      "task_loss": 0.05997316166758537
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1693132221698761,
+      "epoch": 3.82,
+      "learning_rate": 4.731591448931116e-06,
+      "loss": 0.1007,
+      "step": 8040,
+      "task_loss": 0.2038179188966751
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.030011361464858055,
+      "epoch": 3.82,
+      "learning_rate": 4.712589073634204e-06,
+      "loss": 0.1164,
+      "step": 8050,
+      "task_loss": 0.016198869794607162
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03136039152741432,
+      "epoch": 3.83,
+      "learning_rate": 4.693586698337293e-06,
+      "loss": 0.0714,
+      "step": 8060,
+      "task_loss": 0.012641217559576035
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.026943452656269073,
+      "epoch": 3.83,
+      "learning_rate": 4.674584323040381e-06,
+      "loss": 0.0604,
+      "step": 8070,
+      "task_loss": 0.003778461366891861
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0963515192270279,
+      "epoch": 3.84,
+      "learning_rate": 4.6555819477434686e-06,
+      "loss": 0.0641,
+      "step": 8080,
+      "task_loss": 0.1955173909664154
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06787940859794617,
+      "epoch": 3.84,
+      "learning_rate": 4.636579572446556e-06,
+      "loss": 0.07,
+      "step": 8090,
+      "task_loss": 0.029893912374973297
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.18741311132907867,
+      "epoch": 3.85,
+      "learning_rate": 4.617577197149644e-06,
+      "loss": 0.0773,
+      "step": 8100,
+      "task_loss": 0.12163101881742477
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07996465265750885,
+      "epoch": 3.85,
+      "learning_rate": 4.598574821852732e-06,
+      "loss": 0.1448,
+      "step": 8110,
+      "task_loss": 0.04173227399587631
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04726814478635788,
+      "epoch": 3.86,
+      "learning_rate": 4.57957244655582e-06,
+      "loss": 0.0407,
+      "step": 8120,
+      "task_loss": 0.0074256956577301025
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1402328610420227,
+      "epoch": 3.86,
+      "learning_rate": 4.560570071258908e-06,
+      "loss": 0.0503,
+      "step": 8130,
+      "task_loss": 0.07385687530040741
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2221362441778183,
+      "epoch": 3.87,
+      "learning_rate": 4.541567695961996e-06,
+      "loss": 0.1239,
+      "step": 8140,
+      "task_loss": 0.16662901639938354
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.027472279965877533,
+      "epoch": 3.87,
+      "learning_rate": 4.522565320665084e-06,
+      "loss": 0.0452,
+      "step": 8150,
+      "task_loss": 0.007453102618455887
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04676855728030205,
+      "epoch": 3.88,
+      "learning_rate": 4.5035629453681715e-06,
+      "loss": 0.1059,
+      "step": 8160,
+      "task_loss": 0.15876612067222595
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.025045206770300865,
+      "epoch": 3.88,
+      "learning_rate": 4.484560570071259e-06,
+      "loss": 0.1289,
+      "step": 8170,
+      "task_loss": 0.004179120063781738
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.011216237209737301,
+      "epoch": 3.89,
+      "learning_rate": 4.465558194774347e-06,
+      "loss": 0.1117,
+      "step": 8180,
+      "task_loss": 0.0025112181901931763
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.013475890271365643,
+      "epoch": 3.89,
+      "learning_rate": 4.446555819477435e-06,
+      "loss": 0.0861,
+      "step": 8190,
+      "task_loss": 0.0036102384328842163
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.029119327664375305,
+      "epoch": 3.9,
+      "learning_rate": 4.427553444180523e-06,
+      "loss": 0.0486,
+      "step": 8200,
+      "task_loss": 0.0045392923057079315
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.013042573817074299,
+      "epoch": 3.9,
+      "learning_rate": 4.408551068883611e-06,
+      "loss": 0.0941,
+      "step": 8210,
+      "task_loss": 0.11527097970247269
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02264486812055111,
+      "epoch": 3.9,
+      "learning_rate": 4.389548693586699e-06,
+      "loss": 0.0687,
+      "step": 8220,
+      "task_loss": 0.04810720682144165
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.007877323776483536,
+      "epoch": 3.91,
+      "learning_rate": 4.3705463182897865e-06,
+      "loss": 0.0728,
+      "step": 8230,
+      "task_loss": 0.00500917062163353
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.030022092163562775,
+      "epoch": 3.91,
+      "learning_rate": 4.351543942992874e-06,
+      "loss": 0.0973,
+      "step": 8240,
+      "task_loss": 0.02026822790503502
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.20027883350849152,
+      "epoch": 3.92,
+      "learning_rate": 4.332541567695962e-06,
+      "loss": 0.1102,
+      "step": 8250,
+      "task_loss": 0.12834565341472626
+    },
+    {
+      "epoch": 3.92,
+      "eval_accuracy": 0.9162844036697247,
+      "eval_loss": 0.26672619581222534,
+      "eval_runtime": 22.291,
+      "eval_samples_per_second": 39.119,
+      "eval_steps_per_second": 4.89,
+      "step": 8250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12070262432098389,
+      "epoch": 3.92,
+      "learning_rate": 4.31353919239905e-06,
+      "loss": 0.1236,
+      "step": 8260,
+      "task_loss": 0.2388094961643219
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.15468746423721313,
+      "epoch": 3.93,
+      "learning_rate": 4.294536817102138e-06,
+      "loss": 0.1083,
+      "step": 8270,
+      "task_loss": 0.23200759291648865
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.18423910439014435,
+      "epoch": 3.93,
+      "learning_rate": 4.275534441805226e-06,
+      "loss": 0.0713,
+      "step": 8280,
+      "task_loss": 0.09041578322649002
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021810609847307205,
+      "epoch": 3.94,
+      "learning_rate": 4.256532066508314e-06,
+      "loss": 0.0409,
+      "step": 8290,
+      "task_loss": 0.0035596080124378204
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11239130049943924,
+      "epoch": 3.94,
+      "learning_rate": 4.2375296912114015e-06,
+      "loss": 0.0715,
+      "step": 8300,
+      "task_loss": 0.18248476088047028
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08229388296604156,
+      "epoch": 3.95,
+      "learning_rate": 4.218527315914489e-06,
+      "loss": 0.1268,
+      "step": 8310,
+      "task_loss": 0.062466811388731
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.020841067656874657,
+      "epoch": 3.95,
+      "learning_rate": 4.199524940617577e-06,
+      "loss": 0.0958,
+      "step": 8320,
+      "task_loss": 0.01168619841337204
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07428955286741257,
+      "epoch": 3.96,
+      "learning_rate": 4.180522565320665e-06,
+      "loss": 0.1132,
+      "step": 8330,
+      "task_loss": 0.020049307495355606
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.14601218700408936,
+      "epoch": 3.96,
+      "learning_rate": 4.161520190023753e-06,
+      "loss": 0.1099,
+      "step": 8340,
+      "task_loss": 0.06591594219207764
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.014185711741447449,
+      "epoch": 3.97,
+      "learning_rate": 4.142517814726842e-06,
+      "loss": 0.0869,
+      "step": 8350,
+      "task_loss": 0.11826062202453613
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.003288624342530966,
+      "epoch": 3.97,
+      "learning_rate": 4.1235154394299296e-06,
+      "loss": 0.0704,
+      "step": 8360,
+      "task_loss": 0.008070297539234161
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0797647014260292,
+      "epoch": 3.98,
+      "learning_rate": 4.104513064133017e-06,
+      "loss": 0.0894,
+      "step": 8370,
+      "task_loss": 0.10876837372779846
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04628456011414528,
+      "epoch": 3.98,
+      "learning_rate": 4.0855106888361044e-06,
+      "loss": 0.1141,
+      "step": 8380,
+      "task_loss": 0.2866939604282379
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012273245491087437,
+      "epoch": 3.99,
+      "learning_rate": 4.066508313539192e-06,
+      "loss": 0.1052,
+      "step": 8390,
+      "task_loss": 0.14446358382701874
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06594362109899521,
+      "epoch": 3.99,
+      "learning_rate": 4.04750593824228e-06,
+      "loss": 0.052,
+      "step": 8400,
+      "task_loss": 0.02517566829919815
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.017545923590660095,
+      "epoch": 4.0,
+      "learning_rate": 4.028503562945368e-06,
+      "loss": 0.0642,
+      "step": 8410,
+      "task_loss": 0.006249226629734039
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04165567085146904,
+      "epoch": 4.0,
+      "learning_rate": 4.009501187648456e-06,
+      "loss": 0.062,
+      "step": 8420,
+      "task_loss": 0.008884565904736519
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06637102365493774,
+      "epoch": 4.0,
+      "learning_rate": 3.990498812351545e-06,
+      "loss": 0.0575,
+      "step": 8430,
+      "task_loss": 0.11581560969352722
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03454139828681946,
+      "epoch": 4.01,
+      "learning_rate": 3.9714964370546325e-06,
+      "loss": 0.0857,
+      "step": 8440,
+      "task_loss": 0.025920506566762924
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12666280567646027,
+      "epoch": 4.01,
+      "learning_rate": 3.95249406175772e-06,
+      "loss": 0.052,
+      "step": 8450,
+      "task_loss": 0.22069977223873138
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12481103837490082,
+      "epoch": 4.02,
+      "learning_rate": 3.933491686460808e-06,
+      "loss": 0.0905,
+      "step": 8460,
+      "task_loss": 0.34771737456321716
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.027316780760884285,
+      "epoch": 4.02,
+      "learning_rate": 3.914489311163896e-06,
+      "loss": 0.045,
+      "step": 8470,
+      "task_loss": 0.008249737322330475
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.019640466198325157,
+      "epoch": 4.03,
+      "learning_rate": 3.895486935866984e-06,
+      "loss": 0.0346,
+      "step": 8480,
+      "task_loss": 0.0024937279522418976
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02247186377644539,
+      "epoch": 4.03,
+      "learning_rate": 3.876484560570072e-06,
+      "loss": 0.0312,
+      "step": 8490,
+      "task_loss": 0.014447018504142761
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.010977246798574924,
+      "epoch": 4.04,
+      "learning_rate": 3.857482185273159e-06,
+      "loss": 0.061,
+      "step": 8500,
+      "task_loss": 0.03449002653360367
+    },
+    {
+      "epoch": 4.04,
+      "eval_accuracy": 0.9174311926605505,
+      "eval_loss": 0.283713161945343,
+      "eval_runtime": 22.1383,
+      "eval_samples_per_second": 39.389,
+      "eval_steps_per_second": 4.924,
+      "step": 8500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03942285478115082,
+      "epoch": 4.04,
+      "learning_rate": 3.8384798099762475e-06,
+      "loss": 0.0416,
+      "step": 8510,
+      "task_loss": 0.005950760096311569
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.014301864430308342,
+      "epoch": 4.05,
+      "learning_rate": 3.819477434679335e-06,
+      "loss": 0.069,
+      "step": 8520,
+      "task_loss": 0.0029358714818954468
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.039006754755973816,
+      "epoch": 4.05,
+      "learning_rate": 3.800475059382423e-06,
+      "loss": 0.0629,
+      "step": 8530,
+      "task_loss": 0.19067448377609253
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07345309853553772,
+      "epoch": 4.06,
+      "learning_rate": 3.781472684085511e-06,
+      "loss": 0.0515,
+      "step": 8540,
+      "task_loss": 0.18440312147140503
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0816812664270401,
+      "epoch": 4.06,
+      "learning_rate": 3.762470308788599e-06,
+      "loss": 0.0785,
+      "step": 8550,
+      "task_loss": 0.1739380657672882
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07522659748792648,
+      "epoch": 4.07,
+      "learning_rate": 3.743467933491687e-06,
+      "loss": 0.089,
+      "step": 8560,
+      "task_loss": 0.027755912393331528
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2582024037837982,
+      "epoch": 4.07,
+      "learning_rate": 3.7244655581947747e-06,
+      "loss": 0.1013,
+      "step": 8570,
+      "task_loss": 0.27483993768692017
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.011575591750442982,
+      "epoch": 4.08,
+      "learning_rate": 3.7054631828978625e-06,
+      "loss": 0.051,
+      "step": 8580,
+      "task_loss": 0.004382755607366562
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.043528296053409576,
+      "epoch": 4.08,
+      "learning_rate": 3.6864608076009504e-06,
+      "loss": 0.0395,
+      "step": 8590,
+      "task_loss": 0.01687121018767357
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06470053642988205,
+      "epoch": 4.09,
+      "learning_rate": 3.6674584323040387e-06,
+      "loss": 0.0767,
+      "step": 8600,
+      "task_loss": 0.12226305902004242
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.16258491575717926,
+      "epoch": 4.09,
+      "learning_rate": 3.6484560570071265e-06,
+      "loss": 0.0875,
+      "step": 8610,
+      "task_loss": 0.12856589257717133
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012949327006936073,
+      "epoch": 4.1,
+      "learning_rate": 3.629453681710214e-06,
+      "loss": 0.0215,
+      "step": 8620,
+      "task_loss": 0.006088566035032272
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.22730335593223572,
+      "epoch": 4.1,
+      "learning_rate": 3.610451306413302e-06,
+      "loss": 0.0768,
+      "step": 8630,
+      "task_loss": 0.3216843008995056
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02113342471420765,
+      "epoch": 4.1,
+      "learning_rate": 3.5914489311163897e-06,
+      "loss": 0.0908,
+      "step": 8640,
+      "task_loss": 0.3219309449195862
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.3414075970649719,
+      "epoch": 4.11,
+      "learning_rate": 3.5724465558194776e-06,
+      "loss": 0.1158,
+      "step": 8650,
+      "task_loss": 0.39104947447776794
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.017360147088766098,
+      "epoch": 4.11,
+      "learning_rate": 3.5534441805225654e-06,
+      "loss": 0.1378,
+      "step": 8660,
+      "task_loss": 0.005871061235666275
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03779000788927078,
+      "epoch": 4.12,
+      "learning_rate": 3.5344418052256533e-06,
+      "loss": 0.0691,
+      "step": 8670,
+      "task_loss": 0.0107310451567173
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2806292772293091,
+      "epoch": 4.12,
+      "learning_rate": 3.5154394299287416e-06,
+      "loss": 0.1005,
+      "step": 8680,
+      "task_loss": 0.3259883522987366
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06188252195715904,
+      "epoch": 4.13,
+      "learning_rate": 3.4964370546318295e-06,
+      "loss": 0.0653,
+      "step": 8690,
+      "task_loss": 0.12273290753364563
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09528631716966629,
+      "epoch": 4.13,
+      "learning_rate": 3.4774346793349173e-06,
+      "loss": 0.065,
+      "step": 8700,
+      "task_loss": 0.18623818457126617
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.041278135031461716,
+      "epoch": 4.14,
+      "learning_rate": 3.458432304038005e-06,
+      "loss": 0.0581,
+      "step": 8710,
+      "task_loss": 0.02173343300819397
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012133522890508175,
+      "epoch": 4.14,
+      "learning_rate": 3.439429928741093e-06,
+      "loss": 0.0752,
+      "step": 8720,
+      "task_loss": 0.3635708689689636
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02899017184972763,
+      "epoch": 4.15,
+      "learning_rate": 3.4204275534441805e-06,
+      "loss": 0.0563,
+      "step": 8730,
+      "task_loss": 0.11488357186317444
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.18777720630168915,
+      "epoch": 4.15,
+      "learning_rate": 3.4014251781472683e-06,
+      "loss": 0.0686,
+      "step": 8740,
+      "task_loss": 0.11035171151161194
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1590900421142578,
+      "epoch": 4.16,
+      "learning_rate": 3.382422802850356e-06,
+      "loss": 0.0594,
+      "step": 8750,
+      "task_loss": 0.05561506748199463
+    },
+    {
+      "epoch": 4.16,
+      "eval_accuracy": 0.9151376146788991,
+      "eval_loss": 0.2766323983669281,
+      "eval_runtime": 22.3282,
+      "eval_samples_per_second": 39.054,
+      "eval_steps_per_second": 4.882,
+      "step": 8750
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06593011319637299,
+      "epoch": 4.16,
+      "learning_rate": 3.3634204275534445e-06,
+      "loss": 0.0356,
+      "step": 8760,
+      "task_loss": 0.0075419507920742035
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.019471261650323868,
+      "epoch": 4.17,
+      "learning_rate": 3.3444180522565324e-06,
+      "loss": 0.0346,
+      "step": 8770,
+      "task_loss": 0.004301343113183975
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13416732847690582,
+      "epoch": 4.17,
+      "learning_rate": 3.3254156769596202e-06,
+      "loss": 0.0982,
+      "step": 8780,
+      "task_loss": 0.23774561285972595
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.006739257834851742,
+      "epoch": 4.18,
+      "learning_rate": 3.306413301662708e-06,
+      "loss": 0.0593,
+      "step": 8790,
+      "task_loss": 0.0027306489646434784
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04961520433425903,
+      "epoch": 4.18,
+      "learning_rate": 3.287410926365796e-06,
+      "loss": 0.0887,
+      "step": 8800,
+      "task_loss": 0.040518369525671005
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.017031384631991386,
+      "epoch": 4.19,
+      "learning_rate": 3.268408551068884e-06,
+      "loss": 0.0779,
+      "step": 8810,
+      "task_loss": 0.006137076765298843
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.010000055655837059,
+      "epoch": 4.19,
+      "learning_rate": 3.249406175771972e-06,
+      "loss": 0.0801,
+      "step": 8820,
+      "task_loss": 0.006373908370733261
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08995574712753296,
+      "epoch": 4.19,
+      "learning_rate": 3.23040380047506e-06,
+      "loss": 0.0486,
+      "step": 8830,
+      "task_loss": 0.13840673863887787
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.018883461132645607,
+      "epoch": 4.2,
+      "learning_rate": 3.211401425178148e-06,
+      "loss": 0.0651,
+      "step": 8840,
+      "task_loss": 0.002827569842338562
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.038998525589704514,
+      "epoch": 4.2,
+      "learning_rate": 3.1923990498812353e-06,
+      "loss": 0.0606,
+      "step": 8850,
+      "task_loss": 0.014568600803613663
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.026058971881866455,
+      "epoch": 4.21,
+      "learning_rate": 3.173396674584323e-06,
+      "loss": 0.0357,
+      "step": 8860,
+      "task_loss": 0.029363825917243958
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024625074118375778,
+      "epoch": 4.21,
+      "learning_rate": 3.154394299287411e-06,
+      "loss": 0.0547,
+      "step": 8870,
+      "task_loss": 0.004386581480503082
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.020818475633859634,
+      "epoch": 4.22,
+      "learning_rate": 3.135391923990499e-06,
+      "loss": 0.0566,
+      "step": 8880,
+      "task_loss": 0.0970916673541069
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015941135585308075,
+      "epoch": 4.22,
+      "learning_rate": 3.1163895486935867e-06,
+      "loss": 0.0339,
+      "step": 8890,
+      "task_loss": 0.003059886395931244
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.020430579781532288,
+      "epoch": 4.23,
+      "learning_rate": 3.097387173396675e-06,
+      "loss": 0.035,
+      "step": 8900,
+      "task_loss": 0.0052056461572647095
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.15293776988983154,
+      "epoch": 4.23,
+      "learning_rate": 3.078384798099763e-06,
+      "loss": 0.0453,
+      "step": 8910,
+      "task_loss": 0.09499054402112961
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.15551890432834625,
+      "epoch": 4.24,
+      "learning_rate": 3.0593824228028507e-06,
+      "loss": 0.0701,
+      "step": 8920,
+      "task_loss": 0.10638581216335297
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11212066560983658,
+      "epoch": 4.24,
+      "learning_rate": 3.0403800475059386e-06,
+      "loss": 0.0544,
+      "step": 8930,
+      "task_loss": 0.19491392374038696
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.016818024218082428,
+      "epoch": 4.25,
+      "learning_rate": 3.0213776722090264e-06,
+      "loss": 0.0488,
+      "step": 8940,
+      "task_loss": 0.2066863775253296
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0049272989854216576,
+      "epoch": 4.25,
+      "learning_rate": 3.0023752969121143e-06,
+      "loss": 0.0446,
+      "step": 8950,
+      "task_loss": 0.007535018026828766
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03029637783765793,
+      "epoch": 4.26,
+      "learning_rate": 2.9833729216152017e-06,
+      "loss": 0.0593,
+      "step": 8960,
+      "task_loss": 0.3232007324695587
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07497206330299377,
+      "epoch": 4.26,
+      "learning_rate": 2.9643705463182896e-06,
+      "loss": 0.087,
+      "step": 8970,
+      "task_loss": 0.18494026362895966
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04392876848578453,
+      "epoch": 4.27,
+      "learning_rate": 2.945368171021378e-06,
+      "loss": 0.0731,
+      "step": 8980,
+      "task_loss": 0.1489235907793045
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01495037879794836,
+      "epoch": 4.27,
+      "learning_rate": 2.9263657957244658e-06,
+      "loss": 0.0548,
+      "step": 8990,
+      "task_loss": 0.0053473226726055145
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08316938579082489,
+      "epoch": 4.28,
+      "learning_rate": 2.9073634204275536e-06,
+      "loss": 0.1062,
+      "step": 9000,
+      "task_loss": 0.029360707849264145
+    },
+    {
+      "epoch": 4.28,
+      "eval_accuracy": 0.9139908256880734,
+      "eval_loss": 0.2777394950389862,
+      "eval_runtime": 22.1404,
+      "eval_samples_per_second": 39.385,
+      "eval_steps_per_second": 4.923,
+      "step": 9000
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.023107042536139488,
+      "epoch": 4.28,
+      "learning_rate": 2.8883610451306415e-06,
+      "loss": 0.0757,
+      "step": 9010,
+      "task_loss": 0.00649937242269516
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.030949320644140244,
+      "epoch": 4.29,
+      "learning_rate": 2.8693586698337293e-06,
+      "loss": 0.0622,
+      "step": 9020,
+      "task_loss": 0.017877578735351562
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02196587808430195,
+      "epoch": 4.29,
+      "learning_rate": 2.850356294536817e-06,
+      "loss": 0.0572,
+      "step": 9030,
+      "task_loss": 0.014159291982650757
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05502418801188469,
+      "epoch": 4.29,
+      "learning_rate": 2.8313539192399055e-06,
+      "loss": 0.0318,
+      "step": 9040,
+      "task_loss": 0.20975813269615173
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.013882439583539963,
+      "epoch": 4.3,
+      "learning_rate": 2.8123515439429934e-06,
+      "loss": 0.0576,
+      "step": 9050,
+      "task_loss": 0.09990142285823822
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.010699973441660404,
+      "epoch": 4.3,
+      "learning_rate": 2.7933491686460812e-06,
+      "loss": 0.0913,
+      "step": 9060,
+      "task_loss": 0.0033752471208572388
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01082450058311224,
+      "epoch": 4.31,
+      "learning_rate": 2.774346793349169e-06,
+      "loss": 0.0409,
+      "step": 9070,
+      "task_loss": 0.004309527575969696
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.016541698947548866,
+      "epoch": 4.31,
+      "learning_rate": 2.7553444180522565e-06,
+      "loss": 0.0631,
+      "step": 9080,
+      "task_loss": 0.002968382090330124
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.006014951970428228,
+      "epoch": 4.32,
+      "learning_rate": 2.7363420427553444e-06,
+      "loss": 0.0573,
+      "step": 9090,
+      "task_loss": 0.1005413755774498
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.22611913084983826,
+      "epoch": 4.32,
+      "learning_rate": 2.7173396674584322e-06,
+      "loss": 0.1065,
+      "step": 9100,
+      "task_loss": 0.13456928730010986
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03514198213815689,
+      "epoch": 4.33,
+      "learning_rate": 2.69833729216152e-06,
+      "loss": 0.0453,
+      "step": 9110,
+      "task_loss": 0.0037595927715301514
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.016409728676080704,
+      "epoch": 4.33,
+      "learning_rate": 2.6793349168646084e-06,
+      "loss": 0.0601,
+      "step": 9120,
+      "task_loss": 0.005627848207950592
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.016438759863376617,
+      "epoch": 4.34,
+      "learning_rate": 2.6603325415676963e-06,
+      "loss": 0.088,
+      "step": 9130,
+      "task_loss": 0.004075001925230026
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.048099249601364136,
+      "epoch": 4.34,
+      "learning_rate": 2.641330166270784e-06,
+      "loss": 0.0575,
+      "step": 9140,
+      "task_loss": 0.021824389696121216
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03359050303697586,
+      "epoch": 4.35,
+      "learning_rate": 2.622327790973872e-06,
+      "loss": 0.0541,
+      "step": 9150,
+      "task_loss": 0.010654143989086151
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.35756510496139526,
+      "epoch": 4.35,
+      "learning_rate": 2.60332541567696e-06,
+      "loss": 0.0756,
+      "step": 9160,
+      "task_loss": 0.1754165142774582
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015034861862659454,
+      "epoch": 4.36,
+      "learning_rate": 2.5843230403800477e-06,
+      "loss": 0.1075,
+      "step": 9170,
+      "task_loss": 0.008607205003499985
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.26236093044281006,
+      "epoch": 4.36,
+      "learning_rate": 2.565320665083136e-06,
+      "loss": 0.121,
+      "step": 9180,
+      "task_loss": 0.14051243662834167
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024604659527540207,
+      "epoch": 4.37,
+      "learning_rate": 2.546318289786224e-06,
+      "loss": 0.0556,
+      "step": 9190,
+      "task_loss": 0.12805365025997162
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.041992854326963425,
+      "epoch": 4.37,
+      "learning_rate": 2.5273159144893113e-06,
+      "loss": 0.1061,
+      "step": 9200,
+      "task_loss": 0.007956545799970627
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.15274813771247864,
+      "epoch": 4.38,
+      "learning_rate": 2.508313539192399e-06,
+      "loss": 0.0563,
+      "step": 9210,
+      "task_loss": 0.06382400542497635
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1668039709329605,
+      "epoch": 4.38,
+      "learning_rate": 2.4893111638954874e-06,
+      "loss": 0.0911,
+      "step": 9220,
+      "task_loss": 0.32040756940841675
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08066344261169434,
+      "epoch": 4.38,
+      "learning_rate": 2.470308788598575e-06,
+      "loss": 0.0624,
+      "step": 9230,
+      "task_loss": 0.06633038818836212
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021942077204585075,
+      "epoch": 4.39,
+      "learning_rate": 2.4513064133016627e-06,
+      "loss": 0.0486,
+      "step": 9240,
+      "task_loss": 0.00539054349064827
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021917153149843216,
+      "epoch": 4.39,
+      "learning_rate": 2.4323040380047506e-06,
+      "loss": 0.0751,
+      "step": 9250,
+      "task_loss": 0.03953142464160919
+    },
+    {
+      "epoch": 4.39,
+      "eval_accuracy": 0.9220183486238532,
+      "eval_loss": 0.2689874768257141,
+      "eval_runtime": 22.1397,
+      "eval_samples_per_second": 39.386,
+      "eval_steps_per_second": 4.923,
+      "step": 9250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.013196494430303574,
+      "epoch": 4.4,
+      "learning_rate": 2.413301662707839e-06,
+      "loss": 0.0473,
+      "step": 9260,
+      "task_loss": 0.004326473921537399
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04993726685643196,
+      "epoch": 4.4,
+      "learning_rate": 2.3942992874109268e-06,
+      "loss": 0.0502,
+      "step": 9270,
+      "task_loss": 0.03252362832427025
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06098417192697525,
+      "epoch": 4.41,
+      "learning_rate": 2.375296912114014e-06,
+      "loss": 0.0519,
+      "step": 9280,
+      "task_loss": 0.04169990494847298
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.030414171516895294,
+      "epoch": 4.41,
+      "learning_rate": 2.356294536817102e-06,
+      "loss": 0.0838,
+      "step": 9290,
+      "task_loss": 0.008283041417598724
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04094374552369118,
+      "epoch": 4.42,
+      "learning_rate": 2.3372921615201903e-06,
+      "loss": 0.0802,
+      "step": 9300,
+      "task_loss": 0.01785259321331978
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.022956877946853638,
+      "epoch": 4.42,
+      "learning_rate": 2.318289786223278e-06,
+      "loss": 0.0409,
+      "step": 9310,
+      "task_loss": 0.1291673481464386
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.42349928617477417,
+      "epoch": 4.43,
+      "learning_rate": 2.299287410926366e-06,
+      "loss": 0.1074,
+      "step": 9320,
+      "task_loss": 0.4811912178993225
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021385155618190765,
+      "epoch": 4.43,
+      "learning_rate": 2.280285035629454e-06,
+      "loss": 0.0496,
+      "step": 9330,
+      "task_loss": 0.006091751158237457
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.013296818360686302,
+      "epoch": 4.44,
+      "learning_rate": 2.261282660332542e-06,
+      "loss": 0.0448,
+      "step": 9340,
+      "task_loss": 0.007342435419559479
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.0073758745566010475,
+      "epoch": 4.44,
+      "learning_rate": 2.2422802850356297e-06,
+      "loss": 0.074,
+      "step": 9350,
+      "task_loss": 0.0043178461492061615
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.017247337847948074,
+      "epoch": 4.45,
+      "learning_rate": 2.2232779097387175e-06,
+      "loss": 0.0686,
+      "step": 9360,
+      "task_loss": 0.002873547375202179
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13869813084602356,
+      "epoch": 4.45,
+      "learning_rate": 2.2042755344418054e-06,
+      "loss": 0.0623,
+      "step": 9370,
+      "task_loss": 0.12998461723327637
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.037029024213552475,
+      "epoch": 4.46,
+      "learning_rate": 2.1852731591448932e-06,
+      "loss": 0.0671,
+      "step": 9380,
+      "task_loss": 0.21687030792236328
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01134589221328497,
+      "epoch": 4.46,
+      "learning_rate": 2.166270783847981e-06,
+      "loss": 0.057,
+      "step": 9390,
+      "task_loss": 0.19388972222805023
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06970459967851639,
+      "epoch": 4.47,
+      "learning_rate": 2.147268408551069e-06,
+      "loss": 0.0584,
+      "step": 9400,
+      "task_loss": 0.02260136976838112
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.009460528381168842,
+      "epoch": 4.47,
+      "learning_rate": 2.128266033254157e-06,
+      "loss": 0.045,
+      "step": 9410,
+      "task_loss": 0.12310618162155151
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11489342153072357,
+      "epoch": 4.48,
+      "learning_rate": 2.1092636579572447e-06,
+      "loss": 0.0371,
+      "step": 9420,
+      "task_loss": 0.08696582913398743
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07559200376272202,
+      "epoch": 4.48,
+      "learning_rate": 2.0902612826603326e-06,
+      "loss": 0.0618,
+      "step": 9430,
+      "task_loss": 0.3284105062484741
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07968532294034958,
+      "epoch": 4.48,
+      "learning_rate": 2.071258907363421e-06,
+      "loss": 0.1117,
+      "step": 9440,
+      "task_loss": 0.07459443807601929
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.28972363471984863,
+      "epoch": 4.49,
+      "learning_rate": 2.0522565320665087e-06,
+      "loss": 0.0473,
+      "step": 9450,
+      "task_loss": 0.16974028944969177
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02968553639948368,
+      "epoch": 4.49,
+      "learning_rate": 2.033254156769596e-06,
+      "loss": 0.0684,
+      "step": 9460,
+      "task_loss": 0.2634113132953644
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.030829662457108498,
+      "epoch": 4.5,
+      "learning_rate": 2.014251781472684e-06,
+      "loss": 0.0682,
+      "step": 9470,
+      "task_loss": 0.03290770202875137
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03441692143678665,
+      "epoch": 4.5,
+      "learning_rate": 1.9952494061757723e-06,
+      "loss": 0.108,
+      "step": 9480,
+      "task_loss": 0.011277075856924057
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08928455412387848,
+      "epoch": 4.51,
+      "learning_rate": 1.97624703087886e-06,
+      "loss": 0.0582,
+      "step": 9490,
+      "task_loss": 0.07190164923667908
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01250946894288063,
+      "epoch": 4.51,
+      "learning_rate": 1.957244655581948e-06,
+      "loss": 0.0386,
+      "step": 9500,
+      "task_loss": 0.009386200457811356
+    },
+    {
+      "epoch": 4.51,
+      "eval_accuracy": 0.9162844036697247,
+      "eval_loss": 0.2667511999607086,
+      "eval_runtime": 22.1403,
+      "eval_samples_per_second": 39.385,
+      "eval_steps_per_second": 4.923,
+      "step": 9500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.020871058106422424,
+      "epoch": 4.52,
+      "learning_rate": 1.938242280285036e-06,
+      "loss": 0.0491,
+      "step": 9510,
+      "task_loss": 0.005145017057657242
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03983244299888611,
+      "epoch": 4.52,
+      "learning_rate": 1.9192399049881237e-06,
+      "loss": 0.05,
+      "step": 9520,
+      "task_loss": 0.24316395819187164
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10421483218669891,
+      "epoch": 4.53,
+      "learning_rate": 1.9002375296912114e-06,
+      "loss": 0.0836,
+      "step": 9530,
+      "task_loss": 0.06570681929588318
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.043953992426395416,
+      "epoch": 4.53,
+      "learning_rate": 1.8812351543942995e-06,
+      "loss": 0.0585,
+      "step": 9540,
+      "task_loss": 0.008459363132715225
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.058616213500499725,
+      "epoch": 4.54,
+      "learning_rate": 1.8622327790973873e-06,
+      "loss": 0.0522,
+      "step": 9550,
+      "task_loss": 0.26781898736953735
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2032776176929474,
+      "epoch": 4.54,
+      "learning_rate": 1.8432304038004752e-06,
+      "loss": 0.0885,
+      "step": 9560,
+      "task_loss": 0.3326171636581421
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.041910551488399506,
+      "epoch": 4.55,
+      "learning_rate": 1.8242280285035633e-06,
+      "loss": 0.0669,
+      "step": 9570,
+      "task_loss": 0.004698578268289566
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02482573315501213,
+      "epoch": 4.55,
+      "learning_rate": 1.805225653206651e-06,
+      "loss": 0.0582,
+      "step": 9580,
+      "task_loss": 0.005337722599506378
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2271367311477661,
+      "epoch": 4.56,
+      "learning_rate": 1.7862232779097388e-06,
+      "loss": 0.0514,
+      "step": 9590,
+      "task_loss": 0.1770418882369995
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10083962976932526,
+      "epoch": 4.56,
+      "learning_rate": 1.7672209026128267e-06,
+      "loss": 0.0755,
+      "step": 9600,
+      "task_loss": 0.20249593257904053
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03125529736280441,
+      "epoch": 4.57,
+      "learning_rate": 1.7482185273159147e-06,
+      "loss": 0.0481,
+      "step": 9610,
+      "task_loss": 0.012315116822719574
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03914954140782356,
+      "epoch": 4.57,
+      "learning_rate": 1.7292161520190026e-06,
+      "loss": 0.0612,
+      "step": 9620,
+      "task_loss": 0.1674419790506363
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01954154670238495,
+      "epoch": 4.57,
+      "learning_rate": 1.7102137767220902e-06,
+      "loss": 0.0653,
+      "step": 9630,
+      "task_loss": 0.06889810413122177
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05788855999708176,
+      "epoch": 4.58,
+      "learning_rate": 1.691211401425178e-06,
+      "loss": 0.0253,
+      "step": 9640,
+      "task_loss": 0.14915083348751068
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01645825058221817,
+      "epoch": 4.58,
+      "learning_rate": 1.6722090261282662e-06,
+      "loss": 0.0519,
+      "step": 9650,
+      "task_loss": 0.12134365737438202
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04989754408597946,
+      "epoch": 4.59,
+      "learning_rate": 1.653206650831354e-06,
+      "loss": 0.1004,
+      "step": 9660,
+      "task_loss": 0.1625884622335434
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.019783230498433113,
+      "epoch": 4.59,
+      "learning_rate": 1.634204275534442e-06,
+      "loss": 0.0567,
+      "step": 9670,
+      "task_loss": 0.004505373537540436
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.19092217087745667,
+      "epoch": 4.6,
+      "learning_rate": 1.61520190023753e-06,
+      "loss": 0.0458,
+      "step": 9680,
+      "task_loss": 0.2721264362335205
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10719658434391022,
+      "epoch": 4.6,
+      "learning_rate": 1.5961995249406176e-06,
+      "loss": 0.0513,
+      "step": 9690,
+      "task_loss": 0.03760954737663269
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05233538895845413,
+      "epoch": 4.61,
+      "learning_rate": 1.5771971496437055e-06,
+      "loss": 0.0688,
+      "step": 9700,
+      "task_loss": 0.12094153463840485
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.014219870790839195,
+      "epoch": 4.61,
+      "learning_rate": 1.5581947743467934e-06,
+      "loss": 0.0581,
+      "step": 9710,
+      "task_loss": 0.0061318278312683105
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012800279073417187,
+      "epoch": 4.62,
+      "learning_rate": 1.5391923990498814e-06,
+      "loss": 0.0268,
+      "step": 9720,
+      "task_loss": 0.0036646276712417603
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12216310203075409,
+      "epoch": 4.62,
+      "learning_rate": 1.5201900237529693e-06,
+      "loss": 0.1461,
+      "step": 9730,
+      "task_loss": 0.13348951935768127
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07808056473731995,
+      "epoch": 4.63,
+      "learning_rate": 1.5011876484560572e-06,
+      "loss": 0.0822,
+      "step": 9740,
+      "task_loss": 0.22806300222873688
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.010630585253238678,
+      "epoch": 4.63,
+      "learning_rate": 1.4821852731591448e-06,
+      "loss": 0.0284,
+      "step": 9750,
+      "task_loss": 0.002479270100593567
+    },
+    {
+      "epoch": 4.63,
+      "eval_accuracy": 0.9185779816513762,
+      "eval_loss": 0.2812165319919586,
+      "eval_runtime": 22.1715,
+      "eval_samples_per_second": 39.33,
+      "eval_steps_per_second": 4.916,
+      "step": 9750
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.022376172244548798,
+      "epoch": 4.64,
+      "learning_rate": 1.4631828978622329e-06,
+      "loss": 0.0637,
+      "step": 9760,
+      "task_loss": 0.0058107636868953705
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.009340550750494003,
+      "epoch": 4.64,
+      "learning_rate": 1.4441805225653207e-06,
+      "loss": 0.0344,
+      "step": 9770,
+      "task_loss": 0.0033394992351531982
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.010714657604694366,
+      "epoch": 4.65,
+      "learning_rate": 1.4251781472684086e-06,
+      "loss": 0.0541,
+      "step": 9780,
+      "task_loss": 0.004770912230014801
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024335071444511414,
+      "epoch": 4.65,
+      "learning_rate": 1.4061757719714967e-06,
+      "loss": 0.0614,
+      "step": 9790,
+      "task_loss": 0.006472300738096237
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07776258885860443,
+      "epoch": 4.66,
+      "learning_rate": 1.3871733966745845e-06,
+      "loss": 0.075,
+      "step": 9800,
+      "task_loss": 0.039640650153160095
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04833105206489563,
+      "epoch": 4.66,
+      "learning_rate": 1.3681710213776722e-06,
+      "loss": 0.0351,
+      "step": 9810,
+      "task_loss": 0.0052606649696826935
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06957106292247772,
+      "epoch": 4.67,
+      "learning_rate": 1.34916864608076e-06,
+      "loss": 0.0566,
+      "step": 9820,
+      "task_loss": 0.012664098292589188
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.018420187756419182,
+      "epoch": 4.67,
+      "learning_rate": 1.3301662707838481e-06,
+      "loss": 0.0743,
+      "step": 9830,
+      "task_loss": 0.012570954859256744
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.022069044411182404,
+      "epoch": 4.67,
+      "learning_rate": 1.311163895486936e-06,
+      "loss": 0.1181,
+      "step": 9840,
+      "task_loss": 0.2193067967891693
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.10804280638694763,
+      "epoch": 4.68,
+      "learning_rate": 1.2921615201900239e-06,
+      "loss": 0.0684,
+      "step": 9850,
+      "task_loss": 0.1666574627161026
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021793734282255173,
+      "epoch": 4.68,
+      "learning_rate": 1.273159144893112e-06,
+      "loss": 0.0713,
+      "step": 9860,
+      "task_loss": 0.1198580265045166
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.013394506648182869,
+      "epoch": 4.69,
+      "learning_rate": 1.2541567695961996e-06,
+      "loss": 0.077,
+      "step": 9870,
+      "task_loss": 0.003841262310743332
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.024098927155137062,
+      "epoch": 4.69,
+      "learning_rate": 1.2351543942992874e-06,
+      "loss": 0.0668,
+      "step": 9880,
+      "task_loss": 0.11605469882488251
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012392224743962288,
+      "epoch": 4.7,
+      "learning_rate": 1.2161520190023753e-06,
+      "loss": 0.0472,
+      "step": 9890,
+      "task_loss": 0.003138858824968338
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05335499346256256,
+      "epoch": 4.7,
+      "learning_rate": 1.1971496437054634e-06,
+      "loss": 0.0593,
+      "step": 9900,
+      "task_loss": 0.18697652220726013
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.028973210602998734,
+      "epoch": 4.71,
+      "learning_rate": 1.178147268408551e-06,
+      "loss": 0.0279,
+      "step": 9910,
+      "task_loss": 0.11546307057142258
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09816677868366241,
+      "epoch": 4.71,
+      "learning_rate": 1.159144893111639e-06,
+      "loss": 0.0397,
+      "step": 9920,
+      "task_loss": 0.0736415684223175
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.011303352192044258,
+      "epoch": 4.72,
+      "learning_rate": 1.140142517814727e-06,
+      "loss": 0.0569,
+      "step": 9930,
+      "task_loss": 0.004318550229072571
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01750335283577442,
+      "epoch": 4.72,
+      "learning_rate": 1.1211401425178148e-06,
+      "loss": 0.0446,
+      "step": 9940,
+      "task_loss": 0.19570045173168182
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.046896446496248245,
+      "epoch": 4.73,
+      "learning_rate": 1.1021377672209027e-06,
+      "loss": 0.0291,
+      "step": 9950,
+      "task_loss": 0.07542459666728973
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.2460804581642151,
+      "epoch": 4.73,
+      "learning_rate": 1.0831353919239906e-06,
+      "loss": 0.061,
+      "step": 9960,
+      "task_loss": 0.16217932105064392
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012438797391951084,
+      "epoch": 4.74,
+      "learning_rate": 1.0641330166270784e-06,
+      "loss": 0.0274,
+      "step": 9970,
+      "task_loss": 0.0034538879990577698
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012422207742929459,
+      "epoch": 4.74,
+      "learning_rate": 1.0451306413301663e-06,
+      "loss": 0.0365,
+      "step": 9980,
+      "task_loss": 0.003980562090873718
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.09234738349914551,
+      "epoch": 4.75,
+      "learning_rate": 1.0261282660332544e-06,
+      "loss": 0.0777,
+      "step": 9990,
+      "task_loss": 0.07327043265104294
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12707777321338654,
+      "epoch": 4.75,
+      "learning_rate": 1.007125890736342e-06,
+      "loss": 0.1016,
+      "step": 10000,
+      "task_loss": 0.08033004403114319
+    },
+    {
+      "epoch": 4.75,
+      "eval_accuracy": 0.9162844036697247,
+      "eval_loss": 0.2825167179107666,
+      "eval_runtime": 22.1651,
+      "eval_samples_per_second": 39.341,
+      "eval_steps_per_second": 4.918,
+      "step": 10000
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07169970124959946,
+      "epoch": 4.76,
+      "learning_rate": 9.8812351543943e-07,
+      "loss": 0.0671,
+      "step": 10010,
+      "task_loss": 0.06848999857902527
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.013409988954663277,
+      "epoch": 4.76,
+      "learning_rate": 9.69121140142518e-07,
+      "loss": 0.0377,
+      "step": 10020,
+      "task_loss": 0.09568732976913452
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07166978716850281,
+      "epoch": 4.76,
+      "learning_rate": 9.501187648456057e-07,
+      "loss": 0.0294,
+      "step": 10030,
+      "task_loss": 0.0640857145190239
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.04987531155347824,
+      "epoch": 4.77,
+      "learning_rate": 9.311163895486937e-07,
+      "loss": 0.0656,
+      "step": 10040,
+      "task_loss": 0.021407999098300934
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.007782880216836929,
+      "epoch": 4.77,
+      "learning_rate": 9.121140142517816e-07,
+      "loss": 0.0498,
+      "step": 10050,
+      "task_loss": 0.002929363399744034
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021184280514717102,
+      "epoch": 4.78,
+      "learning_rate": 8.931116389548694e-07,
+      "loss": 0.0508,
+      "step": 10060,
+      "task_loss": 0.004369787871837616
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.012340977787971497,
+      "epoch": 4.78,
+      "learning_rate": 8.741092636579574e-07,
+      "loss": 0.0516,
+      "step": 10070,
+      "task_loss": 0.006207786500453949
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1317601352930069,
+      "epoch": 4.79,
+      "learning_rate": 8.551068883610451e-07,
+      "loss": 0.0475,
+      "step": 10080,
+      "task_loss": 0.004766244441270828
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.023309551179409027,
+      "epoch": 4.79,
+      "learning_rate": 8.361045130641331e-07,
+      "loss": 0.0457,
+      "step": 10090,
+      "task_loss": 0.008698023855686188
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.046007562428712845,
+      "epoch": 4.8,
+      "learning_rate": 8.17102137767221e-07,
+      "loss": 0.1062,
+      "step": 10100,
+      "task_loss": 0.05794458091259003
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01069045439362526,
+      "epoch": 4.8,
+      "learning_rate": 7.980997624703088e-07,
+      "loss": 0.0411,
+      "step": 10110,
+      "task_loss": 0.002656955271959305
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.021371502429246902,
+      "epoch": 4.81,
+      "learning_rate": 7.790973871733967e-07,
+      "loss": 0.0504,
+      "step": 10120,
+      "task_loss": 0.005056999623775482
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.060870636254549026,
+      "epoch": 4.81,
+      "learning_rate": 7.600950118764846e-07,
+      "loss": 0.0759,
+      "step": 10130,
+      "task_loss": 0.097642682492733
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.019857440143823624,
+      "epoch": 4.82,
+      "learning_rate": 7.410926365795724e-07,
+      "loss": 0.0455,
+      "step": 10140,
+      "task_loss": 0.005811773240566254
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.05780591070652008,
+      "epoch": 4.82,
+      "learning_rate": 7.220902612826604e-07,
+      "loss": 0.0547,
+      "step": 10150,
+      "task_loss": 0.032336119562387466
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.043222397565841675,
+      "epoch": 4.83,
+      "learning_rate": 7.030878859857483e-07,
+      "loss": 0.0618,
+      "step": 10160,
+      "task_loss": 0.002787157893180847
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01754024438560009,
+      "epoch": 4.83,
+      "learning_rate": 6.840855106888361e-07,
+      "loss": 0.0529,
+      "step": 10170,
+      "task_loss": 0.0054728202521800995
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.007267419248819351,
+      "epoch": 4.84,
+      "learning_rate": 6.650831353919241e-07,
+      "loss": 0.0394,
+      "step": 10180,
+      "task_loss": 0.004029855132102966
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.060019608587026596,
+      "epoch": 4.84,
+      "learning_rate": 6.460807600950119e-07,
+      "loss": 0.0454,
+      "step": 10190,
+      "task_loss": 0.005614615976810455
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06279192864894867,
+      "epoch": 4.85,
+      "learning_rate": 6.270783847980998e-07,
+      "loss": 0.0804,
+      "step": 10200,
+      "task_loss": 0.22416365146636963
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01789834164083004,
+      "epoch": 4.85,
+      "learning_rate": 6.080760095011877e-07,
+      "loss": 0.0704,
+      "step": 10210,
+      "task_loss": 0.0029133372008800507
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.023510048165917397,
+      "epoch": 4.86,
+      "learning_rate": 5.890736342042755e-07,
+      "loss": 0.0306,
+      "step": 10220,
+      "task_loss": 0.008717145770788193
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06903427839279175,
+      "epoch": 4.86,
+      "learning_rate": 5.700712589073635e-07,
+      "loss": 0.0636,
+      "step": 10230,
+      "task_loss": 0.02681458741426468
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.11212094873189926,
+      "epoch": 4.86,
+      "learning_rate": 5.510688836104513e-07,
+      "loss": 0.0766,
+      "step": 10240,
+      "task_loss": 0.05283607542514801
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.020517665892839432,
+      "epoch": 4.87,
+      "learning_rate": 5.320665083135392e-07,
+      "loss": 0.0507,
+      "step": 10250,
+      "task_loss": 0.006865642964839935
+    },
+    {
+      "epoch": 4.87,
+      "eval_accuracy": 0.9139908256880734,
+      "eval_loss": 0.280513197183609,
+      "eval_runtime": 22.1726,
+      "eval_samples_per_second": 39.328,
+      "eval_steps_per_second": 4.916,
+      "step": 10250
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.06565746665000916,
+      "epoch": 4.87,
+      "learning_rate": 5.130641330166272e-07,
+      "loss": 0.084,
+      "step": 10260,
+      "task_loss": 0.08022348582744598
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.041272662580013275,
+      "epoch": 4.88,
+      "learning_rate": 4.94061757719715e-07,
+      "loss": 0.0807,
+      "step": 10270,
+      "task_loss": 0.044755056500434875
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02130916342139244,
+      "epoch": 4.88,
+      "learning_rate": 4.7505938242280285e-07,
+      "loss": 0.0456,
+      "step": 10280,
+      "task_loss": 0.005585514008998871
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.008105727843940258,
+      "epoch": 4.89,
+      "learning_rate": 4.560570071258908e-07,
+      "loss": 0.0662,
+      "step": 10290,
+      "task_loss": 0.007067546248435974
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02983175590634346,
+      "epoch": 4.89,
+      "learning_rate": 4.370546318289787e-07,
+      "loss": 0.0701,
+      "step": 10300,
+      "task_loss": 0.22576290369033813
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.009511064738035202,
+      "epoch": 4.9,
+      "learning_rate": 4.1805225653206654e-07,
+      "loss": 0.085,
+      "step": 10310,
+      "task_loss": 0.007220160216093063
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.037107907235622406,
+      "epoch": 4.9,
+      "learning_rate": 3.990498812351544e-07,
+      "loss": 0.0351,
+      "step": 10320,
+      "task_loss": 0.1071058064699173
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.007235179655253887,
+      "epoch": 4.91,
+      "learning_rate": 3.800475059382423e-07,
+      "loss": 0.0385,
+      "step": 10330,
+      "task_loss": 0.002338908612728119
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.014286589808762074,
+      "epoch": 4.91,
+      "learning_rate": 3.610451306413302e-07,
+      "loss": 0.0366,
+      "step": 10340,
+      "task_loss": 0.0025917217135429382
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.02622096985578537,
+      "epoch": 4.92,
+      "learning_rate": 3.4204275534441805e-07,
+      "loss": 0.0292,
+      "step": 10350,
+      "task_loss": 0.0064817629754543304
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.011151728220283985,
+      "epoch": 4.92,
+      "learning_rate": 3.2304038004750596e-07,
+      "loss": 0.0683,
+      "step": 10360,
+      "task_loss": 0.003448858857154846
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015970010310411453,
+      "epoch": 4.93,
+      "learning_rate": 3.040380047505938e-07,
+      "loss": 0.0584,
+      "step": 10370,
+      "task_loss": 0.005513232201337814
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.1370335817337036,
+      "epoch": 4.93,
+      "learning_rate": 2.8503562945368174e-07,
+      "loss": 0.0771,
+      "step": 10380,
+      "task_loss": 0.008187536150217056
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.022085029631853104,
+      "epoch": 4.94,
+      "learning_rate": 2.660332541567696e-07,
+      "loss": 0.0691,
+      "step": 10390,
+      "task_loss": 0.009103760123252869
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.029131721705198288,
+      "epoch": 4.94,
+      "learning_rate": 2.470308788598575e-07,
+      "loss": 0.0614,
+      "step": 10400,
+      "task_loss": 0.008749555796384811
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01142764464020729,
+      "epoch": 4.95,
+      "learning_rate": 2.280285035629454e-07,
+      "loss": 0.0506,
+      "step": 10410,
+      "task_loss": 0.003860827535390854
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.22063679993152618,
+      "epoch": 4.95,
+      "learning_rate": 2.0902612826603327e-07,
+      "loss": 0.1186,
+      "step": 10420,
+      "task_loss": 0.411041259765625
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.00713141867890954,
+      "epoch": 4.95,
+      "learning_rate": 1.9002375296912116e-07,
+      "loss": 0.0519,
+      "step": 10430,
+      "task_loss": 0.0033237673342227936
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.08448931574821472,
+      "epoch": 4.96,
+      "learning_rate": 1.7102137767220902e-07,
+      "loss": 0.053,
+      "step": 10440,
+      "task_loss": 0.19629473984241486
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.014177798293530941,
+      "epoch": 4.96,
+      "learning_rate": 1.520190023752969e-07,
+      "loss": 0.0672,
+      "step": 10450,
+      "task_loss": 0.009178481996059418
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13920198380947113,
+      "epoch": 4.97,
+      "learning_rate": 1.330166270783848e-07,
+      "loss": 0.0836,
+      "step": 10460,
+      "task_loss": 0.05482466146349907
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.01523562241345644,
+      "epoch": 4.97,
+      "learning_rate": 1.140142517814727e-07,
+      "loss": 0.0335,
+      "step": 10470,
+      "task_loss": 0.0879075825214386
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.015354710631072521,
+      "epoch": 4.98,
+      "learning_rate": 9.501187648456058e-08,
+      "loss": 0.0834,
+      "step": 10480,
+      "task_loss": 0.12236123532056808
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.12151747941970825,
+      "epoch": 4.98,
+      "learning_rate": 7.600950118764846e-08,
+      "loss": 0.0548,
+      "step": 10490,
+      "task_loss": 0.1053650826215744
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.07341088354587555,
+      "epoch": 4.99,
+      "learning_rate": 5.700712589073635e-08,
+      "loss": 0.0709,
+      "step": 10500,
+      "task_loss": 0.12648561596870422
+    },
+    {
+      "epoch": 4.99,
+      "eval_accuracy": 0.9139908256880734,
+      "eval_loss": 0.2854968011379242,
+      "eval_runtime": 21.9579,
+      "eval_samples_per_second": 39.712,
+      "eval_steps_per_second": 4.964,
+      "step": 10500
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.13635969161987305,
+      "epoch": 4.99,
+      "learning_rate": 3.800475059382423e-08,
+      "loss": 0.0549,
+      "step": 10510,
+      "task_loss": 0.1342453956604004
+    },
+    {
+      "compression/movement_sparsity/importance_regularization_factor": 0.05,
+      "compression/movement_sparsity/importance_threshold": 0.0,
+      "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254,
+      "compression/movement_sparsity/model_sparsity": 0.3142793903220987,
+      "compression_loss": 0.0,
+      "distillation_loss": 0.03788226097822189,
+      "epoch": 5.0,
+      "learning_rate": 1.9002375296912114e-08,
+      "loss": 0.0381,
+      "step": 10520,
+      "task_loss": 0.05685931071639061
+    },
+    {
+      "epoch": 5.0,
+      "step": 10525,
+      "total_flos": 2.220815486243328e+16,
+      "train_loss": 2.258239375927669,
+      "train_runtime": 6578.4214,
+      "train_samples_per_second": 51.189,
+      "train_steps_per_second": 1.6
     }
   ],
-  "max_steps": 2105,
-  "num_train_epochs": 1,
-  "total_flos": 4441630972486656.0,
+  "max_steps": 10525,
+  "num_train_epochs": 5,
+  "total_flos": 2.220815486243328e+16,
   "trial_name": null,
   "trial_params": null
 }