jiaqiz commited on
Commit
91827bd
·
verified ·
1 Parent(s): 31e4445

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. __init__.py +0 -0
  3. block_config.py +118 -0
  4. config.json +2961 -0
  5. configuration_decilm.py +65 -0
  6. model-00001-of-00049.safetensors +3 -0
  7. model-00002-of-00049.safetensors +3 -0
  8. model-00003-of-00049.safetensors +3 -0
  9. model-00004-of-00049.safetensors +3 -0
  10. model-00005-of-00049.safetensors +3 -0
  11. model-00006-of-00049.safetensors +3 -0
  12. model-00007-of-00049.safetensors +3 -0
  13. model-00008-of-00049.safetensors +3 -0
  14. model-00009-of-00049.safetensors +3 -0
  15. model-00010-of-00049.safetensors +3 -0
  16. model-00011-of-00049.safetensors +3 -0
  17. model-00012-of-00049.safetensors +3 -0
  18. model-00013-of-00049.safetensors +3 -0
  19. model-00014-of-00049.safetensors +3 -0
  20. model-00015-of-00049.safetensors +3 -0
  21. model-00016-of-00049.safetensors +3 -0
  22. model-00017-of-00049.safetensors +3 -0
  23. model-00018-of-00049.safetensors +3 -0
  24. model-00019-of-00049.safetensors +3 -0
  25. model-00020-of-00049.safetensors +3 -0
  26. model-00021-of-00049.safetensors +3 -0
  27. model-00022-of-00049.safetensors +3 -0
  28. model-00023-of-00049.safetensors +3 -0
  29. model-00024-of-00049.safetensors +3 -0
  30. model-00025-of-00049.safetensors +3 -0
  31. model-00026-of-00049.safetensors +3 -0
  32. model-00027-of-00049.safetensors +3 -0
  33. model-00028-of-00049.safetensors +3 -0
  34. model-00029-of-00049.safetensors +3 -0
  35. model-00030-of-00049.safetensors +3 -0
  36. model-00031-of-00049.safetensors +3 -0
  37. model-00032-of-00049.safetensors +3 -0
  38. model-00033-of-00049.safetensors +3 -0
  39. model-00034-of-00049.safetensors +3 -0
  40. model-00035-of-00049.safetensors +3 -0
  41. model-00036-of-00049.safetensors +3 -0
  42. model-00037-of-00049.safetensors +3 -0
  43. model-00038-of-00049.safetensors +3 -0
  44. model-00039-of-00049.safetensors +3 -0
  45. model-00040-of-00049.safetensors +3 -0
  46. model-00041-of-00049.safetensors +3 -0
  47. model-00042-of-00049.safetensors +3 -0
  48. model-00043-of-00049.safetensors +3 -0
  49. model-00044-of-00049.safetensors +3 -0
  50. model-00045-of-00049.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
__init__.py ADDED
File without changes
block_config.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import json
3
+ import warnings
4
+ from dataclasses import dataclass, MISSING
5
+ from functools import partial
6
+ from typing import Optional, Any
7
+
8
+
9
+ @partial(dataclass, frozen=True, kw_only=True)
10
+ class JsonComparable:
11
+ def to_json(self) -> str:
12
+ return json.dumps(dataclasses.asdict(self))
13
+
14
+ def __eq__(self, other: "JsonComparable") -> bool:
15
+ return self.to_json() == other.to_json()
16
+
17
+ def __hash__(self) -> int:
18
+ return hash(self.to_json())
19
+
20
+ def __lt__(self, other: "JsonComparable") -> bool:
21
+ return self.to_json() < other.to_json()
22
+
23
+
24
+ @partial(dataclass, frozen=True, kw_only=True)
25
+ class SubblockConfig(JsonComparable):
26
+ no_op: bool = False
27
+ replace_with_linear: bool = False
28
+ sparsify: Optional[list[str]] = None
29
+
30
+ def __post_init__(self):
31
+ assert not (self.no_op and self.replace_with_linear)
32
+
33
+ def _force_setattr(self, name: str, value: Any) -> None:
34
+ """
35
+ Set an attribute even in frozen dataclasses.
36
+ Use only inside __post_init__!
37
+ """
38
+ object.__setattr__(self, name, value)
39
+
40
+
41
+ @partial(dataclass, frozen=True, kw_only=True)
42
+ class AttentionConfig(SubblockConfig):
43
+ n_heads_in_group: Optional[int] = None
44
+ window_length: Optional[int] = None
45
+ num_sink_tokens: Optional[int] = None
46
+ use_prefill_window_in_sink_attention: bool = False
47
+ unshifted_sink: bool = False
48
+
49
+ def __post_init__(self):
50
+ super().__post_init__()
51
+ assert not (self.no_op and self.replace_with_linear)
52
+
53
+ if self.no_op or self.replace_with_linear:
54
+ for irrelevant_att in ["n_heads_in_group", "window_length", "num_sink_tokens"]:
55
+ self._force_setattr(irrelevant_att, None)
56
+ else:
57
+ assert self.n_heads_in_group is not None
58
+
59
+ if self.is_sink:
60
+ assert not (self.unshifted_sink and self.use_prefill_window_in_sink_attention), \
61
+ ("Unshifted sink uses its own kind of explicit masking, not standard window. "
62
+ "Set use_prefill_window_in_sink_attention to False.")
63
+ assert not (self.num_sink_tokens == 0 and not self.unshifted_sink), \
64
+ "Fake sink attention with 0 sink tokens is only supported with unshifted_sink=True"
65
+
66
+ @property
67
+ def prefill_sliding_window(self) -> Optional[int]:
68
+ if self.window_length is not None:
69
+ if not self.is_sink or self.use_prefill_window_in_sink_attention:
70
+ return self.window_length
71
+ return None
72
+
73
+ @property
74
+ def is_sliding(self) -> bool:
75
+ return self.prefill_sliding_window is not None
76
+
77
+ @property
78
+ def is_sink(self) -> bool:
79
+ return (
80
+ (self.window_length is not None)
81
+ and
82
+ (self.num_sink_tokens is not None)
83
+ )
84
+
85
+
86
+ @partial(dataclass, frozen=True, kw_only=True)
87
+ class FFNConfig(SubblockConfig):
88
+ ffn_mult: Optional[float] = None
89
+
90
+ def __post_init__(self):
91
+ super().__post_init__()
92
+ if self.no_op or self.replace_with_linear:
93
+ self._force_setattr("ffn_mult", None)
94
+ else:
95
+ assert self.ffn_mult is not None
96
+ self._force_setattr("ffn_mult", round(self.ffn_mult, 6))
97
+
98
+
99
+ @partial(dataclass, frozen=True, kw_only=True)
100
+ class BlockConfig(JsonComparable):
101
+ attention: AttentionConfig = MISSING
102
+ ffn: FFNConfig = MISSING
103
+
104
+ def __post_init__(self):
105
+ """
106
+ Init subblock dataclasses from dicts
107
+ """
108
+ for subblock_name in dataclasses.fields(self):
109
+ subblock_config = getattr(self, subblock_name.name)
110
+ if isinstance(subblock_config, dict):
111
+ subblock_fields = [field.name for field in dataclasses.fields(subblock_name.type)]
112
+ unsupported_fields = [field_name for field_name in subblock_config.keys()
113
+ if field_name not in subblock_fields]
114
+ if len(unsupported_fields) > 0:
115
+ warnings.warn(f"Removed unsupported fields {unsupported_fields} from {subblock_name.type.__name__}")
116
+ subblock_config = {k: v for k, v in subblock_config.items() if k not in unsupported_fields}
117
+ object.__setattr__(self, subblock_name.name,
118
+ subblock_name.type(**subblock_config)) # __setattr__ to overcome frozen=True
config.json ADDED
@@ -0,0 +1,2961 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/lustre/fs1/portfolios/llmservice/users/soumyes/llama-nemotron/results/cw_smy_253b_sft_sp_ultra_blend_v1_maxlen_24k_shuffled_lr8e-6_minlr8e-7_warmup800_GBS256_rerun-step-5700/checkpoints/HF/step1800",
3
+ "architectures": [
4
+ "DeciLMForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_decilm.DeciLMConfig",
10
+ "AutoModelForCausalLM": "modeling_decilm.DeciLMForCausalLM"
11
+ },
12
+ "block_configs": [
13
+ {
14
+ "attention": {
15
+ "n_heads_in_group": 16,
16
+ "no_op": false,
17
+ "num_sink_tokens": null,
18
+ "replace_with_linear": false,
19
+ "sparsify": null,
20
+ "unshifted_sink": false,
21
+ "use_prefill_window_in_sink_attention": false,
22
+ "window_length": null
23
+ },
24
+ "ffn": {
25
+ "ffn_mult": 0.4875,
26
+ "no_op": false,
27
+ "replace_with_linear": false,
28
+ "sparsify": null
29
+ }
30
+ },
31
+ {
32
+ "attention": {
33
+ "n_heads_in_group": 16,
34
+ "no_op": false,
35
+ "num_sink_tokens": null,
36
+ "replace_with_linear": false,
37
+ "sparsify": null,
38
+ "unshifted_sink": false,
39
+ "use_prefill_window_in_sink_attention": false,
40
+ "window_length": null
41
+ },
42
+ "ffn": {
43
+ "ffn_mult": 0.975,
44
+ "no_op": false,
45
+ "replace_with_linear": false,
46
+ "sparsify": null
47
+ }
48
+ },
49
+ {
50
+ "attention": {
51
+ "n_heads_in_group": 16,
52
+ "no_op": false,
53
+ "num_sink_tokens": null,
54
+ "replace_with_linear": false,
55
+ "sparsify": null,
56
+ "unshifted_sink": false,
57
+ "use_prefill_window_in_sink_attention": false,
58
+ "window_length": null
59
+ },
60
+ "ffn": {
61
+ "ffn_mult": 1.4625,
62
+ "no_op": false,
63
+ "replace_with_linear": false,
64
+ "sparsify": null
65
+ }
66
+ },
67
+ {
68
+ "attention": {
69
+ "n_heads_in_group": 16,
70
+ "no_op": false,
71
+ "num_sink_tokens": null,
72
+ "replace_with_linear": false,
73
+ "sparsify": null,
74
+ "unshifted_sink": false,
75
+ "use_prefill_window_in_sink_attention": false,
76
+ "window_length": null
77
+ },
78
+ "ffn": {
79
+ "ffn_mult": 1.4625,
80
+ "no_op": false,
81
+ "replace_with_linear": false,
82
+ "sparsify": null
83
+ }
84
+ },
85
+ {
86
+ "attention": {
87
+ "n_heads_in_group": 16,
88
+ "no_op": false,
89
+ "num_sink_tokens": null,
90
+ "replace_with_linear": false,
91
+ "sparsify": null,
92
+ "unshifted_sink": false,
93
+ "use_prefill_window_in_sink_attention": false,
94
+ "window_length": null
95
+ },
96
+ "ffn": {
97
+ "ffn_mult": 1.4625,
98
+ "no_op": false,
99
+ "replace_with_linear": false,
100
+ "sparsify": null
101
+ }
102
+ },
103
+ {
104
+ "attention": {
105
+ "n_heads_in_group": 16,
106
+ "no_op": false,
107
+ "num_sink_tokens": null,
108
+ "replace_with_linear": false,
109
+ "sparsify": null,
110
+ "unshifted_sink": false,
111
+ "use_prefill_window_in_sink_attention": false,
112
+ "window_length": null
113
+ },
114
+ "ffn": {
115
+ "ffn_mult": 1.4625,
116
+ "no_op": false,
117
+ "replace_with_linear": false,
118
+ "sparsify": null
119
+ }
120
+ },
121
+ {
122
+ "attention": {
123
+ "n_heads_in_group": 16,
124
+ "no_op": false,
125
+ "num_sink_tokens": null,
126
+ "replace_with_linear": false,
127
+ "sparsify": null,
128
+ "unshifted_sink": false,
129
+ "use_prefill_window_in_sink_attention": false,
130
+ "window_length": null
131
+ },
132
+ "ffn": {
133
+ "ffn_mult": 1.4625,
134
+ "no_op": false,
135
+ "replace_with_linear": false,
136
+ "sparsify": null
137
+ }
138
+ },
139
+ {
140
+ "attention": {
141
+ "n_heads_in_group": 16,
142
+ "no_op": false,
143
+ "num_sink_tokens": null,
144
+ "replace_with_linear": false,
145
+ "sparsify": null,
146
+ "unshifted_sink": false,
147
+ "use_prefill_window_in_sink_attention": false,
148
+ "window_length": null
149
+ },
150
+ "ffn": {
151
+ "ffn_mult": 1.4625,
152
+ "no_op": false,
153
+ "replace_with_linear": false,
154
+ "sparsify": null
155
+ }
156
+ },
157
+ {
158
+ "attention": {
159
+ "n_heads_in_group": 16,
160
+ "no_op": false,
161
+ "num_sink_tokens": null,
162
+ "replace_with_linear": false,
163
+ "sparsify": null,
164
+ "unshifted_sink": false,
165
+ "use_prefill_window_in_sink_attention": false,
166
+ "window_length": null
167
+ },
168
+ "ffn": {
169
+ "ffn_mult": 1.95,
170
+ "no_op": false,
171
+ "replace_with_linear": false,
172
+ "sparsify": null
173
+ }
174
+ },
175
+ {
176
+ "attention": {
177
+ "n_heads_in_group": null,
178
+ "no_op": true,
179
+ "num_sink_tokens": null,
180
+ "replace_with_linear": false,
181
+ "sparsify": null,
182
+ "unshifted_sink": false,
183
+ "use_prefill_window_in_sink_attention": false,
184
+ "window_length": null
185
+ },
186
+ "ffn": {
187
+ "ffn_mult": null,
188
+ "no_op": true,
189
+ "replace_with_linear": false,
190
+ "sparsify": null
191
+ }
192
+ },
193
+ {
194
+ "attention": {
195
+ "n_heads_in_group": null,
196
+ "no_op": true,
197
+ "num_sink_tokens": null,
198
+ "replace_with_linear": false,
199
+ "sparsify": null,
200
+ "unshifted_sink": false,
201
+ "use_prefill_window_in_sink_attention": false,
202
+ "window_length": null
203
+ },
204
+ "ffn": {
205
+ "ffn_mult": null,
206
+ "no_op": true,
207
+ "replace_with_linear": false,
208
+ "sparsify": null
209
+ }
210
+ },
211
+ {
212
+ "attention": {
213
+ "n_heads_in_group": null,
214
+ "no_op": true,
215
+ "num_sink_tokens": null,
216
+ "replace_with_linear": false,
217
+ "sparsify": null,
218
+ "unshifted_sink": false,
219
+ "use_prefill_window_in_sink_attention": false,
220
+ "window_length": null
221
+ },
222
+ "ffn": {
223
+ "ffn_mult": null,
224
+ "no_op": true,
225
+ "replace_with_linear": false,
226
+ "sparsify": null
227
+ }
228
+ },
229
+ {
230
+ "attention": {
231
+ "n_heads_in_group": null,
232
+ "no_op": true,
233
+ "num_sink_tokens": null,
234
+ "replace_with_linear": false,
235
+ "sparsify": null,
236
+ "unshifted_sink": false,
237
+ "use_prefill_window_in_sink_attention": false,
238
+ "window_length": null
239
+ },
240
+ "ffn": {
241
+ "ffn_mult": null,
242
+ "no_op": true,
243
+ "replace_with_linear": false,
244
+ "sparsify": null
245
+ }
246
+ },
247
+ {
248
+ "attention": {
249
+ "n_heads_in_group": 16,
250
+ "no_op": false,
251
+ "num_sink_tokens": null,
252
+ "replace_with_linear": false,
253
+ "sparsify": null,
254
+ "unshifted_sink": false,
255
+ "use_prefill_window_in_sink_attention": false,
256
+ "window_length": null
257
+ },
258
+ "ffn": {
259
+ "ffn_mult": 1.95,
260
+ "no_op": false,
261
+ "replace_with_linear": false,
262
+ "sparsify": null
263
+ }
264
+ },
265
+ {
266
+ "attention": {
267
+ "n_heads_in_group": 16,
268
+ "no_op": false,
269
+ "num_sink_tokens": null,
270
+ "replace_with_linear": false,
271
+ "sparsify": null,
272
+ "unshifted_sink": false,
273
+ "use_prefill_window_in_sink_attention": false,
274
+ "window_length": null
275
+ },
276
+ "ffn": {
277
+ "ffn_mult": 1.95,
278
+ "no_op": false,
279
+ "replace_with_linear": false,
280
+ "sparsify": null
281
+ }
282
+ },
283
+ {
284
+ "attention": {
285
+ "n_heads_in_group": 16,
286
+ "no_op": false,
287
+ "num_sink_tokens": null,
288
+ "replace_with_linear": false,
289
+ "sparsify": null,
290
+ "unshifted_sink": false,
291
+ "use_prefill_window_in_sink_attention": false,
292
+ "window_length": null
293
+ },
294
+ "ffn": {
295
+ "ffn_mult": 1.95,
296
+ "no_op": false,
297
+ "replace_with_linear": false,
298
+ "sparsify": null
299
+ }
300
+ },
301
+ {
302
+ "attention": {
303
+ "n_heads_in_group": 16,
304
+ "no_op": false,
305
+ "num_sink_tokens": null,
306
+ "replace_with_linear": false,
307
+ "sparsify": null,
308
+ "unshifted_sink": false,
309
+ "use_prefill_window_in_sink_attention": false,
310
+ "window_length": null
311
+ },
312
+ "ffn": {
313
+ "ffn_mult": 4.875,
314
+ "no_op": false,
315
+ "replace_with_linear": false,
316
+ "sparsify": null
317
+ }
318
+ },
319
+ {
320
+ "attention": {
321
+ "n_heads_in_group": 16,
322
+ "no_op": false,
323
+ "num_sink_tokens": null,
324
+ "replace_with_linear": false,
325
+ "sparsify": null,
326
+ "unshifted_sink": false,
327
+ "use_prefill_window_in_sink_attention": false,
328
+ "window_length": null
329
+ },
330
+ "ffn": {
331
+ "ffn_mult": 4.875,
332
+ "no_op": false,
333
+ "replace_with_linear": false,
334
+ "sparsify": null
335
+ }
336
+ },
337
+ {
338
+ "attention": {
339
+ "n_heads_in_group": null,
340
+ "no_op": true,
341
+ "num_sink_tokens": null,
342
+ "replace_with_linear": false,
343
+ "sparsify": null,
344
+ "unshifted_sink": false,
345
+ "use_prefill_window_in_sink_attention": false,
346
+ "window_length": null
347
+ },
348
+ "ffn": {
349
+ "ffn_mult": null,
350
+ "no_op": true,
351
+ "replace_with_linear": false,
352
+ "sparsify": null
353
+ }
354
+ },
355
+ {
356
+ "attention": {
357
+ "n_heads_in_group": null,
358
+ "no_op": true,
359
+ "num_sink_tokens": null,
360
+ "replace_with_linear": false,
361
+ "sparsify": null,
362
+ "unshifted_sink": false,
363
+ "use_prefill_window_in_sink_attention": false,
364
+ "window_length": null
365
+ },
366
+ "ffn": {
367
+ "ffn_mult": null,
368
+ "no_op": true,
369
+ "replace_with_linear": false,
370
+ "sparsify": null
371
+ }
372
+ },
373
+ {
374
+ "attention": {
375
+ "n_heads_in_group": null,
376
+ "no_op": true,
377
+ "num_sink_tokens": null,
378
+ "replace_with_linear": false,
379
+ "sparsify": null,
380
+ "unshifted_sink": false,
381
+ "use_prefill_window_in_sink_attention": false,
382
+ "window_length": null
383
+ },
384
+ "ffn": {
385
+ "ffn_mult": null,
386
+ "no_op": true,
387
+ "replace_with_linear": false,
388
+ "sparsify": null
389
+ }
390
+ },
391
+ {
392
+ "attention": {
393
+ "n_heads_in_group": null,
394
+ "no_op": true,
395
+ "num_sink_tokens": null,
396
+ "replace_with_linear": false,
397
+ "sparsify": null,
398
+ "unshifted_sink": false,
399
+ "use_prefill_window_in_sink_attention": false,
400
+ "window_length": null
401
+ },
402
+ "ffn": {
403
+ "ffn_mult": null,
404
+ "no_op": true,
405
+ "replace_with_linear": false,
406
+ "sparsify": null
407
+ }
408
+ },
409
+ {
410
+ "attention": {
411
+ "n_heads_in_group": null,
412
+ "no_op": true,
413
+ "num_sink_tokens": null,
414
+ "replace_with_linear": false,
415
+ "sparsify": null,
416
+ "unshifted_sink": false,
417
+ "use_prefill_window_in_sink_attention": false,
418
+ "window_length": null
419
+ },
420
+ "ffn": {
421
+ "ffn_mult": null,
422
+ "no_op": true,
423
+ "replace_with_linear": false,
424
+ "sparsify": null
425
+ }
426
+ },
427
+ {
428
+ "attention": {
429
+ "n_heads_in_group": null,
430
+ "no_op": true,
431
+ "num_sink_tokens": null,
432
+ "replace_with_linear": false,
433
+ "sparsify": null,
434
+ "unshifted_sink": false,
435
+ "use_prefill_window_in_sink_attention": false,
436
+ "window_length": null
437
+ },
438
+ "ffn": {
439
+ "ffn_mult": null,
440
+ "no_op": true,
441
+ "replace_with_linear": false,
442
+ "sparsify": null
443
+ }
444
+ },
445
+ {
446
+ "attention": {
447
+ "n_heads_in_group": 16,
448
+ "no_op": false,
449
+ "num_sink_tokens": null,
450
+ "replace_with_linear": false,
451
+ "sparsify": null,
452
+ "unshifted_sink": false,
453
+ "use_prefill_window_in_sink_attention": false,
454
+ "window_length": null
455
+ },
456
+ "ffn": {
457
+ "ffn_mult": 4.875,
458
+ "no_op": false,
459
+ "replace_with_linear": false,
460
+ "sparsify": null
461
+ }
462
+ },
463
+ {
464
+ "attention": {
465
+ "n_heads_in_group": 16,
466
+ "no_op": false,
467
+ "num_sink_tokens": null,
468
+ "replace_with_linear": false,
469
+ "sparsify": null,
470
+ "unshifted_sink": false,
471
+ "use_prefill_window_in_sink_attention": false,
472
+ "window_length": null
473
+ },
474
+ "ffn": {
475
+ "ffn_mult": 4.875,
476
+ "no_op": false,
477
+ "replace_with_linear": false,
478
+ "sparsify": null
479
+ }
480
+ },
481
+ {
482
+ "attention": {
483
+ "n_heads_in_group": 16,
484
+ "no_op": false,
485
+ "num_sink_tokens": null,
486
+ "replace_with_linear": false,
487
+ "sparsify": null,
488
+ "unshifted_sink": false,
489
+ "use_prefill_window_in_sink_attention": false,
490
+ "window_length": null
491
+ },
492
+ "ffn": {
493
+ "ffn_mult": 4.875,
494
+ "no_op": false,
495
+ "replace_with_linear": false,
496
+ "sparsify": null
497
+ }
498
+ },
499
+ {
500
+ "attention": {
501
+ "n_heads_in_group": null,
502
+ "no_op": true,
503
+ "num_sink_tokens": null,
504
+ "replace_with_linear": false,
505
+ "sparsify": null,
506
+ "unshifted_sink": false,
507
+ "use_prefill_window_in_sink_attention": false,
508
+ "window_length": null
509
+ },
510
+ "ffn": {
511
+ "ffn_mult": null,
512
+ "no_op": true,
513
+ "replace_with_linear": false,
514
+ "sparsify": null
515
+ }
516
+ },
517
+ {
518
+ "attention": {
519
+ "n_heads_in_group": null,
520
+ "no_op": true,
521
+ "num_sink_tokens": null,
522
+ "replace_with_linear": false,
523
+ "sparsify": null,
524
+ "unshifted_sink": false,
525
+ "use_prefill_window_in_sink_attention": false,
526
+ "window_length": null
527
+ },
528
+ "ffn": {
529
+ "ffn_mult": null,
530
+ "no_op": true,
531
+ "replace_with_linear": false,
532
+ "sparsify": null
533
+ }
534
+ },
535
+ {
536
+ "attention": {
537
+ "n_heads_in_group": null,
538
+ "no_op": true,
539
+ "num_sink_tokens": null,
540
+ "replace_with_linear": false,
541
+ "sparsify": null,
542
+ "unshifted_sink": false,
543
+ "use_prefill_window_in_sink_attention": false,
544
+ "window_length": null
545
+ },
546
+ "ffn": {
547
+ "ffn_mult": null,
548
+ "no_op": true,
549
+ "replace_with_linear": false,
550
+ "sparsify": null
551
+ }
552
+ },
553
+ {
554
+ "attention": {
555
+ "n_heads_in_group": null,
556
+ "no_op": true,
557
+ "num_sink_tokens": null,
558
+ "replace_with_linear": false,
559
+ "sparsify": null,
560
+ "unshifted_sink": false,
561
+ "use_prefill_window_in_sink_attention": false,
562
+ "window_length": null
563
+ },
564
+ "ffn": {
565
+ "ffn_mult": null,
566
+ "no_op": true,
567
+ "replace_with_linear": false,
568
+ "sparsify": null
569
+ }
570
+ },
571
+ {
572
+ "attention": {
573
+ "n_heads_in_group": null,
574
+ "no_op": true,
575
+ "num_sink_tokens": null,
576
+ "replace_with_linear": false,
577
+ "sparsify": null,
578
+ "unshifted_sink": false,
579
+ "use_prefill_window_in_sink_attention": false,
580
+ "window_length": null
581
+ },
582
+ "ffn": {
583
+ "ffn_mult": null,
584
+ "no_op": true,
585
+ "replace_with_linear": false,
586
+ "sparsify": null
587
+ }
588
+ },
589
+ {
590
+ "attention": {
591
+ "n_heads_in_group": 16,
592
+ "no_op": false,
593
+ "num_sink_tokens": null,
594
+ "replace_with_linear": false,
595
+ "sparsify": null,
596
+ "unshifted_sink": false,
597
+ "use_prefill_window_in_sink_attention": false,
598
+ "window_length": null
599
+ },
600
+ "ffn": {
601
+ "ffn_mult": 4.875,
602
+ "no_op": false,
603
+ "replace_with_linear": false,
604
+ "sparsify": null
605
+ }
606
+ },
607
+ {
608
+ "attention": {
609
+ "n_heads_in_group": 16,
610
+ "no_op": false,
611
+ "num_sink_tokens": null,
612
+ "replace_with_linear": false,
613
+ "sparsify": null,
614
+ "unshifted_sink": false,
615
+ "use_prefill_window_in_sink_attention": false,
616
+ "window_length": null
617
+ },
618
+ "ffn": {
619
+ "ffn_mult": 4.875,
620
+ "no_op": false,
621
+ "replace_with_linear": false,
622
+ "sparsify": null
623
+ }
624
+ },
625
+ {
626
+ "attention": {
627
+ "n_heads_in_group": 16,
628
+ "no_op": false,
629
+ "num_sink_tokens": null,
630
+ "replace_with_linear": false,
631
+ "sparsify": null,
632
+ "unshifted_sink": false,
633
+ "use_prefill_window_in_sink_attention": false,
634
+ "window_length": null
635
+ },
636
+ "ffn": {
637
+ "ffn_mult": 4.875,
638
+ "no_op": false,
639
+ "replace_with_linear": false,
640
+ "sparsify": null
641
+ }
642
+ },
643
+ {
644
+ "attention": {
645
+ "n_heads_in_group": 16,
646
+ "no_op": false,
647
+ "num_sink_tokens": null,
648
+ "replace_with_linear": false,
649
+ "sparsify": null,
650
+ "unshifted_sink": false,
651
+ "use_prefill_window_in_sink_attention": false,
652
+ "window_length": null
653
+ },
654
+ "ffn": {
655
+ "ffn_mult": 2.4375,
656
+ "no_op": false,
657
+ "replace_with_linear": false,
658
+ "sparsify": null
659
+ }
660
+ },
661
+ {
662
+ "attention": {
663
+ "n_heads_in_group": null,
664
+ "no_op": true,
665
+ "num_sink_tokens": null,
666
+ "replace_with_linear": false,
667
+ "sparsify": null,
668
+ "unshifted_sink": false,
669
+ "use_prefill_window_in_sink_attention": false,
670
+ "window_length": null
671
+ },
672
+ "ffn": {
673
+ "ffn_mult": null,
674
+ "no_op": true,
675
+ "replace_with_linear": false,
676
+ "sparsify": null
677
+ }
678
+ },
679
+ {
680
+ "attention": {
681
+ "n_heads_in_group": null,
682
+ "no_op": true,
683
+ "num_sink_tokens": null,
684
+ "replace_with_linear": false,
685
+ "sparsify": null,
686
+ "unshifted_sink": false,
687
+ "use_prefill_window_in_sink_attention": false,
688
+ "window_length": null
689
+ },
690
+ "ffn": {
691
+ "ffn_mult": null,
692
+ "no_op": true,
693
+ "replace_with_linear": false,
694
+ "sparsify": null
695
+ }
696
+ },
697
+ {
698
+ "attention": {
699
+ "n_heads_in_group": null,
700
+ "no_op": true,
701
+ "num_sink_tokens": null,
702
+ "replace_with_linear": false,
703
+ "sparsify": null,
704
+ "unshifted_sink": false,
705
+ "use_prefill_window_in_sink_attention": false,
706
+ "window_length": null
707
+ },
708
+ "ffn": {
709
+ "ffn_mult": null,
710
+ "no_op": true,
711
+ "replace_with_linear": false,
712
+ "sparsify": null
713
+ }
714
+ },
715
+ {
716
+ "attention": {
717
+ "n_heads_in_group": 16,
718
+ "no_op": false,
719
+ "num_sink_tokens": null,
720
+ "replace_with_linear": false,
721
+ "sparsify": null,
722
+ "unshifted_sink": false,
723
+ "use_prefill_window_in_sink_attention": false,
724
+ "window_length": null
725
+ },
726
+ "ffn": {
727
+ "ffn_mult": 1.95,
728
+ "no_op": false,
729
+ "replace_with_linear": false,
730
+ "sparsify": null
731
+ }
732
+ },
733
+ {
734
+ "attention": {
735
+ "n_heads_in_group": 16,
736
+ "no_op": false,
737
+ "num_sink_tokens": null,
738
+ "replace_with_linear": false,
739
+ "sparsify": null,
740
+ "unshifted_sink": false,
741
+ "use_prefill_window_in_sink_attention": false,
742
+ "window_length": null
743
+ },
744
+ "ffn": {
745
+ "ffn_mult": 1.95,
746
+ "no_op": false,
747
+ "replace_with_linear": false,
748
+ "sparsify": null
749
+ }
750
+ },
751
+ {
752
+ "attention": {
753
+ "n_heads_in_group": 16,
754
+ "no_op": false,
755
+ "num_sink_tokens": null,
756
+ "replace_with_linear": false,
757
+ "sparsify": null,
758
+ "unshifted_sink": false,
759
+ "use_prefill_window_in_sink_attention": false,
760
+ "window_length": null
761
+ },
762
+ "ffn": {
763
+ "ffn_mult": 1.95,
764
+ "no_op": false,
765
+ "replace_with_linear": false,
766
+ "sparsify": null
767
+ }
768
+ },
769
+ {
770
+ "attention": {
771
+ "n_heads_in_group": null,
772
+ "no_op": true,
773
+ "num_sink_tokens": null,
774
+ "replace_with_linear": false,
775
+ "sparsify": null,
776
+ "unshifted_sink": false,
777
+ "use_prefill_window_in_sink_attention": false,
778
+ "window_length": null
779
+ },
780
+ "ffn": {
781
+ "ffn_mult": 1.95,
782
+ "no_op": false,
783
+ "replace_with_linear": false,
784
+ "sparsify": null
785
+ }
786
+ },
787
+ {
788
+ "attention": {
789
+ "n_heads_in_group": 16,
790
+ "no_op": false,
791
+ "num_sink_tokens": null,
792
+ "replace_with_linear": false,
793
+ "sparsify": null,
794
+ "unshifted_sink": false,
795
+ "use_prefill_window_in_sink_attention": false,
796
+ "window_length": null
797
+ },
798
+ "ffn": {
799
+ "ffn_mult": 4.875,
800
+ "no_op": false,
801
+ "replace_with_linear": false,
802
+ "sparsify": null
803
+ }
804
+ },
805
+ {
806
+ "attention": {
807
+ "n_heads_in_group": null,
808
+ "no_op": true,
809
+ "num_sink_tokens": null,
810
+ "replace_with_linear": false,
811
+ "sparsify": null,
812
+ "unshifted_sink": false,
813
+ "use_prefill_window_in_sink_attention": false,
814
+ "window_length": null
815
+ },
816
+ "ffn": {
817
+ "ffn_mult": 4.875,
818
+ "no_op": false,
819
+ "replace_with_linear": false,
820
+ "sparsify": null
821
+ }
822
+ },
823
+ {
824
+ "attention": {
825
+ "n_heads_in_group": null,
826
+ "no_op": true,
827
+ "num_sink_tokens": null,
828
+ "replace_with_linear": false,
829
+ "sparsify": null,
830
+ "unshifted_sink": false,
831
+ "use_prefill_window_in_sink_attention": false,
832
+ "window_length": null
833
+ },
834
+ "ffn": {
835
+ "ffn_mult": null,
836
+ "no_op": true,
837
+ "replace_with_linear": false,
838
+ "sparsify": null
839
+ }
840
+ },
841
+ {
842
+ "attention": {
843
+ "n_heads_in_group": null,
844
+ "no_op": true,
845
+ "num_sink_tokens": null,
846
+ "replace_with_linear": false,
847
+ "sparsify": null,
848
+ "unshifted_sink": false,
849
+ "use_prefill_window_in_sink_attention": false,
850
+ "window_length": null
851
+ },
852
+ "ffn": {
853
+ "ffn_mult": null,
854
+ "no_op": true,
855
+ "replace_with_linear": false,
856
+ "sparsify": null
857
+ }
858
+ },
859
+ {
860
+ "attention": {
861
+ "n_heads_in_group": null,
862
+ "no_op": true,
863
+ "num_sink_tokens": null,
864
+ "replace_with_linear": false,
865
+ "sparsify": null,
866
+ "unshifted_sink": false,
867
+ "use_prefill_window_in_sink_attention": false,
868
+ "window_length": null
869
+ },
870
+ "ffn": {
871
+ "ffn_mult": null,
872
+ "no_op": true,
873
+ "replace_with_linear": false,
874
+ "sparsify": null
875
+ }
876
+ },
877
+ {
878
+ "attention": {
879
+ "n_heads_in_group": null,
880
+ "no_op": true,
881
+ "num_sink_tokens": null,
882
+ "replace_with_linear": false,
883
+ "sparsify": null,
884
+ "unshifted_sink": false,
885
+ "use_prefill_window_in_sink_attention": false,
886
+ "window_length": null
887
+ },
888
+ "ffn": {
889
+ "ffn_mult": null,
890
+ "no_op": true,
891
+ "replace_with_linear": false,
892
+ "sparsify": null
893
+ }
894
+ },
895
+ {
896
+ "attention": {
897
+ "n_heads_in_group": null,
898
+ "no_op": true,
899
+ "num_sink_tokens": null,
900
+ "replace_with_linear": false,
901
+ "sparsify": null,
902
+ "unshifted_sink": false,
903
+ "use_prefill_window_in_sink_attention": false,
904
+ "window_length": null
905
+ },
906
+ "ffn": {
907
+ "ffn_mult": null,
908
+ "no_op": true,
909
+ "replace_with_linear": false,
910
+ "sparsify": null
911
+ }
912
+ },
913
+ {
914
+ "attention": {
915
+ "n_heads_in_group": 16,
916
+ "no_op": false,
917
+ "num_sink_tokens": null,
918
+ "replace_with_linear": false,
919
+ "sparsify": null,
920
+ "unshifted_sink": false,
921
+ "use_prefill_window_in_sink_attention": false,
922
+ "window_length": null
923
+ },
924
+ "ffn": {
925
+ "ffn_mult": 4.875,
926
+ "no_op": false,
927
+ "replace_with_linear": false,
928
+ "sparsify": null
929
+ }
930
+ },
931
+ {
932
+ "attention": {
933
+ "n_heads_in_group": 16,
934
+ "no_op": false,
935
+ "num_sink_tokens": null,
936
+ "replace_with_linear": false,
937
+ "sparsify": null,
938
+ "unshifted_sink": false,
939
+ "use_prefill_window_in_sink_attention": false,
940
+ "window_length": null
941
+ },
942
+ "ffn": {
943
+ "ffn_mult": 4.875,
944
+ "no_op": false,
945
+ "replace_with_linear": false,
946
+ "sparsify": null
947
+ }
948
+ },
949
+ {
950
+ "attention": {
951
+ "n_heads_in_group": 16,
952
+ "no_op": false,
953
+ "num_sink_tokens": null,
954
+ "replace_with_linear": false,
955
+ "sparsify": null,
956
+ "unshifted_sink": false,
957
+ "use_prefill_window_in_sink_attention": false,
958
+ "window_length": null
959
+ },
960
+ "ffn": {
961
+ "ffn_mult": 4.875,
962
+ "no_op": false,
963
+ "replace_with_linear": false,
964
+ "sparsify": null
965
+ }
966
+ },
967
+ {
968
+ "attention": {
969
+ "n_heads_in_group": 16,
970
+ "no_op": false,
971
+ "num_sink_tokens": null,
972
+ "replace_with_linear": false,
973
+ "sparsify": null,
974
+ "unshifted_sink": false,
975
+ "use_prefill_window_in_sink_attention": false,
976
+ "window_length": null
977
+ },
978
+ "ffn": {
979
+ "ffn_mult": 4.875,
980
+ "no_op": false,
981
+ "replace_with_linear": false,
982
+ "sparsify": null
983
+ }
984
+ },
985
+ {
986
+ "attention": {
987
+ "n_heads_in_group": null,
988
+ "no_op": true,
989
+ "num_sink_tokens": null,
990
+ "replace_with_linear": false,
991
+ "sparsify": null,
992
+ "unshifted_sink": false,
993
+ "use_prefill_window_in_sink_attention": false,
994
+ "window_length": null
995
+ },
996
+ "ffn": {
997
+ "ffn_mult": null,
998
+ "no_op": true,
999
+ "replace_with_linear": false,
1000
+ "sparsify": null
1001
+ }
1002
+ },
1003
+ {
1004
+ "attention": {
1005
+ "n_heads_in_group": null,
1006
+ "no_op": true,
1007
+ "num_sink_tokens": null,
1008
+ "replace_with_linear": false,
1009
+ "sparsify": null,
1010
+ "unshifted_sink": false,
1011
+ "use_prefill_window_in_sink_attention": false,
1012
+ "window_length": null
1013
+ },
1014
+ "ffn": {
1015
+ "ffn_mult": null,
1016
+ "no_op": true,
1017
+ "replace_with_linear": false,
1018
+ "sparsify": null
1019
+ }
1020
+ },
1021
+ {
1022
+ "attention": {
1023
+ "n_heads_in_group": null,
1024
+ "no_op": true,
1025
+ "num_sink_tokens": null,
1026
+ "replace_with_linear": false,
1027
+ "sparsify": null,
1028
+ "unshifted_sink": false,
1029
+ "use_prefill_window_in_sink_attention": false,
1030
+ "window_length": null
1031
+ },
1032
+ "ffn": {
1033
+ "ffn_mult": null,
1034
+ "no_op": true,
1035
+ "replace_with_linear": false,
1036
+ "sparsify": null
1037
+ }
1038
+ },
1039
+ {
1040
+ "attention": {
1041
+ "n_heads_in_group": null,
1042
+ "no_op": true,
1043
+ "num_sink_tokens": null,
1044
+ "replace_with_linear": false,
1045
+ "sparsify": null,
1046
+ "unshifted_sink": false,
1047
+ "use_prefill_window_in_sink_attention": false,
1048
+ "window_length": null
1049
+ },
1050
+ "ffn": {
1051
+ "ffn_mult": null,
1052
+ "no_op": true,
1053
+ "replace_with_linear": false,
1054
+ "sparsify": null
1055
+ }
1056
+ },
1057
+ {
1058
+ "attention": {
1059
+ "n_heads_in_group": null,
1060
+ "no_op": true,
1061
+ "num_sink_tokens": null,
1062
+ "replace_with_linear": false,
1063
+ "sparsify": null,
1064
+ "unshifted_sink": false,
1065
+ "use_prefill_window_in_sink_attention": false,
1066
+ "window_length": null
1067
+ },
1068
+ "ffn": {
1069
+ "ffn_mult": null,
1070
+ "no_op": true,
1071
+ "replace_with_linear": false,
1072
+ "sparsify": null
1073
+ }
1074
+ },
1075
+ {
1076
+ "attention": {
1077
+ "n_heads_in_group": 16,
1078
+ "no_op": false,
1079
+ "num_sink_tokens": null,
1080
+ "replace_with_linear": false,
1081
+ "sparsify": null,
1082
+ "unshifted_sink": false,
1083
+ "use_prefill_window_in_sink_attention": false,
1084
+ "window_length": null
1085
+ },
1086
+ "ffn": {
1087
+ "ffn_mult": 4.875,
1088
+ "no_op": false,
1089
+ "replace_with_linear": false,
1090
+ "sparsify": null
1091
+ }
1092
+ },
1093
+ {
1094
+ "attention": {
1095
+ "n_heads_in_group": 16,
1096
+ "no_op": false,
1097
+ "num_sink_tokens": null,
1098
+ "replace_with_linear": false,
1099
+ "sparsify": null,
1100
+ "unshifted_sink": false,
1101
+ "use_prefill_window_in_sink_attention": false,
1102
+ "window_length": null
1103
+ },
1104
+ "ffn": {
1105
+ "ffn_mult": 4.875,
1106
+ "no_op": false,
1107
+ "replace_with_linear": false,
1108
+ "sparsify": null
1109
+ }
1110
+ },
1111
+ {
1112
+ "attention": {
1113
+ "n_heads_in_group": 16,
1114
+ "no_op": false,
1115
+ "num_sink_tokens": null,
1116
+ "replace_with_linear": false,
1117
+ "sparsify": null,
1118
+ "unshifted_sink": false,
1119
+ "use_prefill_window_in_sink_attention": false,
1120
+ "window_length": null
1121
+ },
1122
+ "ffn": {
1123
+ "ffn_mult": 4.875,
1124
+ "no_op": false,
1125
+ "replace_with_linear": false,
1126
+ "sparsify": null
1127
+ }
1128
+ },
1129
+ {
1130
+ "attention": {
1131
+ "n_heads_in_group": 16,
1132
+ "no_op": false,
1133
+ "num_sink_tokens": null,
1134
+ "replace_with_linear": false,
1135
+ "sparsify": null,
1136
+ "unshifted_sink": false,
1137
+ "use_prefill_window_in_sink_attention": false,
1138
+ "window_length": null
1139
+ },
1140
+ "ffn": {
1141
+ "ffn_mult": 4.875,
1142
+ "no_op": false,
1143
+ "replace_with_linear": false,
1144
+ "sparsify": null
1145
+ }
1146
+ },
1147
+ {
1148
+ "attention": {
1149
+ "n_heads_in_group": null,
1150
+ "no_op": true,
1151
+ "num_sink_tokens": null,
1152
+ "replace_with_linear": false,
1153
+ "sparsify": null,
1154
+ "unshifted_sink": false,
1155
+ "use_prefill_window_in_sink_attention": false,
1156
+ "window_length": null
1157
+ },
1158
+ "ffn": {
1159
+ "ffn_mult": null,
1160
+ "no_op": true,
1161
+ "replace_with_linear": false,
1162
+ "sparsify": null
1163
+ }
1164
+ },
1165
+ {
1166
+ "attention": {
1167
+ "n_heads_in_group": null,
1168
+ "no_op": true,
1169
+ "num_sink_tokens": null,
1170
+ "replace_with_linear": false,
1171
+ "sparsify": null,
1172
+ "unshifted_sink": false,
1173
+ "use_prefill_window_in_sink_attention": false,
1174
+ "window_length": null
1175
+ },
1176
+ "ffn": {
1177
+ "ffn_mult": null,
1178
+ "no_op": true,
1179
+ "replace_with_linear": false,
1180
+ "sparsify": null
1181
+ }
1182
+ },
1183
+ {
1184
+ "attention": {
1185
+ "n_heads_in_group": null,
1186
+ "no_op": true,
1187
+ "num_sink_tokens": null,
1188
+ "replace_with_linear": false,
1189
+ "sparsify": null,
1190
+ "unshifted_sink": false,
1191
+ "use_prefill_window_in_sink_attention": false,
1192
+ "window_length": null
1193
+ },
1194
+ "ffn": {
1195
+ "ffn_mult": null,
1196
+ "no_op": true,
1197
+ "replace_with_linear": false,
1198
+ "sparsify": null
1199
+ }
1200
+ },
1201
+ {
1202
+ "attention": {
1203
+ "n_heads_in_group": null,
1204
+ "no_op": true,
1205
+ "num_sink_tokens": null,
1206
+ "replace_with_linear": false,
1207
+ "sparsify": null,
1208
+ "unshifted_sink": false,
1209
+ "use_prefill_window_in_sink_attention": false,
1210
+ "window_length": null
1211
+ },
1212
+ "ffn": {
1213
+ "ffn_mult": null,
1214
+ "no_op": true,
1215
+ "replace_with_linear": false,
1216
+ "sparsify": null
1217
+ }
1218
+ },
1219
+ {
1220
+ "attention": {
1221
+ "n_heads_in_group": null,
1222
+ "no_op": true,
1223
+ "num_sink_tokens": null,
1224
+ "replace_with_linear": false,
1225
+ "sparsify": null,
1226
+ "unshifted_sink": false,
1227
+ "use_prefill_window_in_sink_attention": false,
1228
+ "window_length": null
1229
+ },
1230
+ "ffn": {
1231
+ "ffn_mult": null,
1232
+ "no_op": true,
1233
+ "replace_with_linear": false,
1234
+ "sparsify": null
1235
+ }
1236
+ },
1237
+ {
1238
+ "attention": {
1239
+ "n_heads_in_group": 16,
1240
+ "no_op": false,
1241
+ "num_sink_tokens": null,
1242
+ "replace_with_linear": false,
1243
+ "sparsify": null,
1244
+ "unshifted_sink": false,
1245
+ "use_prefill_window_in_sink_attention": false,
1246
+ "window_length": null
1247
+ },
1248
+ "ffn": {
1249
+ "ffn_mult": 4.875,
1250
+ "no_op": false,
1251
+ "replace_with_linear": false,
1252
+ "sparsify": null
1253
+ }
1254
+ },
1255
+ {
1256
+ "attention": {
1257
+ "n_heads_in_group": 16,
1258
+ "no_op": false,
1259
+ "num_sink_tokens": null,
1260
+ "replace_with_linear": false,
1261
+ "sparsify": null,
1262
+ "unshifted_sink": false,
1263
+ "use_prefill_window_in_sink_attention": false,
1264
+ "window_length": null
1265
+ },
1266
+ "ffn": {
1267
+ "ffn_mult": 4.875,
1268
+ "no_op": false,
1269
+ "replace_with_linear": false,
1270
+ "sparsify": null
1271
+ }
1272
+ },
1273
+ {
1274
+ "attention": {
1275
+ "n_heads_in_group": 16,
1276
+ "no_op": false,
1277
+ "num_sink_tokens": null,
1278
+ "replace_with_linear": false,
1279
+ "sparsify": null,
1280
+ "unshifted_sink": false,
1281
+ "use_prefill_window_in_sink_attention": false,
1282
+ "window_length": null
1283
+ },
1284
+ "ffn": {
1285
+ "ffn_mult": 4.875,
1286
+ "no_op": false,
1287
+ "replace_with_linear": false,
1288
+ "sparsify": null
1289
+ }
1290
+ },
1291
+ {
1292
+ "attention": {
1293
+ "n_heads_in_group": 16,
1294
+ "no_op": false,
1295
+ "num_sink_tokens": null,
1296
+ "replace_with_linear": false,
1297
+ "sparsify": null,
1298
+ "unshifted_sink": false,
1299
+ "use_prefill_window_in_sink_attention": false,
1300
+ "window_length": null
1301
+ },
1302
+ "ffn": {
1303
+ "ffn_mult": 4.875,
1304
+ "no_op": false,
1305
+ "replace_with_linear": false,
1306
+ "sparsify": null
1307
+ }
1308
+ },
1309
+ {
1310
+ "attention": {
1311
+ "n_heads_in_group": null,
1312
+ "no_op": true,
1313
+ "num_sink_tokens": null,
1314
+ "replace_with_linear": false,
1315
+ "sparsify": null,
1316
+ "unshifted_sink": false,
1317
+ "use_prefill_window_in_sink_attention": false,
1318
+ "window_length": null
1319
+ },
1320
+ "ffn": {
1321
+ "ffn_mult": null,
1322
+ "no_op": true,
1323
+ "replace_with_linear": false,
1324
+ "sparsify": null
1325
+ }
1326
+ },
1327
+ {
1328
+ "attention": {
1329
+ "n_heads_in_group": null,
1330
+ "no_op": true,
1331
+ "num_sink_tokens": null,
1332
+ "replace_with_linear": false,
1333
+ "sparsify": null,
1334
+ "unshifted_sink": false,
1335
+ "use_prefill_window_in_sink_attention": false,
1336
+ "window_length": null
1337
+ },
1338
+ "ffn": {
1339
+ "ffn_mult": null,
1340
+ "no_op": true,
1341
+ "replace_with_linear": false,
1342
+ "sparsify": null
1343
+ }
1344
+ },
1345
+ {
1346
+ "attention": {
1347
+ "n_heads_in_group": null,
1348
+ "no_op": true,
1349
+ "num_sink_tokens": null,
1350
+ "replace_with_linear": false,
1351
+ "sparsify": null,
1352
+ "unshifted_sink": false,
1353
+ "use_prefill_window_in_sink_attention": false,
1354
+ "window_length": null
1355
+ },
1356
+ "ffn": {
1357
+ "ffn_mult": null,
1358
+ "no_op": true,
1359
+ "replace_with_linear": false,
1360
+ "sparsify": null
1361
+ }
1362
+ },
1363
+ {
1364
+ "attention": {
1365
+ "n_heads_in_group": null,
1366
+ "no_op": true,
1367
+ "num_sink_tokens": null,
1368
+ "replace_with_linear": false,
1369
+ "sparsify": null,
1370
+ "unshifted_sink": false,
1371
+ "use_prefill_window_in_sink_attention": false,
1372
+ "window_length": null
1373
+ },
1374
+ "ffn": {
1375
+ "ffn_mult": null,
1376
+ "no_op": true,
1377
+ "replace_with_linear": false,
1378
+ "sparsify": null
1379
+ }
1380
+ },
1381
+ {
1382
+ "attention": {
1383
+ "n_heads_in_group": null,
1384
+ "no_op": true,
1385
+ "num_sink_tokens": null,
1386
+ "replace_with_linear": false,
1387
+ "sparsify": null,
1388
+ "unshifted_sink": false,
1389
+ "use_prefill_window_in_sink_attention": false,
1390
+ "window_length": null
1391
+ },
1392
+ "ffn": {
1393
+ "ffn_mult": null,
1394
+ "no_op": true,
1395
+ "replace_with_linear": false,
1396
+ "sparsify": null
1397
+ }
1398
+ },
1399
+ {
1400
+ "attention": {
1401
+ "n_heads_in_group": 16,
1402
+ "no_op": false,
1403
+ "num_sink_tokens": null,
1404
+ "replace_with_linear": false,
1405
+ "sparsify": null,
1406
+ "unshifted_sink": false,
1407
+ "use_prefill_window_in_sink_attention": false,
1408
+ "window_length": null
1409
+ },
1410
+ "ffn": {
1411
+ "ffn_mult": 4.875,
1412
+ "no_op": false,
1413
+ "replace_with_linear": false,
1414
+ "sparsify": null
1415
+ }
1416
+ },
1417
+ {
1418
+ "attention": {
1419
+ "n_heads_in_group": 16,
1420
+ "no_op": false,
1421
+ "num_sink_tokens": null,
1422
+ "replace_with_linear": false,
1423
+ "sparsify": null,
1424
+ "unshifted_sink": false,
1425
+ "use_prefill_window_in_sink_attention": false,
1426
+ "window_length": null
1427
+ },
1428
+ "ffn": {
1429
+ "ffn_mult": 4.875,
1430
+ "no_op": false,
1431
+ "replace_with_linear": false,
1432
+ "sparsify": null
1433
+ }
1434
+ },
1435
+ {
1436
+ "attention": {
1437
+ "n_heads_in_group": 16,
1438
+ "no_op": false,
1439
+ "num_sink_tokens": null,
1440
+ "replace_with_linear": false,
1441
+ "sparsify": null,
1442
+ "unshifted_sink": false,
1443
+ "use_prefill_window_in_sink_attention": false,
1444
+ "window_length": null
1445
+ },
1446
+ "ffn": {
1447
+ "ffn_mult": 4.875,
1448
+ "no_op": false,
1449
+ "replace_with_linear": false,
1450
+ "sparsify": null
1451
+ }
1452
+ },
1453
+ {
1454
+ "attention": {
1455
+ "n_heads_in_group": 16,
1456
+ "no_op": false,
1457
+ "num_sink_tokens": null,
1458
+ "replace_with_linear": false,
1459
+ "sparsify": null,
1460
+ "unshifted_sink": false,
1461
+ "use_prefill_window_in_sink_attention": false,
1462
+ "window_length": null
1463
+ },
1464
+ "ffn": {
1465
+ "ffn_mult": 4.875,
1466
+ "no_op": false,
1467
+ "replace_with_linear": false,
1468
+ "sparsify": null
1469
+ }
1470
+ },
1471
+ {
1472
+ "attention": {
1473
+ "n_heads_in_group": null,
1474
+ "no_op": true,
1475
+ "num_sink_tokens": null,
1476
+ "replace_with_linear": false,
1477
+ "sparsify": null,
1478
+ "unshifted_sink": false,
1479
+ "use_prefill_window_in_sink_attention": false,
1480
+ "window_length": null
1481
+ },
1482
+ "ffn": {
1483
+ "ffn_mult": null,
1484
+ "no_op": true,
1485
+ "replace_with_linear": false,
1486
+ "sparsify": null
1487
+ }
1488
+ },
1489
+ {
1490
+ "attention": {
1491
+ "n_heads_in_group": null,
1492
+ "no_op": true,
1493
+ "num_sink_tokens": null,
1494
+ "replace_with_linear": false,
1495
+ "sparsify": null,
1496
+ "unshifted_sink": false,
1497
+ "use_prefill_window_in_sink_attention": false,
1498
+ "window_length": null
1499
+ },
1500
+ "ffn": {
1501
+ "ffn_mult": null,
1502
+ "no_op": true,
1503
+ "replace_with_linear": false,
1504
+ "sparsify": null
1505
+ }
1506
+ },
1507
+ {
1508
+ "attention": {
1509
+ "n_heads_in_group": null,
1510
+ "no_op": true,
1511
+ "num_sink_tokens": null,
1512
+ "replace_with_linear": false,
1513
+ "sparsify": null,
1514
+ "unshifted_sink": false,
1515
+ "use_prefill_window_in_sink_attention": false,
1516
+ "window_length": null
1517
+ },
1518
+ "ffn": {
1519
+ "ffn_mult": null,
1520
+ "no_op": true,
1521
+ "replace_with_linear": false,
1522
+ "sparsify": null
1523
+ }
1524
+ },
1525
+ {
1526
+ "attention": {
1527
+ "n_heads_in_group": null,
1528
+ "no_op": true,
1529
+ "num_sink_tokens": null,
1530
+ "replace_with_linear": false,
1531
+ "sparsify": null,
1532
+ "unshifted_sink": false,
1533
+ "use_prefill_window_in_sink_attention": false,
1534
+ "window_length": null
1535
+ },
1536
+ "ffn": {
1537
+ "ffn_mult": null,
1538
+ "no_op": true,
1539
+ "replace_with_linear": false,
1540
+ "sparsify": null
1541
+ }
1542
+ },
1543
+ {
1544
+ "attention": {
1545
+ "n_heads_in_group": null,
1546
+ "no_op": true,
1547
+ "num_sink_tokens": null,
1548
+ "replace_with_linear": false,
1549
+ "sparsify": null,
1550
+ "unshifted_sink": false,
1551
+ "use_prefill_window_in_sink_attention": false,
1552
+ "window_length": null
1553
+ },
1554
+ "ffn": {
1555
+ "ffn_mult": null,
1556
+ "no_op": true,
1557
+ "replace_with_linear": false,
1558
+ "sparsify": null
1559
+ }
1560
+ },
1561
+ {
1562
+ "attention": {
1563
+ "n_heads_in_group": 16,
1564
+ "no_op": false,
1565
+ "num_sink_tokens": null,
1566
+ "replace_with_linear": false,
1567
+ "sparsify": null,
1568
+ "unshifted_sink": false,
1569
+ "use_prefill_window_in_sink_attention": false,
1570
+ "window_length": null
1571
+ },
1572
+ "ffn": {
1573
+ "ffn_mult": 4.875,
1574
+ "no_op": false,
1575
+ "replace_with_linear": false,
1576
+ "sparsify": null
1577
+ }
1578
+ },
1579
+ {
1580
+ "attention": {
1581
+ "n_heads_in_group": 16,
1582
+ "no_op": false,
1583
+ "num_sink_tokens": null,
1584
+ "replace_with_linear": false,
1585
+ "sparsify": null,
1586
+ "unshifted_sink": false,
1587
+ "use_prefill_window_in_sink_attention": false,
1588
+ "window_length": null
1589
+ },
1590
+ "ffn": {
1591
+ "ffn_mult": 3.4125,
1592
+ "no_op": false,
1593
+ "replace_with_linear": false,
1594
+ "sparsify": null
1595
+ }
1596
+ },
1597
+ {
1598
+ "attention": {
1599
+ "n_heads_in_group": 16,
1600
+ "no_op": false,
1601
+ "num_sink_tokens": null,
1602
+ "replace_with_linear": false,
1603
+ "sparsify": null,
1604
+ "unshifted_sink": false,
1605
+ "use_prefill_window_in_sink_attention": false,
1606
+ "window_length": null
1607
+ },
1608
+ "ffn": {
1609
+ "ffn_mult": 3.4125,
1610
+ "no_op": false,
1611
+ "replace_with_linear": false,
1612
+ "sparsify": null
1613
+ }
1614
+ },
1615
+ {
1616
+ "attention": {
1617
+ "n_heads_in_group": 16,
1618
+ "no_op": false,
1619
+ "num_sink_tokens": null,
1620
+ "replace_with_linear": false,
1621
+ "sparsify": null,
1622
+ "unshifted_sink": false,
1623
+ "use_prefill_window_in_sink_attention": false,
1624
+ "window_length": null
1625
+ },
1626
+ "ffn": {
1627
+ "ffn_mult": 3.4125,
1628
+ "no_op": false,
1629
+ "replace_with_linear": false,
1630
+ "sparsify": null
1631
+ }
1632
+ },
1633
+ {
1634
+ "attention": {
1635
+ "n_heads_in_group": null,
1636
+ "no_op": true,
1637
+ "num_sink_tokens": null,
1638
+ "replace_with_linear": false,
1639
+ "sparsify": null,
1640
+ "unshifted_sink": false,
1641
+ "use_prefill_window_in_sink_attention": false,
1642
+ "window_length": null
1643
+ },
1644
+ "ffn": {
1645
+ "ffn_mult": null,
1646
+ "no_op": true,
1647
+ "replace_with_linear": false,
1648
+ "sparsify": null
1649
+ }
1650
+ },
1651
+ {
1652
+ "attention": {
1653
+ "n_heads_in_group": null,
1654
+ "no_op": true,
1655
+ "num_sink_tokens": null,
1656
+ "replace_with_linear": false,
1657
+ "sparsify": null,
1658
+ "unshifted_sink": false,
1659
+ "use_prefill_window_in_sink_attention": false,
1660
+ "window_length": null
1661
+ },
1662
+ "ffn": {
1663
+ "ffn_mult": null,
1664
+ "no_op": true,
1665
+ "replace_with_linear": false,
1666
+ "sparsify": null
1667
+ }
1668
+ },
1669
+ {
1670
+ "attention": {
1671
+ "n_heads_in_group": 16,
1672
+ "no_op": false,
1673
+ "num_sink_tokens": null,
1674
+ "replace_with_linear": false,
1675
+ "sparsify": null,
1676
+ "unshifted_sink": false,
1677
+ "use_prefill_window_in_sink_attention": false,
1678
+ "window_length": null
1679
+ },
1680
+ "ffn": {
1681
+ "ffn_mult": 2.925,
1682
+ "no_op": false,
1683
+ "replace_with_linear": false,
1684
+ "sparsify": null
1685
+ }
1686
+ },
1687
+ {
1688
+ "attention": {
1689
+ "n_heads_in_group": 16,
1690
+ "no_op": false,
1691
+ "num_sink_tokens": null,
1692
+ "replace_with_linear": false,
1693
+ "sparsify": null,
1694
+ "unshifted_sink": false,
1695
+ "use_prefill_window_in_sink_attention": false,
1696
+ "window_length": null
1697
+ },
1698
+ "ffn": {
1699
+ "ffn_mult": 2.4375,
1700
+ "no_op": false,
1701
+ "replace_with_linear": false,
1702
+ "sparsify": null
1703
+ }
1704
+ },
1705
+ {
1706
+ "attention": {
1707
+ "n_heads_in_group": 16,
1708
+ "no_op": false,
1709
+ "num_sink_tokens": null,
1710
+ "replace_with_linear": false,
1711
+ "sparsify": null,
1712
+ "unshifted_sink": false,
1713
+ "use_prefill_window_in_sink_attention": false,
1714
+ "window_length": null
1715
+ },
1716
+ "ffn": {
1717
+ "ffn_mult": 2.4375,
1718
+ "no_op": false,
1719
+ "replace_with_linear": false,
1720
+ "sparsify": null
1721
+ }
1722
+ },
1723
+ {
1724
+ "attention": {
1725
+ "n_heads_in_group": 16,
1726
+ "no_op": false,
1727
+ "num_sink_tokens": null,
1728
+ "replace_with_linear": false,
1729
+ "sparsify": null,
1730
+ "unshifted_sink": false,
1731
+ "use_prefill_window_in_sink_attention": false,
1732
+ "window_length": null
1733
+ },
1734
+ "ffn": {
1735
+ "ffn_mult": 2.4375,
1736
+ "no_op": false,
1737
+ "replace_with_linear": false,
1738
+ "sparsify": null
1739
+ }
1740
+ },
1741
+ {
1742
+ "attention": {
1743
+ "n_heads_in_group": null,
1744
+ "no_op": true,
1745
+ "num_sink_tokens": null,
1746
+ "replace_with_linear": false,
1747
+ "sparsify": null,
1748
+ "unshifted_sink": false,
1749
+ "use_prefill_window_in_sink_attention": false,
1750
+ "window_length": null
1751
+ },
1752
+ "ffn": {
1753
+ "ffn_mult": 2.4375,
1754
+ "no_op": false,
1755
+ "replace_with_linear": false,
1756
+ "sparsify": null
1757
+ }
1758
+ },
1759
+ {
1760
+ "attention": {
1761
+ "n_heads_in_group": null,
1762
+ "no_op": true,
1763
+ "num_sink_tokens": null,
1764
+ "replace_with_linear": false,
1765
+ "sparsify": null,
1766
+ "unshifted_sink": false,
1767
+ "use_prefill_window_in_sink_attention": false,
1768
+ "window_length": null
1769
+ },
1770
+ "ffn": {
1771
+ "ffn_mult": 2.4375,
1772
+ "no_op": false,
1773
+ "replace_with_linear": false,
1774
+ "sparsify": null
1775
+ }
1776
+ },
1777
+ {
1778
+ "attention": {
1779
+ "n_heads_in_group": 16,
1780
+ "no_op": false,
1781
+ "num_sink_tokens": null,
1782
+ "replace_with_linear": false,
1783
+ "sparsify": null,
1784
+ "unshifted_sink": false,
1785
+ "use_prefill_window_in_sink_attention": false,
1786
+ "window_length": null
1787
+ },
1788
+ "ffn": {
1789
+ "ffn_mult": 2.4375,
1790
+ "no_op": false,
1791
+ "replace_with_linear": false,
1792
+ "sparsify": null
1793
+ }
1794
+ },
1795
+ {
1796
+ "attention": {
1797
+ "n_heads_in_group": null,
1798
+ "no_op": true,
1799
+ "num_sink_tokens": null,
1800
+ "replace_with_linear": false,
1801
+ "sparsify": null,
1802
+ "unshifted_sink": false,
1803
+ "use_prefill_window_in_sink_attention": false,
1804
+ "window_length": null
1805
+ },
1806
+ "ffn": {
1807
+ "ffn_mult": null,
1808
+ "no_op": true,
1809
+ "replace_with_linear": false,
1810
+ "sparsify": null
1811
+ }
1812
+ },
1813
+ {
1814
+ "attention": {
1815
+ "n_heads_in_group": null,
1816
+ "no_op": true,
1817
+ "num_sink_tokens": null,
1818
+ "replace_with_linear": false,
1819
+ "sparsify": null,
1820
+ "unshifted_sink": false,
1821
+ "use_prefill_window_in_sink_attention": false,
1822
+ "window_length": null
1823
+ },
1824
+ "ffn": {
1825
+ "ffn_mult": 2.4375,
1826
+ "no_op": false,
1827
+ "replace_with_linear": false,
1828
+ "sparsify": null
1829
+ }
1830
+ },
1831
+ {
1832
+ "attention": {
1833
+ "n_heads_in_group": null,
1834
+ "no_op": true,
1835
+ "num_sink_tokens": null,
1836
+ "replace_with_linear": false,
1837
+ "sparsify": null,
1838
+ "unshifted_sink": false,
1839
+ "use_prefill_window_in_sink_attention": false,
1840
+ "window_length": null
1841
+ },
1842
+ "ffn": {
1843
+ "ffn_mult": 2.4375,
1844
+ "no_op": false,
1845
+ "replace_with_linear": false,
1846
+ "sparsify": null
1847
+ }
1848
+ },
1849
+ {
1850
+ "attention": {
1851
+ "n_heads_in_group": null,
1852
+ "no_op": true,
1853
+ "num_sink_tokens": null,
1854
+ "replace_with_linear": false,
1855
+ "sparsify": null,
1856
+ "unshifted_sink": false,
1857
+ "use_prefill_window_in_sink_attention": false,
1858
+ "window_length": null
1859
+ },
1860
+ "ffn": {
1861
+ "ffn_mult": 2.4375,
1862
+ "no_op": false,
1863
+ "replace_with_linear": false,
1864
+ "sparsify": null
1865
+ }
1866
+ },
1867
+ {
1868
+ "attention": {
1869
+ "n_heads_in_group": null,
1870
+ "no_op": true,
1871
+ "num_sink_tokens": null,
1872
+ "replace_with_linear": false,
1873
+ "sparsify": null,
1874
+ "unshifted_sink": false,
1875
+ "use_prefill_window_in_sink_attention": false,
1876
+ "window_length": null
1877
+ },
1878
+ "ffn": {
1879
+ "ffn_mult": 2.4375,
1880
+ "no_op": false,
1881
+ "replace_with_linear": false,
1882
+ "sparsify": null
1883
+ }
1884
+ },
1885
+ {
1886
+ "attention": {
1887
+ "n_heads_in_group": null,
1888
+ "no_op": true,
1889
+ "num_sink_tokens": null,
1890
+ "replace_with_linear": false,
1891
+ "sparsify": null,
1892
+ "unshifted_sink": false,
1893
+ "use_prefill_window_in_sink_attention": false,
1894
+ "window_length": null
1895
+ },
1896
+ "ffn": {
1897
+ "ffn_mult": 2.4375,
1898
+ "no_op": false,
1899
+ "replace_with_linear": false,
1900
+ "sparsify": null
1901
+ }
1902
+ },
1903
+ {
1904
+ "attention": {
1905
+ "n_heads_in_group": null,
1906
+ "no_op": true,
1907
+ "num_sink_tokens": null,
1908
+ "replace_with_linear": false,
1909
+ "sparsify": null,
1910
+ "unshifted_sink": false,
1911
+ "use_prefill_window_in_sink_attention": false,
1912
+ "window_length": null
1913
+ },
1914
+ "ffn": {
1915
+ "ffn_mult": 2.4375,
1916
+ "no_op": false,
1917
+ "replace_with_linear": false,
1918
+ "sparsify": null
1919
+ }
1920
+ },
1921
+ {
1922
+ "attention": {
1923
+ "n_heads_in_group": null,
1924
+ "no_op": true,
1925
+ "num_sink_tokens": null,
1926
+ "replace_with_linear": false,
1927
+ "sparsify": null,
1928
+ "unshifted_sink": false,
1929
+ "use_prefill_window_in_sink_attention": false,
1930
+ "window_length": null
1931
+ },
1932
+ "ffn": {
1933
+ "ffn_mult": 2.4375,
1934
+ "no_op": false,
1935
+ "replace_with_linear": false,
1936
+ "sparsify": null
1937
+ }
1938
+ },
1939
+ {
1940
+ "attention": {
1941
+ "n_heads_in_group": 16,
1942
+ "no_op": false,
1943
+ "num_sink_tokens": null,
1944
+ "replace_with_linear": false,
1945
+ "sparsify": null,
1946
+ "unshifted_sink": false,
1947
+ "use_prefill_window_in_sink_attention": false,
1948
+ "window_length": null
1949
+ },
1950
+ "ffn": {
1951
+ "ffn_mult": 2.4375,
1952
+ "no_op": false,
1953
+ "replace_with_linear": false,
1954
+ "sparsify": null
1955
+ }
1956
+ },
1957
+ {
1958
+ "attention": {
1959
+ "n_heads_in_group": null,
1960
+ "no_op": true,
1961
+ "num_sink_tokens": null,
1962
+ "replace_with_linear": false,
1963
+ "sparsify": null,
1964
+ "unshifted_sink": false,
1965
+ "use_prefill_window_in_sink_attention": false,
1966
+ "window_length": null
1967
+ },
1968
+ "ffn": {
1969
+ "ffn_mult": null,
1970
+ "no_op": true,
1971
+ "replace_with_linear": false,
1972
+ "sparsify": null
1973
+ }
1974
+ },
1975
+ {
1976
+ "attention": {
1977
+ "n_heads_in_group": null,
1978
+ "no_op": true,
1979
+ "num_sink_tokens": null,
1980
+ "replace_with_linear": false,
1981
+ "sparsify": null,
1982
+ "unshifted_sink": false,
1983
+ "use_prefill_window_in_sink_attention": false,
1984
+ "window_length": null
1985
+ },
1986
+ "ffn": {
1987
+ "ffn_mult": null,
1988
+ "no_op": true,
1989
+ "replace_with_linear": false,
1990
+ "sparsify": null
1991
+ }
1992
+ },
1993
+ {
1994
+ "attention": {
1995
+ "n_heads_in_group": null,
1996
+ "no_op": true,
1997
+ "num_sink_tokens": null,
1998
+ "replace_with_linear": false,
1999
+ "sparsify": null,
2000
+ "unshifted_sink": false,
2001
+ "use_prefill_window_in_sink_attention": false,
2002
+ "window_length": null
2003
+ },
2004
+ "ffn": {
2005
+ "ffn_mult": null,
2006
+ "no_op": true,
2007
+ "replace_with_linear": false,
2008
+ "sparsify": null
2009
+ }
2010
+ },
2011
+ {
2012
+ "attention": {
2013
+ "n_heads_in_group": null,
2014
+ "no_op": true,
2015
+ "num_sink_tokens": null,
2016
+ "replace_with_linear": false,
2017
+ "sparsify": null,
2018
+ "unshifted_sink": false,
2019
+ "use_prefill_window_in_sink_attention": false,
2020
+ "window_length": null
2021
+ },
2022
+ "ffn": {
2023
+ "ffn_mult": null,
2024
+ "no_op": true,
2025
+ "replace_with_linear": false,
2026
+ "sparsify": null
2027
+ }
2028
+ },
2029
+ {
2030
+ "attention": {
2031
+ "n_heads_in_group": null,
2032
+ "no_op": true,
2033
+ "num_sink_tokens": null,
2034
+ "replace_with_linear": false,
2035
+ "sparsify": null,
2036
+ "unshifted_sink": false,
2037
+ "use_prefill_window_in_sink_attention": false,
2038
+ "window_length": null
2039
+ },
2040
+ "ffn": {
2041
+ "ffn_mult": null,
2042
+ "no_op": true,
2043
+ "replace_with_linear": false,
2044
+ "sparsify": null
2045
+ }
2046
+ },
2047
+ {
2048
+ "attention": {
2049
+ "n_heads_in_group": 16,
2050
+ "no_op": false,
2051
+ "num_sink_tokens": null,
2052
+ "replace_with_linear": false,
2053
+ "sparsify": null,
2054
+ "unshifted_sink": false,
2055
+ "use_prefill_window_in_sink_attention": false,
2056
+ "window_length": null
2057
+ },
2058
+ "ffn": {
2059
+ "ffn_mult": 2.925,
2060
+ "no_op": false,
2061
+ "replace_with_linear": false,
2062
+ "sparsify": null
2063
+ }
2064
+ },
2065
+ {
2066
+ "attention": {
2067
+ "n_heads_in_group": 16,
2068
+ "no_op": false,
2069
+ "num_sink_tokens": null,
2070
+ "replace_with_linear": false,
2071
+ "sparsify": null,
2072
+ "unshifted_sink": false,
2073
+ "use_prefill_window_in_sink_attention": false,
2074
+ "window_length": null
2075
+ },
2076
+ "ffn": {
2077
+ "ffn_mult": 4.875,
2078
+ "no_op": false,
2079
+ "replace_with_linear": false,
2080
+ "sparsify": null
2081
+ }
2082
+ },
2083
+ {
2084
+ "attention": {
2085
+ "n_heads_in_group": null,
2086
+ "no_op": true,
2087
+ "num_sink_tokens": null,
2088
+ "replace_with_linear": false,
2089
+ "sparsify": null,
2090
+ "unshifted_sink": false,
2091
+ "use_prefill_window_in_sink_attention": false,
2092
+ "window_length": null
2093
+ },
2094
+ "ffn": {
2095
+ "ffn_mult": 4.875,
2096
+ "no_op": false,
2097
+ "replace_with_linear": false,
2098
+ "sparsify": null
2099
+ }
2100
+ },
2101
+ {
2102
+ "attention": {
2103
+ "n_heads_in_group": 16,
2104
+ "no_op": false,
2105
+ "num_sink_tokens": null,
2106
+ "replace_with_linear": false,
2107
+ "sparsify": null,
2108
+ "unshifted_sink": false,
2109
+ "use_prefill_window_in_sink_attention": false,
2110
+ "window_length": null
2111
+ },
2112
+ "ffn": {
2113
+ "ffn_mult": 4.875,
2114
+ "no_op": false,
2115
+ "replace_with_linear": false,
2116
+ "sparsify": null
2117
+ }
2118
+ },
2119
+ {
2120
+ "attention": {
2121
+ "n_heads_in_group": null,
2122
+ "no_op": true,
2123
+ "num_sink_tokens": null,
2124
+ "replace_with_linear": false,
2125
+ "sparsify": null,
2126
+ "unshifted_sink": false,
2127
+ "use_prefill_window_in_sink_attention": false,
2128
+ "window_length": null
2129
+ },
2130
+ "ffn": {
2131
+ "ffn_mult": null,
2132
+ "no_op": true,
2133
+ "replace_with_linear": false,
2134
+ "sparsify": null
2135
+ }
2136
+ },
2137
+ {
2138
+ "attention": {
2139
+ "n_heads_in_group": null,
2140
+ "no_op": true,
2141
+ "num_sink_tokens": null,
2142
+ "replace_with_linear": false,
2143
+ "sparsify": null,
2144
+ "unshifted_sink": false,
2145
+ "use_prefill_window_in_sink_attention": false,
2146
+ "window_length": null
2147
+ },
2148
+ "ffn": {
2149
+ "ffn_mult": null,
2150
+ "no_op": true,
2151
+ "replace_with_linear": false,
2152
+ "sparsify": null
2153
+ }
2154
+ },
2155
+ {
2156
+ "attention": {
2157
+ "n_heads_in_group": null,
2158
+ "no_op": true,
2159
+ "num_sink_tokens": null,
2160
+ "replace_with_linear": false,
2161
+ "sparsify": null,
2162
+ "unshifted_sink": false,
2163
+ "use_prefill_window_in_sink_attention": false,
2164
+ "window_length": null
2165
+ },
2166
+ "ffn": {
2167
+ "ffn_mult": null,
2168
+ "no_op": true,
2169
+ "replace_with_linear": false,
2170
+ "sparsify": null
2171
+ }
2172
+ },
2173
+ {
2174
+ "attention": {
2175
+ "n_heads_in_group": null,
2176
+ "no_op": true,
2177
+ "num_sink_tokens": null,
2178
+ "replace_with_linear": false,
2179
+ "sparsify": null,
2180
+ "unshifted_sink": false,
2181
+ "use_prefill_window_in_sink_attention": false,
2182
+ "window_length": null
2183
+ },
2184
+ "ffn": {
2185
+ "ffn_mult": null,
2186
+ "no_op": true,
2187
+ "replace_with_linear": false,
2188
+ "sparsify": null
2189
+ }
2190
+ },
2191
+ {
2192
+ "attention": {
2193
+ "n_heads_in_group": null,
2194
+ "no_op": true,
2195
+ "num_sink_tokens": null,
2196
+ "replace_with_linear": false,
2197
+ "sparsify": null,
2198
+ "unshifted_sink": false,
2199
+ "use_prefill_window_in_sink_attention": false,
2200
+ "window_length": null
2201
+ },
2202
+ "ffn": {
2203
+ "ffn_mult": null,
2204
+ "no_op": true,
2205
+ "replace_with_linear": false,
2206
+ "sparsify": null
2207
+ }
2208
+ },
2209
+ {
2210
+ "attention": {
2211
+ "n_heads_in_group": null,
2212
+ "no_op": true,
2213
+ "num_sink_tokens": null,
2214
+ "replace_with_linear": false,
2215
+ "sparsify": null,
2216
+ "unshifted_sink": false,
2217
+ "use_prefill_window_in_sink_attention": false,
2218
+ "window_length": null
2219
+ },
2220
+ "ffn": {
2221
+ "ffn_mult": null,
2222
+ "no_op": true,
2223
+ "replace_with_linear": false,
2224
+ "sparsify": null
2225
+ }
2226
+ },
2227
+ {
2228
+ "attention": {
2229
+ "n_heads_in_group": null,
2230
+ "no_op": true,
2231
+ "num_sink_tokens": null,
2232
+ "replace_with_linear": false,
2233
+ "sparsify": null,
2234
+ "unshifted_sink": false,
2235
+ "use_prefill_window_in_sink_attention": false,
2236
+ "window_length": null
2237
+ },
2238
+ "ffn": {
2239
+ "ffn_mult": null,
2240
+ "no_op": true,
2241
+ "replace_with_linear": false,
2242
+ "sparsify": null
2243
+ }
2244
+ },
2245
+ {
2246
+ "attention": {
2247
+ "n_heads_in_group": null,
2248
+ "no_op": true,
2249
+ "num_sink_tokens": null,
2250
+ "replace_with_linear": false,
2251
+ "sparsify": null,
2252
+ "unshifted_sink": false,
2253
+ "use_prefill_window_in_sink_attention": false,
2254
+ "window_length": null
2255
+ },
2256
+ "ffn": {
2257
+ "ffn_mult": null,
2258
+ "no_op": true,
2259
+ "replace_with_linear": false,
2260
+ "sparsify": null
2261
+ }
2262
+ },
2263
+ {
2264
+ "attention": {
2265
+ "n_heads_in_group": null,
2266
+ "no_op": true,
2267
+ "num_sink_tokens": null,
2268
+ "replace_with_linear": false,
2269
+ "sparsify": null,
2270
+ "unshifted_sink": false,
2271
+ "use_prefill_window_in_sink_attention": false,
2272
+ "window_length": null
2273
+ },
2274
+ "ffn": {
2275
+ "ffn_mult": 36.5625,
2276
+ "no_op": false,
2277
+ "replace_with_linear": false,
2278
+ "sparsify": null
2279
+ }
2280
+ },
2281
+ {
2282
+ "attention": {
2283
+ "n_heads_in_group": null,
2284
+ "no_op": true,
2285
+ "num_sink_tokens": null,
2286
+ "replace_with_linear": false,
2287
+ "sparsify": null,
2288
+ "unshifted_sink": false,
2289
+ "use_prefill_window_in_sink_attention": false,
2290
+ "window_length": null
2291
+ },
2292
+ "ffn": {
2293
+ "ffn_mult": null,
2294
+ "no_op": true,
2295
+ "replace_with_linear": false,
2296
+ "sparsify": null
2297
+ }
2298
+ },
2299
+ {
2300
+ "attention": {
2301
+ "n_heads_in_group": null,
2302
+ "no_op": true,
2303
+ "num_sink_tokens": null,
2304
+ "replace_with_linear": false,
2305
+ "sparsify": null,
2306
+ "unshifted_sink": false,
2307
+ "use_prefill_window_in_sink_attention": false,
2308
+ "window_length": null
2309
+ },
2310
+ "ffn": {
2311
+ "ffn_mult": null,
2312
+ "no_op": true,
2313
+ "replace_with_linear": false,
2314
+ "sparsify": null
2315
+ }
2316
+ },
2317
+ {
2318
+ "attention": {
2319
+ "n_heads_in_group": null,
2320
+ "no_op": true,
2321
+ "num_sink_tokens": null,
2322
+ "replace_with_linear": false,
2323
+ "sparsify": null,
2324
+ "unshifted_sink": false,
2325
+ "use_prefill_window_in_sink_attention": false,
2326
+ "window_length": null
2327
+ },
2328
+ "ffn": {
2329
+ "ffn_mult": null,
2330
+ "no_op": true,
2331
+ "replace_with_linear": false,
2332
+ "sparsify": null
2333
+ }
2334
+ },
2335
+ {
2336
+ "attention": {
2337
+ "n_heads_in_group": null,
2338
+ "no_op": true,
2339
+ "num_sink_tokens": null,
2340
+ "replace_with_linear": false,
2341
+ "sparsify": null,
2342
+ "unshifted_sink": false,
2343
+ "use_prefill_window_in_sink_attention": false,
2344
+ "window_length": null
2345
+ },
2346
+ "ffn": {
2347
+ "ffn_mult": null,
2348
+ "no_op": true,
2349
+ "replace_with_linear": false,
2350
+ "sparsify": null
2351
+ }
2352
+ },
2353
+ {
2354
+ "attention": {
2355
+ "n_heads_in_group": null,
2356
+ "no_op": true,
2357
+ "num_sink_tokens": null,
2358
+ "replace_with_linear": false,
2359
+ "sparsify": null,
2360
+ "unshifted_sink": false,
2361
+ "use_prefill_window_in_sink_attention": false,
2362
+ "window_length": null
2363
+ },
2364
+ "ffn": {
2365
+ "ffn_mult": null,
2366
+ "no_op": true,
2367
+ "replace_with_linear": false,
2368
+ "sparsify": null
2369
+ }
2370
+ },
2371
+ {
2372
+ "attention": {
2373
+ "n_heads_in_group": null,
2374
+ "no_op": true,
2375
+ "num_sink_tokens": null,
2376
+ "replace_with_linear": false,
2377
+ "sparsify": null,
2378
+ "unshifted_sink": false,
2379
+ "use_prefill_window_in_sink_attention": false,
2380
+ "window_length": null
2381
+ },
2382
+ "ffn": {
2383
+ "ffn_mult": null,
2384
+ "no_op": true,
2385
+ "replace_with_linear": false,
2386
+ "sparsify": null
2387
+ }
2388
+ },
2389
+ {
2390
+ "attention": {
2391
+ "n_heads_in_group": null,
2392
+ "no_op": true,
2393
+ "num_sink_tokens": null,
2394
+ "replace_with_linear": false,
2395
+ "sparsify": null,
2396
+ "unshifted_sink": false,
2397
+ "use_prefill_window_in_sink_attention": false,
2398
+ "window_length": null
2399
+ },
2400
+ "ffn": {
2401
+ "ffn_mult": null,
2402
+ "no_op": true,
2403
+ "replace_with_linear": false,
2404
+ "sparsify": null
2405
+ }
2406
+ },
2407
+ {
2408
+ "attention": {
2409
+ "n_heads_in_group": null,
2410
+ "no_op": true,
2411
+ "num_sink_tokens": null,
2412
+ "replace_with_linear": false,
2413
+ "sparsify": null,
2414
+ "unshifted_sink": false,
2415
+ "use_prefill_window_in_sink_attention": false,
2416
+ "window_length": null
2417
+ },
2418
+ "ffn": {
2419
+ "ffn_mult": null,
2420
+ "no_op": true,
2421
+ "replace_with_linear": false,
2422
+ "sparsify": null
2423
+ }
2424
+ },
2425
+ {
2426
+ "attention": {
2427
+ "n_heads_in_group": null,
2428
+ "no_op": true,
2429
+ "num_sink_tokens": null,
2430
+ "replace_with_linear": false,
2431
+ "sparsify": null,
2432
+ "unshifted_sink": false,
2433
+ "use_prefill_window_in_sink_attention": false,
2434
+ "window_length": null
2435
+ },
2436
+ "ffn": {
2437
+ "ffn_mult": 39.0,
2438
+ "no_op": false,
2439
+ "replace_with_linear": false,
2440
+ "sparsify": null
2441
+ }
2442
+ },
2443
+ {
2444
+ "attention": {
2445
+ "n_heads_in_group": null,
2446
+ "no_op": true,
2447
+ "num_sink_tokens": null,
2448
+ "replace_with_linear": false,
2449
+ "sparsify": null,
2450
+ "unshifted_sink": false,
2451
+ "use_prefill_window_in_sink_attention": false,
2452
+ "window_length": null
2453
+ },
2454
+ "ffn": {
2455
+ "ffn_mult": null,
2456
+ "no_op": true,
2457
+ "replace_with_linear": false,
2458
+ "sparsify": null
2459
+ }
2460
+ },
2461
+ {
2462
+ "attention": {
2463
+ "n_heads_in_group": null,
2464
+ "no_op": true,
2465
+ "num_sink_tokens": null,
2466
+ "replace_with_linear": false,
2467
+ "sparsify": null,
2468
+ "unshifted_sink": false,
2469
+ "use_prefill_window_in_sink_attention": false,
2470
+ "window_length": null
2471
+ },
2472
+ "ffn": {
2473
+ "ffn_mult": null,
2474
+ "no_op": true,
2475
+ "replace_with_linear": false,
2476
+ "sparsify": null
2477
+ }
2478
+ },
2479
+ {
2480
+ "attention": {
2481
+ "n_heads_in_group": null,
2482
+ "no_op": true,
2483
+ "num_sink_tokens": null,
2484
+ "replace_with_linear": false,
2485
+ "sparsify": null,
2486
+ "unshifted_sink": false,
2487
+ "use_prefill_window_in_sink_attention": false,
2488
+ "window_length": null
2489
+ },
2490
+ "ffn": {
2491
+ "ffn_mult": null,
2492
+ "no_op": true,
2493
+ "replace_with_linear": false,
2494
+ "sparsify": null
2495
+ }
2496
+ },
2497
+ {
2498
+ "attention": {
2499
+ "n_heads_in_group": null,
2500
+ "no_op": true,
2501
+ "num_sink_tokens": null,
2502
+ "replace_with_linear": false,
2503
+ "sparsify": null,
2504
+ "unshifted_sink": false,
2505
+ "use_prefill_window_in_sink_attention": false,
2506
+ "window_length": null
2507
+ },
2508
+ "ffn": {
2509
+ "ffn_mult": null,
2510
+ "no_op": true,
2511
+ "replace_with_linear": false,
2512
+ "sparsify": null
2513
+ }
2514
+ },
2515
+ {
2516
+ "attention": {
2517
+ "n_heads_in_group": null,
2518
+ "no_op": true,
2519
+ "num_sink_tokens": null,
2520
+ "replace_with_linear": false,
2521
+ "sparsify": null,
2522
+ "unshifted_sink": false,
2523
+ "use_prefill_window_in_sink_attention": false,
2524
+ "window_length": null
2525
+ },
2526
+ "ffn": {
2527
+ "ffn_mult": null,
2528
+ "no_op": true,
2529
+ "replace_with_linear": false,
2530
+ "sparsify": null
2531
+ }
2532
+ },
2533
+ {
2534
+ "attention": {
2535
+ "n_heads_in_group": null,
2536
+ "no_op": true,
2537
+ "num_sink_tokens": null,
2538
+ "replace_with_linear": false,
2539
+ "sparsify": null,
2540
+ "unshifted_sink": false,
2541
+ "use_prefill_window_in_sink_attention": false,
2542
+ "window_length": null
2543
+ },
2544
+ "ffn": {
2545
+ "ffn_mult": null,
2546
+ "no_op": true,
2547
+ "replace_with_linear": false,
2548
+ "sparsify": null
2549
+ }
2550
+ },
2551
+ {
2552
+ "attention": {
2553
+ "n_heads_in_group": null,
2554
+ "no_op": true,
2555
+ "num_sink_tokens": null,
2556
+ "replace_with_linear": false,
2557
+ "sparsify": null,
2558
+ "unshifted_sink": false,
2559
+ "use_prefill_window_in_sink_attention": false,
2560
+ "window_length": null
2561
+ },
2562
+ "ffn": {
2563
+ "ffn_mult": null,
2564
+ "no_op": true,
2565
+ "replace_with_linear": false,
2566
+ "sparsify": null
2567
+ }
2568
+ },
2569
+ {
2570
+ "attention": {
2571
+ "n_heads_in_group": null,
2572
+ "no_op": true,
2573
+ "num_sink_tokens": null,
2574
+ "replace_with_linear": false,
2575
+ "sparsify": null,
2576
+ "unshifted_sink": false,
2577
+ "use_prefill_window_in_sink_attention": false,
2578
+ "window_length": null
2579
+ },
2580
+ "ffn": {
2581
+ "ffn_mult": null,
2582
+ "no_op": true,
2583
+ "replace_with_linear": false,
2584
+ "sparsify": null
2585
+ }
2586
+ },
2587
+ {
2588
+ "attention": {
2589
+ "n_heads_in_group": null,
2590
+ "no_op": true,
2591
+ "num_sink_tokens": null,
2592
+ "replace_with_linear": false,
2593
+ "sparsify": null,
2594
+ "unshifted_sink": false,
2595
+ "use_prefill_window_in_sink_attention": false,
2596
+ "window_length": null
2597
+ },
2598
+ "ffn": {
2599
+ "ffn_mult": 31.40625,
2600
+ "no_op": false,
2601
+ "replace_with_linear": false,
2602
+ "sparsify": null
2603
+ }
2604
+ },
2605
+ {
2606
+ "attention": {
2607
+ "n_heads_in_group": null,
2608
+ "no_op": true,
2609
+ "num_sink_tokens": null,
2610
+ "replace_with_linear": false,
2611
+ "sparsify": null,
2612
+ "unshifted_sink": false,
2613
+ "use_prefill_window_in_sink_attention": false,
2614
+ "window_length": null
2615
+ },
2616
+ "ffn": {
2617
+ "ffn_mult": null,
2618
+ "no_op": true,
2619
+ "replace_with_linear": false,
2620
+ "sparsify": null
2621
+ }
2622
+ },
2623
+ {
2624
+ "attention": {
2625
+ "n_heads_in_group": null,
2626
+ "no_op": true,
2627
+ "num_sink_tokens": null,
2628
+ "replace_with_linear": false,
2629
+ "sparsify": null,
2630
+ "unshifted_sink": false,
2631
+ "use_prefill_window_in_sink_attention": false,
2632
+ "window_length": null
2633
+ },
2634
+ "ffn": {
2635
+ "ffn_mult": null,
2636
+ "no_op": true,
2637
+ "replace_with_linear": false,
2638
+ "sparsify": null
2639
+ }
2640
+ },
2641
+ {
2642
+ "attention": {
2643
+ "n_heads_in_group": null,
2644
+ "no_op": true,
2645
+ "num_sink_tokens": null,
2646
+ "replace_with_linear": false,
2647
+ "sparsify": null,
2648
+ "unshifted_sink": false,
2649
+ "use_prefill_window_in_sink_attention": false,
2650
+ "window_length": null
2651
+ },
2652
+ "ffn": {
2653
+ "ffn_mult": null,
2654
+ "no_op": true,
2655
+ "replace_with_linear": false,
2656
+ "sparsify": null
2657
+ }
2658
+ },
2659
+ {
2660
+ "attention": {
2661
+ "n_heads_in_group": null,
2662
+ "no_op": true,
2663
+ "num_sink_tokens": null,
2664
+ "replace_with_linear": false,
2665
+ "sparsify": null,
2666
+ "unshifted_sink": false,
2667
+ "use_prefill_window_in_sink_attention": false,
2668
+ "window_length": null
2669
+ },
2670
+ "ffn": {
2671
+ "ffn_mult": null,
2672
+ "no_op": true,
2673
+ "replace_with_linear": false,
2674
+ "sparsify": null
2675
+ }
2676
+ },
2677
+ {
2678
+ "attention": {
2679
+ "n_heads_in_group": null,
2680
+ "no_op": true,
2681
+ "num_sink_tokens": null,
2682
+ "replace_with_linear": false,
2683
+ "sparsify": null,
2684
+ "unshifted_sink": false,
2685
+ "use_prefill_window_in_sink_attention": false,
2686
+ "window_length": null
2687
+ },
2688
+ "ffn": {
2689
+ "ffn_mult": null,
2690
+ "no_op": true,
2691
+ "replace_with_linear": false,
2692
+ "sparsify": null
2693
+ }
2694
+ },
2695
+ {
2696
+ "attention": {
2697
+ "n_heads_in_group": null,
2698
+ "no_op": true,
2699
+ "num_sink_tokens": null,
2700
+ "replace_with_linear": false,
2701
+ "sparsify": null,
2702
+ "unshifted_sink": false,
2703
+ "use_prefill_window_in_sink_attention": false,
2704
+ "window_length": null
2705
+ },
2706
+ "ffn": {
2707
+ "ffn_mult": 27.5625,
2708
+ "no_op": false,
2709
+ "replace_with_linear": false,
2710
+ "sparsify": null
2711
+ }
2712
+ },
2713
+ {
2714
+ "attention": {
2715
+ "n_heads_in_group": null,
2716
+ "no_op": true,
2717
+ "num_sink_tokens": null,
2718
+ "replace_with_linear": false,
2719
+ "sparsify": null,
2720
+ "unshifted_sink": false,
2721
+ "use_prefill_window_in_sink_attention": false,
2722
+ "window_length": null
2723
+ },
2724
+ "ffn": {
2725
+ "ffn_mult": 1.95,
2726
+ "no_op": false,
2727
+ "replace_with_linear": false,
2728
+ "sparsify": null
2729
+ }
2730
+ },
2731
+ {
2732
+ "attention": {
2733
+ "n_heads_in_group": 16,
2734
+ "no_op": false,
2735
+ "num_sink_tokens": null,
2736
+ "replace_with_linear": false,
2737
+ "sparsify": null,
2738
+ "unshifted_sink": false,
2739
+ "use_prefill_window_in_sink_attention": false,
2740
+ "window_length": null
2741
+ },
2742
+ "ffn": {
2743
+ "ffn_mult": 1.95,
2744
+ "no_op": false,
2745
+ "replace_with_linear": false,
2746
+ "sparsify": null
2747
+ }
2748
+ },
2749
+ {
2750
+ "attention": {
2751
+ "n_heads_in_group": 16,
2752
+ "no_op": false,
2753
+ "num_sink_tokens": null,
2754
+ "replace_with_linear": false,
2755
+ "sparsify": null,
2756
+ "unshifted_sink": false,
2757
+ "use_prefill_window_in_sink_attention": false,
2758
+ "window_length": null
2759
+ },
2760
+ "ffn": {
2761
+ "ffn_mult": 2.4375,
2762
+ "no_op": false,
2763
+ "replace_with_linear": false,
2764
+ "sparsify": null
2765
+ }
2766
+ },
2767
+ {
2768
+ "attention": {
2769
+ "n_heads_in_group": null,
2770
+ "no_op": true,
2771
+ "num_sink_tokens": null,
2772
+ "replace_with_linear": false,
2773
+ "sparsify": null,
2774
+ "unshifted_sink": false,
2775
+ "use_prefill_window_in_sink_attention": false,
2776
+ "window_length": null
2777
+ },
2778
+ "ffn": {
2779
+ "ffn_mult": null,
2780
+ "no_op": true,
2781
+ "replace_with_linear": false,
2782
+ "sparsify": null
2783
+ }
2784
+ },
2785
+ {
2786
+ "attention": {
2787
+ "n_heads_in_group": 16,
2788
+ "no_op": false,
2789
+ "num_sink_tokens": null,
2790
+ "replace_with_linear": false,
2791
+ "sparsify": null,
2792
+ "unshifted_sink": false,
2793
+ "use_prefill_window_in_sink_attention": false,
2794
+ "window_length": null
2795
+ },
2796
+ "ffn": {
2797
+ "ffn_mult": 2.4375,
2798
+ "no_op": false,
2799
+ "replace_with_linear": false,
2800
+ "sparsify": null
2801
+ }
2802
+ },
2803
+ {
2804
+ "attention": {
2805
+ "n_heads_in_group": 16,
2806
+ "no_op": false,
2807
+ "num_sink_tokens": null,
2808
+ "replace_with_linear": false,
2809
+ "sparsify": null,
2810
+ "unshifted_sink": false,
2811
+ "use_prefill_window_in_sink_attention": false,
2812
+ "window_length": null
2813
+ },
2814
+ "ffn": {
2815
+ "ffn_mult": 2.4375,
2816
+ "no_op": false,
2817
+ "replace_with_linear": false,
2818
+ "sparsify": null
2819
+ }
2820
+ },
2821
+ {
2822
+ "attention": {
2823
+ "n_heads_in_group": 16,
2824
+ "no_op": false,
2825
+ "num_sink_tokens": null,
2826
+ "replace_with_linear": false,
2827
+ "sparsify": null,
2828
+ "unshifted_sink": false,
2829
+ "use_prefill_window_in_sink_attention": false,
2830
+ "window_length": null
2831
+ },
2832
+ "ffn": {
2833
+ "ffn_mult": 3.4125,
2834
+ "no_op": false,
2835
+ "replace_with_linear": false,
2836
+ "sparsify": null
2837
+ }
2838
+ },
2839
+ {
2840
+ "attention": {
2841
+ "n_heads_in_group": 16,
2842
+ "no_op": false,
2843
+ "num_sink_tokens": null,
2844
+ "replace_with_linear": false,
2845
+ "sparsify": null,
2846
+ "unshifted_sink": false,
2847
+ "use_prefill_window_in_sink_attention": false,
2848
+ "window_length": null
2849
+ },
2850
+ "ffn": {
2851
+ "ffn_mult": 4.875,
2852
+ "no_op": false,
2853
+ "replace_with_linear": false,
2854
+ "sparsify": null
2855
+ }
2856
+ },
2857
+ {
2858
+ "attention": {
2859
+ "n_heads_in_group": 16,
2860
+ "no_op": false,
2861
+ "num_sink_tokens": null,
2862
+ "replace_with_linear": false,
2863
+ "sparsify": null,
2864
+ "unshifted_sink": false,
2865
+ "use_prefill_window_in_sink_attention": false,
2866
+ "window_length": null
2867
+ },
2868
+ "ffn": {
2869
+ "ffn_mult": 4.875,
2870
+ "no_op": false,
2871
+ "replace_with_linear": false,
2872
+ "sparsify": null
2873
+ }
2874
+ },
2875
+ {
2876
+ "attention": {
2877
+ "n_heads_in_group": 16,
2878
+ "no_op": false,
2879
+ "num_sink_tokens": null,
2880
+ "replace_with_linear": false,
2881
+ "sparsify": null,
2882
+ "unshifted_sink": false,
2883
+ "use_prefill_window_in_sink_attention": false,
2884
+ "window_length": null
2885
+ },
2886
+ "ffn": {
2887
+ "ffn_mult": 4.875,
2888
+ "no_op": false,
2889
+ "replace_with_linear": false,
2890
+ "sparsify": null
2891
+ }
2892
+ },
2893
+ {
2894
+ "attention": {
2895
+ "n_heads_in_group": 16,
2896
+ "no_op": false,
2897
+ "num_sink_tokens": null,
2898
+ "replace_with_linear": false,
2899
+ "sparsify": null,
2900
+ "unshifted_sink": false,
2901
+ "use_prefill_window_in_sink_attention": false,
2902
+ "window_length": null
2903
+ },
2904
+ "ffn": {
2905
+ "ffn_mult": 4.875,
2906
+ "no_op": false,
2907
+ "replace_with_linear": false,
2908
+ "sparsify": null
2909
+ }
2910
+ },
2911
+ {
2912
+ "attention": {
2913
+ "n_heads_in_group": 16,
2914
+ "no_op": false,
2915
+ "num_sink_tokens": null,
2916
+ "replace_with_linear": false,
2917
+ "sparsify": null,
2918
+ "unshifted_sink": false,
2919
+ "use_prefill_window_in_sink_attention": false,
2920
+ "window_length": null
2921
+ },
2922
+ "ffn": {
2923
+ "ffn_mult": 2.4375,
2924
+ "no_op": false,
2925
+ "replace_with_linear": false,
2926
+ "sparsify": null
2927
+ }
2928
+ }
2929
+ ],
2930
+ "bos_token_id": 128000,
2931
+ "eos_token_id": [
2932
+ 128001,
2933
+ 128008,
2934
+ 128009
2935
+ ],
2936
+ "hidden_act": "silu",
2937
+ "hidden_size": 16384,
2938
+ "initializer_range": 0.02,
2939
+ "intermediate_size": null,
2940
+ "max_position_embeddings": 131072,
2941
+ "mlp_bias": false,
2942
+ "model_type": "nemotron-nas",
2943
+ "num_attention_heads": 128,
2944
+ "num_hidden_layers": 162,
2945
+ "num_key_value_heads": null,
2946
+ "pretraining_tp": 1,
2947
+ "rms_norm_eps": 1e-05,
2948
+ "rope_scaling": {
2949
+ "factor": 16.0,
2950
+ "high_freq_factor": 4.0,
2951
+ "low_freq_factor": 1.0,
2952
+ "original_max_position_embeddings": 8192,
2953
+ "rope_type": "llama3"
2954
+ },
2955
+ "rope_theta": 500000.0,
2956
+ "tie_word_embeddings": false,
2957
+ "torch_dtype": "bfloat16",
2958
+ "transformers_version": "4.45.1",
2959
+ "use_cache": true,
2960
+ "vocab_size": 128256
2961
+ }
configuration_decilm.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Nvidia Corporation. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import dataclasses
17
+ import warnings
18
+ from typing import Dict, Any
19
+
20
+ from transformers.utils import is_flash_attn_2_available
21
+
22
+ from .block_config import BlockConfig
23
+ from .transformers_4_44_2__configuration_llama import LlamaConfig
24
+ from .transformers_4_44_2__modeling_rope_utils import \
25
+ rope_config_validation # fake import to make AutoConfig infer the dependency
26
+
27
+ rope_config_validation # this line is here to make sure that auto-formatting doesn't remove the import
28
+
29
+
30
+ class DeciLMConfig(LlamaConfig):
31
+ model_type = "nemotron-nas"
32
+
33
+ def __init__(
34
+ self,
35
+ block_configs: list[dict] | list[BlockConfig] = None,
36
+ **kwargs,
37
+ ):
38
+ attn_implementation = kwargs.pop("attn_implementation", None)
39
+ if attn_implementation is None and is_flash_attn_2_available():
40
+ attn_implementation = "flash_attention_2"
41
+
42
+ if block_configs is not None:
43
+ if isinstance(block_configs[0], dict):
44
+ block_configs = [BlockConfig(**conf) for conf in block_configs]
45
+
46
+ using_unshifted_sink = any([block_config.attention.unshifted_sink for block_config in block_configs])
47
+ if using_unshifted_sink and attn_implementation != "eager":
48
+ warnings.warn("Forcing attn_implementation='eager' since some attention layers use unshifted sink")
49
+ attn_implementation = "eager"
50
+
51
+ super().__init__(attn_implementation=attn_implementation, **kwargs)
52
+
53
+ self.intermediate_size = None
54
+ self.num_key_value_heads = None
55
+
56
+ if block_configs is not None:
57
+ assert len(block_configs) == self.num_hidden_layers
58
+
59
+ self.block_configs: list[BlockConfig] = block_configs
60
+
61
+ def to_dict(self) -> Dict[str, Any]:
62
+ self_dict = super().to_dict()
63
+ if self.block_configs is not None:
64
+ self_dict["block_configs"] = [dataclasses.asdict(conf) for conf in self.block_configs]
65
+ return self_dict
model-00001-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dbe6134deefa5c58219493ad5ed304690cb53c9817f50b8e441a42d0a56d768
3
+ size 9496497704
model-00002-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e453414a781fc32003cc4590f672772127e9a455574ec27c8db26435d04baf7f
3
+ size 9923726576
model-00003-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15949222a368cc658583f4c7d623237a78933d5f4f38b8953067983b2ff13d02
3
+ size 9403632784
model-00004-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caa989cb73609dda3f8a705d1b8d7095c0cb26fee264e5e23aa4902ceb688e90
3
+ size 9999254704
model-00005-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df55b5e6606dc53077bfaf33ecfb1e216ff5b894b57a7caaa91784677855031e
3
+ size 9529756680
model-00006-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed59b0fd56d0b1c62e3ca058303cad864f7e8a4716723309aa2a6eeca2180dd0
3
+ size 8824816704
model-00007-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e262b6f839a165b26875bf8a7199e7c5cd31fd14386c1d55ade118f4b957a6
3
+ size 7516391416
model-00008-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17bcdc7a25930719e6581c1e69495bbdd6fd9db56b16ce6a4cd18839d6bf63aa
3
+ size 8120173520
model-00009-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71329fd9755fcbffaa0c505e6b565d79f5e4a788d59ced0423fe48e4d9aeffcf
3
+ size 8724153648
model-00010-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:424671a36a4b76cdcef53c3780127395b343fcd4148b4f6a5b79b967096bfedd
3
+ size 9797928728
model-00011-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b7dac37dc498f469168f42dde8151ed3eaa82fcd4e4c6083559b6dbc7a9658e
3
+ size 8321730600
model-00012-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c53bb43c1a9fa69d16c4beeacd387495c86bc1b6431045379cedeb8de844267f
3
+ size 9630156560
model-00013-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d00fc7ed4c9313fb37a13e5985dbfe56133e00efa15e896b5c59d2fb4d0f3461
3
+ size 7550077744
model-00014-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a9cbfa9f26773a4f2b2f058c213b77a9814f304d8bff642485486d311341582
3
+ size 9865234288
model-00015-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5488a00c542a5a261294e7d8b899ce4dbe1edc4869847f7cb26d08c73317523
3
+ size 8120173520
model-00016-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6f9b5cd0c6b12a80d37df2d982cd1541f7c3c456d6c510fdfd12c6805d36629
3
+ size 7516227000
model-00017-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b58bff67f2f8e0e47b86e83cd171df7254c8e21fb83e3a4cc465d7ea837f403
3
+ size 9865234288
model-00018-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aba6a488d1b2c8a2244dfac7db48e725d0128d9820ef8fa30af5f41c538baf0
3
+ size 8120173520
model-00019-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b5c605987e1a43c5bbf1c4d2e6177020852ffcea7bb8b40dc5df3a6bb8ce9d
3
+ size 7516227000
model-00020-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:514b2870d3ee3a6c10fc6dfae1da8bd66b1e4de5a07f0d3402cd8b010a0665db
3
+ size 9865234288
model-00021-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a741e720b18b95992f32c82702aa6ffee0cb0e5dee0f91854d552ad12f96604
3
+ size 8120173520
model-00022-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be64c2a2e5b7c9a01f5480a4b4a957fbd53274abf16d4d1d59b3d88101671b63
3
+ size 7516227000
model-00023-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57140ddbdb1492884d83a5c4765633a345412e0c95b094cfbb9e1dfa634f8941
3
+ size 9865234288
model-00024-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:236476c971742130bc74f2f11ce0271c861e970be44f300dba157d731ee810ae
3
+ size 8120173520
model-00025-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d102f6f7604ac4af54999d44c7fac7b0cffee2803a8c9fa591334d4355e494e7
3
+ size 9865004104
model-00026-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec920c4d730196ac4eda55e9632b905e7fdcf5ab77eabcc38adcbf054eed029
3
+ size 8405386784
model-00027-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2fa6f03938e1cc17dd014246e137dc40f12fe6582419d2fdb88c4de8ebd9ae4
3
+ size 9529723696
model-00028-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81f5c79d696b1b31a5604696df6426d99297cbf61856262aae98451fb7cbf435
3
+ size 9076475656
model-00029-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d9bc6be03f9f8d42fcd5994f87f2ee07fd90d3de7dc52c895ea71e5b1153135
3
+ size 9101641608
model-00030-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62fbd93b4cef1eadf647d28b9ed2da98e88cb276059f902942d26345c6242ea4
3
+ size 9261419184
model-00031-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25eef50bd433ebb2c3e15b004c20fabf490c09ac43456b384f31952950858711
3
+ size 9865004696
model-00032-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b13a8bc3b993a7d35edcda6be1c19cd5568584aed2bf2d2dceb068b21ca7dab2
3
+ size 8724482424
model-00033-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52796fd5c6b4629013fa76d4de4a7266c0ddec65b457ab792c520fc7e6b72a54
3
+ size 9865201416
model-00034-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abf4a1294e78cb9d522ec393a6ef9c1a1bd96e733a69393e927e2e26c121b47d
3
+ size 6727664600
model-00035-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:758a9f00f001b5de7426fd5f60f7025b17f6c851310945a0f36e189acfe2ce41
3
+ size 26172457232
model-00036-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:106061a51c4005c956ce5b7b1188850aca3e946c6a9350b52820c34bd5ede094
3
+ size 13086228632
model-00037-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08be35796d7e6b955d761062b00d6d41f9ffaddf14b068ce1e1fda609611a90d
3
+ size 27917287696
model-00038-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a254b9c52c747e760bc00e7d51e9f8ecb8360445b656cb4ca6cdc59f294bbfd
3
+ size 13958643864
model-00039-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069c7a61fffbd35237e443a91d2038f2a2c1a6c1b3e41713dd300a4e27c8393a
3
+ size 22481469712
model-00040-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cddb00fbb8d7d0e867716621f39df8aa765fca99a70b13d398527102d707f54b
3
+ size 11240734872
model-00041-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6da605fbb86f89d3ba909881451d2d185385c5611c65b4fda88f5583d0f9a0ba
3
+ size 19730006288
model-00042-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef80aadc1deef5792b941e0229b8bb43b2cd475538d4f44ba3c83e28695b1d8c
3
+ size 5679187160
model-00043-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afc97447fb1297d51e2ec8a8dfd39b7abc96906a3b17df43119648903883c450
3
+ size 9865036056
model-00044-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de741e1d14911e68654ec86b2b1135dec00866a06b334a4246b79179c4f7fdc8
3
+ size 9663843024
model-00045-of-00049.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc757075156fc5372efe22fccbc7b4f8c4e3bf114b820e285a4558dfb8cc7d7b
3
+ size 8556447384