| | --- |
| | license: mit |
| | datasets: |
| | - tiiuae/falcon-refinedweb |
| | language: |
| | - en |
| | metrics: |
| | - perplexity |
| | - accuracy |
| | --- |
| | ## Description |
| |
|
| | <!-- Provide a quick summary of what the model is/does. --> |
| |
|
| | Models trained on 300B tokens, including dense FFN ones and low-rank FFN ones. |
| |
|
| |
|
| |
|
| | ## Citation |
| |
|
| | If you find it useful, please consider citing the paper: |
| |
|
| | ``` |
| | @article{wei2024building, |
| | title={Building on efficient foundations: Effective training of LLMs with structured feedforward layers}, |
| | author={Wei, Xiuying and Moalla, Skander and Pascanu, Razvan and Gulcehre, Caglar}, |
| | journal={Advances in Neural Information Processing Systems}, |
| | volume={37}, |
| | pages={4689--4717}, |
| | year={2024} |
| | } |
| | |
| | @article{wei2024investigating, |
| | title={Investigating low-rank training in transformer language models: Efficiency and scaling analysis}, |
| | author={Wei, Xiuying and Moalla, Skander and Pascanu, Razvan and Gulcehre, Caglar}, |
| | journal={arXiv preprint arXiv:2407.09835}, |
| | year={2024} |
| | } |
| | ``` |