| --- |
| datasets: |
| - bigcode/the-stack-v2 |
| - yulan-team/YuLan-Mini-Datasets |
| - HuggingFaceFW/fineweb-edu |
| - bigcode/the-stack-v2 |
| - mlfoundations/dclm-baseline-1.0 |
| - math-ai/AutoMathText |
| - gair-prox/open-web-math-pro |
| - RUC-AIBOX/long_form_thought_data_5k |
| - internlm/Lean-Workbook |
| - internlm/Lean-Github |
| - deepseek-ai/DeepSeek-Prover-V1 |
| - ScalableMath/Lean-STaR-base |
| - ScalableMath/Lean-STaR-plus |
| - ScalableMath/Lean-CoT-base |
| - ScalableMath/Lean-CoT-plus |
| - opencsg/chinese-fineweb-edu |
| - liwu/MNBVC |
| - vikp/textbook_quality_programming |
| - HuggingFaceTB/smollm-corpus |
| - OpenCoder-LLM/opc-annealing-corpus |
| - OpenCoder-LLM/opc-sft-stage1 |
| - OpenCoder-LLM/opc-sft-stage2 |
| - XinyaoHu/AMPS_mathematica |
| - deepmind/math_dataset |
| - mrfakename/basic-math-10m |
| - microsoft/orca-math-word-problems-200k |
| - AI-MO/NuminaMath-CoT |
| - HuggingFaceTB/cosmopedia |
| - MU-NLPC/Calc-ape210k |
| - manu/project_gutenberg |
| - storytracer/LoC-PD-Books |
| - allenai/dolma |
| --- |