| | |
| | |
| | |
| | |
| | |
| | |
| | import torch |
| | from torch.utils import cpp_extension |
| | from torch.utils.cpp_extension import CUDAExtension, BuildExtension |
| |
|
| | import os |
| | import subprocess |
| | import sys |
| |
|
| | from setuptools import find_packages, setup |
| |
|
| | DISABLE_CUDA_EXTENSION = True |
| | filtered_args = [] |
| | for i, arg in enumerate(sys.argv): |
| | if arg == "--enable-cuda-ext": |
| | DISABLE_CUDA_EXTENSION = False |
| | continue |
| | filtered_args.append(arg) |
| | sys.argv = filtered_args |
| |
|
| |
|
| | if sys.version_info < (3, 7): |
| | sys.exit("Sorry, Python >= 3.7 is required for unicore.") |
| |
|
| |
|
| | def write_version_py(): |
| | with open(os.path.join("unicore", "version.txt")) as f: |
| | version = f.read().strip() |
| |
|
| | |
| | with open(os.path.join("unicore", "version.py"), "w") as f: |
| | f.write('__version__ = "{}"\n'.format(version)) |
| | return version |
| |
|
| |
|
| | version = write_version_py() |
| |
|
| |
|
| | |
| | this_dir = os.path.dirname(os.path.abspath(__file__)) |
| |
|
| |
|
| | def get_cuda_bare_metal_version(cuda_dir): |
| | raw_output = subprocess.check_output( |
| | [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True |
| | ) |
| | output = raw_output.split() |
| | release_idx = output.index("release") + 1 |
| | release = output[release_idx].split(".") |
| | bare_metal_major = release[0] |
| | bare_metal_minor = release[1][0] |
| |
|
| | return raw_output, bare_metal_major, bare_metal_minor |
| |
|
| |
|
| | if not torch.cuda.is_available() and not DISABLE_CUDA_EXTENSION: |
| | print( |
| | "\nWarning: Torch did not find available GPUs on this system.\n", |
| | "If your intention is to cross-compile, this is not an error.\n" |
| | "By default, it will cross-compile for Volta (compute capability 7.0), Turing (compute capability 7.5),\n" |
| | "and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n" |
| | "If you wish to cross-compile for a single specific architecture,\n" |
| | 'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n', |
| | ) |
| | if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: |
| | _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) |
| | if int(bare_metal_major) == 11: |
| | os.environ["TORCH_CUDA_ARCH_LIST"] = "7.0;7.5;8.0;9.0" |
| | else: |
| | os.environ["TORCH_CUDA_ARCH_LIST"] = "7.0;7.5" |
| |
|
| | print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) |
| | TORCH_MAJOR = int(torch.__version__.split(".")[0]) |
| | TORCH_MINOR = int(torch.__version__.split(".")[1]) |
| |
|
| | if not ((TORCH_MAJOR >= 1 and TORCH_MINOR >= 4) or (TORCH_MAJOR > 1)): |
| | raise RuntimeError( |
| | "Requires Pytorch 1.4 or newer.\n" |
| | + "The latest stable release can be obtained from https://pytorch.org/" |
| | ) |
| |
|
| | cmdclass = {} |
| | ext_modules = [] |
| |
|
| | extras = {} |
| |
|
| | if not DISABLE_CUDA_EXTENSION: |
| |
|
| | def get_cuda_bare_metal_version(cuda_dir): |
| | raw_output = subprocess.check_output( |
| | [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True |
| | ) |
| | output = raw_output.split() |
| | release_idx = output.index("release") + 1 |
| | release = output[release_idx].split(".") |
| | bare_metal_major = release[0] |
| | bare_metal_minor = release[1][0] |
| |
|
| | return raw_output, bare_metal_major, bare_metal_minor |
| |
|
| | def check_cuda_torch_binary_vs_bare_metal(cuda_dir): |
| | raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version( |
| | cuda_dir |
| | ) |
| | torch_binary_major = torch.version.cuda.split(".")[0] |
| | torch_binary_minor = torch.version.cuda.split(".")[1] |
| |
|
| | print("\nCompiling cuda extensions with") |
| | print(raw_output + "from " + cuda_dir + "/bin\n") |
| |
|
| | if (bare_metal_major != torch_binary_major) or ( |
| | bare_metal_minor != torch_binary_minor |
| | ): |
| | raise RuntimeError( |
| | "Cuda extensions are being compiled with a version of Cuda that does " |
| | + "not match the version used to compile Pytorch binaries. " |
| | + "Pytorch binaries were compiled with Cuda {}.\n".format( |
| | torch.version.cuda |
| | ) |
| | ) |
| |
|
| | cmdclass["build_ext"] = BuildExtension |
| |
|
| | if torch.utils.cpp_extension.CUDA_HOME is None: |
| | raise RuntimeError( |
| | "Nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc." |
| | ) |
| |
|
| | |
| |
|
| | generator_flag = [] |
| | torch_dir = torch.__path__[0] |
| | if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGenerator.h")): |
| | generator_flag = ["-DOLD_GENERATOR"] |
| |
|
| | ext_modules.append( |
| | CUDAExtension( |
| | name="unicore_fused_rounding", |
| | sources=["csrc/rounding/interface.cpp", "csrc/rounding/fp32_to_bf16.cu"], |
| | include_dirs=[os.path.join(this_dir, "csrc")], |
| | extra_compile_args={ |
| | "cxx": [ |
| | "-O3", |
| | ] |
| | + generator_flag, |
| | "nvcc": [ |
| | "-O3", |
| | "--use_fast_math", |
| | "-gencode", |
| | "arch=compute_70,code=sm_70", |
| | "-gencode", |
| | "arch=compute_80,code=sm_80", |
| | "-gencode", |
| | "arch=compute_90,code=sm_90", |
| | "-U__CUDA_NO_HALF_OPERATORS__", |
| | "-U__CUDA_NO_BFLOAT16_OPERATORS__", |
| | "-U__CUDA_NO_HALF_CONVERSIONS__", |
| | "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", |
| | "--expt-relaxed-constexpr", |
| | "--expt-extended-lambda", |
| | ] |
| | + generator_flag, |
| | }, |
| | ) |
| | ) |
| |
|
| | ext_modules.append( |
| | CUDAExtension( |
| | name="unicore_fused_multi_tensor", |
| | sources=[ |
| | "csrc/multi_tensor/interface.cpp", |
| | "csrc/multi_tensor/multi_tensor_l2norm_kernel.cu", |
| | ], |
| | include_dirs=[os.path.join(this_dir, "csrc")], |
| | extra_compile_args={ |
| | "cxx": ["-O3"], |
| | "nvcc": [ |
| | "-O3", |
| | "--use_fast_math", |
| | "-gencode", |
| | "arch=compute_70,code=sm_70", |
| | "-gencode", |
| | "arch=compute_80,code=sm_80", |
| | "-gencode", |
| | "arch=compute_90,code=sm_90", |
| | "-U__CUDA_NO_HALF_OPERATORS__", |
| | "-U__CUDA_NO_BFLOAT16_OPERATORS__", |
| | "-U__CUDA_NO_HALF_CONVERSIONS__", |
| | "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", |
| | "--expt-relaxed-constexpr", |
| | "--expt-extended-lambda", |
| | ], |
| | }, |
| | ) |
| | ) |
| |
|
| | ext_modules.append( |
| | CUDAExtension( |
| | name="unicore_fused_adam", |
| | sources=["csrc/adam/interface.cpp", "csrc/adam/adam_kernel.cu"], |
| | include_dirs=[os.path.join(this_dir, "csrc")], |
| | extra_compile_args={ |
| | "cxx": ["-O3"], |
| | "nvcc": [ |
| | "-O3", |
| | "--use_fast_math", |
| | "-gencode", |
| | "arch=compute_70,code=sm_70", |
| | "-gencode", |
| | "arch=compute_80,code=sm_80", |
| | "-gencode", |
| | "arch=compute_90,code=sm_90", |
| | ], |
| | }, |
| | ) |
| | ) |
| |
|
| | ext_modules.append( |
| | CUDAExtension( |
| | name="unicore_fused_softmax_dropout", |
| | sources=[ |
| | "csrc/softmax_dropout/interface.cpp", |
| | "csrc/softmax_dropout/softmax_dropout_kernel.cu", |
| | ], |
| | include_dirs=[os.path.join(this_dir, "csrc")], |
| | extra_compile_args={ |
| | "cxx": [ |
| | "-O3", |
| | ] |
| | + generator_flag, |
| | "nvcc": [ |
| | "-O3", |
| | "--use_fast_math", |
| | "-gencode", |
| | "arch=compute_70,code=sm_70", |
| | "-gencode", |
| | "arch=compute_80,code=sm_80", |
| | "-gencode", |
| | "arch=compute_90,code=sm_90", |
| | "-U__CUDA_NO_HALF_OPERATORS__", |
| | "-U__CUDA_NO_BFLOAT16_OPERATORS__", |
| | "-U__CUDA_NO_HALF_CONVERSIONS__", |
| | "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", |
| | "--expt-relaxed-constexpr", |
| | "--expt-extended-lambda", |
| | ] |
| | + generator_flag, |
| | }, |
| | ) |
| | ) |
| |
|
| | ext_modules.append( |
| | CUDAExtension( |
| | name="unicore_fused_layernorm", |
| | sources=["csrc/layernorm/interface.cpp", "csrc/layernorm/layernorm.cu"], |
| | include_dirs=[os.path.join(this_dir, "csrc")], |
| | extra_compile_args={ |
| | "cxx": [ |
| | "-O3", |
| | ] |
| | + generator_flag, |
| | "nvcc": [ |
| | "-O3", |
| | "--use_fast_math", |
| | "-gencode", |
| | "arch=compute_70,code=sm_70", |
| | "-gencode", |
| | "arch=compute_80,code=sm_80", |
| | "-gencode", |
| | "arch=compute_90,code=sm_90", |
| | "-U__CUDA_NO_HALF_OPERATORS__", |
| | "-U__CUDA_NO_BFLOAT16_OPERATORS__", |
| | "-U__CUDA_NO_HALF_CONVERSIONS__", |
| | "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", |
| | "--expt-relaxed-constexpr", |
| | "--expt-extended-lambda", |
| | ] |
| | + generator_flag, |
| | }, |
| | ) |
| | ) |
| |
|
| | ext_modules.append( |
| | CUDAExtension( |
| | name="unicore_fused_layernorm_backward_gamma_beta", |
| | sources=[ |
| | "csrc/layernorm/interface_gamma_beta.cpp", |
| | "csrc/layernorm/layernorm_backward.cu", |
| | ], |
| | include_dirs=[os.path.join(this_dir, "csrc")], |
| | extra_compile_args={ |
| | "cxx": [ |
| | "-O3", |
| | ] |
| | + generator_flag, |
| | "nvcc": [ |
| | "-O3", |
| | "--use_fast_math", |
| | "-maxrregcount=50", |
| | "-gencode", |
| | "arch=compute_70,code=sm_70", |
| | "-gencode", |
| | "arch=compute_80,code=sm_80", |
| | "-gencode", |
| | "arch=compute_90,code=sm_90", |
| | "-U__CUDA_NO_HALF_OPERATORS__", |
| | "-U__CUDA_NO_BFLOAT16_OPERATORS__", |
| | "-U__CUDA_NO_HALF_CONVERSIONS__", |
| | "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", |
| | "--expt-relaxed-constexpr", |
| | "--expt-extended-lambda", |
| | ] |
| | + generator_flag, |
| | }, |
| | ) |
| | ) |
| |
|
| | ext_modules.append( |
| | CUDAExtension( |
| | name="unicore_fused_rmsnorm", |
| | sources=["csrc/rmsnorm/interface.cpp", "csrc/rmsnorm/rmsnorm.cu"], |
| | include_dirs=[os.path.join(this_dir, "csrc")], |
| | extra_compile_args={ |
| | "cxx": [ |
| | "-O3", |
| | ] |
| | + generator_flag, |
| | "nvcc": [ |
| | "-O3", |
| | "--use_fast_math", |
| | "-gencode", |
| | "arch=compute_70,code=sm_70", |
| | "-gencode", |
| | "arch=compute_80,code=sm_80", |
| | "-gencode", |
| | "arch=compute_90,code=sm_90", |
| | "-U__CUDA_NO_HALF_OPERATORS__", |
| | "-U__CUDA_NO_BFLOAT16_OPERATORS__", |
| | "-U__CUDA_NO_HALF_CONVERSIONS__", |
| | "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", |
| | "--expt-relaxed-constexpr", |
| | "--expt-extended-lambda", |
| | ] |
| | + generator_flag, |
| | }, |
| | ) |
| | ) |
| |
|
| | ext_modules.append( |
| | CUDAExtension( |
| | name="unicore_fused_rmsnorm_backward_gamma", |
| | sources=[ |
| | "csrc/rmsnorm/interface_gamma.cpp", |
| | "csrc/rmsnorm/rmsnorm_backward.cu", |
| | ], |
| | include_dirs=[os.path.join(this_dir, "csrc")], |
| | extra_compile_args={ |
| | "cxx": [ |
| | "-O3", |
| | ] |
| | + generator_flag, |
| | "nvcc": [ |
| | "-O3", |
| | "--use_fast_math", |
| | "-maxrregcount=50", |
| | "-gencode", |
| | "arch=compute_70,code=sm_70", |
| | "-gencode", |
| | "arch=compute_80,code=sm_80", |
| | "-gencode", |
| | "arch=compute_90,code=sm_90", |
| | "-U__CUDA_NO_HALF_OPERATORS__", |
| | "-U__CUDA_NO_BFLOAT16_OPERATORS__", |
| | "-U__CUDA_NO_HALF_CONVERSIONS__", |
| | "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", |
| | "--expt-relaxed-constexpr", |
| | "--expt-extended-lambda", |
| | ] |
| | + generator_flag, |
| | }, |
| | ) |
| | ) |
| | setup( |
| | name="unicore", |
| | version=version, |
| | description="DP Technology's Core AI Framework", |
| | url="https://github.com/dptech-corp/unicore", |
| | classifiers=[ |
| | "Intended Audience :: Science/Research", |
| | "License :: OSI Approved :: MIT License", |
| | "Programming Language :: Python :: 3.7", |
| | "Programming Language :: Python :: 3.8", |
| | "Programming Language :: Python :: 3.9", |
| | "Programming Language :: Python :: 3.10", |
| | "Topic :: Scientific/Engineering :: Artificial Intelligence", |
| | ], |
| | setup_requires=[ |
| | "setuptools>=18.0", |
| | ], |
| | install_requires=[ |
| | 'numpy; python_version>="3.7"', |
| | "lmdb", |
| | "tqdm", |
| | "torch>=2.0.0", |
| | "ml_collections", |
| | "scipy", |
| | "tensorboardX", |
| | "tokenizers", |
| | "wandb", |
| | ], |
| | packages=find_packages( |
| | exclude=[ |
| | "build", |
| | "csrc", |
| | "examples", |
| | "examples.*", |
| | "scripts", |
| | "scripts.*", |
| | "tests", |
| | "tests.*", |
| | ] |
| | ), |
| | ext_modules=ext_modules, |
| | cmdclass=cmdclass, |
| | extras_require=extras, |
| | entry_points={ |
| | "console_scripts": [ |
| | "unicore-train = unicore_cli.train:cli_main", |
| | ], |
| | }, |
| | zip_safe=False, |
| | ) |
| |
|