#!/usr/bin/env python3 -u
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
from torch.utils import cpp_extension
from torch.utils.cpp_extension import CUDAExtension, BuildExtension

import os
import subprocess
import sys

from setuptools import find_packages, setup

DISABLE_CUDA_EXTENSION = True
filtered_args = []
for i, arg in enumerate(sys.argv):
    if arg == "--enable-cuda-ext":
        DISABLE_CUDA_EXTENSION = False
        continue
    filtered_args.append(arg)
sys.argv = filtered_args


if sys.version_info < (3, 7):
    sys.exit("Sorry, Python >= 3.7 is required for unicore.")


def write_version_py():
    with open(os.path.join("unicore", "version.txt")) as f:
        version = f.read().strip()

    # write version info to unicore/version.py
    with open(os.path.join("unicore", "version.py"), "w") as f:
        f.write('__version__ = "{}"\n'.format(version))
    return version


version = write_version_py()


# # ninja build does not work unless include_dirs are abs path
this_dir = os.path.dirname(os.path.abspath(__file__))


def get_cuda_bare_metal_version(cuda_dir):
    raw_output = subprocess.check_output(
        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
    )
    output = raw_output.split()
    release_idx = output.index("release") + 1
    release = output[release_idx].split(".")
    bare_metal_major = release[0]
    bare_metal_minor = release[1][0]

    return raw_output, bare_metal_major, bare_metal_minor


if not torch.cuda.is_available() and not DISABLE_CUDA_EXTENSION:
    print(
        "\nWarning: Torch did not find available GPUs on this system.\n",
        "If your intention is to cross-compile, this is not an error.\n"
        "By default, it will cross-compile for Volta (compute capability 7.0), Turing (compute capability 7.5),\n"
        "and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n"
        "If you wish to cross-compile for a single specific architecture,\n"
        'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n',
    )
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
        _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
        if int(bare_metal_major) == 11:
            os.environ["TORCH_CUDA_ARCH_LIST"] = "7.0;7.5;8.0;9.0"
        else:
            os.environ["TORCH_CUDA_ARCH_LIST"] = "7.0;7.5"

print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
TORCH_MAJOR = int(torch.__version__.split(".")[0])
TORCH_MINOR = int(torch.__version__.split(".")[1])

if not ((TORCH_MAJOR >= 1 and TORCH_MINOR >= 4) or (TORCH_MAJOR > 1)):
    raise RuntimeError(
        "Requires Pytorch 1.4 or newer.\n"
        + "The latest stable release can be obtained from https://pytorch.org/"
    )

cmdclass = {}
ext_modules = []

extras = {}

if not DISABLE_CUDA_EXTENSION:

    def get_cuda_bare_metal_version(cuda_dir):
        raw_output = subprocess.check_output(
            [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
        )
        output = raw_output.split()
        release_idx = output.index("release") + 1
        release = output[release_idx].split(".")
        bare_metal_major = release[0]
        bare_metal_minor = release[1][0]

        return raw_output, bare_metal_major, bare_metal_minor

    def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
        raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(
            cuda_dir
        )
        torch_binary_major = torch.version.cuda.split(".")[0]
        torch_binary_minor = torch.version.cuda.split(".")[1]

        print("\nCompiling cuda extensions with")
        print(raw_output + "from " + cuda_dir + "/bin\n")

        if (bare_metal_major != torch_binary_major) or (
            bare_metal_minor != torch_binary_minor
        ):
            raise RuntimeError(
                "Cuda extensions are being compiled with a version of Cuda that does "
                + "not match the version used to compile Pytorch binaries.  "
                + "Pytorch binaries were compiled with Cuda {}.\n".format(
                    torch.version.cuda
                )
            )

    cmdclass["build_ext"] = BuildExtension

    if torch.utils.cpp_extension.CUDA_HOME is None:
        raise RuntimeError(
            "Nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc."
        )

    # check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)

    generator_flag = []
    torch_dir = torch.__path__[0]
    if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGenerator.h")):
        generator_flag = ["-DOLD_GENERATOR"]

    ext_modules.append(
        CUDAExtension(
            name="unicore_fused_rounding",
            sources=["csrc/rounding/interface.cpp", "csrc/rounding/fp32_to_bf16.cu"],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": [
                    "-O3",
                ]
                + generator_flag,
                "nvcc": [
                    "-O3",
                    "--use_fast_math",
                    "-gencode",
                    "arch=compute_70,code=sm_70",
                    "-gencode",
                    "arch=compute_80,code=sm_80",
                    "-gencode",
                    "arch=compute_90,code=sm_90",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ]
                + generator_flag,
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="unicore_fused_multi_tensor",
            sources=[
                "csrc/multi_tensor/interface.cpp",
                "csrc/multi_tensor/multi_tensor_l2norm_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-O3",
                    "--use_fast_math",
                    "-gencode",
                    "arch=compute_70,code=sm_70",
                    "-gencode",
                    "arch=compute_80,code=sm_80",
                    "-gencode",
                    "arch=compute_90,code=sm_90",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="unicore_fused_adam",
            sources=["csrc/adam/interface.cpp", "csrc/adam/adam_kernel.cu"],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-O3",
                    "--use_fast_math",
                    "-gencode",
                    "arch=compute_70,code=sm_70",
                    "-gencode",
                    "arch=compute_80,code=sm_80",
                    "-gencode",
                    "arch=compute_90,code=sm_90",
                ],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="unicore_fused_softmax_dropout",
            sources=[
                "csrc/softmax_dropout/interface.cpp",
                "csrc/softmax_dropout/softmax_dropout_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": [
                    "-O3",
                ]
                + generator_flag,
                "nvcc": [
                    "-O3",
                    "--use_fast_math",
                    "-gencode",
                    "arch=compute_70,code=sm_70",
                    "-gencode",
                    "arch=compute_80,code=sm_80",
                    "-gencode",
                    "arch=compute_90,code=sm_90",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ]
                + generator_flag,
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="unicore_fused_layernorm",
            sources=["csrc/layernorm/interface.cpp", "csrc/layernorm/layernorm.cu"],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": [
                    "-O3",
                ]
                + generator_flag,
                "nvcc": [
                    "-O3",
                    "--use_fast_math",
                    "-gencode",
                    "arch=compute_70,code=sm_70",
                    "-gencode",
                    "arch=compute_80,code=sm_80",
                    "-gencode",
                    "arch=compute_90,code=sm_90",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ]
                + generator_flag,
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="unicore_fused_layernorm_backward_gamma_beta",
            sources=[
                "csrc/layernorm/interface_gamma_beta.cpp",
                "csrc/layernorm/layernorm_backward.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": [
                    "-O3",
                ]
                + generator_flag,
                "nvcc": [
                    "-O3",
                    "--use_fast_math",
                    "-maxrregcount=50",
                    "-gencode",
                    "arch=compute_70,code=sm_70",
                    "-gencode",
                    "arch=compute_80,code=sm_80",
                    "-gencode",
                    "arch=compute_90,code=sm_90",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ]
                + generator_flag,
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="unicore_fused_rmsnorm",
            sources=["csrc/rmsnorm/interface.cpp", "csrc/rmsnorm/rmsnorm.cu"],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": [
                    "-O3",
                ]
                + generator_flag,
                "nvcc": [
                    "-O3",
                    "--use_fast_math",
                    "-gencode",
                    "arch=compute_70,code=sm_70",
                    "-gencode",
                    "arch=compute_80,code=sm_80",
                    "-gencode",
                    "arch=compute_90,code=sm_90",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ]
                + generator_flag,
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="unicore_fused_rmsnorm_backward_gamma",
            sources=[
                "csrc/rmsnorm/interface_gamma.cpp",
                "csrc/rmsnorm/rmsnorm_backward.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": [
                    "-O3",
                ]
                + generator_flag,
                "nvcc": [
                    "-O3",
                    "--use_fast_math",
                    "-maxrregcount=50",
                    "-gencode",
                    "arch=compute_70,code=sm_70",
                    "-gencode",
                    "arch=compute_80,code=sm_80",
                    "-gencode",
                    "arch=compute_90,code=sm_90",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ]
                + generator_flag,
            },
        )
    )
setup(
    name="unicore",
    version=version,
    description="DP Technology's Core AI Framework",
    url="https://github.com/dptech-corp/unicore",
    classifiers=[
        "Intended Audience :: Science/Research",
        "License :: OSI Approved :: MIT License",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
    setup_requires=[
        "setuptools>=18.0",
    ],
    install_requires=[
        'numpy; python_version>="3.7"',
        "lmdb",
        "tqdm",
        "torch>=2.0.0",
        "ml_collections",
        "scipy",
        "tensorboardX",
        "tokenizers",
        "wandb",
    ],
    packages=find_packages(
        exclude=[
            "build",
            "csrc",
            "examples",
            "examples.*",
            "scripts",
            "scripts.*",
            "tests",
            "tests.*",
        ]
    ),
    ext_modules=ext_modules,
    cmdclass=cmdclass,
    extras_require=extras,
    entry_points={
        "console_scripts": [
            "unicore-train = unicore_cli.train:cli_main",
        ],
    },
    zip_safe=False,
)