diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..cb4ae273bb47fe3873611d6f094e9b637e4e103e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,100 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/_rmsnorm_0d12ee5.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-xpu20251-x86_64-linux/rmsnorm/_rmsnorm_0d12ee5.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/rmsnorm/_rmsnorm_0d12ee5.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_a7a4369.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_a7a4369.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_a7a4369.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_a7a4369.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_fb26d8c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_fb26d8c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_fb26d8c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_fb26d8c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_fb26d8c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_fb26d8c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_7606158.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_7606158.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_7606158.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_7606158.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_7606158.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_7606158.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_4367ce1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_4367ce1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_4367ce1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_4367ce1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_4367ce1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_4367ce1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_a8702c9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_a8702c9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_a8702c9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_a8702c9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_a8702c9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_a8702c9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_235cde1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_235cde1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_235cde1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_235cde1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_235cde1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_235cde1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_fd30c0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_fd30c0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_fd30c0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_fd30c0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-xpu20252-x86_64-windows/rmsnorm/_rmsnorm_96c9886.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_ce2b5cc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_ce2b5cc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_ce2b5cc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_ce2b5cc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-xpu20253-x86_64-windows/rmsnorm/_rmsnorm_4cd2f5b.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_7bbf693.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_7bbf693.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_7bbf693.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_xpu_7bbf693.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-xpu20253-x86_64-windows/_rmsnorm_xpu_0f8f3b4.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-xpu20253-x86_64-windows/_rmsnorm_xpu_2aa36b6.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_b3d66c6.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_b3d66c6.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_b3d66c6.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_xpu_b3d66c6.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_cec90b8.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_cec90b8.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_cec90b8.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_xpu_cec90b8.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_1a02f6f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_1a02f6f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_1a02f6f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_1a02f6f.abi3.so filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c8870a17590bc1bd03ed09dc4bdb14bccb86caf1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+---
+tags:
+- kernels
+- cuda
+---
\ No newline at end of file
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37702991fadf27d757eba7cb11a50704006e4f9c
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_ops.py b/build/torch210-cxx11-cpu-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b39cfa78e70e8f8a8ee88dcaa37942f199479e
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_cpu_1a02f6f
+ops = torch.ops._rmsnorm_cpu_1a02f6f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_cpu_1a02f6f::{op_name}"
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_1a02f6f.abi3.so b/build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_1a02f6f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..2b8d25b32392b3987d6823b3d8e55fb28134b077
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_1a02f6f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c102259696d99bbe9d4c686b4293195548faa4856123a358d44aab3d90148620
+size 2006072
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/layers.py b/build/torch210-cxx11-cpu-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed828add3a21d3c45864ccf3d43123f1d3911a1c
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/metadata.json b/build/torch210-cxx11-cpu-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb22148b3f551be150f7824a5684c19bbc40ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37702991fadf27d757eba7cb11a50704006e4f9c
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4dd5f183ce502642a8715282c0dddcb5e305f01
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_xpu_1a02f6f
+ops = torch.ops._rmsnorm_xpu_1a02f6f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_xpu_1a02f6f::{op_name}"
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_1a02f6f.abi3.so b/build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_1a02f6f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c4362d7482afafc37105461800325dc8dcde2649
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_1a02f6f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a87f0910ab215646183ecd9f4b2cbc5be6c72c3eee20d167f42f71c14629e65
+size 104793360
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/layers.py b/build/torch210-cxx11-xpu20253-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed828add3a21d3c45864ccf3d43123f1d3911a1c
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json b/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..b911d0a2549a35a1c65ab7e77d32e5aac23cd6ac
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/rmsnorm/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-xpu20253-x86_64-windows/__init__.py b/build/torch210-xpu20253-x86_64-windows/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1c7731922beab2e10baf849ad97324beb02c15
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch210-xpu20253-x86_64-windows/_ops.py b/build/torch210-xpu20253-x86_64-windows/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..dece9201fc5e1cac54dce23a2d64cbd7ca1859d2
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_xpu_2aa36b6
+ops = torch.ops._rmsnorm_xpu_2aa36b6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_xpu_2aa36b6::{op_name}"
diff --git a/build/torch210-xpu20253-x86_64-windows/_rmsnorm_xpu_2aa36b6.pyd b/build/torch210-xpu20253-x86_64-windows/_rmsnorm_xpu_2aa36b6.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..296a4c5a336e00b5f9c912564cd8f7e4a0003a5f
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/_rmsnorm_xpu_2aa36b6.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:690752b7e809e03b7be6d8f5521080ea84115db1078cf6a0010597612e5844d7
+size 2363904
diff --git a/build/torch210-xpu20253-x86_64-windows/layers.py b/build/torch210-xpu20253-x86_64-windows/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4401139637abaedc8e292bcc938ed17f3a6e1c89
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch210-xpu20253-x86_64-windows/metadata.json b/build/torch210-xpu20253-x86_64-windows/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5381dd80836f863378b9f33a559815688de9287
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/metadata.json
@@ -0,0 +1,5 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch210-xpu20253-x86_64-windows/rmsnorm/__init__.py b/build/torch210-xpu20253-x86_64-windows/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc434ef44e63409acb52a8f3fff54a4adc46ed6a
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37702991fadf27d757eba7cb11a50704006e4f9c
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_ops.py b/build/torch211-cxx11-cpu-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b39cfa78e70e8f8a8ee88dcaa37942f199479e
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_cpu_1a02f6f
+ops = torch.ops._rmsnorm_cpu_1a02f6f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_cpu_1a02f6f::{op_name}"
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_1a02f6f.abi3.so b/build/torch211-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_1a02f6f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..8d4c155c592327aef47b369b9f4b1b5003531435
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_1a02f6f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:439ac1a1bc4a6095844795cbccd7f2137c101bce3e3415bcebb3fd2b0dfcb97b
+size 2001976
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/layers.py b/build/torch211-cxx11-cpu-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed828add3a21d3c45864ccf3d43123f1d3911a1c
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/metadata.json b/build/torch211-cxx11-cpu-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb22148b3f551be150f7824a5684c19bbc40ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37702991fadf27d757eba7cb11a50704006e4f9c
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4dd5f183ce502642a8715282c0dddcb5e305f01
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_xpu_1a02f6f
+ops = torch.ops._rmsnorm_xpu_1a02f6f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_xpu_1a02f6f::{op_name}"
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_1a02f6f.abi3.so b/build/torch211-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_1a02f6f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..31be8841d454fe9c33f5de2b2d4738593c2fd54f
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_rmsnorm_xpu_1a02f6f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:153aa232ee4f342e92075140aa796e86ccd2f55f07d27bcad90890ed2fac57bf
+size 104793120
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/layers.py b/build/torch211-cxx11-xpu20253-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed828add3a21d3c45864ccf3d43123f1d3911a1c
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json b/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..b911d0a2549a35a1c65ab7e77d32e5aac23cd6ac
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/rmsnorm/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__init__.py b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2235c56b91c957e1ba2c1b5dc189aa3682bc1b
--- /dev/null
+++ b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,14 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )
+
+__all__ = ["layers", "apply_rms_norm"]
+
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8491c70660a5490fa811a066e5cade533975d4f6
Binary files /dev/null and b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..082997e4c8c09f173a9b3b1af712b717ffabea92
Binary files /dev/null and b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58001ddaeac4d0e2f8695d10b8a0d613b2a9919a
Binary files /dev/null and b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/_ops.py b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ce583db41cbfd7783d2ea4bb1e9287e23a3bf3e
--- /dev/null
+++ b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_0d12ee5
+ops = torch.ops._rmsnorm_0d12ee5
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_0d12ee5::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/_rmsnorm_0d12ee5.abi3.so b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/_rmsnorm_0d12ee5.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..7ee0a01a1d2b5e7be2408d3028781311a0118238
--- /dev/null
+++ b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/_rmsnorm_0d12ee5.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79eb24cb07a24a3f829ce1d210bd0cbd79badd0cc236710a84e83c15575ddf04
+size 100963504
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/layers.py b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a78c858f9c1d2c18845e0a6ebae1c4327284b9dc
--- /dev/null
+++ b/build/torch27-cxx11-xpu20250-x86_64-linux/rmsnorm/layers.py
@@ -0,0 +1,36 @@
+import torch
+from ._ops import ops
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return ops.apply_rms_norm(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch28-cxx11-cpu-x86_64-linux/__init__.py b/build/torch28-cxx11-cpu-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37702991fadf27d757eba7cb11a50704006e4f9c
--- /dev/null
+++ b/build/torch28-cxx11-cpu-x86_64-linux/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch28-cxx11-cpu-x86_64-linux/_ops.py b/build/torch28-cxx11-cpu-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..72eb9e39d1d8ec3d9ad8f07fd8d6dbcd034187d7
--- /dev/null
+++ b/build/torch28-cxx11-cpu-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_235cde1
+ops = torch.ops._rmsnorm_235cde1
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_235cde1::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_235cde1.abi3.so b/build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_235cde1.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..cb7d092ce04f021549684ad2a4f95e6de84c82f8
--- /dev/null
+++ b/build/torch28-cxx11-cpu-x86_64-linux/_rmsnorm_235cde1.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16c92de9cefabeeadc60ffff87189a1e66ecb9ea19b343570ac55e9d9c7d98fe
+size 156648
diff --git a/build/torch28-cxx11-cpu-x86_64-linux/layers.py b/build/torch28-cxx11-cpu-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed828add3a21d3c45864ccf3d43123f1d3911a1c
--- /dev/null
+++ b/build/torch28-cxx11-cpu-x86_64-linux/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch28-cxx11-cpu-x86_64-linux/metadata.json b/build/torch28-cxx11-cpu-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cpu-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py b/build/torch28-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/__init__.py b/build/torch28-cxx11-xpu20251-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37702991fadf27d757eba7cb11a50704006e4f9c
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/_ops.py b/build/torch28-cxx11-xpu20251-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..72eb9e39d1d8ec3d9ad8f07fd8d6dbcd034187d7
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_235cde1
+ops = torch.ops._rmsnorm_235cde1
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_235cde1::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_235cde1.abi3.so b/build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_235cde1.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..28951a4be0835cb76bf825819a2c98916258e3b2
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/_rmsnorm_235cde1.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77c4b43d63dc74b210633da81630023a6d6e359a7a1115bff55da9f4436053d9
+size 103700632
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/layers.py b/build/torch28-cxx11-xpu20251-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed828add3a21d3c45864ccf3d43123f1d3911a1c
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/metadata.json b/build/torch28-cxx11-xpu20251-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/rmsnorm/__init__.py b/build/torch28-cxx11-xpu20251-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37702991fadf27d757eba7cb11a50704006e4f9c
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_ops.py b/build/torch29-cxx11-cpu-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcfb7861dfb320e9e4ae32388904a8bea9ce7079
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_cpu_b3d66c6
+ops = torch.ops._rmsnorm_cpu_b3d66c6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_cpu_b3d66c6::{op_name}"
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_b3d66c6.abi3.so b/build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_b3d66c6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..6d57c2727fb3d802d56b32e4ad541c093fe0e547
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_rmsnorm_cpu_b3d66c6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf3b3a68445d97357b4c08dd07ed0d197d18c9e7449ad62172dd55dfc49e7d08
+size 1999776
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/layers.py b/build/torch29-cxx11-cpu-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed828add3a21d3c45864ccf3d43123f1d3911a1c
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/metadata.json b/build/torch29-cxx11-cpu-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5381dd80836f863378b9f33a559815688de9287
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/metadata.json
@@ -0,0 +1,5 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37702991fadf27d757eba7cb11a50704006e4f9c
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..399c07d6a8ba93f4907028763b4b8967d50eb815
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_xpu_cec90b8
+ops = torch.ops._rmsnorm_xpu_cec90b8
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_xpu_cec90b8::{op_name}"
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_xpu_cec90b8.abi3.so b/build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_xpu_cec90b8.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..fa48eb982c08dacaf4d75c447d3fc11b177d4f85
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_rmsnorm_xpu_cec90b8.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5fb0e95d3b6be17bd03833abcf461bb10d9c62fbf1336d9226dce0950dce1fa
+size 102179544
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/layers.py b/build/torch29-cxx11-xpu20252-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed828add3a21d3c45864ccf3d43123f1d3911a1c
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json b/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..b911d0a2549a35a1c65ab7e77d32e5aac23cd6ac
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/rmsnorm/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/rmsnorm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-xpu20252-x86_64-windows/metadata.json b/build/torch29-xpu20252-x86_64-windows/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch29-xpu20252-x86_64-windows/rmsnorm/__init__.py b/build/torch29-xpu20252-x86_64-windows/rmsnorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1c7731922beab2e10baf849ad97324beb02c15
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/rmsnorm/__init__.py
@@ -0,0 +1,27 @@
+from . import layers
+
+from ._ops import ops
+
+
+def apply_rms_norm(input, weight, eps):
+    # ops.apply_rms_norm returns [output, rstd]
+    return ops.apply_rms_norm(
+            input,
+            weight,
+            eps,
+    )[0]
+
+def apply_rms_norm_backward(grad_output, input, weight, output, rstd, eps, input_requires_grad=True, weight_requires_grad=True):
+    return ops.apply_rms_norm_backward(
+            grad_output,
+            input,
+            weight,
+            output,
+            rstd,
+            eps,
+            input_requires_grad,
+            weight_requires_grad
+    )
+
+__all__ = ["layers", "apply_rms_norm_forward", "apply_rms_norm_backward"]
+
diff --git a/build/torch29-xpu20252-x86_64-windows/rmsnorm/_ops.py b/build/torch29-xpu20252-x86_64-windows/rmsnorm/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..572899747964300a538b85d5952f7aee04ea8c27
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/rmsnorm/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rmsnorm_96c9886
+ops = torch.ops._rmsnorm_96c9886
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rmsnorm_96c9886::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-xpu20252-x86_64-windows/rmsnorm/_rmsnorm_96c9886.pyd b/build/torch29-xpu20252-x86_64-windows/rmsnorm/_rmsnorm_96c9886.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..2813ba3019836fb28e60f27081e73a38a8a892cc
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/rmsnorm/_rmsnorm_96c9886.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0cfb67260dcf293c71463a698f1531e9d86fb497f9dcf86c296d612ffa4c142
+size 2379264
diff --git a/build/torch29-xpu20252-x86_64-windows/rmsnorm/layers.py b/build/torch29-xpu20252-x86_64-windows/rmsnorm/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4401139637abaedc8e292bcc938ed17f3a6e1c89
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/rmsnorm/layers.py
@@ -0,0 +1,59 @@
+import torch
+from ._ops import ops
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, variance_epsilon):
+        ctx.variance_epsilon = variance_epsilon
+        output, rstd = ops.apply_rms_norm(hidden_states, weight, variance_epsilon)
+        ctx.save_for_backward(hidden_states, weight, output, rstd)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, weight, output, rstd = ctx.saved_tensors
+        grads = ops.apply_rms_norm_backward(
+            grad_output,
+            hidden_states,
+            weight,
+            output,
+            rstd,
+            ctx.variance_epsilon,
+            ctx.needs_input_grad[0],
+            ctx.needs_input_grad[1]
+        )
+        return grads[0], grads[1], None
+
+class RMSNorm(torch.nn.Module):
+    """
+    RMSNorm module that uses the optimized LigerRMSNormFunction.
+    
+    Args:
+        hidden_size (int): The size of the hidden dimension.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
+        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
+        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
+    """
+    
+
+    weight: torch.Tensor
+    variance_epsilon: float
+    
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to the input tensor.
+        
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
+            
+        Returns:
+            torch.Tensor: Normalized tensor of the same shape as input
+        """
+        return RMSNormFunction.apply(
+            hidden_states,
+            self.weight, 
+            self.variance_epsilon,
+        )
+    
+__all__ = ["RMSNorm"]