Rename MODEL_CARD.md to README.md
Browse files- MODEL_CARD.md β README.md +41 -24
MODEL_CARD.md β README.md
RENAMED
|
@@ -36,9 +36,7 @@ model-index:
|
|
| 36 |
|
| 37 |
**A Robust Preference Learning Model for Agentic Reasoning Systems**
|
| 38 |
|
| 39 |
-
[](https://huggingface.co/datasets/akleshmishra/orm-pairwise-preference-pairs)
|
| 41 |
-
[](https://github.com/Coder-12)
|
| 42 |
|
| 43 |
</div>
|
| 44 |
|
|
@@ -200,21 +198,38 @@ L = -log(sigmoid(f(x_chosen) - f(x_rejected)))
|
|
| 200 |
### Installation
|
| 201 |
|
| 202 |
```bash
|
| 203 |
-
pip install transformers torch
|
| 204 |
```
|
| 205 |
|
| 206 |
### Basic Usage
|
| 207 |
|
| 208 |
```python
|
| 209 |
-
from transformers import AutoModel, AutoTokenizer
|
| 210 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
#
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
model.eval()
|
| 217 |
-
model.to("cuda" if torch.cuda.is_available() else "cpu")
|
| 218 |
|
| 219 |
# Score a single reasoning trace
|
| 220 |
def score_trace(trace_text: str) -> float:
|
|
@@ -229,12 +244,15 @@ def score_trace(trace_text: str) -> float:
|
|
| 229 |
max_length=512,
|
| 230 |
padding=True
|
| 231 |
)
|
| 232 |
-
inputs = {k: v.to(
|
| 233 |
|
| 234 |
with torch.no_grad():
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
return score
|
| 240 |
|
|
@@ -332,26 +350,25 @@ This work builds upon and complements:
|
|
| 332 |
If you use this model in your research, please cite:
|
| 333 |
|
| 334 |
```bibtex
|
| 335 |
-
@article{
|
| 336 |
-
title={
|
| 337 |
author={Mishra, Aklesh},
|
| 338 |
-
journal={arXiv preprint
|
| 339 |
-
year={
|
|
|
|
| 340 |
}
|
| 341 |
```
|
| 342 |
|
| 343 |
## π Resources
|
| 344 |
|
| 345 |
-
- π **Paper**:
|
| 346 |
- πΎ **Dataset**: [HuggingFace](https://huggingface.co/datasets/LossFunctionLover/orm-pairwise-preference-pairs)
|
| 347 |
-
- π» **Code**: [GitHub](https://github.com/Coder-12)
|
| 348 |
-
- π **Training Logs**: [Weights & Biases](wandb-link) (if available)
|
| 349 |
|
| 350 |
## π§ Contact
|
| 351 |
|
| 352 |
**Aklesh Mishra**
|
| 353 |
- Email: akleshmishra7@gmail.com
|
| 354 |
-
-
|
| 355 |
|
| 356 |
## π License
|
| 357 |
|
|
@@ -382,4 +399,4 @@ This research builds upon months of dedicated work in preference learning and ag
|
|
| 382 |
|
| 383 |
---
|
| 384 |
|
| 385 |
-
**Last Updated**:
|
|
|
|
| 36 |
|
| 37 |
**A Robust Preference Learning Model for Agentic Reasoning Systems**
|
| 38 |
|
| 39 |
+
[](https://huggingface.co/datasets/LossFunctionLover/orm-pairwise-preference-pairs)
|
|
|
|
|
|
|
| 40 |
|
| 41 |
</div>
|
| 42 |
|
|
|
|
| 198 |
### Installation
|
| 199 |
|
| 200 |
```bash
|
| 201 |
+
pip install transformers torch huggingface_hub
|
| 202 |
```
|
| 203 |
|
| 204 |
### Basic Usage
|
| 205 |
|
| 206 |
```python
|
|
|
|
| 207 |
import torch
|
| 208 |
+
from transformers import AutoModel, AutoTokenizer
|
| 209 |
+
from huggingface_hub import hf_hub_download
|
| 210 |
+
|
| 211 |
+
# Download the trained model weights
|
| 212 |
+
model_path = hf_hub_download(
|
| 213 |
+
repo_id="LossFunctionLover/pairwise-orm-model",
|
| 214 |
+
filename="pairwise_orm.pt"
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
# Load the base encoder (frozen during training)
|
| 218 |
+
base_model = AutoModel.from_pretrained("facebook/opt-1.3b")
|
| 219 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
|
| 220 |
+
|
| 221 |
+
# Load the trained scoring head weights
|
| 222 |
+
scoring_head_weights = torch.load(model_path, map_location="cpu")
|
| 223 |
+
|
| 224 |
+
# Initialize scoring head (single linear layer)
|
| 225 |
+
hidden_size = base_model.config.hidden_size
|
| 226 |
+
scoring_head = torch.nn.Linear(hidden_size, 1)
|
| 227 |
+
scoring_head.load_state_dict(scoring_head_weights)
|
| 228 |
|
| 229 |
+
# Move to device
|
| 230 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 231 |
+
base_model.eval().to(device)
|
| 232 |
+
scoring_head.eval().to(device)
|
|
|
|
|
|
|
| 233 |
|
| 234 |
# Score a single reasoning trace
|
| 235 |
def score_trace(trace_text: str) -> float:
|
|
|
|
| 244 |
max_length=512,
|
| 245 |
padding=True
|
| 246 |
)
|
| 247 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 248 |
|
| 249 |
with torch.no_grad():
|
| 250 |
+
# Get base model embeddings
|
| 251 |
+
encoder_outputs = base_model(**inputs)
|
| 252 |
+
# Pool final token (EOS)
|
| 253 |
+
pooled = encoder_outputs.last_hidden_state[:, -1, :]
|
| 254 |
+
# Get reward score
|
| 255 |
+
score = scoring_head(pooled).squeeze(-1).cpu().item()
|
| 256 |
|
| 257 |
return score
|
| 258 |
|
|
|
|
| 350 |
If you use this model in your research, please cite:
|
| 351 |
|
| 352 |
```bibtex
|
| 353 |
+
@article{mishra2026orm,
|
| 354 |
+
title={Stable Outcome Reward Modeling via Pairwise Preference Learning},
|
| 355 |
author={Mishra, Aklesh},
|
| 356 |
+
journal={arXiv preprint},
|
| 357 |
+
year={2026},
|
| 358 |
+
note={Under review}
|
| 359 |
}
|
| 360 |
```
|
| 361 |
|
| 362 |
## π Resources
|
| 363 |
|
| 364 |
+
- π **Paper**: Submitted to arXiv (under review)
|
| 365 |
- πΎ **Dataset**: [HuggingFace](https://huggingface.co/datasets/LossFunctionLover/orm-pairwise-preference-pairs)
|
|
|
|
|
|
|
| 366 |
|
| 367 |
## π§ Contact
|
| 368 |
|
| 369 |
**Aklesh Mishra**
|
| 370 |
- Email: akleshmishra7@gmail.com
|
| 371 |
+
- Independent Researcher
|
| 372 |
|
| 373 |
## π License
|
| 374 |
|
|
|
|
| 399 |
|
| 400 |
---
|
| 401 |
|
| 402 |
+
**Last Updated**: January 22, 2026
|