Instructions to use Proooof/Finance_NLP_Toolkit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Proooof/Finance_NLP_Toolkit with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="Proooof/Finance_NLP_Toolkit")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Proooof/Finance_NLP_Toolkit", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| import json, argparse | |
| from datasets import Dataset | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer | |
| from training.utils import compute_metrics_ner | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model_name", default="bert-base-cased") | |
| parser.add_argument("--train_json", required=True, help="JSONL with {'tokens': [...], 'ner_tags': [...]} per line") | |
| parser.add_argument("--eval_json", required=True) | |
| parser.add_argument("--text_col", default="tokens") | |
| parser.add_argument("--label_col", default="ner_tags") | |
| parser.add_argument("--labels_file", default="training/labels_ner.json") | |
| parser.add_argument("--output_dir", default="./outputs/ner") | |
| parser.add_argument("--epochs", type=int, default=5) | |
| parser.add_argument("--batch_size", type=int, default=8) | |
| parser.add_argument("--lr", type=float, default=3e-5) | |
| args = parser.parse_args() | |
| def load_jsonl(path): | |
| rows = [] | |
| with open(path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| rows.append(json.loads(line)) | |
| return rows | |
| train_rows = load_jsonl(args.train_json) | |
| eval_rows = load_jsonl(args.eval_json) | |
| with open(args.labels_file, "r") as f: | |
| label_list = json.load(f) # e.g., ["O","B-ORG","I-ORG","B-MONEY","I-MONEY","B-DATE","I-DATE","B-TICKER","I-TICKER"] | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) | |
| def align_labels_with_tokens(tokens, labels): | |
| # labels are per-token already; convert to ids | |
| label2id = {l:i for i,l in enumerate(label_list)} | |
| return [label2id[l] for l in labels] | |
| def encode_batch(batch): | |
| tokenized = tokenizer(batch[args.text_col], is_split_into_words=True, truncation=True, padding=True) | |
| encoded_labels = [] | |
| for i, labels in enumerate(batch[args.label_col]): | |
| word_ids = tokenized.word_ids(batch_index=i) | |
| label_ids = [] | |
| j = 0 | |
| for w_id in word_ids: | |
| if w_id is None: | |
| label_ids.append(-100) | |
| else: | |
| label_ids.append(align_labels_with_tokens(batch[args.text_col][i], labels)[w_id]) | |
| encoded_labels.append(label_ids) | |
| tokenized["labels"] = encoded_labels | |
| return tokenized | |
| train_ds = Dataset.from_list(train_rows).map(encode_batch, batched=True, remove_columns=[args.text_col, args.label_col]) | |
| eval_ds = Dataset.from_list(eval_rows).map(encode_batch, batched=True, remove_columns=[args.text_col, args.label_col]) | |
| model = AutoModelForTokenClassification.from_pretrained( | |
| args.model_name, num_labels=len(label_list), id2label={i:l for i,l in enumerate(label_list)}, label2id={l:i for i,l in enumerate(label_list)} | |
| ) | |
| data_collator = DataCollatorForTokenClassification(tokenizer) | |
| training_args = TrainingArguments( | |
| output_dir=args.output_dir, | |
| evaluation_strategy="epoch", | |
| learning_rate=args.lr, | |
| per_device_train_batch_size=args.batch_size, | |
| per_device_eval_batch_size=args.batch_size, | |
| num_train_epochs=args.epochs, | |
| weight_decay=0.01, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="f1", | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_ds, | |
| eval_dataset=eval_ds, | |
| tokenizer=tokenizer, | |
| data_collator=data_collator, | |
| compute_metrics=lambda p: compute_metrics_ner(p, label_list), | |
| ) | |
| trainer.train() | |
| trainer.save_model(args.output_dir) | |
| tokenizer.save_pretrained(args.output_dir) | |