Text Classification
Transformers
Safetensors
roberta
devign
defect detection
code
Eval Results (legacy)
text-embeddings-inference
Instructions to use claudios/VulBERTa-MLP-Devign with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use claudios/VulBERTa-MLP-Devign with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="claudios/VulBERTa-MLP-Devign")# Load model directly from transformers import AutoTokenizer, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained("claudios/VulBERTa-MLP-Devign") model = AutoModelForSequenceClassification.from_pretrained("claudios/VulBERTa-MLP-Devign") - Notebooks
- Google Colab
- Kaggle
| from typing import List | |
| from tokenizers import NormalizedString, PreTokenizedString | |
| from tokenizers.pre_tokenizers import PreTokenizer | |
| from transformers import PreTrainedTokenizerFast | |
| try: | |
| from clang import cindex | |
| except ModuleNotFoundError as e: | |
| raise ModuleNotFoundError( | |
| "VulBERTa Clang tokenizer requires `libclang`. Please install it via `pip install libclang`.", | |
| ) from e | |
| class ClangPreTokenizer: | |
| cidx = cindex.Index.create() | |
| def clang_split( | |
| self, | |
| i: int, | |
| normalized_string: NormalizedString, | |
| ) -> List[NormalizedString]: | |
| tok = [] | |
| tu = self.cidx.parse( | |
| "tmp.c", | |
| args=[""], | |
| unsaved_files=[("tmp.c", str(normalized_string.original))], | |
| options=0, | |
| ) | |
| for t in tu.get_tokens(extent=tu.cursor.extent): | |
| spelling = t.spelling.strip() | |
| if spelling == "": | |
| continue | |
| tok.append(NormalizedString(spelling)) | |
| return tok | |
| def pre_tokenize(self, pretok: PreTokenizedString): | |
| pretok.split(self.clang_split) | |
| class VulBERTaTokenizer(PreTrainedTokenizerFast): | |
| def __init__( | |
| self, | |
| *args, | |
| **kwargs, | |
| ): | |
| super().__init__( | |
| *args, | |
| **kwargs, | |
| ) | |
| self._tokenizer.pre_tokenizer = PreTokenizer.custom(ClangPreTokenizer()) | |