claudios
/

VulBERTa-MLP-Devign

Text Classification

defect detection

Eval Results (legacy)

text-embeddings-inference

Model card Files Files and versions

VulBERTa-MLP-Devign / tokenization_vulberta.py

claudios's picture

Add custom tokenizer

2274e74 over 2 years ago

history blame contribute delete

1.39 kB

	from typing import List

	from tokenizers import NormalizedString, PreTokenizedString
	from tokenizers.pre_tokenizers import PreTokenizer
	from transformers import PreTrainedTokenizerFast

	try:
	from clang import cindex
	except ModuleNotFoundError as e:
	raise ModuleNotFoundError(
	"VulBERTa Clang tokenizer requires `libclang`. Please install it via `pip install libclang`.",
	) from e


	class ClangPreTokenizer:
	cidx = cindex.Index.create()

	def clang_split(
	self,
	i: int,
	normalized_string: NormalizedString,
	) -> List[NormalizedString]:
	tok = []
	tu = self.cidx.parse(
	"tmp.c",
	args=[""],
	unsaved_files=[("tmp.c", str(normalized_string.original))],
	options=0,
	)
	for t in tu.get_tokens(extent=tu.cursor.extent):
	spelling = t.spelling.strip()
	if spelling == "":
	continue
	tok.append(NormalizedString(spelling))
	return tok

	def pre_tokenize(self, pretok: PreTokenizedString):
	pretok.split(self.clang_split)


	class VulBERTaTokenizer(PreTrainedTokenizerFast):
	def __init__(
	self,
	*args,
	**kwargs,
	):
	super().__init__(
	*args,
	**kwargs,
	)
	self._tokenizer.pre_tokenizer = PreTokenizer.custom(ClangPreTokenizer())