| | --- |
| | license: apache-2.0 |
| | language: |
| | - he |
| | datasets: |
| | - HeTree/MevakerConcSen |
| | --- |
| | ## Hebrew Conclusion Extraction Model (based on sequence plus context classification) |
| |
|
| | #### How to use |
| |
|
| | ```python |
| | from transformers import RobertaTokenizerFast, AutoModelForSequenceClassification |
| | from datasets import load_dataset, Dataset |
| | from functools import partial |
| | from tqdm.auto import tqdm |
| | tqdm._instances.clear() |
| | |
| | def tokenize_function(example): |
| | inputs = tokenizer( |
| | example["sentence"], |
| | example["context"], |
| | max_length=512, |
| | truncation=True, |
| | padding="max_length", |
| | ) |
| | return inputs |
| | |
| | def create_windowed_context_ds(context_l, example, idx): |
| | example["context"] = context_l[idx] |
| | return example |
| | |
| | def create_windowed_context(raw_dataset, window_size): |
| | df_pandas = raw_dataset['train'].to_pandas() |
| | len1 = len(raw_dataset['train']) |
| | context_l = [] |
| | for i in tqdm(range(len1)): |
| | if i - window_size <0: |
| | context_l.append(' '.join(df_pandas['sentence'][0:window_size])) |
| | else: |
| | if i + window_size > len1 : |
| | context_l.append(' '.join(df_pandas['sentence'][i - window_size:-1])) |
| | else: |
| | context_l.append(' '.join(df_pandas['sentence'][i - window_size:i + window_size])) |
| | return context_l |
| | |
| | model = AutoModelForSequenceClassification.from_pretrained('HeTree/HeConEspc', num_labels=2) |
| | tokenizer = RobertaTokenizerFast.from_pretrained('HeTree/HeConEspc') |
| | raw_dataset = load_dataset('HeTree/MevakerConcSen') |
| | window_size = 5 |
| | context_l = create_windowed_context(raw_dataset, window_size) |
| | raw_dataset_window = raw_dataset.map(partial(create_windowed_context_ds, context_l), batched=False, with_indices=True) |
| | tokenized_data = raw_dataset_window.map(tokenize_function, batched=True) |
| | ``` |
| |
|
| |
|
| | ### Citing |
| |
|
| | If you use HeConEspc in your research, please cite [Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language](https://arxiv.org/abs/2403.09719). |
| | ``` |
| | @article{shalumov2024mevaker, |
| | title={Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language}, |
| | author={Vitaly Shalumov and Harel Haskey and Yuval Solaz}, |
| | year={2024}, |
| | eprint={2403.09719}, |
| | archivePrefix={arXiv}, |
| | primaryClass={cs.CL} |
| | } |
| | ``` |