| | import pandas as pd |
| | import torch |
| | def preparing_data(text:str , domain: int): |
| | """ |
| | |
| | |
| | |
| | Args: |
| | text (_str_): input text from the user |
| | domain (_int_): output domain from domain identification pipeline |
| | |
| | Returns: |
| | _DataFrame_: dataframe contains texts and domain |
| | """ |
| | |
| | |
| | dict_data = { |
| | 'text': ['hello world' ] , |
| | 'domain': [0] , |
| | } |
| | |
| | dict_data["text"].append(text) |
| | dict_data["domain"].append(domain) |
| | |
| | df = pd.DataFrame(dict_data) |
| |
|
| | |
| | return df |
| |
|
| |
|
| | def loading_data(tokenizer , df: pd.DataFrame ): |
| | ids = [] |
| | masks = [] |
| | domain_list = [] |
| |
|
| | texts = df["text"] |
| | domains= df["domain"] |
| |
|
| | |
| | for i in range(len(df)): |
| | text = texts[i] |
| | token = tokenizer(text) |
| | ids.append(token["token_id"]) |
| | masks.append(token["mask"]) |
| | domain_list.append(domains[i]) |
| |
|
| | input_ids = torch.cat(ids , dim=0) |
| | input_masks = torch.cat(masks ,dim = 0) |
| | input_domains = torch.tensor(domain_list) |
| | |
| | |
| | return input_ids , input_masks , input_domains |
| |
|
| |
|