| | ''' |
| | Split a given dataset into three different datasets: training, validation and |
| | testing. |
| | |
| | This is achieved by splitting the given list of sentences into three separate |
| | lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an |
| | explicit enumeration. The sentences are also tokenised using the given |
| | vocabulary. |
| | |
| | Also splits a given list of dictionaries containing information about |
| | each sentence. |
| | |
| | An additional parameter can be set 'extend_with', which will extend the given |
| | vocabulary with up to 'extend_with' tokens, taken from the training dataset. |
| | ''' |
| | from __future__ import print_function, unicode_literals |
| | import example_helper |
| | import json |
| |
|
| | from torchmoji.sentence_tokenizer import SentenceTokenizer |
| |
|
| | DATASET = [ |
| | 'I am sentence 0', |
| | 'I am sentence 1', |
| | 'I am sentence 2', |
| | 'I am sentence 3', |
| | 'I am sentence 4', |
| | 'I am sentence 5', |
| | 'I am sentence 6', |
| | 'I am sentence 7', |
| | 'I am sentence 8', |
| | 'I am sentence 9 newword', |
| | ] |
| |
|
| | INFO_DICTS = [ |
| | {'label': 'sentence 0'}, |
| | {'label': 'sentence 1'}, |
| | {'label': 'sentence 2'}, |
| | {'label': 'sentence 3'}, |
| | {'label': 'sentence 4'}, |
| | {'label': 'sentence 5'}, |
| | {'label': 'sentence 6'}, |
| | {'label': 'sentence 7'}, |
| | {'label': 'sentence 8'}, |
| | {'label': 'sentence 9'}, |
| | ] |
| |
|
| | with open('../model/vocabulary.json', 'r') as f: |
| | vocab = json.load(f) |
| | st = SentenceTokenizer(vocab, 30) |
| |
|
| | |
| | print(st.split_train_val_test(DATASET, INFO_DICTS)) |
| |
|
| | |
| | print(st.split_train_val_test(DATASET, |
| | INFO_DICTS, |
| | [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], |
| | extend_with=1)) |
| |
|