Instructions to use Language/DemoRepo with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- TF-Keras
How to use Language/DemoRepo with TF-Keras:
# Note: 'keras<3.x' or 'tf_keras' must be installed (legacy) # See https://github.com/keras-team/tf-keras for more details. from huggingface_hub import from_pretrained_keras model = from_pretrained_keras("Language/DemoRepo") - Notebooks
- Google Colab
- Kaggle
| #Load all the required modules. | |
| from tensorflow.keras.models import Model | |
| from tensorflow.keras import models | |
| from tensorflow.keras.utils import plot_model | |
| from tensorflow.keras.layers import Input,LSTM,Dense | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| import numpy as np | |
| import pickle | |
| #initialize all variables | |
| input_texts=[] | |
| target_texts=[] | |
| input_characters=set() | |
| target_characters=set() | |
| #read dataset file | |
| with open('eng-french.txt','r',encoding='utf-8') as f: | |
| rows=f.read().split('\n') | |
| #read first 10,000 rows from dataset | |
| for row in rows[:10000]: | |
| #split input and target by '\t'=tab | |
| input_text,target_text = row.split('\t') | |
| #add '\t' at start and '\n' at end of text. | |
| target_text='\t' + target_text + '\n' | |
| input_texts.append(input_text.lower()) | |
| target_texts.append(target_text.lower()) | |
| #split character from text and add in respective sets | |
| input_characters.update(list(input_text.lower())) | |
| target_characters.update(list(target_text.lower())) | |
| #sort input and target characters | |
| input_characters = sorted(list(input_characters)) | |
| target_characters = sorted(list(target_characters)) | |
| #get the total length of input and target characters | |
| num_en_chars = len(input_characters) | |
| num_dec_chars = len(target_characters) | |
| #get the maximum length of input and target text. | |
| max_input_length = max([len(i) for i in input_texts]) | |
| max_target_length = max([len(i) for i in target_texts]) | |
| def bagofcharacters(input_texts,target_texts): | |
| #inintialize encoder , decoder input and target data. | |
| en_in_data=[] ; dec_in_data=[] ; dec_tr_data=[] | |
| #padding variable with first character as 1 as rest all 0. | |
| pad_en=[1]+[0]*(len(input_characters)-1) | |
| pad_dec=[0]*(len(target_characters)) ; pad_dec[2]=1 | |
| #countvectorizer for one hot encoding as we want to tokenize character so | |
| #anlyzer is true and None the stopwords action. | |
| cv=CountVectorizer(binary=True,tokenizer=lambda txt: txt.split(),stop_words=None,analyzer='char') | |
| for i,(input_t,target_t) in enumerate(zip(input_texts,target_texts)): | |
| #fit the input characters into the CountVectorizer function | |
| cv_inp= cv.fit(input_characters) | |
| #transform the input text from the help of CountVectorizer fit. | |
| #it character present than put 1 and 0 otherwise. | |
| en_in_data.append(cv_inp.transform(list(input_t)).toarray().tolist()) | |
| cv_tar= cv.fit(target_characters) | |
| dec_in_data.append(cv_tar.transform(list(target_t)).toarray().tolist()) | |
| #decoder target will be one timestep ahead because it will not consider | |
| #the first character i.e. '\t'. | |
| dec_tr_data.append(cv_tar.transform(list(target_t)[1:]).toarray().tolist()) | |
| #add padding variable if the length of the input or target text is smaller | |
| #than their respective maximum input or target length. | |
| if len(input_t) < max_input_length: | |
| for _ in range(max_input_length-len(input_t)): | |
| en_in_data[i].append(pad_en) | |
| if len(target_t) < max_target_length: | |
| for _ in range(max_target_length-len(target_t)): | |
| dec_in_data[i].append(pad_dec) | |
| if (len(target_t)-1) < max_target_length: | |
| for _ in range(max_target_length-len(target_t)+1): | |
| dec_tr_data[i].append(pad_dec) | |
| #convert list to numpy array with data type float32 | |
| en_in_data=np.array(en_in_data,dtype="float32") | |
| dec_in_data=np.array(dec_in_data,dtype="float32") | |
| dec_tr_data=np.array(dec_tr_data,dtype="float32") | |
| return en_in_data,dec_in_data,dec_tr_data | |
| #create input object of total number of encoder characters | |
| en_inputs = Input(shape=(None, num_en_chars)) | |
| #create LSTM with the hidden dimension of 256 | |
| #return state=True as we don't want output sequence. | |
| encoder = LSTM(256, return_state=True) | |
| #discard encoder output and store hidden and cell state. | |
| en_outputs, state_h, state_c = encoder(en_inputs) | |
| en_states = [state_h, state_c] | |
| #create input object of total number of decoder characters | |
| dec_inputs = Input(shape=(None, num_dec_chars)) | |
| #create LSTM with the hidden dimension of 256 | |
| #return state and return sequences as we want output sequence. | |
| dec_lstm = LSTM(256, return_sequences=True, return_state=True) | |
| #initialize the decoder model with the states on encoder. | |
| dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states) | |
| #Output layer with shape of total number of decoder characters | |
| dec_dense = Dense(num_dec_chars, activation="softmax") | |
| dec_outputs = dec_dense(dec_outputs) | |
| #create Model and store all variables | |
| model = Model([en_inputs, dec_inputs], dec_outputs) | |
| pickle.dump({'input_characters':input_characters,'target_characters':target_characters, | |
| 'max_input_length':max_input_length,'max_target_length':max_target_length, | |
| 'num_en_chars':num_en_chars,'num_dec_chars':num_dec_chars},open("training_data.pkl","wb")) | |
| #load the data and train the model | |
| # en_in_data,dec_in_data,dec_tr_data = bagofcharacters(input_texts,target_texts) | |
| # model.compile( | |
| # optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"] | |
| # ) | |
| # model.fit( | |
| # [en_in_data, dec_in_data], | |
| # dec_tr_data, | |
| # batch_size=64, | |
| # epochs=200, | |
| # validation_split=0.2, | |
| # ) | |
| # Save model | |
| model.save("s2s")cd | |
| #summary and model plot | |
| model.summary() | |
| plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) | |