merve HF Staff

Upload train.py

bfb4bc1 over 3 years ago

4.23 kB

	import skops
	import sklearn
	import matplotlib.pyplot as plt
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.impute import SimpleImputer
	from sklearn.compose import ColumnTransformer
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.pipeline import Pipeline

	# preprocess the dataset

	df = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv")


	column_transformer_pipeline = ColumnTransformer([
	("loading_missing_value_imputer", SimpleImputer(strategy="mean"), ["loading"]),
	("numerical_missing_value_imputer", SimpleImputer(strategy="mean"), list(df.columns[df.dtypes == 'float64'])),
	("attribute_0_encoder", OneHotEncoder(categories = "auto"), ["attribute_0"]),
	("attribute_1_encoder", OneHotEncoder(categories = "auto"), ["attribute_1"]),
	("product_code_encoder", OneHotEncoder(categories = "auto"), ["product_code"])])

	df = df.drop(["id"], axis=1)


	pipeline = Pipeline([
	('transformation', column_transformer_pipeline),
	('model', DecisionTreeClassifier(max_depth=4))
	])

	X = df.drop(["failure"], axis = 1)
	y = df.failure

	# split the data and train the model

	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y)
	pipeline.fit(X_train, y_train)

	# we will now use skops to initialize a repository
	# create a model card, and push the model to the
	# Hugging Face Hub
	from skops import card, hub_utils
	import pickle

	model_path = "model.pkl"
	local_repo = "decision-tree-playground-kaggle"

	# save the model
	with open(model_path, mode="bw") as f:
	pickle.dump(pipeline, file=f)

	# initialize the repository
	hub_utils.init(
	model=model_path,
	requirements=[f"scikit-learn={sklearn.__version__}"],
	dst=local_repo,
	task="tabular-classification",
	data=X_test,
	)

	# initialize the model card
	from pathlib import Path
	model_card = card.Card(pipeline, metadata=card.metadata_from_config(Path(local_repo)))

	## let's fill some information about the model
	limitations = "This model is not ready to be used in production."
	model_description = "This is a DecisionTreeClassifier model built for Kaggle Tabular Playground Series August 2022, trained on supersoaker production failures dataset."
	model_card_authors = "huggingface"
	get_started_code = f"import pickle \nwith open({local_repo}/{model_path}, 'rb') as file: \n clf = pickle.load(file)"

	# pass this information to the card
	model_card.add(
	get_started_code=get_started_code,
	model_card_authors=model_card_authors,
	limitations=limitations,
	model_description=model_description,
	)

	# we will now evaluate the model and write eval results to the card
	from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, confusion_matrix
	model_card.add(eval_method="The model is evaluated using test split, on accuracy and F1 score with micro average.")
	model_card.add_metrics(accuracy=accuracy_score(y_test, y_pred))
	model_card.add_metrics(**{"f1 score": f1_score(y_test, y_pred, average="micro")})

	model = pipeline.steps[-1][1]

	# we will plot the tree and add the plot to our card
	from sklearn.tree import plot_tree
	plt.figure()
	plot_tree(model,filled=True)
	plt.savefig(f'{local_repo}/tree.png',format='png',bbox_inches = "tight")

	# let's make a prediction and evaluate the model
	y_pred = pipeline.predict(X_test)
	cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
	disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
	disp.plot()

	# save the plot
	plt.savefig(Path(local_repo) / "confusion_matrix.png")

	# add figures to model card with their new sections as keys to the dictionary
	model_card.add_plot(**{"Tree Plot": f'{local_repo}/tree.png', "Confusion Matrix": f"{local_repo}/confusion_matrix.png"})

	#save the card
	model_card.save(f"{local_repo}/README.md")

	# we can now push the model!
	# if the repository doesn't exist remotely on the Hugging Face Hub, it will be created when we set create_remote to True
	repo_id = "scikit-learn/tabular-playground"
	hub_utils.push(
	repo_id=repo_id,
	source=local_repo,
	token=token,
	commit_message="pushing files to the repo from the example!",
	create_remote=True,
	)