Upload 2 files

ce40857 verified 12 months ago

4.48 kB

	# Script to train and save the password health model.
	# Loads labeled password dataset, extracts features, trains a RandomForest classifier, and saves the model.

	import pandas as pd
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score
	import joblib
	import random
	import string
	import re
	import math
	import zlib
	from collections import Counter

	# Load the weak password list into memory for training data.
	with open("weak_passwords.txt", "r", encoding="utf-8", errors="ignore") as f:
	rockyou_passwords = [line.strip() for line in f]

	# Generate weak, medium, and strong password samples for training.
	# Weak: real breached passwords; Medium/Strong: synthetic passwords with varying complexity.

	# 100,000 weak passwords from weak_passwords.txt
	weak_passwords = random.sample(rockyou_passwords, 100000)

	# Generate 100,000 medium passwords: random length between 8–12, letters, digits, and occasionally symbols
	medium_passwords = []
	for _ in range(100000):
	length = random.randint(8, 12)
	if random.random() < 0.2:
	allowed_chars = string.ascii_letters + string.digits + "!@#$%"
	else:
	allowed_chars = string.ascii_letters + string.digits
	medium_passwords.append(''.join(random.choices(allowed_chars, k=length)))

	# Generate 100,000 strong passwords: random length between 12–16, letters, digits, and symbols
	strong_passwords = [
	''.join(random.choices(string.ascii_letters + string.digits + "!@#$%", k=random.randint(12,16)))
	for _ in range(100000)
	]

	# Combine all passwords and assign labels: 0 (weak), 1 (medium), 2 (strong)
	data = weak_passwords + medium_passwords + strong_passwords
	labels = [0] * 100000 + [1] * 100000 + [2] * 100000

	# Define function to extract password features for ML classification.
	def password_features(password: str) -> dict:
	"""
	Extracts features from a password for strength classification.
	Returns a dictionary with password features.
	"""
	features = {
	"length": len(password),
	"entropy": math.log2(len(set(password)) ** len(password)) if password else 0,
	"has_upper": int(bool(re.search(r"[A-Z]", password))),
	"has_symbol": int(bool(re.search(r"[^A-Za-z0-9]", password))),
	"has_leet": int(any(c in "@3!0" for c in password)),
	"repetition": int(bool(re.search(r"(.)\1{2,}", password))),
	}

	# Proportion of digits
	num_digits = sum(1 for c in password if c.isdigit())
	features["digit_ratio"] = num_digits / len(password) if password else 0

	# Unique character ratio
	features["unique_ratio"] = len(set(password)) / len(password) if password else 0

	# Bigram entropy
	if len(password) >= 2:
	bigrams = [password[i:i+2] for i in range(len(password)-1)]
	bigram_counts = Counter(bigrams)
	total_bigrams = sum(bigram_counts.values())
	features["bigram_entropy"] = -sum(
	(count / total_bigrams) * math.log2(count / total_bigrams)
	for count in bigram_counts.values()
	) if total_bigrams else 0
	else:
	features["bigram_entropy"] = 0

	# Compression ratio
	features["compression_ratio"] = (
	len(zlib.compress(password.encode())) / len(password)
	if password else 1.0
	)

	return features

	# Extract features for all passwords and build the DataFrame.
	df = pd.DataFrame([password_features(pw) for pw in data])

	# Add breached status and labels to the DataFrame.
	df["hibp_breached"] = [1 if label == 0 else 0 for label in labels]
	df["label"] = labels

	# Prepare features (X) and target (y) for training.
	X = df.drop("label", axis=1)
	y = df["label"]

	# Split into training and test sets for evaluation.
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Initialize and train the Random Forest Classifier with tuned parameters.
	model = RandomForestClassifier(
	n_estimators=200,
	max_depth=20, # Limit depth to prevent overfitting
	min_samples_split=5, # Require at least 5 samples to split
	random_state=42
	)
	model.fit(X_train, y_train)

	# Evaluate model accuracy on the test set.
	y_pred = model.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	print(f"Model accuracy: {accuracy:.2%}")

	# Serialize trained model for use in the application.
	joblib.dump(model, "password_health_model.pkl")
	print("Model saved as 'password_health_model.pkl'")