Spaces:

InnoTrack
/

Graduation_Project-v1.2

Sleeping

App Files Files Community

Graduation_Project-v1.2 / src /recommendation_engine /validator.py

bat-6

update

6459fd1 23 days ago

Raw

History Blame Contribute Delete

3.81 kB

	import re
	from typing import List

	GENERIC_PATTERNS = [
	"dashboard",
	"login",
	"signup",
	"authentication",
	"admin panel",
	"analytics system",
	"analytics platform",
	"management system",
	"tracking system",
	"monitoring system",
	"ai module",
	"smart system",
	"web platform",
	"mobile app",
	"website",
	"reports page",
	"user management"
	]

	BAD_STARTS = [
	"here are",
	"below are",
	"these are",
	"the following",
	"project ideas",
	"features include"
	]

	LOW_VALUE_WORDS = [
	"system",
	"platform",
	"application",
	"website",
	"solution"
	]

	def clean_text(text: str) -> str:

	if not text:
	return ""

	text = str(text).strip()


	text = re.sub(r"^\d+[\)\.\-\s]+", "", text)


	text = re.sub(r"^[\-\*\•\→\▪\s]+", "", text)


	text = text.replace("**", "")


	text = text.replace('"', "").replace("'", "")


	text = re.sub(r"$.*?$", "", text)


	if ":" in text and len(text.split()) > 6:
	text = text.split(":")[0]


	text = re.sub(r"^(assistant\|bot)\s[:\-]\s", "", text, flags=re.I)


	text = re.sub(r"[.,\-:;]+$", "", text)


	text = re.sub(r"\s+", " ", text).strip()

	return text

	def normalize_key(text: str) -> str:

	text = text.lower()

	text = re.sub(r"[^a-z0-9\s]", "", text)

	text = re.sub(r"\s+", " ", text).strip()

	return text

	def is_generic(text: str) -> bool:

	low = normalize_key(text)

	for pattern in GENERIC_PATTERNS:
	if pattern in low:
	return True

	return False

	def is_low_quality(text: str) -> bool:

	low = normalize_key(text)

	words = low.split()


	if len(words) < 3:
	return True


	if len(words) > 12:
	return True


	if any(low.startswith(x) for x in BAD_STARTS):
	return True


	weak_count = sum(
	1 for w in words
	if w in LOW_VALUE_WORDS
	)

	if weak_count >= len(words) / 2:
	return True

	return False

	def is_valid_item(text: str) -> bool:

	if not text:
	return False


	if is_generic(text):
	return False


	if is_low_quality(text):
	return False

	return True

	def filter_items(items: List[str]) -> List[str]:

	final = []

	seen = set()

	for item in items:

	text = clean_text(item)

	if not text:
	continue

	if not is_valid_item(text):
	continue

	key = normalize_key(text)


	if key in seen:
	continue


	duplicate = False

	for old in seen:


	overlap = set(key.split()) & set(old.split())

	if len(overlap) >= max(2, min(len(key.split()), len(old.split())) - 1):
	duplicate = True
	break

	if duplicate:
	continue

	seen.add(key)

	final.append(text)

	return final

	def smart_split(text: str) -> List[str]:

	if not text:
	return []

	text = text.replace("\r", "\n")

	lines = []

	for line in text.split("\n"):

	line = line.strip()

	if not line:
	continue


	parts = re.split(r"\d+[\.\)]\s*", line)

	for p in parts:

	p = p.strip()

	if not p:
	continue

	# Remove leading bullets or hyphens instead of splitting the whole string
	p = re.sub(r"^[-•▪]\s", "", p).strip()

	if p:
	lines.append(p)

	return lines

	def validate_generated_list(
	text: str,
	top_k: int = 10
	) -> List[str]:

	if not text:
	return []

	raw_items = smart_split(text)

	cleaned = filter_items(raw_items)

	return cleaned[:top_k]