Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List | |
| GENERIC_PATTERNS = [ | |
| "dashboard", | |
| "login", | |
| "signup", | |
| "authentication", | |
| "admin panel", | |
| "analytics system", | |
| "analytics platform", | |
| "management system", | |
| "tracking system", | |
| "monitoring system", | |
| "ai module", | |
| "smart system", | |
| "web platform", | |
| "mobile app", | |
| "website", | |
| "reports page", | |
| "user management" | |
| ] | |
| BAD_STARTS = [ | |
| "here are", | |
| "below are", | |
| "these are", | |
| "the following", | |
| "project ideas", | |
| "features include" | |
| ] | |
| LOW_VALUE_WORDS = [ | |
| "system", | |
| "platform", | |
| "application", | |
| "website", | |
| "solution" | |
| ] | |
| def clean_text(text: str) -> str: | |
| if not text: | |
| return "" | |
| text = str(text).strip() | |
| text = re.sub(r"^\d+[\)\.\-\s]+", "", text) | |
| text = re.sub(r"^[\-\*\•\→\▪\s]+", "", text) | |
| text = text.replace("**", "") | |
| text = text.replace('"', "").replace("'", "") | |
| text = re.sub(r"\(.*?\)", "", text) | |
| if ":" in text and len(text.split()) > 6: | |
| text = text.split(":")[0] | |
| text = re.sub(r"^(assistant|bot)\s*[:\-]\s*", "", text, flags=re.I) | |
| text = re.sub(r"[.,\-:;]+$", "", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def normalize_key(text: str) -> str: | |
| text = text.lower() | |
| text = re.sub(r"[^a-z0-9\s]", "", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def is_generic(text: str) -> bool: | |
| low = normalize_key(text) | |
| for pattern in GENERIC_PATTERNS: | |
| if pattern in low: | |
| return True | |
| return False | |
| def is_low_quality(text: str) -> bool: | |
| low = normalize_key(text) | |
| words = low.split() | |
| if len(words) < 3: | |
| return True | |
| if len(words) > 12: | |
| return True | |
| if any(low.startswith(x) for x in BAD_STARTS): | |
| return True | |
| weak_count = sum( | |
| 1 for w in words | |
| if w in LOW_VALUE_WORDS | |
| ) | |
| if weak_count >= len(words) / 2: | |
| return True | |
| return False | |
| def is_valid_item(text: str) -> bool: | |
| if not text: | |
| return False | |
| if is_generic(text): | |
| return False | |
| if is_low_quality(text): | |
| return False | |
| return True | |
| def filter_items(items: List[str]) -> List[str]: | |
| final = [] | |
| seen = set() | |
| for item in items: | |
| text = clean_text(item) | |
| if not text: | |
| continue | |
| if not is_valid_item(text): | |
| continue | |
| key = normalize_key(text) | |
| if key in seen: | |
| continue | |
| duplicate = False | |
| for old in seen: | |
| overlap = set(key.split()) & set(old.split()) | |
| if len(overlap) >= max(2, min(len(key.split()), len(old.split())) - 1): | |
| duplicate = True | |
| break | |
| if duplicate: | |
| continue | |
| seen.add(key) | |
| final.append(text) | |
| return final | |
| def smart_split(text: str) -> List[str]: | |
| if not text: | |
| return [] | |
| text = text.replace("\r", "\n") | |
| lines = [] | |
| for line in text.split("\n"): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| parts = re.split(r"\d+[\.\)]\s*", line) | |
| for p in parts: | |
| p = p.strip() | |
| if not p: | |
| continue | |
| # Remove leading bullets or hyphens instead of splitting the whole string | |
| p = re.sub(r"^[-•▪*]\s*", "", p).strip() | |
| if p: | |
| lines.append(p) | |
| return lines | |
| def validate_generated_list( | |
| text: str, | |
| top_k: int = 10 | |
| ) -> List[str]: | |
| if not text: | |
| return [] | |
| raw_items = smart_split(text) | |
| cleaned = filter_items(raw_items) | |
| return cleaned[:top_k] | |