Spaces:
Sleeping
Sleeping
| import logging | |
| import re | |
| from collections import Counter | |
| from typing import Dict, Any, List | |
| from functools import lru_cache | |
| from difflib import get_close_matches | |
| import pandas as pd | |
| from src.similarity_model import ( | |
| find_similar_projects, | |
| extract_features | |
| ) | |
| from src.recommendation_engine.config import ( | |
| SIMILARITY_TOP_K, | |
| MAX_FEATURES | |
| ) | |
| logger = logging.getLogger(__name__) | |
| DOMAIN_KEYWORDS = { | |
| "AI & Machine Learning": [ | |
| "ai", "artificial intelligence", "machine learning", "ml", "deep learning", | |
| "neural network", "nlp", "computer vision" | |
| ], | |
| "Business & Finance": [ | |
| "fintech", "finance", "bank", "payment", "crypto", "blockchain", "business", "trading" | |
| ], | |
| "Cloud & DevOps": [ | |
| "cloud", "devops", "aws", "azure", "docker", "kubernetes", "infrastructure" | |
| ], | |
| "Cybersecurity": [ | |
| "security", "cyber", "cybersecurity", "threat", "attack", "malware", "hacking" | |
| ], | |
| "Education": [ | |
| "education", "school", "learning", "edtech", "student", "university", "academic" | |
| ], | |
| "Healthcare": [ | |
| "hospital", "health", "medical", "healthcare", "clinic", "patient", "care" | |
| ], | |
| "IoT & Embedded Systems": [ | |
| "iot", "embedded", "hardware", "sensor", "arduino", "raspberry", "smart home" | |
| ], | |
| "Web & Mobile Development": [ | |
| "web", "mobile", "app", "ios", "android", "frontend", "backend", "fullstack", "website" | |
| ], | |
| "Data Science & Analytics": [ | |
| "data", "analytics", "science", "big data", "dashboard", "statistics" | |
| ], | |
| "E-Commerce & Marketplaces": [ | |
| "ecommerce", "shopping", "retail", "store", "marketplace", "shop" | |
| ], | |
| "Smart Systems": [ | |
| "smart system", "automation", "smart city", "smart" | |
| ], | |
| "Networking & Communication": [ | |
| "networking", "communication", "telecom", "5g", "network" | |
| ], | |
| "Game Development": [ | |
| "game", "gaming", "unity", "unreal", "ar", "vr" | |
| ], | |
| "Others": [ | |
| "general", "random", "anything", "any", "whatever", "surprise me", "mixed", "all", "open", "everything", "other" | |
| ] | |
| } | |
| def normalize(text: str) -> str: | |
| text = str(text).lower().strip() | |
| text = re.sub(r"[^a-z0-9\s]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def clean_list( | |
| items: List[str], | |
| limit: int = 20 | |
| ) -> List[str]: | |
| final = [] | |
| seen = set() | |
| for item in items: | |
| val = normalize(item) | |
| if not val: | |
| continue | |
| if val not in seen: | |
| seen.add(val) | |
| final.append(val) | |
| return final[:limit] | |
| def detect_domains(text: str) -> List[str]: | |
| text = normalize(text) | |
| detected = [] | |
| words_in_text = set(text.split()) | |
| for domain, words in DOMAIN_KEYWORDS.items(): | |
| for w in words: | |
| if " " in w: | |
| if w in text: | |
| detected.append(domain) | |
| break | |
| else: | |
| if w in words_in_text: | |
| detected.append(domain) | |
| break | |
| return clean_list(detected, limit=3) | |
| def extract_domain(text: str) -> str: | |
| if not text: | |
| return "" | |
| text = normalize(text) | |
| if text in ["ai", "ml"]: | |
| return "artificial intelligence" | |
| # Map normalized domain names to their original keys | |
| normalized_domains = {normalize(d): d for d in DOMAIN_KEYWORDS.keys()} | |
| if text in normalized_domains: | |
| return normalized_domains[text] | |
| # Check close matches against normalized domain names | |
| match_domain = get_close_matches( | |
| text, | |
| list(normalized_domains.keys()), | |
| n=1, | |
| cutoff=0.85 | |
| ) | |
| if match_domain: | |
| return normalized_domains[match_domain[0]] | |
| if text in DOMAIN_KEYWORDS: | |
| return text | |
| domains = detect_domains(text) | |
| if domains: | |
| for d in domains: | |
| if d != "general": | |
| return d | |
| return domains[0] | |
| all_words = [] | |
| word_map = {} | |
| for domain, words in DOMAIN_KEYWORDS.items(): | |
| for w in words: | |
| all_words.append(w) | |
| word_map[w] = domain | |
| match = get_close_matches( | |
| text, | |
| all_words, | |
| n=1, | |
| cutoff=0.75 | |
| ) | |
| if match: | |
| return word_map[match[0]] | |
| for domain, words in DOMAIN_KEYWORDS.items(): | |
| for w in words: | |
| if text in w or w.startswith(text): | |
| return domain | |
| others_keywords = DOMAIN_KEYWORDS.get("Others", []) | |
| if any(ow in text for ow in others_keywords): | |
| return "Others" | |
| return "" | |
| def cached_similarity( | |
| title: str, | |
| description: str | |
| ): | |
| return find_similar_projects( | |
| title=title, | |
| description=description, | |
| top_k=SIMILARITY_TOP_K | |
| ) | |
| def extract_common_features( | |
| results: pd.DataFrame | |
| ) -> List[str]: | |
| counter = Counter() | |
| if not isinstance(results, pd.DataFrame): | |
| return [] | |
| for _, row in results.iterrows(): | |
| matches = row.get( | |
| "matched_features", | |
| [] | |
| ) | |
| for item in matches: | |
| if isinstance(item, dict): | |
| feat = item.get( | |
| "feature_b", | |
| "" | |
| ) | |
| feat = normalize(feat) | |
| if feat: | |
| counter[feat] += 1 | |
| return [ | |
| feat | |
| for feat, _ | |
| in counter.most_common(12) | |
| ] | |
| def extract_titles( | |
| results: pd.DataFrame | |
| ) -> List[str]: | |
| if not isinstance(results, pd.DataFrame): | |
| return [] | |
| titles = [ | |
| str(row.get("project_title", "")).strip() | |
| for _, row in results.iterrows() | |
| if row.get("project_title") | |
| ] | |
| return clean_list(titles, limit=10) | |
| def build_architecture_hints( | |
| domains: List[str] | |
| ) -> List[str]: | |
| hints = [] | |
| if "artificial intelligence" in domains: | |
| hints.extend([ | |
| "AI inference pipeline", | |
| "Model prediction workflow", | |
| "Data preprocessing module" | |
| ]) | |
| if "healthcare" in domains: | |
| hints.extend([ | |
| "Emergency handling workflow", | |
| "Patient monitoring logic", | |
| "Medical alert system" | |
| ]) | |
| if "security" in domains: | |
| hints.extend([ | |
| "Threat detection pipeline", | |
| "Behavior anomaly analysis", | |
| "Risk monitoring engine" | |
| ]) | |
| if "education" in domains: | |
| hints.extend([ | |
| "Adaptive learning workflow", | |
| "Student performance analytics", | |
| "Recommendation engine" | |
| ]) | |
| return clean_list(hints, limit=10) | |
| def build_project_context( | |
| title: str, | |
| description: str, | |
| abstract: str = "", | |
| features: List[str] = None | |
| ) -> Dict[str, Any]: | |
| features = features or [] | |
| logger.info("Building project context") | |
| full_text = ( | |
| f"{title}. " | |
| f"{abstract}. " | |
| f"{description}" | |
| ) | |
| domains = detect_domains(full_text) | |
| main_domain = ( | |
| domains[0] | |
| if domains | |
| else "general" | |
| ) | |
| auto_features = extract_features( | |
| full_text | |
| ) | |
| user_features = clean_list( | |
| features + auto_features, | |
| MAX_FEATURES | |
| ) | |
| try: | |
| results = cached_similarity( | |
| title, | |
| description | |
| ) | |
| except Exception as e: | |
| logger.warning( | |
| f"Similarity failed: {e}" | |
| ) | |
| results = None | |
| if ( | |
| not isinstance(results, pd.DataFrame) | |
| or len(results) == 0 | |
| or "message" in results.columns | |
| ): | |
| return { | |
| "project_title": title, | |
| "domain": main_domain, | |
| "domains": domains, | |
| "features": user_features, | |
| "similar_titles": [], | |
| "common_features": [], | |
| "unique_features": user_features, | |
| "architecture_hints": build_architecture_hints(domains), | |
| "originality_score": 99.0, | |
| "context_strength": 0.0 | |
| } | |
| similar_titles = extract_titles(results) | |
| common_features = extract_common_features( | |
| results | |
| ) | |
| unique_features = [ | |
| f | |
| for f in user_features | |
| if f not in common_features | |
| ] | |
| hybrid_scores = results.get( | |
| "hybrid_score", | |
| pd.Series([0]) | |
| ) | |
| context_strength = float( | |
| hybrid_scores.mean() | |
| ) | |
| return { | |
| "project_title": title, | |
| "domain": main_domain, | |
| "domains": domains, | |
| "features": user_features, | |
| "similar_titles": similar_titles, | |
| "common_features": common_features, | |
| "unique_features": unique_features, | |
| "architecture_hints": build_architecture_hints(domains), | |
| "originality_score": calibrate_originality(context_strength), | |
| "context_strength": round(context_strength, 4) | |
| } | |
| def calibrate_originality(similarity: float) -> float: | |
| """ | |
| Piecewise linear calibration curve mapping database similarity to originality percentage. | |
| - S <= 0.45: maps linearly to O in [85.0%, 99.0%] | |
| - S > 0.45: maps linearly to O in [5.0%, 85.0%] | |
| """ | |
| s = max(0.0, min(1.0, float(similarity))) | |
| if s <= 0.45: | |
| originality = 99.0 - (s / 0.45) * 14.0 | |
| else: | |
| originality = 85.0 - ((s - 0.45) / 0.55) * 80.0 | |
| return round(originality, 2) | |
| def build_domain_context( | |
| domain: str | |
| ) -> Dict[str, Any]: | |
| extracted = extract_domain(domain) | |
| if extracted and extracted.lower() != "others": | |
| domain_clean = extracted | |
| else: | |
| logger.info( | |
| f"[DOMAIN INFO] Using custom dynamic domain: {domain}" | |
| ) | |
| domain_clean = normalize(domain) | |
| logger.info( | |
| f"Building domain context: {domain_clean}" | |
| ) | |
| try: | |
| results = cached_similarity( | |
| domain_clean, | |
| domain_clean | |
| ) | |
| except Exception as e: | |
| logger.warning( | |
| f"Domain similarity failed: {e}" | |
| ) | |
| results = None | |
| if ( | |
| not isinstance(results, pd.DataFrame) | |
| or len(results) == 0 | |
| or "message" in results.columns | |
| ): | |
| return { | |
| "domain": domain_clean, | |
| "existing_titles": [], | |
| "common_features": [], | |
| "architecture_hints": build_architecture_hints([domain_clean]), | |
| "context_strength": 0.0 | |
| } | |
| hybrid_scores = results.get( | |
| "hybrid_score", | |
| pd.Series([0]) | |
| ) | |
| return { | |
| "domain": domain_clean, | |
| "existing_titles": extract_titles(results), | |
| "common_features": extract_common_features(results), | |
| "architecture_hints": build_architecture_hints([domain_clean]), | |
| "context_strength": round( | |
| float(hybrid_scores.mean()), | |
| 4 | |
| ) | |
| } | |