Spaces:
Sleeping
Sleeping
| import re | |
| import random | |
| import nltk | |
| from typing import List, Dict, Optional | |
| import numpy as np | |
| # Download required NLTK data | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt') | |
| try: | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError: | |
| nltk.download('wordnet') | |
| try: | |
| nltk.data.find('corpora/omw-1.4') | |
| except LookupError: | |
| nltk.download('omw-1.4') | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| # Production-grade imports with proper error handling and retries | |
| def safe_import_with_retry(module_name, component=None, max_retries=3): | |
| """Import with retries and detailed error reporting""" | |
| for attempt in range(max_retries): | |
| try: | |
| if component: | |
| module = __import__(module_name, fromlist=[component]) | |
| return getattr(module, component), True | |
| else: | |
| return __import__(module_name), True | |
| except ImportError as e: | |
| if attempt < max_retries - 1: | |
| print(f"β οΈ Import attempt {attempt + 1} failed for {module_name}: {e}") | |
| print(f"π Retrying in 2 seconds...") | |
| import time | |
| time.sleep(2) | |
| continue | |
| else: | |
| print(f"β Final import failed for {module_name}: {e}") | |
| return None, False | |
| except Exception as e: | |
| print(f"β Unexpected error importing {module_name}: {e}") | |
| return None, False | |
| return None, False | |
| # Advanced model imports with retries | |
| print("π Loading AI Text Humanizer - Production Version...") | |
| print("=" * 50) | |
| print("π₯ Loading sentence transformers...") | |
| SentenceTransformer, SENTENCE_TRANSFORMERS_AVAILABLE = safe_import_with_retry('sentence_transformers', 'SentenceTransformer') | |
| print("π₯ Loading transformers pipeline...") | |
| pipeline, TRANSFORMERS_AVAILABLE = safe_import_with_retry('transformers', 'pipeline') | |
| print("π₯ Loading scikit-learn...") | |
| try: | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity | |
| SKLEARN_AVAILABLE = True | |
| print("β Scikit-learn loaded successfully") | |
| except ImportError as e: | |
| print(f"β οΈ Scikit-learn not available: {e}") | |
| SKLEARN_AVAILABLE = False | |
| # Additional production imports | |
| try: | |
| import torch | |
| TORCH_AVAILABLE = True | |
| print(f"β PyTorch loaded - CUDA available: {torch.cuda.is_available()}") | |
| except ImportError: | |
| TORCH_AVAILABLE = False | |
| print("β οΈ PyTorch not available") | |
| class ProductionAITextHumanizer: | |
| def __init__(self, enable_gpu=True, model_cache_dir=None): | |
| """Initialize production-grade text humanizer with all advanced features""" | |
| print("π Initializing Production AI Text Humanizer...") | |
| self.enable_gpu = enable_gpu and TORCH_AVAILABLE | |
| self.model_cache_dir = model_cache_dir | |
| # Initialize advanced models with detailed error handling | |
| self._load_similarity_model() | |
| self._load_paraphrasing_model() | |
| self._initialize_fallback_methods() | |
| self._setup_word_mappings() | |
| print("β Production AI Text Humanizer initialized!") | |
| self._print_feature_status() | |
| def _load_similarity_model(self): | |
| """Load sentence transformer with production settings""" | |
| self.similarity_model = None | |
| if SENTENCE_TRANSFORMERS_AVAILABLE and SentenceTransformer: | |
| try: | |
| print("π Loading sentence transformer model...") | |
| # Production settings | |
| model_kwargs = { | |
| 'device': 'cuda' if self.enable_gpu and torch.cuda.is_available() else 'cpu' | |
| } if TORCH_AVAILABLE else {} | |
| if self.model_cache_dir: | |
| model_kwargs['cache_folder'] = self.model_cache_dir | |
| self.similarity_model = SentenceTransformer( | |
| 'all-MiniLM-L6-v2', | |
| **model_kwargs | |
| ) | |
| # Test the model | |
| test_embedding = self.similarity_model.encode(["test sentence"]) | |
| print("β Sentence transformer model loaded and tested successfully!") | |
| except Exception as e: | |
| print(f"β Failed to load sentence transformer: {e}") | |
| print("π‘ Troubleshooting tips:") | |
| print(" - Check internet connection for model download") | |
| print(" - Verify sentence-transformers version: pip install sentence-transformers==2.2.2") | |
| print(" - Check CUDA compatibility if using GPU") | |
| self.similarity_model = None | |
| else: | |
| print("β Sentence transformers not available") | |
| def _load_paraphrasing_model(self): | |
| """Load paraphrasing model with production settings""" | |
| self.paraphraser = None | |
| if TRANSFORMERS_AVAILABLE and pipeline: | |
| try: | |
| print("π Loading paraphrasing model...") | |
| # Production settings | |
| device = 0 if self.enable_gpu and TORCH_AVAILABLE and torch.cuda.is_available() else -1 | |
| self.paraphraser = pipeline( | |
| "text2text-generation", | |
| model="google/flan-t5-small", | |
| device=device, | |
| max_length=512, | |
| model_kwargs={"cache_dir": self.model_cache_dir} if self.model_cache_dir else {} | |
| ) | |
| # Test the model | |
| test_result = self.paraphraser("Test sentence for paraphrasing.", max_length=50) | |
| print("β Paraphrasing model loaded and tested successfully!") | |
| except Exception as e: | |
| print(f"β Failed to load paraphrasing model: {e}") | |
| print("π‘ Troubleshooting tips:") | |
| print(" - Check internet connection for model download") | |
| print(" - Verify transformers version: pip install transformers==4.35.0") | |
| print(" - Check available memory (models need ~2GB RAM)") | |
| self.paraphraser = None | |
| else: | |
| print("β Transformers not available") | |
| def _initialize_fallback_methods(self): | |
| """Initialize fallback similarity methods""" | |
| self.tfidf_vectorizer = None | |
| if SKLEARN_AVAILABLE: | |
| try: | |
| self.tfidf_vectorizer = TfidfVectorizer( | |
| stop_words='english', | |
| ngram_range=(1, 2), | |
| max_features=5000 | |
| ) | |
| print("β TF-IDF fallback similarity initialized") | |
| except Exception as e: | |
| print(f"β οΈ TF-IDF initialization failed: {e}") | |
| def _setup_word_mappings(self): | |
| """Setup comprehensive word mappings for production""" | |
| # Extended formal to casual mappings for production | |
| self.formal_to_casual = { | |
| # Basic formal words | |
| "utilize": "use", "demonstrate": "show", "facilitate": "help", | |
| "implement": "do", "consequently": "so", "therefore": "so", | |
| "nevertheless": "but", "furthermore": "also", "moreover": "also", | |
| "subsequently": "then", "accordingly": "so", "regarding": "about", | |
| "concerning": "about", "pertaining": "about", "approximately": "about", | |
| "endeavor": "try", "commence": "start", "terminate": "end", | |
| "obtain": "get", "purchase": "buy", "examine": "look at", | |
| "analyze": "study", "construct": "build", "establish": "set up", | |
| # Advanced formal words | |
| "magnitude": "size", "comprehensive": "complete", "significant": "big", | |
| "substantial": "large", "optimal": "best", "sufficient": "enough", | |
| "adequate": "good enough", "exceptional": "amazing", "remarkable": "great", | |
| "outstanding": "excellent", "predominant": "main", "fundamental": "basic", | |
| "essential": "needed", "crucial": "important", "vital": "key", | |
| "paramount": "most important", "imperative": "must", "mandatory": "required", | |
| # Formal phrases | |
| "prior to": "before", "in order to": "to", "due to the fact that": "because", | |
| "at this point in time": "now", "in the event that": "if", | |
| "it is important to note": "note that", "it should be emphasized": "remember", | |
| "it is worth mentioning": "by the way", "it is crucial to understand": "importantly", | |
| "for the purpose of": "to", "with regard to": "about", | |
| "in accordance with": "following", "as a result of": "because of", | |
| "in spite of the fact that": "although", "on the other hand": "however", | |
| # Academic/business terms | |
| "methodology": "method", "systematically": "step by step", | |
| "optimization": "improvement", "enhancement": "upgrade", | |
| "implementation": "setup", "utilization": "use", "evaluation": "review", | |
| "assessment": "check", "validation": "proof", "verification": "confirmation", | |
| "consolidation": "combining", "integration": "bringing together", | |
| "transformation": "change", "modification": "change", "alteration": "change" | |
| } | |
| # Extended contractions | |
| self.contractions = { | |
| "do not": "don't", "does not": "doesn't", "did not": "didn't", | |
| "will not": "won't", "would not": "wouldn't", "should not": "shouldn't", | |
| "could not": "couldn't", "cannot": "can't", "is not": "isn't", | |
| "are not": "aren't", "was not": "wasn't", "were not": "weren't", | |
| "have not": "haven't", "has not": "hasn't", "had not": "hadn't", | |
| "I am": "I'm", "you are": "you're", "he is": "he's", "she is": "she's", | |
| "it is": "it's", "we are": "we're", "they are": "they're", | |
| "I have": "I've", "you have": "you've", "we have": "we've", | |
| "they have": "they've", "I will": "I'll", "you will": "you'll", | |
| "he will": "he'll", "she will": "she'll", "it will": "it'll", | |
| "we will": "we'll", "they will": "they'll", "would have": "would've", | |
| "should have": "should've", "could have": "could've", "might have": "might've" | |
| } | |
| # AI-like transitions (expanded) | |
| self.ai_transition_words = [ | |
| "Furthermore,", "Moreover,", "Additionally,", "Subsequently,", | |
| "Consequently,", "Therefore,", "Nevertheless,", "However,", | |
| "In conclusion,", "To summarize,", "In summary,", "Overall,", | |
| "It is important to note that", "It should be emphasized that", | |
| "It is worth mentioning that", "It is crucial to understand that", | |
| "It is essential to recognize that", "It must be acknowledged that", | |
| "It should be noted that", "It is imperative to understand", | |
| "From a practical standpoint,", "From an analytical perspective,", | |
| "In terms of implementation,", "With respect to the aforementioned,", | |
| "As previously mentioned,", "As stated earlier,", "In light of this," | |
| ] | |
| # Natural alternatives (expanded) | |
| self.natural_transitions = [ | |
| "Also,", "Plus,", "And,", "Then,", "So,", "But,", "Still,", | |
| "Anyway,", "By the way,", "Actually,", "Basically,", "Look,", | |
| "Listen,", "Here's the thing:", "The point is,", "What's more,", | |
| "On top of that,", "Another thing,", "Now,", "Well,", "You know,", | |
| "I mean,", "Honestly,", "Frankly,", "Simply put,", "In other words,", | |
| "To put it differently,", "Let me explain,", "Here's what I mean:", | |
| "Think about it,", "Consider this,", "Get this,", "Check this out," | |
| ] | |
| def _print_feature_status(self): | |
| """Print detailed feature status for production monitoring""" | |
| print("\nπ PRODUCTION FEATURE STATUS:") | |
| print("-" * 40) | |
| print(f"π€ Advanced Similarity: {'β ENABLED' if self.similarity_model else 'β DISABLED'}") | |
| print(f"π§ AI Paraphrasing: {'β ENABLED' if self.paraphraser else 'β DISABLED'}") | |
| print(f"π TF-IDF Fallback: {'β ENABLED' if self.tfidf_vectorizer else 'β DISABLED'}") | |
| print(f"π GPU Acceleration: {'β ENABLED' if self.enable_gpu and TORCH_AVAILABLE else 'β DISABLED'}") | |
| print(f"β‘ Word Mappings: β ENABLED ({len(self.formal_to_casual)} mappings)") | |
| print(f"π Contractions: β ENABLED ({len(self.contractions)} contractions)") | |
| if TORCH_AVAILABLE: | |
| import torch | |
| print(f"π₯οΈ Device: {'CUDA' if torch.cuda.is_available() and self.enable_gpu else 'CPU'}") | |
| # Calculate feature completeness | |
| total_features = 6 | |
| enabled_features = sum([ | |
| bool(self.similarity_model), | |
| bool(self.paraphraser), | |
| bool(self.tfidf_vectorizer), | |
| True, # Word mappings always available | |
| True, # Contractions always available | |
| TORCH_AVAILABLE | |
| ]) | |
| completeness = (enabled_features / total_features) * 100 | |
| print(f"π― Feature Completeness: {completeness:.1f}%") | |
| if completeness < 70: | |
| print("β οΈ WARNING: Less than 70% features enabled - not production ready") | |
| elif completeness < 90: | |
| print("β οΈ CAUTION: Some advanced features missing") | |
| else: | |
| print("π PRODUCTION READY: All critical features enabled!") | |
| def add_contractions(self, text: str) -> str: | |
| """Add contractions with improved pattern matching""" | |
| # Sort by length (longest first) to avoid partial replacements | |
| sorted_contractions = sorted(self.contractions.items(), key=lambda x: len(x[0]), reverse=True) | |
| for formal, casual in sorted_contractions: | |
| # Use word boundaries to avoid partial matches | |
| pattern = r'\b' + re.escape(formal) + r'\b' | |
| text = re.sub(pattern, casual, text, flags=re.IGNORECASE) | |
| return text | |
| def replace_formal_words(self, text: str, replacement_rate: float = 0.8) -> str: | |
| """Enhanced formal word replacement with context awareness""" | |
| # Handle phrases first (longer matches) | |
| phrase_replacements = {k: v for k, v in self.formal_to_casual.items() if len(k.split()) > 1} | |
| word_replacements = {k: v for k, v in self.formal_to_casual.items() if len(k.split()) == 1} | |
| # Replace phrases first | |
| for formal_phrase, casual_phrase in phrase_replacements.items(): | |
| if random.random() < replacement_rate: | |
| pattern = r'\b' + re.escape(formal_phrase) + r'\b' | |
| text = re.sub(pattern, casual_phrase, text, flags=re.IGNORECASE) | |
| # Then replace individual words | |
| words = word_tokenize(text) | |
| for i, word in enumerate(words): | |
| word_clean = word.lower().strip('.,!?;:"') | |
| if word_clean in word_replacements and random.random() < replacement_rate: | |
| replacement = word_replacements[word_clean] | |
| # Preserve case | |
| if word.isupper(): | |
| words[i] = word.replace(word_clean, replacement.upper()) | |
| elif word.istitle(): | |
| words[i] = word.replace(word_clean, replacement.title()) | |
| else: | |
| words[i] = word.replace(word_clean, replacement) | |
| # Reconstruct with proper spacing | |
| result = "" | |
| for i, word in enumerate(words): | |
| if i > 0 and word not in ".,!?;:\"')": | |
| result += " " | |
| result += word | |
| return result | |
| def replace_ai_transitions(self, text: str) -> str: | |
| """Enhanced AI transition replacement with context awareness""" | |
| # Sort by length to handle longer phrases first | |
| sorted_transitions = sorted(self.ai_transition_words, key=len, reverse=True) | |
| for ai_transition in sorted_transitions: | |
| if ai_transition in text: | |
| # Choose appropriate natural replacement based on context | |
| natural_replacement = random.choice(self.natural_transitions) | |
| # Adjust replacement based on sentence position | |
| if text.startswith(ai_transition): | |
| # Beginning of text | |
| text = text.replace(ai_transition, natural_replacement, 1) | |
| else: | |
| # Middle of text - be more selective | |
| if random.random() < 0.7: # 70% chance to replace | |
| text = text.replace(ai_transition, natural_replacement, 1) | |
| return text | |
| def advanced_paraphrasing(self, text: str, paraphrase_rate: float = 0.4) -> str: | |
| """Production-grade paraphrasing with quality control""" | |
| if not self.paraphraser: | |
| return text | |
| sentences = sent_tokenize(text) | |
| paraphrased_sentences = [] | |
| for sentence in sentences: | |
| # Only paraphrase longer, more complex sentences | |
| if len(sentence.split()) > 10 and random.random() < paraphrase_rate: | |
| try: | |
| # Multiple paraphrasing strategies | |
| prompts = [ | |
| f"Rewrite this more naturally: {sentence}", | |
| f"Make this sound more conversational: {sentence}", | |
| f"Rephrase this in simpler terms: {sentence}", | |
| f"Say this in a more casual way: {sentence}" | |
| ] | |
| prompt = random.choice(prompts) | |
| result = self.paraphraser( | |
| prompt, | |
| max_length=len(sentence) + 50, | |
| min_length=max(10, len(sentence) // 2), | |
| num_return_sequences=1, | |
| temperature=0.7, | |
| do_sample=True | |
| ) | |
| paraphrased = result[0]['generated_text'] | |
| paraphrased = paraphrased.replace(prompt, '').strip().strip('"\'') | |
| # Quality checks | |
| if (paraphrased and | |
| len(paraphrased) > 5 and | |
| len(paraphrased) < len(sentence) * 2 and | |
| not paraphrased.lower().startswith(('i cannot', 'i can\'t', 'sorry'))): | |
| paraphrased_sentences.append(paraphrased) | |
| else: | |
| paraphrased_sentences.append(sentence) | |
| except Exception as e: | |
| print(f"β οΈ Paraphrasing failed: {e}") | |
| paraphrased_sentences.append(sentence) | |
| else: | |
| paraphrased_sentences.append(sentence) | |
| return ' '.join(paraphrased_sentences) | |
| def calculate_similarity_advanced(self, text1: str, text2: str) -> float: | |
| """Production-grade similarity calculation""" | |
| if self.similarity_model: | |
| try: | |
| embeddings1 = self.similarity_model.encode([text1]) | |
| embeddings2 = self.similarity_model.encode([text2]) | |
| similarity = np.dot(embeddings1[0], embeddings2[0]) / ( | |
| np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0]) | |
| ) | |
| return float(similarity) | |
| except Exception as e: | |
| print(f"β οΈ Advanced similarity calculation failed: {e}") | |
| # Fallback to TF-IDF | |
| if self.tfidf_vectorizer and SKLEARN_AVAILABLE: | |
| try: | |
| tfidf_matrix = self.tfidf_vectorizer.fit_transform([text1, text2]) | |
| similarity = sklearn_cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] | |
| return float(similarity) | |
| except Exception as e: | |
| print(f"β οΈ TF-IDF similarity calculation failed: {e}") | |
| # Basic fallback | |
| words1 = set(word_tokenize(text1.lower())) | |
| words2 = set(word_tokenize(text2.lower())) | |
| if not words1 or not words2: | |
| return 1.0 if text1 == text2 else 0.0 | |
| intersection = words1.intersection(words2) | |
| union = words1.union(words2) | |
| return len(intersection) / len(union) if union else 1.0 | |
| def humanize_text_production(self, | |
| text: str, | |
| style: str = "natural", | |
| intensity: float = 0.8, | |
| preserve_length: bool = True, | |
| quality_threshold: float = 0.75) -> Dict: | |
| """ | |
| Production-grade text humanization with comprehensive quality control | |
| Args: | |
| text: Input text to humanize | |
| style: Style ('natural', 'casual', 'conversational') | |
| intensity: Transformation intensity (0.0 to 1.0) | |
| preserve_length: Try to maintain similar text length | |
| quality_threshold: Minimum similarity score to accept | |
| Returns: | |
| Comprehensive results with quality metrics | |
| """ | |
| if not text.strip(): | |
| return { | |
| "original_text": text, | |
| "humanized_text": text, | |
| "similarity_score": 1.0, | |
| "changes_made": [], | |
| "style": style, | |
| "intensity": intensity, | |
| "quality_score": 1.0, | |
| "processing_time_ms": 0.0, | |
| "feature_usage": {} | |
| } | |
| import time | |
| start_time = time.time() | |
| changes_made = [] | |
| humanized_text = text | |
| original_text = text | |
| feature_usage = {} | |
| # Step 1: AI transition replacement (early to catch obvious AI patterns) | |
| if intensity > 0.2: | |
| before = humanized_text | |
| humanized_text = self.replace_ai_transitions(humanized_text) | |
| if humanized_text != before: | |
| changes_made.append("Replaced AI-like transition phrases") | |
| feature_usage['ai_transitions'] = True | |
| # Step 2: Formal word replacement | |
| if intensity > 0.3: | |
| before = humanized_text | |
| humanized_text = self.replace_formal_words(humanized_text, intensity * 0.9) | |
| if humanized_text != before: | |
| changes_made.append("Replaced formal words with casual alternatives") | |
| feature_usage['word_replacement'] = True | |
| # Step 3: Add contractions | |
| if intensity > 0.4: | |
| before = humanized_text | |
| humanized_text = self.add_contractions(humanized_text) | |
| if humanized_text != before: | |
| changes_made.append("Added natural contractions") | |
| feature_usage['contractions'] = True | |
| # Step 4: Advanced paraphrasing (if available) | |
| if intensity > 0.6 and self.paraphraser: | |
| before = humanized_text | |
| humanized_text = self.advanced_paraphrasing(humanized_text, intensity * 0.5) | |
| if humanized_text != before: | |
| changes_made.append("Applied AI paraphrasing for natural flow") | |
| feature_usage['paraphrasing'] = True | |
| # Step 5: Calculate quality metrics | |
| processing_time = (time.time() - start_time) * 1000 | |
| similarity_score = self.calculate_similarity_advanced(original_text, humanized_text) | |
| # Quality control - revert if similarity too low | |
| if similarity_score < quality_threshold: | |
| print(f"β οΈ Quality check failed (similarity: {similarity_score:.3f})") | |
| humanized_text = original_text | |
| similarity_score = 1.0 | |
| changes_made = ["Quality threshold not met - reverted to original"] | |
| feature_usage['quality_control'] = True | |
| # Calculate comprehensive quality score | |
| length_ratio = len(humanized_text) / len(original_text) if original_text else 1.0 | |
| length_penalty = max(0, 1.0 - abs(length_ratio - 1.0)) if preserve_length else 1.0 | |
| change_score = min(1.0, len(changes_made) / 5.0) # Reward meaningful changes | |
| quality_score = (similarity_score * 0.5) + (length_penalty * 0.3) + (change_score * 0.2) | |
| return { | |
| "original_text": original_text, | |
| "humanized_text": humanized_text, | |
| "similarity_score": similarity_score, | |
| "quality_score": quality_score, | |
| "changes_made": changes_made, | |
| "style": style, | |
| "intensity": intensity, | |
| "processing_time_ms": processing_time, | |
| "feature_usage": feature_usage, | |
| "length_change": len(humanized_text) - len(original_text), | |
| "word_count_change": len(humanized_text.split()) - len(original_text.split()) | |
| } | |
| # Convenience function for backward compatibility | |
| def AITextHumanizer(): | |
| """Factory function for backward compatibility""" | |
| return ProductionAITextHumanizer() | |
| # Test the production version | |
| if __name__ == "__main__": | |
| humanizer = ProductionAITextHumanizer() | |
| test_texts = [ | |
| "Furthermore, it is important to note that artificial intelligence systems demonstrate significant capabilities.", | |
| "The implementation of comprehensive methodologies will facilitate optimization and enhance operational efficiency.", | |
| "Subsequently, organizations must utilize systematic approaches to evaluate and implement technological solutions." | |
| ] | |
| print(f"\nπ§ͺ TESTING PRODUCTION HUMANIZER") | |
| print("=" * 40) | |
| for i, test_text in enumerate(test_texts, 1): | |
| print(f"\n㪠Test {i}:") | |
| print(f"Original: {test_text}") | |
| result = humanizer.humanize_text_production( | |
| text=test_text, | |
| style="conversational", | |
| intensity=0.8 | |
| ) | |
| print(f"Humanized: {result['humanized_text']}") | |
| print(f"Quality Score: {result['quality_score']:.3f}") | |
| print(f"Similarity: {result['similarity_score']:.3f}") | |
| print(f"Processing: {result['processing_time_ms']:.1f}ms") | |
| print(f"Changes: {', '.join(result['changes_made']) if result['changes_made'] else 'None'}") | |
| print(f"\nπ Production testing completed!") |