Spaces:
Running
Running
| import re | |
| import random | |
| import nltk | |
| from typing import List, Dict, Optional | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| from transformers import pipeline | |
| # Download required NLTK data | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt') | |
| try: | |
| nltk.data.find('tokenizers/punkt_tab') | |
| except LookupError: | |
| nltk.download('punkt_tab') | |
| try: | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError: | |
| nltk.download('wordnet') | |
| try: | |
| nltk.data.find('corpora/omw-1.4') | |
| except LookupError: | |
| nltk.download('omw-1.4') | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.corpus import wordnet | |
| class AITextHumanizer: | |
| def __init__(self): | |
| """Initialize the text humanizer with necessary models and data""" | |
| print("Loading models...") | |
| # Load sentence transformer for semantic similarity | |
| try: | |
| self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| except Exception as e: | |
| print(f"Warning: Could not load similarity model: {e}") | |
| self.similarity_model = None | |
| # Initialize paraphrasing pipeline | |
| try: | |
| self.paraphraser = pipeline("text2text-generation", | |
| model="google/flan-t5-small", | |
| max_length=512) | |
| except Exception as e: | |
| print(f"Warning: Could not load paraphrasing model: {e}") | |
| self.paraphraser = None | |
| # Formal to casual word mappings | |
| self.formal_to_casual = { | |
| "utilize": "use", | |
| "demonstrate": "show", | |
| "facilitate": "help", | |
| "implement": "do", | |
| "consequently": "so", | |
| "therefore": "so", | |
| "nevertheless": "but", | |
| "furthermore": "also", | |
| "moreover": "also", | |
| "subsequently": "then", | |
| "accordingly": "so", | |
| "regarding": "about", | |
| "concerning": "about", | |
| "pertaining": "about", | |
| "approximately": "about", | |
| "endeavor": "try", | |
| "commence": "start", | |
| "terminate": "end", | |
| "obtain": "get", | |
| "purchase": "buy", | |
| "examine": "look at", | |
| "analyze": "study", | |
| "construct": "build", | |
| "establish": "set up", | |
| "magnitude": "size", | |
| "comprehensive": "complete", | |
| "significant": "big", | |
| "substantial": "large", | |
| "optimal": "best", | |
| "sufficient": "enough", | |
| "prior to": "before", | |
| "in order to": "to", | |
| "due to the fact that": "because", | |
| "at this point in time": "now", | |
| "in the event that": "if", | |
| } | |
| # Contractions mapping | |
| self.contractions = { | |
| "do not": "don't", | |
| "does not": "doesn't", | |
| "did not": "didn't", | |
| "will not": "won't", | |
| "would not": "wouldn't", | |
| "should not": "shouldn't", | |
| "could not": "couldn't", | |
| "cannot": "can't", | |
| "is not": "isn't", | |
| "are not": "aren't", | |
| "was not": "wasn't", | |
| "were not": "weren't", | |
| "have not": "haven't", | |
| "has not": "hasn't", | |
| "had not": "hadn't", | |
| "I am": "I'm", | |
| "you are": "you're", | |
| "he is": "he's", | |
| "she is": "she's", | |
| "it is": "it's", | |
| "we are": "we're", | |
| "they are": "they're", | |
| "I have": "I've", | |
| "you have": "you've", | |
| "we have": "we've", | |
| "they have": "they've", | |
| "I will": "I'll", | |
| "you will": "you'll", | |
| "he will": "he'll", | |
| "she will": "she'll", | |
| "it will": "it'll", | |
| "we will": "we'll", | |
| "they will": "they'll", | |
| } | |
| # Transition words that make text sound more AI-like | |
| self.ai_transition_words = [ | |
| "Furthermore,", "Moreover,", "Additionally,", "Subsequently,", | |
| "Consequently,", "Therefore,", "Nevertheless,", "However,", | |
| "In conclusion,", "To summarize,", "In summary,", "Overall,", | |
| "It is important to note that", "It should be emphasized that", | |
| "It is worth mentioning that", "It is crucial to understand that" | |
| ] | |
| # Natural alternatives | |
| self.natural_transitions = [ | |
| "Also,", "Plus,", "And,", "Then,", "So,", "But,", "Still,", | |
| "Anyway,", "By the way,", "Actually,", "Basically,", | |
| "Look,", "Listen,", "Here's the thing:", "The point is,", | |
| "What's more,", "On top of that,", "Another thing,", | |
| ] | |
| print("Humanizer initialized successfully!") | |
| def add_contractions(self, text: str) -> str: | |
| """Add contractions to make text sound more natural""" | |
| for formal, casual in self.contractions.items(): | |
| # Case insensitive replacement but preserve original case | |
| pattern = re.compile(re.escape(formal), re.IGNORECASE) | |
| text = pattern.sub(casual, text) | |
| return text | |
| def replace_formal_words(self, text: str, replacement_rate: float = 0.7) -> str: | |
| """Replace formal words with casual alternatives""" | |
| words = word_tokenize(text) | |
| for i, word in enumerate(words): | |
| word_lower = word.lower() | |
| if word_lower in self.formal_to_casual and random.random() < replacement_rate: | |
| # Preserve original case | |
| if word.isupper(): | |
| words[i] = self.formal_to_casual[word_lower].upper() | |
| elif word.istitle(): | |
| words[i] = self.formal_to_casual[word_lower].title() | |
| else: | |
| words[i] = self.formal_to_casual[word_lower] | |
| # Reconstruct text with proper spacing | |
| result = "" | |
| for i, word in enumerate(words): | |
| if i > 0 and word not in ".,!?;:": | |
| result += " " | |
| result += word | |
| return result | |
| def vary_sentence_structure(self, text: str) -> str: | |
| """Vary sentence structure to sound more natural""" | |
| sentences = sent_tokenize(text) | |
| varied_sentences = [] | |
| for sentence in sentences: | |
| # Sometimes start sentences with connecting words | |
| if random.random() < 0.3: | |
| connectors = ["Well,", "So,", "Now,", "Look,", "Actually,", "Basically,"] | |
| if not any(sentence.startswith(word) for word in connectors): | |
| sentence = random.choice(connectors) + " " + sentence.lower() | |
| # Occasionally break long sentences | |
| if len(sentence.split()) > 20 and random.random() < 0.4: | |
| words = sentence.split() | |
| mid_point = len(words) // 2 | |
| # Find a natural break point near the middle | |
| for i in range(mid_point - 3, min(mid_point + 3, len(words))): | |
| if words[i] in [',', 'and', 'but', 'or', 'so']: | |
| sentence1 = ' '.join(words[:i+1]) | |
| sentence2 = ' '.join(words[i+1:]) | |
| if sentence2: | |
| sentence2 = sentence2[0].upper() + sentence2[1:] | |
| varied_sentences.append(sentence1) | |
| sentence = sentence2 | |
| break | |
| varied_sentences.append(sentence) | |
| return ' '.join(varied_sentences) | |
| def replace_ai_transitions(self, text: str) -> str: | |
| """Replace AI-like transition words with natural alternatives""" | |
| for ai_word in self.ai_transition_words: | |
| if ai_word in text: | |
| natural_replacement = random.choice(self.natural_transitions) | |
| text = text.replace(ai_word, natural_replacement) | |
| return text | |
| def add_natural_imperfections(self, text: str, imperfection_rate: float = 0.1) -> str: | |
| """Add subtle imperfections to make text more human-like""" | |
| sentences = sent_tokenize(text) | |
| modified_sentences = [] | |
| for sentence in sentences: | |
| # Occasionally start with lowercase after punctuation (casual style) | |
| if random.random() < imperfection_rate: | |
| words = sentence.split() | |
| if len(words) > 1 and words[0].lower() in ['and', 'but', 'or', 'so']: | |
| sentence = words[0].lower() + ' ' + ' '.join(words[1:]) | |
| # Sometimes use informal punctuation | |
| if random.random() < imperfection_rate: | |
| if sentence.endswith('.'): | |
| sentence = sentence[:-1] # Remove period occasionally | |
| elif not sentence.endswith(('.', '!', '?')): | |
| if random.random() < 0.5: | |
| sentence += '.' | |
| modified_sentences.append(sentence) | |
| return ' '.join(modified_sentences) | |
| def paraphrase_segments(self, text: str, paraphrase_rate: float = 0.3) -> str: | |
| """Paraphrase some segments using the transformer model""" | |
| if not self.paraphraser: | |
| return text | |
| sentences = sent_tokenize(text) | |
| paraphrased_sentences = [] | |
| for sentence in sentences: | |
| if random.random() < paraphrase_rate and len(sentence.split()) > 5: | |
| try: | |
| # Create paraphrase prompt | |
| prompt = f"Rewrite this sentence in a more natural, conversational way: {sentence}" | |
| result = self.paraphraser(prompt, max_length=100, num_return_sequences=1) | |
| paraphrased = result[0]['generated_text'] | |
| # Clean up the result | |
| paraphrased = paraphrased.replace(prompt, '').strip() | |
| if paraphrased and len(paraphrased) > 10: | |
| paraphrased_sentences.append(paraphrased) | |
| else: | |
| paraphrased_sentences.append(sentence) | |
| except Exception as e: | |
| print(f"Paraphrasing failed: {e}") | |
| paraphrased_sentences.append(sentence) | |
| else: | |
| paraphrased_sentences.append(sentence) | |
| return ' '.join(paraphrased_sentences) | |
| def calculate_similarity(self, text1: str, text2: str) -> float: | |
| """Calculate semantic similarity between original and humanized text""" | |
| if not self.similarity_model: | |
| return 0.85 # Return reasonable default if model not available | |
| try: | |
| embeddings1 = self.similarity_model.encode([text1]) | |
| embeddings2 = self.similarity_model.encode([text2]) | |
| similarity = np.dot(embeddings1[0], embeddings2[0]) / ( | |
| np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0]) | |
| ) | |
| return float(similarity) | |
| except Exception as e: | |
| print(f"Similarity calculation failed: {e}") | |
| return 0.85 | |
| def humanize_text(self, | |
| text: str, | |
| style: str = "natural", | |
| intensity: float = 0.7) -> Dict: | |
| """ | |
| Main humanization function | |
| Args: | |
| text: Input text to humanize | |
| style: Style of humanization ('natural', 'casual', 'conversational') | |
| intensity: Intensity of humanization (0.0 to 1.0) | |
| Returns: | |
| Dictionary with humanized text and metadata | |
| """ | |
| if not text.strip(): | |
| return { | |
| "original_text": text, | |
| "humanized_text": text, | |
| "similarity_score": 1.0, | |
| "changes_made": [] | |
| } | |
| changes_made = [] | |
| humanized_text = text | |
| # Apply transformations based on intensity | |
| if intensity > 0.2: | |
| # Replace formal words | |
| before_formal = humanized_text | |
| humanized_text = self.replace_formal_words(humanized_text, intensity * 0.7) | |
| if humanized_text != before_formal: | |
| changes_made.append("Replaced formal words with casual alternatives") | |
| if intensity > 0.3: | |
| # Add contractions | |
| before_contractions = humanized_text | |
| humanized_text = self.add_contractions(humanized_text) | |
| if humanized_text != before_contractions: | |
| changes_made.append("Added contractions") | |
| if intensity > 0.4: | |
| # Replace AI-like transitions | |
| before_transitions = humanized_text | |
| humanized_text = self.replace_ai_transitions(humanized_text) | |
| if humanized_text != before_transitions: | |
| changes_made.append("Replaced AI-like transition words") | |
| if intensity > 0.5: | |
| # Vary sentence structure | |
| before_structure = humanized_text | |
| humanized_text = self.vary_sentence_structure(humanized_text) | |
| if humanized_text != before_structure: | |
| changes_made.append("Varied sentence structure") | |
| if intensity > 0.6 and style in ["casual", "conversational"]: | |
| # Add natural imperfections | |
| before_imperfections = humanized_text | |
| humanized_text = self.add_natural_imperfections(humanized_text, intensity * 0.2) | |
| if humanized_text != before_imperfections: | |
| changes_made.append("Added natural imperfections") | |
| if intensity > 0.7: | |
| # Paraphrase some segments | |
| before_paraphrase = humanized_text | |
| humanized_text = self.paraphrase_segments(humanized_text, intensity * 0.4) | |
| if humanized_text != before_paraphrase: | |
| changes_made.append("Paraphrased some segments") | |
| # Calculate similarity | |
| similarity_score = self.calculate_similarity(text, humanized_text) | |
| return { | |
| "original_text": text, | |
| "humanized_text": humanized_text, | |
| "similarity_score": similarity_score, | |
| "changes_made": changes_made, | |
| "style": style, | |
| "intensity": intensity | |
| } | |
| # Test the humanizer | |
| if __name__ == "__main__": | |
| humanizer = AITextHumanizer() | |
| # Test text | |
| test_text = """ | |
| Furthermore, it is important to note that artificial intelligence systems demonstrate | |
| significant capabilities in natural language processing tasks. Subsequently, these | |
| systems can analyze and generate text with remarkable accuracy. Nevertheless, it is | |
| crucial to understand that human oversight remains essential for optimal performance. | |
| Therefore, organizations should implement comprehensive strategies to utilize these | |
| technologies effectively while maintaining quality standards. | |
| """ | |
| print("Original Text:") | |
| print(test_text) | |
| print("\n" + "="*50 + "\n") | |
| result = humanizer.humanize_text(test_text, style="conversational", intensity=0.8) | |
| print("Humanized Text:") | |
| print(result["humanized_text"]) | |
| print(f"\nSimilarity Score: {result['similarity_score']:.3f}") | |
| print(f"Changes Made: {', '.join(result['changes_made'])}") |