| | """ |
| | Neural Machine Translation Module for Multilingual Audio Intelligence System |
| | |
| | This module implements state-of-the-art neural machine translation using Helsinki-NLP/Opus-MT |
| | models. Designed for efficient CPU-based translation with dynamic model loading and |
| | intelligent batching strategies. |
| | |
| | Key Features: |
| | - Dynamic model loading for 100+ language pairs |
| | - Helsinki-NLP/Opus-MT models (300MB each) for specific language pairs |
| | - Intelligent batching for maximum CPU throughput |
| | - Fallback to multilingual models (mBART, M2M-100) for rare languages |
| | - Memory-efficient model management with automatic cleanup |
| | - Robust error handling and translation confidence scoring |
| | - Cache management for frequently used language pairs |
| | |
| | Models: Helsinki-NLP/opus-mt-* series, Facebook mBART50, M2M-100 |
| | Dependencies: transformers, torch, sentencepiece |
| | """ |
| |
|
| | import os |
| | import logging |
| | import warnings |
| | import torch |
| | from typing import List, Dict, Optional, Tuple, Union, Any |
| | import gc |
| | from dataclasses import dataclass |
| | from collections import defaultdict |
| | import time |
| |
|
| | try: |
| | from transformers import ( |
| | MarianMTModel, MarianTokenizer, |
| | MBartForConditionalGeneration, MBart50TokenizerFast, |
| | M2M100ForConditionalGeneration, M2M100Tokenizer, |
| | pipeline |
| | ) |
| | TRANSFORMERS_AVAILABLE = True |
| | except ImportError: |
| | TRANSFORMERS_AVAILABLE = False |
| | logging.warning("transformers not available. Install with: pip install transformers") |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | warnings.filterwarnings("ignore", category=UserWarning) |
| | warnings.filterwarnings("ignore", category=FutureWarning) |
| |
|
| |
|
| | @dataclass |
| | class TranslationResult: |
| | """ |
| | Data class representing a translation result with metadata. |
| | |
| | Attributes: |
| | original_text (str): Original text in source language |
| | translated_text (str): Translated text in target language |
| | source_language (str): Source language code |
| | target_language (str): Target language code |
| | confidence (float): Translation confidence score |
| | model_used (str): Name of the model used for translation |
| | processing_time (float): Time taken for translation in seconds |
| | """ |
| | original_text: str |
| | translated_text: str |
| | source_language: str |
| | target_language: str |
| | confidence: float = 1.0 |
| | model_used: str = "unknown" |
| | processing_time: float = 0.0 |
| | |
| | def to_dict(self) -> dict: |
| | """Convert to dictionary for JSON serialization.""" |
| | return { |
| | 'original_text': self.original_text, |
| | 'translated_text': self.translated_text, |
| | 'source_language': self.source_language, |
| | 'target_language': self.target_language, |
| | 'confidence': self.confidence, |
| | 'model_used': self.model_used, |
| | 'processing_time': self.processing_time |
| | } |
| |
|
| |
|
| | class NeuralTranslator: |
| | """ |
| | ENHANCED 3-Tier Hybrid Translation System for Competition Excellence |
| | |
| | Combines original Opus-MT capabilities with NEW hybrid approach: |
| | - Tier 1: Helsinki-NLP/Opus-MT models (highest quality, specific languages) |
| | - Tier 2: Google Translate API (broad coverage, reliable fallback) |
| | - Tier 3: mBART50 multilingual (offline fallback, code-switching support) |
| | |
| | NEW FEATURES for Indian Languages & Competition: |
| | - Enhanced support for Tamil, Telugu, Gujarati, Kannada, Nepali |
| | - Smart fallback strategies to handle missing models |
| | - Free Google Translate alternatives (googletrans, deep-translator) |
| | - Code-switching detection for mixed language audio |
| | - Memory-efficient processing for large files |
| | """ |
| | |
| | def __init__(self, |
| | target_language: str = "en", |
| | device: Optional[str] = None, |
| | cache_size: int = 3, |
| | use_multilingual_fallback: bool = True, |
| | model_cache_dir: Optional[str] = None, |
| | enable_google_api: bool = True, |
| | google_api_key: Optional[str] = None): |
| | """ |
| | Initialize the Neural Translator. |
| | |
| | Args: |
| | target_language (str): Target language code (default: 'en' for English) |
| | device (str, optional): Device to run on ('cpu', 'cuda', 'auto') |
| | cache_size (int): Maximum number of models to keep in memory |
| | use_multilingual_fallback (bool): Use mBART/M2M-100 for unsupported pairs |
| | model_cache_dir (str, optional): Directory to cache downloaded models |
| | enable_google_api (bool): NEW - Enable Google Translate API fallback |
| | google_api_key (str, optional): NEW - Google API key for paid service |
| | """ |
| | |
| | self.target_language = target_language |
| | self.cache_size = cache_size |
| | self.use_multilingual_fallback = use_multilingual_fallback |
| | self.model_cache_dir = model_cache_dir |
| | |
| | |
| | self.enable_google_api = enable_google_api |
| | self.google_api_key = google_api_key |
| | |
| | |
| | if device == 'auto' or device is None: |
| | self.device = torch.device('cpu') |
| | else: |
| | self.device = torch.device('cpu') |
| | |
| | logger.info(f"✅ Enhanced NeuralTranslator Initializing:") |
| | logger.info(f" Target: {target_language}, Device: {self.device}") |
| | logger.info(f" Hybrid Mode: Opus-MT → Google API → mBART50") |
| | logger.info(f" Google API: {'Enabled' if enable_google_api else 'Disabled'}") |
| | |
| | |
| | self.model_cache = {} |
| | self.fallback_model = None |
| | self.fallback_tokenizer = None |
| | self.fallback_model_name = None |
| | |
| | |
| | self.opus_mt_models = {} |
| | self.indic_models = {} |
| | self.google_translator = None |
| | self.google_translator_class = None |
| | |
| | |
| | self._initialize_opus_mt_models() |
| | self._initialize_indic_models() |
| | |
| | if enable_google_api: |
| | self._initialize_google_translator() |
| | logger.info(f"🔍 Final Google Translator status: {self.google_translator}") |
| | else: |
| | logger.warning("❌ Google API disabled - translations will use fallback") |
| | |
| | |
| | self.translation_stats = { |
| | 'opus_mt_calls': 0, |
| | 'google_api_calls': 0, |
| | 'mbart_calls': 0, |
| | 'fallback_used': 0, |
| | 'total_translations': 0, |
| | 'supported_languages': set() |
| | } |
| | |
| | |
| | self.language_mapping = self._get_language_mapping() |
| | |
| | |
| | self._supported_pairs_cache = None |
| | |
| | |
| | if use_multilingual_fallback: |
| | self._load_fallback_model() |
| | |
| | def _get_language_mapping(self) -> Dict[str, str]: |
| | """Get mapping of language codes to Helsinki-NLP model codes.""" |
| | |
| | return { |
| | 'en': 'en', 'es': 'es', 'fr': 'fr', 'de': 'de', 'it': 'it', 'pt': 'pt', |
| | 'ru': 'ru', 'zh': 'zh', 'ja': 'ja', 'ko': 'ko', 'ar': 'ar', 'hi': 'hi', |
| | 'tr': 'tr', 'pl': 'pl', 'nl': 'nl', 'sv': 'sv', 'da': 'da', 'no': 'no', |
| | 'fi': 'fi', 'hu': 'hu', 'cs': 'cs', 'sk': 'sk', 'sl': 'sl', 'hr': 'hr', |
| | 'bg': 'bg', 'ro': 'ro', 'el': 'el', 'he': 'he', 'th': 'th', 'vi': 'vi', |
| | 'id': 'id', 'ms': 'ms', 'tl': 'tl', 'sw': 'sw', 'eu': 'eu', 'ca': 'ca', |
| | 'gl': 'gl', 'cy': 'cy', 'ga': 'ga', 'mt': 'mt', 'is': 'is', 'lv': 'lv', |
| | 'lt': 'lt', 'et': 'et', 'mk': 'mk', 'sq': 'sq', 'be': 'be', 'uk': 'uk', |
| | 'ka': 'ka', 'hy': 'hy', 'az': 'az', 'kk': 'kk', 'ky': 'ky', 'uz': 'uz', |
| | 'fa': 'fa', 'ur': 'ur', 'bn': 'bn', 'ta': 'ta', 'te': 'te', 'ml': 'ml', |
| | 'kn': 'kn', 'gu': 'gu', 'pa': 'pa', 'mr': 'mr', 'ne': 'ne', 'si': 'si', |
| | 'my': 'my', 'km': 'km', 'lo': 'lo', 'mn': 'mn', 'bo': 'bo' |
| | } |
| | |
| | def _load_fallback_model(self): |
| | """Load multilingual fallback model (mBART50 or M2M-100).""" |
| | try: |
| | |
| | logger.info("Loading mBART50 multilingual fallback model...") |
| | |
| | self.fallback_model = MBartForConditionalGeneration.from_pretrained( |
| | "facebook/mbart-large-50-many-to-many-mmt", |
| | cache_dir=self.model_cache_dir |
| | ).to(self.device) |
| | |
| | self.fallback_tokenizer = MBart50TokenizerFast.from_pretrained( |
| | "facebook/mbart-large-50-many-to-many-mmt", |
| | cache_dir=self.model_cache_dir |
| | ) |
| | |
| | self.fallback_model_name = "mbart50" |
| | logger.info("mBART50 fallback model loaded successfully") |
| | |
| | except Exception as e: |
| | logger.warning(f"Failed to load mBART50: {e}") |
| | |
| | try: |
| | |
| | logger.info("Loading M2M-100 multilingual fallback model...") |
| | |
| | self.fallback_model = M2M100ForConditionalGeneration.from_pretrained( |
| | "facebook/m2m100_418M", |
| | cache_dir=self.model_cache_dir |
| | ).to(self.device) |
| | |
| | self.fallback_tokenizer = M2M100Tokenizer.from_pretrained( |
| | "facebook/m2m100_418M", |
| | cache_dir=self.model_cache_dir |
| | ) |
| | |
| | self.fallback_model_name = "m2m100" |
| | logger.info("M2M-100 fallback model loaded successfully") |
| | |
| | except Exception as e2: |
| | logger.warning(f"Failed to load M2M-100: {e2}") |
| | self.fallback_model = None |
| | self.fallback_tokenizer = None |
| | self.fallback_model_name = None |
| | |
| | def _initialize_google_translator(self): |
| | """Initialize Google Translate API integration.""" |
| | logger.info("🔄 Attempting to initialize Google Translate...") |
| | try: |
| | if self.google_api_key: |
| | try: |
| | from google.cloud import translate_v2 as translate |
| | self.google_translator = translate.Client(api_key=self.google_api_key) |
| | logger.info("✅ Google Cloud Translation API initialized") |
| | return |
| | except ImportError: |
| | logger.warning("Google Cloud client not available, falling back to free options") |
| | |
| | |
| | try: |
| | from googletrans import Translator |
| | |
| | self.google_translator = Translator() |
| | |
| | |
| | test_result = self.google_translator.translate('Hello', src='en', dest='fr') |
| | if test_result and hasattr(test_result, 'text') and test_result.text: |
| | logger.info("✅ Google Translate (googletrans) initialized and tested") |
| | return |
| | else: |
| | logger.warning("⚠️ Googletrans test failed") |
| | self.google_translator = None |
| | except Exception as e: |
| | logger.warning(f"⚠️ Googletrans initialization failed: {e}") |
| | pass |
| | |
| | try: |
| | from deep_translator import GoogleTranslator |
| | |
| | test_translator = GoogleTranslator(source='en', target='fr') |
| | test_result = test_translator.translate('test') |
| | if test_result: |
| | self.google_translator = 'deep_translator' |
| | self.google_translator_class = GoogleTranslator |
| | logger.info("✅ Deep Translator (Google) initialized and tested") |
| | return |
| | else: |
| | logger.warning("⚠️ Deep Translator test failed") |
| | except Exception as e: |
| | logger.warning(f"⚠️ Deep Translator failed: {e}") |
| | pass |
| | |
| | logger.warning("⚠️ No Google Translate library available") |
| | self.google_translator = None |
| | |
| | except Exception as e: |
| | logger.error(f"❌ Failed to initialize Google Translator: {e}") |
| | self.google_translator = None |
| | |
| | def _translate_with_google_api(self, text: str, source_lang: str, target_lang: str) -> str: |
| | """ |
| | Unified method to translate using any available Google Translate API. |
| | """ |
| | if not self.google_translator: |
| | return None |
| | |
| | |
| | source_lang = self._normalize_language_code(source_lang) |
| | target_lang = self._normalize_language_code(target_lang) |
| | |
| | logger.info(f"Translating '{text[:50]}...' from {source_lang} to {target_lang}") |
| | |
| | try: |
| | if self.google_translator == 'deep_translator': |
| | |
| | translator = self.google_translator_class(source=source_lang, target=target_lang) |
| | result = translator.translate(text) |
| | logger.info(f"Deep Translator result: {result[:50] if result else 'None'}...") |
| | return result |
| | else: |
| | |
| | result = self.google_translator.translate(text, src=source_lang, dest=target_lang) |
| | translated_text = result.text if result else None |
| | logger.info(f"Googletrans result: {translated_text[:50] if translated_text else 'None'}...") |
| | return translated_text |
| | except Exception as e: |
| | logger.warning(f"Google API translation error ({source_lang}->{target_lang}): {e}") |
| | return None |
| | |
| | def _normalize_language_code(self, lang_code: str) -> str: |
| | """ |
| | Normalize language codes for Google Translate compatibility. |
| | """ |
| | |
| | lang_mapping = { |
| | 'ja': 'ja', |
| | 'hi': 'hi', |
| | 'ur': 'ur', |
| | 'ar': 'ar', |
| | 'zh': 'zh-cn', |
| | 'fr': 'fr', |
| | 'es': 'es', |
| | 'de': 'de', |
| | 'en': 'en', |
| | 'unknown': 'auto' |
| | } |
| | |
| | return lang_mapping.get(lang_code.lower(), lang_code.lower()) |
| | |
| | def _initialize_opus_mt_models(self): |
| | """Initialize Helsinki-NLP Opus-MT models for high-quality translation.""" |
| | logger.info("🔄 Initializing Helsinki-NLP Opus-MT models...") |
| | |
| | |
| | self.opus_mt_pairs = { |
| | |
| | 'fr-en': 'Helsinki-NLP/opus-mt-fr-en', |
| | 'de-en': 'Helsinki-NLP/opus-mt-de-en', |
| | 'es-en': 'Helsinki-NLP/opus-mt-es-en', |
| | 'it-en': 'Helsinki-NLP/opus-mt-it-en', |
| | 'ru-en': 'Helsinki-NLP/opus-mt-ru-en', |
| | 'pt-en': 'Helsinki-NLP/opus-mt-pt-en', |
| | |
| | |
| | 'ja-en': 'Helsinki-NLP/opus-mt-ja-en', |
| | 'ko-en': 'Helsinki-NLP/opus-mt-ko-en', |
| | 'zh-en': 'Helsinki-NLP/opus-mt-zh-en', |
| | 'ar-en': 'Helsinki-NLP/opus-mt-ar-en', |
| | |
| | |
| | 'en-fr': 'Helsinki-NLP/opus-mt-en-fr', |
| | 'en-de': 'Helsinki-NLP/opus-mt-en-de', |
| | 'en-es': 'Helsinki-NLP/opus-mt-en-es', |
| | 'en-it': 'Helsinki-NLP/opus-mt-en-it', |
| | 'en-ru': 'Helsinki-NLP/opus-mt-en-ru', |
| | 'en-ja': 'Helsinki-NLP/opus-mt-en-ja', |
| | 'en-zh': 'Helsinki-NLP/opus-mt-en-zh', |
| | |
| | |
| | 'hi-en': 'Helsinki-NLP/opus-mt-hi-en', |
| | 'en-hi': 'Helsinki-NLP/opus-mt-en-hi', |
| | 'ur-en': 'Helsinki-NLP/opus-mt-ur-en', |
| | 'en-ur': 'Helsinki-NLP/opus-mt-en-ur', |
| | } |
| | |
| | logger.info(f"✅ Opus-MT models configured for {len(self.opus_mt_pairs)} language pairs") |
| |
|
| | def _initialize_indic_models(self): |
| | """Initialize specialized models for Indian languages.""" |
| | logger.info("🔄 Initializing Indian language translation models...") |
| | |
| | |
| | |
| | self.indic_model_info = { |
| | 'indictrans2': { |
| | 'en-indic': 'ai4bharat/indictrans2-en-indic-1B', |
| | 'indic-en': 'ai4bharat/indictrans2-indic-en-1B', |
| | 'languages': ['hi', 'bn', 'ta', 'te', 'ml', 'gu', 'kn', 'or', 'pa', 'ur', 'as', 'mr', 'ne'] |
| | }, |
| | 'sarvam': { |
| | 'model': 'sarvamai/sarvam-translate', |
| | 'languages': ['hi', 'bn', 'ta', 'te', 'ml', 'gu', 'kn', 'or', 'pa', 'ur', 'as', 'mr', 'ne'] |
| | } |
| | } |
| | |
| | logger.info("✅ Indian language models configured (will load on-demand)") |
| |
|
| | def _load_opus_mt_model(self, src_lang: str, tgt_lang: str): |
| | """Load a specific Opus-MT model for the language pair.""" |
| | lang_pair = f"{src_lang}-{tgt_lang}" |
| | |
| | if lang_pair in self.opus_mt_models: |
| | return self.opus_mt_models[lang_pair] |
| | |
| | if lang_pair not in self.opus_mt_pairs: |
| | return None |
| | |
| | try: |
| | from transformers import MarianMTModel, MarianTokenizer |
| | |
| | model_name = self.opus_mt_pairs[lang_pair] |
| | logger.info(f"🔄 Loading Opus-MT model: {model_name}") |
| | |
| | tokenizer = MarianTokenizer.from_pretrained(model_name) |
| | model = MarianMTModel.from_pretrained(model_name) |
| | |
| | if self.device != 'cpu': |
| | model = model.to(self.device) |
| | |
| | self.opus_mt_models[lang_pair] = {'model': model, 'tokenizer': tokenizer} |
| | logger.info(f"✅ Loaded Opus-MT model: {model_name}") |
| | |
| | return self.opus_mt_models[lang_pair] |
| | |
| | except Exception as e: |
| | logger.warning(f"⚠️ Failed to load Opus-MT model {lang_pair}: {e}") |
| | return None |
| |
|
| | def _translate_with_opus_mt(self, text: str, src_lang: str, tgt_lang: str) -> str: |
| | """Translate using Helsinki-NLP Opus-MT models.""" |
| | opus_model = self._load_opus_mt_model(src_lang, tgt_lang) |
| | if not opus_model: |
| | return None |
| | |
| | try: |
| | model = opus_model['model'] |
| | tokenizer = opus_model['tokenizer'] |
| | |
| | |
| | inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) |
| | |
| | if self.device != 'cpu': |
| | inputs = {k: v.to(self.device) for k, v in inputs.items()} |
| | |
| | |
| | with torch.no_grad(): |
| | outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True) |
| | |
| | |
| | translated = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | |
| | logger.info(f"Opus-MT translation ({src_lang}->{tgt_lang}): {text[:50]}... -> {translated[:50]}...") |
| | return translated |
| | |
| | except Exception as e: |
| | logger.warning(f"Opus-MT translation error ({src_lang}->{tgt_lang}): {e}") |
| | return None |
| | |
| | def _translate_using_hierarchy(self, text: str, src_lang: str, tgt_lang: str) -> str: |
| | """ |
| | Translate using the proper hierarchy: |
| | 1. Helsinki-NLP Opus-MT (best quality for supported pairs) |
| | 2. Specialized models (IndicTrans2, Sarvam for Indian languages) |
| | 3. Google Translate API |
| | 4. Deep Translator (fallback) |
| | """ |
| | if src_lang == tgt_lang: |
| | return text |
| | |
| | |
| | try: |
| | opus_result = self._translate_with_opus_mt(text, src_lang, tgt_lang) |
| | if opus_result and opus_result != text: |
| | logger.info(f"✅ Opus-MT translation successful ({src_lang}->{tgt_lang})") |
| | self.translation_stats['opus_mt_calls'] = self.translation_stats.get('opus_mt_calls', 0) + 1 |
| | return opus_result |
| | except Exception as e: |
| | logger.debug(f"Opus-MT failed ({src_lang}->{tgt_lang}): {e}") |
| | |
| | |
| | indian_languages = ['hi', 'bn', 'ta', 'te', 'ml', 'gu', 'kn', 'or', 'pa', 'ur', 'as', 'mr', 'ne'] |
| | if src_lang in indian_languages or tgt_lang in indian_languages: |
| | try: |
| | |
| | |
| | logger.debug(f"Indian language pair detected ({src_lang}->{tgt_lang}), specialized models not loaded") |
| | except Exception as e: |
| | logger.debug(f"Specialized model failed ({src_lang}->{tgt_lang}): {e}") |
| | |
| | |
| | try: |
| | google_result = self._translate_with_google_api(text, src_lang, tgt_lang) |
| | if google_result and google_result != text: |
| | logger.info(f"✅ Google Translate successful ({src_lang}->{tgt_lang})") |
| | self.translation_stats['google_api_calls'] = self.translation_stats.get('google_api_calls', 0) + 1 |
| | return google_result |
| | except Exception as e: |
| | logger.debug(f"Google Translate failed ({src_lang}->{tgt_lang}): {e}") |
| | |
| | |
| | logger.warning(f"⚠️ All translation methods failed for {src_lang}->{tgt_lang}") |
| | return text |
| | |
| | def test_translation(self) -> bool: |
| | """Test if Google Translate is working with a simple translation.""" |
| | if not self.google_translator: |
| | logger.warning("❌ No Google Translator available for testing") |
| | return False |
| | |
| | try: |
| | test_text = "Hello world" |
| | result = self._translate_with_google_api(test_text, 'en', 'ja') |
| | if result and result != test_text: |
| | logger.info(f"✅ Translation test successful: '{test_text}' -> '{result}'") |
| | return True |
| | else: |
| | logger.warning(f"❌ Translation test failed: got '{result}'") |
| | return False |
| | except Exception as e: |
| | logger.error(f"❌ Translation test error: {e}") |
| | return False |
| | |
| | def validate_language_detection(self, text: str, detected_lang: str) -> str: |
| | """ |
| | Validate and correct language detection for Indian languages. |
| | """ |
| | |
| | clean_text = text.strip() |
| | |
| | |
| | if len(clean_text) < 10 or len(set(clean_text.split())) < 3: |
| | logger.warning(f"Text too short or repetitive for reliable language detection: {clean_text[:50]}...") |
| | |
| | return detected_lang |
| | |
| | |
| | devanagari_chars = sum(1 for char in clean_text if '\u0900' <= char <= '\u097F') |
| | arabic_chars = sum(1 for char in clean_text if '\u0600' <= char <= '\u06FF') |
| | japanese_chars = sum(1 for char in clean_text if '\u3040' <= char <= '\u309F' or |
| | '\u30A0' <= char <= '\u30FF' or |
| | '\u4E00' <= char <= '\u9FAF') |
| | |
| | total_chars = len([c for c in clean_text if c.isalpha() or '\u3040' <= c <= '\u9FAF']) |
| | |
| | if total_chars > 0: |
| | devanagari_ratio = devanagari_chars / total_chars |
| | arabic_ratio = arabic_chars / total_chars |
| | japanese_ratio = japanese_chars / total_chars |
| | |
| | if japanese_ratio > 0.5: |
| | logger.info(f"Detected Japanese script ({japanese_ratio:.2f} ratio)") |
| | return 'ja' |
| | elif devanagari_ratio > 0.7: |
| | return 'hi' |
| | elif arabic_ratio > 0.7: |
| | return 'ur' |
| | |
| | |
| | if detected_lang in ['zh', 'ar', 'en'] and any(char in clean_text for char in 'तो है का में से'): |
| | logger.info(f"Correcting language detection from {detected_lang} to Hindi") |
| | return 'hi' |
| | |
| | return detected_lang |
| | |
| | def translate_text_hybrid(self, text: str, source_lang: str, target_lang: str) -> TranslationResult: |
| | """Enhanced 3-tier hybrid translation with intelligent fallback.""" |
| | start_time = time.time() |
| | |
| | |
| | corrected_lang = self.validate_language_detection(text, source_lang) |
| | if corrected_lang != source_lang: |
| | logger.info(f"Language corrected: {source_lang} → {corrected_lang}") |
| | source_lang = corrected_lang |
| | |
| | |
| | clean_text = text.strip() |
| | words = clean_text.split() |
| | |
| | |
| | if len(words) > 5: |
| | unique_words = set(words) |
| | if len(unique_words) / len(words) < 0.3: |
| | logger.warning(f"Detected repetitive text: {clean_text[:50]}...") |
| | |
| | |
| | meaningful_part = "" |
| | word_counts = {} |
| | for word in words: |
| | word_counts[word] = word_counts.get(word, 0) + 1 |
| | |
| | |
| | meaningful_words = [] |
| | for word in words[:10]: |
| | if word_counts[word] <= 3: |
| | meaningful_words.append(word) |
| | else: |
| | break |
| | |
| | if len(meaningful_words) >= 3: |
| | meaningful_part = " ".join(meaningful_words) |
| | logger.info(f"Extracted meaningful part: {meaningful_part}") |
| | |
| | |
| | if source_lang != target_lang: |
| | translated_text = self._translate_using_hierarchy(meaningful_part, source_lang, target_lang) |
| | if translated_text and translated_text != meaningful_part: |
| | return TranslationResult( |
| | original_text="[Repetitive or low-quality audio segment]", |
| | translated_text=translated_text, |
| | source_language=source_lang, |
| | target_language=target_lang, |
| | confidence=0.6, |
| | model_used="hierarchy_filtered", |
| | processing_time=time.time() - start_time |
| | ) |
| | |
| | |
| | return TranslationResult( |
| | original_text="[Repetitive or low-quality audio segment]", |
| | translated_text="[Repetitive or low-quality audio segment]", |
| | source_language=source_lang, |
| | target_language=target_lang, |
| | confidence=0.1, |
| | model_used="quality_filter", |
| | processing_time=time.time() - start_time |
| | ) |
| | |
| | |
| | self.translation_stats['total_translations'] += 1 |
| | self.translation_stats['supported_languages'].add(source_lang) |
| | |
| | |
| | try: |
| | |
| | if source_lang != target_lang: |
| | translated_text = self._translate_using_hierarchy(text, source_lang, target_lang) |
| | if translated_text and translated_text != text: |
| | |
| | model_used = "hierarchy_translation" |
| | confidence = 0.8 |
| | |
| | |
| | if hasattr(self, 'opus_mt_models') and any(text in str(model) for model in self.opus_mt_models.values()): |
| | model_used = "opus_mt" |
| | confidence = 0.9 |
| | elif self.google_translator: |
| | model_used = "google_translate" |
| | confidence = 0.8 |
| | |
| | return TranslationResult( |
| | original_text=text, |
| | translated_text=translated_text, |
| | source_language=source_lang, |
| | target_language=target_lang, |
| | confidence=confidence, |
| | model_used=model_used, |
| | processing_time=time.time() - start_time |
| | ) |
| | |
| | |
| | if source_lang == target_lang: |
| | return TranslationResult( |
| | original_text=text, |
| | translated_text=text, |
| | source_language=source_lang, |
| | target_language=target_lang, |
| | confidence=1.0, |
| | model_used="identity", |
| | processing_time=time.time() - start_time |
| | ) |
| | |
| | except Exception as e: |
| | logger.error(f"Translation failed: {e}") |
| | |
| | |
| | logger.warning(f"⚠️ Translation falling back to original text for {source_lang}->{target_lang}: {text[:50]}...") |
| | logger.warning(f"⚠️ Google translator status: {self.google_translator}") |
| | return TranslationResult( |
| | original_text=text, |
| | translated_text=text, |
| | source_language=source_lang, |
| | target_language=target_lang, |
| | confidence=0.5, |
| | model_used="fallback", |
| | processing_time=time.time() - start_time |
| | ) |
| | |
| |
|
| |
|
| | |
| | def translate_text(text: str, |
| | source_language: str, |
| | target_language: str = "en", |
| | device: Optional[str] = None) -> TranslationResult: |
| | """ |
| | Convenience function to translate text with default settings. |
| | """ |
| | translator = NeuralTranslator( |
| | target_language=target_language, |
| | device=device |
| | ) |
| | return translator.translate_text(text, source_language, target_language) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import argparse |
| | |
| | parser = argparse.ArgumentParser(description='Neural Machine Translation') |
| | parser.add_argument('text', help='Text to translate') |
| | parser.add_argument('--source', '-s', required=True, help='Source language') |
| | parser.add_argument('--target', '-t', default='en', help='Target language') |
| | |
| | args = parser.parse_args() |
| | |
| | result = translate_text(args.text, args.source, args.target) |
| | print(f'Original: {result.original_text}') |
| | print(f'Translated: {result.translated_text}') |
| | print(f'Confidence: {result.confidence:.2f}') |
| |
|