GLEN-model / scripts /test_connectivity.py

Commit 15-06-v1

6534252 8 months ago

5.63 kB

	#!/usr/bin/env python3
	"""
	Test script to check Hugging Face connectivity and provide solutions
	"""

	import requests
	import os
	from pathlib import Path

	def test_huggingface_connectivity():
	"""Test connection to Hugging Face"""
	print("🌐 Testing Hugging Face connectivity...")

	try:
	response = requests.get("https://huggingface.co", timeout=10)
	if response.status_code == 200:
	print("✅ Hugging Face is accessible")
	return True
	else:
	print(f"⚠️ Hugging Face returned status code: {response.status_code}")
	return False
	except requests.exceptions.Timeout:
	print("❌ Connection to Hugging Face timed out")
	return False
	except requests.exceptions.ConnectionError:
	print("❌ Cannot connect to Hugging Face")
	return False
	except Exception as e:
	print(f"❌ Error connecting to Hugging Face: {e}")
	return False

	def check_cached_models():
	"""Check if T5 models are already cached"""
	print("\n📁 Checking for cached models...")

	# Common cache locations
	cache_locations = [
	Path.home() / ".cache" / "huggingface" / "transformers",
	Path.home() / ".cache" / "huggingface" / "hub",
	Path(os.environ.get("HF_HOME", "")) / "hub" if os.environ.get("HF_HOME") else None,
	]

	found_models = []
	for cache_dir in cache_locations:
	if cache_dir and cache_dir.exists():
	# Look for t5-base related folders
	for item in cache_dir.iterdir():
	if item.is_dir() and "t5" in item.name.lower():
	found_models.append(str(item))
	print(f"✅ Found cached model: {item}")

	if not found_models:
	print("❌ No T5 models found in cache")

	return found_models

	def suggest_solutions():
	"""Provide solutions for connectivity issues"""
	print("\n💡 Solutions for connectivity issues:")
	print("="*50)

	print("\n1. 🌐 Pre-download the model with better connectivity:")
	print(" Run this when you have stable internet:")
	print(" ```python")
	print(" from transformers import AutoTokenizer, AutoModelForSeq2SeqLM")
	print(" tokenizer = AutoTokenizer.from_pretrained('t5-base')")
	print(" model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')")
	print(" ```")

	print("\n2. 🔄 Retry with longer timeout:")
	print(" Set environment variables:")
	print(" ```bash")
	print(" export HF_HUB_TIMEOUT=300")
	print(" export REQUESTS_TIMEOUT=300")
	print(" ```")

	print("\n3. 🏠 Use offline mode (if model is cached):")
	print(" ```bash")
	print(" export TRANSFORMERS_OFFLINE=1")
	print(" ```")

	print("\n4. 🌐 Alternative: Use different mirror:")
	print(" ```bash")
	print(" export HF_ENDPOINT=https://hf-mirror.com")
	print(" ```")

	print("\n5. 📦 Local testing without model download:")
	print(" Use a smaller test that doesn't require model downloads")

	def create_simple_test():
	"""Create a simple test that doesn't require model downloads"""
	print("\n🧪 Creating simplified test...")

	test_script = '''#!/usr/bin/env python3
	"""
	Simple test that only tests data loading and GPU monitoring without model downloads
	"""

	import sys
	import os
	sys.path.append('src')

	def test_data_only():
	"""Test only data loading functionality"""
	try:
	import pandas as pd
	from tevatron.utils.gpu_monitor import GPUMemoryMonitor

	print("✅ Testing data loading...")
	df = pd.read_csv("data/the_vault/DOC_VAULT_train.tsv", sep='\\t', nrows=5)
	print(f"✅ Loaded {len(df)} samples")

	print("✅ Testing GPU monitor...")
	monitor = GPUMemoryMonitor(memory_threshold=0.8, check_interval=10)
	stats = monitor.get_memory_stats()
	print(f"✅ GPU monitor initialized: {stats}")

	print("🎉 Basic functionality test PASSED!")
	return True

	except Exception as e:
	print(f"❌ Test failed: {e}")
	return False

	if __name__ == "__main__":
	success = test_data_only()
	sys.exit(0 if success else 1)
	'''

	with open("scripts/test_basic.py", "w") as f:
	f.write(test_script)

	print("✅ Created scripts/test_basic.py")
	print(" Run with: python scripts/test_basic.py")

	def main():
	print("🔍 GLEN Connectivity Diagnostic")
	print("="*40)

	# Test connectivity
	connectivity_ok = test_huggingface_connectivity()

	# Check cached models
	cached_models = check_cached_models()

	# Create simple test
	create_simple_test()

	# Suggest solutions
	suggest_solutions()

	print("\n" + "="*50)
	print("📋 Summary:")
	print(f" - Hugging Face connectivity: {'✅ OK' if connectivity_ok else '❌ FAILED'}")
	print(f" - Cached models found: {'✅ YES' if cached_models else '❌ NO'}")
	print(" - Simple test created: ✅ YES")

	if not connectivity_ok and not cached_models:
	print("\n⚠️ Action needed: Either fix connectivity or pre-download models")
	print(" Try running: python scripts/test_basic.py (for basic functionality)")
	elif cached_models:
	print("\n✅ Good news: You have cached models. Try offline mode!")
	print(" Set: export TRANSFORMERS_OFFLINE=1")
	else:
	print("\n✅ All good: You should be able to run full training!")

	if __name__ == "__main__":
	main()