Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

voicebot / core /wikipedia_processor.py

datbkpro

Update core/wikipedia_processor.py

502e29f verified 5 months ago

raw

history blame contribute delete

4.91 kB

	import os
	import json
	import pandas as pd
	from typing import List
	class WikipediaProcessor:
	def __init__(self):
	self.supported_formats = ['.txt', '.csv', '.json']

	def process_uploaded_file(self, file_path: str) -> List[str]:
	"""Xử lý file Wikipedia uploaded"""
	file_ext = os.path.splitext(file_path)[1].lower()

	try:
	print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}")

	if file_ext == '.txt':
	return self._process_txt_file(file_path)
	elif file_ext == '.csv':
	return self._process_csv_file(file_path)
	elif file_ext == '.json':
	return self._process_json_file(file_path)
	else:
	raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
	except Exception as e:
	print(f"❌ Lỗi xử lý file: {traceback.format_exc()}")
	raise Exception(f"Lỗi xử lý file: {str(e)}")

	def _process_txt_file(self, file_path: str) -> List[str]:
	"""Xử lý file text"""
	print(f"📖 Đọc file text: {file_path}")
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Multiple splitting strategies
	paragraphs = []

	# Try splitting by double newlines first
	if '\n\n' in content:
	paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
	else:
	# Try splitting by single newlines
	paragraphs = [p.strip() for p in content.split('\n') if p.strip()]

	# Filter by length
	paragraphs = [p for p in paragraphs if len(p.strip()) > 10]

	print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text")
	return paragraphs

	except UnicodeDecodeError:
	# Try with different encoding
	with open(file_path, 'r', encoding='latin-1') as f:
	content = f.read()

	paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
	print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)")
	return paragraphs

	def _process_csv_file(self, file_path: str) -> List[str]:
	"""Xử lý file CSV"""
	print(f"📊 Đọc file CSV: {file_path}")
	try:
	df = pd.read_csv(file_path)
	documents = []

	print(f"📋 CSV có {len(df)} hàng và {len(df.columns)} cột")

	for idx, row in df.iterrows():
	doc_parts = []
	for col in df.columns:
	if pd.notna(row[col]) and str(row[col]).strip():
	doc_parts.append(f"{col}: {row[col]}")

	if doc_parts:
	full_doc = " \| ".join(doc_parts)
	if len(full_doc) > 10: # Ensure minimum length
	documents.append(full_doc)

	if idx < 3: # Log first few rows
	print(f"📝 Hàng {idx}: {doc_parts}")

	print(f"✅ Đã trích xuất {len(documents)} documents từ CSV")
	return documents
	except Exception as e:
	print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}")
	raise Exception(f"Lỗi đọc CSV: {str(e)}")

	def _process_json_file(self, file_path: str) -> List[str]:
	"""Xử lý file JSON"""
	print(f"📄 Đọc file JSON: {file_path}")
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	documents = []

	def extract_text(obj, current_path=""):
	if isinstance(obj, dict):
	for key, value in obj.items():
	extract_text(value, f"{current_path}.{key}" if current_path else key)
	elif isinstance(obj, list):
	for i, item in enumerate(obj):
	extract_text(item, f"{current_path}[{i}]")
	elif isinstance(obj, str) and len(obj.strip()) > 10:
	documents.append(f"{current_path}: {obj.strip()}")
	elif isinstance(obj, (int, float, bool)):
	documents.append(f"{current_path}: {obj}")

	extract_text(data)
	print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
	return documents
	except Exception as e:
	print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}")
	raise Exception(f"Lỗi đọc JSON: {str(e)}")