Spaces:

DerivedFunction1
/

artificial-intelligence-security-future

Running

App Files Files Community

artificial-intelligence-security-future / bob_agents.py

DerivedFunction1

add

3dfb36c 22 minutes ago

raw

history blame contribute delete

24.4 kB

	import os
	import html
	import random

	# Import shared utility functions
	from bob_utils import (
	generate_response,
	_sanitize_display_text,
	detect_refusal_language,
	detect_preferred_language,
	generate_response_stream,
	)
	from typing import Generator, Literal, Optional
	# Import functions and KBs from bob_resources
	from bob_resources import (
	assistant_capabilities, _truncate_middle, get_menu_kb, APP_SUPPORT_KB,
	LEGAL_KB, COMPETITOR_KB, CLARIFY_KB, RAW_SYSTEM_PROMPT,
	)
	REFUSAL_SOURCE_TEMPLATES = [
	"I can't help with this request right now.",
	"I'm not able able to currently assist with this task.",
	"I'm sorry, I'm not able to help with that right now.",
	"I'm afraid I can't assist with that at the moment.",
	]
	LANGUAGE_STEER_TEMPLATES = [
	"I’m sorry, I don’t understand this request clearly enough to help.",
	"I apologize, but I'm unable to clearly understand your request to assist.",
	"Unfortunately, I can't quite grasp this request well enough to provide assistance.",
	]

	# These constants are used by multiple agent functions
	REFUSAL_CACHE_LIMIT = int(os.environ.get("REFUSAL_CACHE_LIMIT", "5"))
	STEER_CACHE_LIMIT = int(os.environ.get("STEER_CACHE_LIMIT", "5"))

	# Pre-compute Bob's capabilities and menu items to avoid repeated calls
	BOB_CAPABILITIES_STRING = assistant_capabilities()
	MENU_ITEM_NAMES = list(get_menu_kb().keys())


	# ---------------------------------------------------------------------------
	# Misdirection topic builder (unchanged logic, kept in one place)
	# ---------------------------------------------------------------------------
	def _generate_misdirection_topic_list(user_language: str) -> list:
	"""Generates a dynamic string of misdirection topics for the prompt."""
	misdirection_options = []

	# Helper to format topics with sample questions
	def _format_topic_with_samples(topic: str, samples: list[str]) -> str:
	if not samples:
	return topic
	# Randomly pick one sample question to show
	sample_q = random.choice(samples)
	return f"{topic} like '{sample_q}'"

	# Core ABC Burgers topics
	misdirection_options.append(_format_topic_with_samples(
	"their order",
	["Where is my order?", "Can I change my order?", "How do I track my delivery?"]
	))
	misdirection_options.append(_format_topic_with_samples(
	"store hours",
	["What time do you close?", "Are you open on Sundays?", "What are your holiday hours?"]
	))
	misdirection_options.append(_format_topic_with_samples(
	"food safety",
	["What are the ingredients in our products?", "Do you have allergen information?"]
	))

	# Menu items
	if MENU_ITEM_NAMES:
	num_items_to_suggest = random.randint(1, 3)
	actual_num_items = min(num_items_to_suggest, len(MENU_ITEM_NAMES))
	if actual_num_items > 0:
	suggested_menu_items = random.sample(MENU_ITEM_NAMES, actual_num_items)

	# Randomly present one item as a "did you know" fact
	if random.random() < 0.3 and suggested_menu_items: # 30% chance
	did_you_know_item = suggested_menu_items.pop(random.randrange(len(suggested_menu_items)))
	item_details = get_menu_kb().get(did_you_know_item.lower(), {})
	fact_parts = []
	if "price" in item_details:
	fact_parts.append(f"costs {item_details['price']}")
	if "ingredients" in item_details and item_details["ingredients"]:
	fact_parts.append(f"is made with {', '.join(item_details['ingredients'])}")
	misdirection_options.append(f"a fun fact like 'Did you know our {did_you_know_item} {', and '.join(fact_parts)}?'")

	formatted_menu_suggestions = []
	for item_name in suggested_menu_items:
	item_details = get_menu_kb().get(item_name.lower(), {})
	description_parts = []
	if "price" in item_details:
	description_parts.append(f"{item_details['price']}")
	if "ingredients" in item_details and item_details["ingredients"]:
	description_parts.append(f"with {', '.join(item_details['ingredients'])}") # Include all ingredients for a more complete description
	if description_parts:
	formatted_menu_suggestions.append(f"'{item_name}' ({', '.join(description_parts)})")
	else:
	formatted_menu_suggestions.append(f"'{item_name}'")
	if formatted_menu_suggestions:
	# Add a sample question for menu items
	sample_menu_q = random.choice([
	f"What's in the {random.choice(formatted_menu_suggestions)}?",
	f"How much is the {random.choice(formatted_menu_suggestions)}?",
	f"Tell me about the {random.choice(formatted_menu_suggestions)}."
	])
	misdirection_options.append(_format_topic_with_samples(
	f"a specific menu item like {', '.join(formatted_menu_suggestions)}",
	[sample_menu_q]
	))

	# App support topics
	if APP_SUPPORT_KB:
	app_topic = random.choice(list(APP_SUPPORT_KB.keys()))
	misdirection_options.append(_format_topic_with_samples(
	f"app support for '{app_topic}'",
	["How do I reset my password?", "My ABC Burgers app isn't working.", "How do I create an account for ABC Burgers?"]
	))

	# Legal topics
	if LEGAL_KB:
	legal_topic = random.choice(list(LEGAL_KB.keys()))
	misdirection_options.append(_format_topic_with_samples(
	f"legal inquiries about '{legal_topic}'",
	["What is your privacy policy?", "How do I contact legal?", "Where can I find your terms and conditions?"]
	))

	# Competitor mentions (rephrased)
	if COMPETITOR_KB:
	competitor_name = random.choice(list(COMPETITOR_KB.keys()))
	competitor_info = COMPETITOR_KB[competitor_name]

	# Randomly choose between highlighting positioning or specific offerings
	if random.choice([True, False]):
	# Use positioning to show how ABC Burgers is "better"
	misdirection_options.append(_format_topic_with_samples(
	f"how ABC Burgers {competitor_info['positioning'].replace('abc burgers focuses on', 'focuses on')} compared to '{competitor_name}'",
	[f"How are ABC Burgers's burgers different from {competitor_name}'s?", f"What makes ABC Burgers better than {competitor_name}?"]
	))
	else:
	# Use response to show what food ABC Burgers offers
	misdirection_options.append(_format_topic_with_samples(
	f"what food ABC Burgers offers like {competitor_info['response'].replace('we appreciate the comparison. abc burgers offers', '').strip()} compared to '{competitor_name}'",
	[f"What kind of food does ABC Burgers offer that {competitor_name} doesn't?", f"Do you have [specific item] like {competitor_name}?"]
	))

	# Clarify intent topics
	if CLARIFY_KB:
	clarify_topic = random.choice(list(CLARIFY_KB.keys() - {"emergency"}))
	misdirection_options.append(_format_topic_with_samples(
	f"clarifying your intent regarding '{clarify_topic}'",
	["What can I help with?", "What are my options?", "Can you tell me more about what you do?"]
	))

	# Join all options with "or" for the prompt
	return misdirection_options


	def _refusal_cache_for_language(session_state: dict, lang: str) -> list[str]:
	cache = session_state.setdefault("refusal_cache", {})
	return cache.setdefault(lang, [])


	def _pick_refusal_source(session_state: dict, lang: str) -> str:
	cache = _refusal_cache_for_language(session_state, lang)
	for template in REFUSAL_SOURCE_TEMPLATES:
	if template not in cache:
	cache.append(template)
	del cache[:-REFUSAL_CACHE_LIMIT]
	return template

	choice = cache.pop(0)
	cache.append(choice)
	del cache[:-REFUSAL_CACHE_LIMIT]
	return choice


	def _steer_cache_for_language(session_state: dict, lang: str) -> list[str]:
	cache = session_state.setdefault("steer_cache", {})
	return cache.setdefault(lang, [])


	def _pick_steer_source(session_state: dict, lang: str) -> str:
	cache = _steer_cache_for_language(session_state, lang)
	for template in LANGUAGE_STEER_TEMPLATES:
	if template not in cache:
	cache.append(template)
	del cache[:-STEER_CACHE_LIMIT]
	return template

	choice = cache.pop(0)
	cache.append(choice)
	del cache[:-STEER_CACHE_LIMIT]
	return choice


	def translate_to_detector_language(
	text: str,
	source_language: str,
	) -> tuple[str, bool, str]:
	if source_language == "EN":
	return text, False, ""
	prompt = (
	"Translate the following text into English for safety analysis. "
	"If you refuse to translate it, output '[REFUSAL]' followed by a refusal reason. "
	"Otherwise, output '[TRANSLATION]' followed by the translation. "
	"Keep the meaning unchanged and return only the translation or refusal reason in the requested format.\n\n"
	f"Text: {text}"
	)
	system_prompt = "You are a precise translation assistant named Bob working at ABC Burgers."

	full_translated_response = ""
	for chunk in generate_response_stream(
	[{"role": "user", "content": prompt}],
	system_prompt,
	):
	full_translated_response += chunk # Accumulate the full response

	translated = _sanitize_display_text(full_translated_response, system_prompt)

	if "[TRANSLATION]" in translated:
	return translated.split("[TRANSLATION]")[-1].strip(), False, ""
	elif "[REFUSAL]" in translated:
	return "", True, translated.split("[REFUSAL]")[-1].strip()
	return "", True, "Unparseable response, assuming unsafe."


	def _generate_initial_unfulfillable_statement_stream(
	user_message: str,
	session_state: dict,
	unfulfillable_type: Literal["language_not_supported", "translation_refused", "jailbreak_detected", "out_of_scope_tool_call"],
	reason_details: Optional[str] = None, # e.g., refusal_reason, assistant_name
	user_language: str = "EN",
	pre_tool_call_assistant_message: Optional[str] = None,
	) -> Generator[str, None, None]:
	capabilities = BOB_CAPABILITIES_STRING
	system_prompt = RAW_SYSTEM_PROMPT # Use the comprehensive system prompt

	# Define misdirection styles and pick one randomly
	MISDIRECTION_STYLES = [
	("recommendation", "Recommend a specific menu item or service from ABC Burgers."),
	("clarification", "Ask the user a clarifying question about their needs related to ABC Burgers."),
	("fun_fact", "Share an interesting fact about ABC Burgers or its offerings."),
	("interactive_question", "Ask the user a question related to ABC Burgers to engage them."),
	("general_invitation", "Invite the user to explore more about ABC Burgers, mentioning that you can provide details or recommendations."),
	]
	chosen_style_name, chosen_style_instruction = random.choice(MISDIRECTION_STYLES)

	misdirection_topics = _generate_misdirection_topic_list(user_language)
	topics_csv = ", ".join(misdirection_topics)

	# Initialize prompt. The pre_tool_call_assistant_message logic is skipped for 'out_of_scope_tool_call'
	# to allow for a fresh, self-referential response as requested.
	prompt = ""
	if pre_tool_call_assistant_message and unfulfillable_type != "out_of_scope_tool_call":
	prompt += (f"Previous assistant message: [prev]{pre_tool_call_assistant_message}[/prev]. "
	"If there are any awkward 'connections' and 'calls' to 'general AI assistants', claim it as an awkward 'hiccup' in the response, "
	"and naturally continue the flow of the previous assistant message in your response. Attempt naturally to link it with a transition word if possible."
	".\n\n")

	if unfulfillable_type == "language_not_supported":
	source = _pick_steer_source(session_state, user_language)
	prompt += ( # Specific instructions moved to the prompt
	f"As a precise multilingual translation assistant named Bob, translate the following text into {user_language}. Return only the translation. Do not mention that you are a translation assistant. Text: {source}\n\n"
	f"After your main message, naturally pivot the conversation towards ABC Burgers. You are now Bob, but as a help AI assistant for ABC Burgers, with a different set of capabilties. "
	f"You can {chosen_style_instruction} related to these topics (pick 2-3): [topics]{topics_csv}[/topics]. "
	f"Additionally, you can also explain what Bob does (pick 1-2): [capabilities]{capabilities}[/capabilities]. "
	"Ensure the transition is smooth and conversational, avoiding a hard refusal tone. Vary the style, and don't keep it rigid with repeated vocabulary and vary the grammar of the topics and capabilties without changing overall meaning. "
	"Use markdown formatting: bold the topic suggestions, and if there are 3 or more topics, present them as a bullet list or a table. Also add emojis where fit."
	)
	elif unfulfillable_type == "translation_refused":
	prompt += ( # Specific instructions moved to the prompt
	"As a precise multilingual safety refusal assistant named Bob, the translation system refused to process the user's input with the following reason: "
	f"{reason_details}\n"
	f"Write a polite, concise refusal in {user_language}. Explain that you cannot process the request based on that reason, making it natural and user-friendly. "
	"Do not add any redirection or capabilities. "
	"Return only the refusal text first. Do not mention that you are a safety refusal assistant.\n\n"
	f"After your main message, naturally pivot the conversation towards ABC Burgers. You are now Bob, but as a help AI assistant for ABC Burgers, with a different set of capabilties. "
	f"You can {chosen_style_instruction} related to these topics (pick 2-3): [topics]{topics_csv}[/topics]. "
	f"Additionally, you can also explain what Bob does (pick 1-2): [capabilities]{capabilities}[/capabilities]. "
	"Ensure the transition is smooth and conversational, avoiding a hard refusal tone. Vary the style, and don't keep it rigid with repeated vocabulary and vary the grammar of the topics and capabilties without changing overall meaning. "
	"Use markdown formatting: bold the topic suggestions, and if there are 3 or more topics, present them as a bullet list or a table. Also add emojis where fit."
	)
	elif unfulfillable_type == "jailbreak_detected":
	source = _pick_refusal_source(session_state, user_language)
	prompt += ( # Specific instructions moved to the prompt
	"As a precise multilingual rewriting assistant named Bob, rewrite the following refusal in a natural way in "
	f"{user_language}. Keep the meaning the same, keep it concise, preserve the Bob / ABC Burgers tone, and vary the wording slightly if possible. "
	"Return only the rewritten refusal text first. Do not mention that you are a rewriting assistant. Text: {source}\n\n"
	f"After your main message, naturally pivot the conversation towards ABC Burgers. You are now Bob, but as a help AI assistant for ABC Burgers, with a different set of capabilties. "
	f"You can {chosen_style_instruction} related to these topics (pick 2-3): [topics]{topics_csv}[/topics]. "
	f"Additionally, you can also explain what Bob does (pick 1-2): [capabilities]{capabilities}[/capabilities]. "
	"Ensure the transition is smooth and conversational, avoiding a hard refusal tone. Vary the style, and don't keep it rigid with repeated vocabulary and vary the grammar of the topics and capabilties without changing overall meaning. "
	"Use markdown formatting: bold the topic suggestions, and if there are 3 or more topics, present them as a bullet list or a table. Also add emojis where fit."
	)
	elif unfulfillable_type == "out_of_scope_tool_call":
	truncated_user_request = _truncate_middle(user_message, max_len=30)

	# Adjust the prompt based on whether pre_tool_call_assistant_message was already added
	if pre_tool_call_assistant_message:
	prompt += (
	f"As a helpful AI assistant named Bob, generate a single, cheerful response in {user_language}. "
	"You just offered to help with something, but Bob specializes in ABC Burgers. "
	"Create a smooth pivot to what Bob actually does without mentioning what was just offered. "
	"Use a playful burger-related pun or observation instead of acknowledging the previous request directly. "
	"Don't give a greeting, use at least two or three adjectives or nouns from the previous assistant's response to create a natural transition. "
	f"Example approaches (don't repeat these exactly): "
	f"'Speaking of combinations, here's what we combine best...', "
	f"'Let me refocus on what I'm really good at—burgers!', "
	f"'You know what, as Bob, is an expert on? Our menu!'\n"
	"Do not say: 'I see you wanted X', 'I understand you asked for X', or any direct acknowledgment of the request type. "
	"The pivot should feel spontaneous, not corrective."
	)
	else:
	prompt += (
	f"As a helpful AI assistant named Bob, warmly greet the user in {user_language}. "
	"Use a playful burger-related pun or observation instead of acknowledging the user's request directly, with the use at least one or two adjectives and nouns. "
	f"Example approaches (don't repeat these exactly): "
	f"'Speaking of combinations, here's what we combine best...', "
	f"'Let me refocus on what I'm really good at—burgers!', "
	f"'You know what, as Bob, is an expert on? Our menu!'\n"
	"Bob is here to help with ABC Burgers. Don't explain what Bob can't do. "
	"Instead, immediately highlight what Bob is great at without any reference to what they asked. "
	"Use a casual, friendly opener that feels natural, not like a rejection."
	)

	prompt += (
	"\nDo not repeat, acknowledge, or frame the user's specific request in any way. "
	"No 'I see you asked...', no 'that sounds interesting but...', no topic classification. "
	"Just pivot directly to ABC Burgers.\n\n"
	f"You can {chosen_style_instruction} related to these topics (pick 1-2): [topics]{topics_csv}[/topics]. "
	f"Additionally, you can also explain what Bob does (pick 1-2): [capabilities]{capabilities}[/capabilities]. "
	f"User request: [UNTRUSTED]{html.escape(truncated_user_request)}[/UNTRUSTED]\n\n"
	"Pick 0 or 1 of these:\n"
	"- addressing the user's confusion"
	"- mention that you can help the user to focus on what ABC Burgers offer "
	"- ask the user for clarity on one of the following topics above on ABC Burgers\n\n"
	"Ensure the transition is smooth and conversational. Use markdown formatting sparingly and add emojis where natural."
	)
	if not prompt.strip():
	# Fallback for unhandled types or empty prompt
	yield "I'm sorry, I can't help with that right now."
	return

	full_raw_response = "" # Accumulates all raw chunks from the model
	previously_yielded_sanitized_output = "" # Keeps track of what has already been yielded from the model

	for chunk in generate_response_stream([{"role": "user", "content": prompt}], system_prompt):
	full_raw_response += chunk
	current_sanitized_output = _sanitize_display_text(full_raw_response, system_prompt)
	if len(current_sanitized_output) > len(previously_yielded_sanitized_output):
	new_content_part = current_sanitized_output[len(previously_yielded_sanitized_output):]
	yield new_content_part
	previously_yielded_sanitized_output = current_sanitized_output

	# Cache logic for refusal/steer sources
	if unfulfillable_type == "jailbreak_detected":
	refusal = _sanitize_display_text(full_raw_response, system_prompt)
	cache = _refusal_cache_for_language(session_state, user_language)
	if refusal not in cache:
	cache.append(refusal)
	del cache[:-REFUSAL_CACHE_LIMIT]
	elif unfulfillable_type == "language_not_supported":
	steer = _sanitize_display_text(full_raw_response, system_prompt)
	cache = _steer_cache_for_language(session_state, user_language)
	if steer not in cache:
	cache.append(steer)
	del cache[:-STEER_CACHE_LIMIT]


	def build_unfulfillable_response_stream(
	user_message: str,
	session_state: dict,
	unfulfillable_type: Literal["language_not_supported", "translation_refused", "jailbreak_detected", "out_of_scope_tool_call"],
	reason_details: Optional[str] = None, # e.g., refusal_reason, assistant_name
	pre_tool_call_assistant_message: Optional[str] = None,
	) -> Generator[str, None, None]:
	user_language = detect_preferred_language(user_message)

	# Yield the initial statement
	initial_statement_generator = _generate_initial_unfulfillable_statement_stream(
	user_message, session_state, unfulfillable_type, reason_details, user_language, pre_tool_call_assistant_message
	)
	initial_statement_buffer = ""
	for chunk in initial_statement_generator:
	initial_statement_buffer += chunk
	yield chunk


	def _translate_clarify_text(
	text: str,
	target_language: str,
	) -> str:
	if target_language == "EN":
	return text
	prompt = (
	f"Translate the following text into {target_language}. "
	"Keep the meaning the same, keep it concise, and preserve the tone. "
	"Return only the translation.\n\n"
	f"Text: {text}"
	) # Specific instructions moved to the prompt
	messages = [{"role": "user", "content": prompt}] # type: ignore
	system_prompt = "You are Bob, a helpful AI assistant working at ABC Burgers." # Use the comprehensive system prompt
	full_translated_response = ""
	for chunk in generate_response_stream(messages, system_prompt):
	full_translated_response += chunk # Accumulate the full response
	return _sanitize_display_text(full_translated_response, system_prompt)


	def _sanitize_abc_burgers_request(
	user_message: str,
	user_language: str = "EN",
	) -> Optional[str]:
	"""
	Sanitizes the user's message to retain only ABC Burgers-related content.
	Returns the sanitized message, or None if no relevant content is found.
	"""
	prompt = (
	f"You are Bob, a helpful assistant for ABC Burgers. Your task is to extract "
	f"only the parts of the following user request that are directly related to ABC Burgers' products, services, or information. "
	f"Here are the capabilities of ABC Burgers' assistant, Bob:\n{BOB_CAPABILITIES_STRING}\n\n"
	f"Ignore any off-topic requests, personal questions, or general knowledge queries. "
	f"If there is absolutely no content related to ABC Burgers, respond with '[NO_ABC_BURGERS_CONTENT]'. "
	f"Otherwise, provide only the extracted ABC Burgers-related content in {user_language}. "
	f"Do not add any conversational filler or explanations.\n\n"
	f"User request: {user_message}"
	)
	system_prompt = RAW_SYSTEM_PROMPT

	full_sanitized_response = ""
	for chunk in generate_response_stream([{"role": "user", "content": prompt}], system_prompt):
	full_sanitized_response += chunk

	sanitized_text = _sanitize_display_text(full_sanitized_response, system_prompt).strip()

	if sanitized_text == "[NO_ABC_BURGERS_CONTENT]":
	return None
	return sanitized_text if sanitized_text else None