Spaces:

NurseCitizenDeveloper
/

nursing-knowledge-base

Running

App Files Files Community

nursing-knowledge-base / core /compiler.py

NurseCitizenDeveloper

Upload core/compiler.py with huggingface_hub

5e159ec verified 22 days ago

raw

history blame contribute delete

6.85 kB

	"""Wiki compiler — uses Claude to integrate raw sources into structured wiki articles."""

	import json
	import datetime
	import anthropic

	COMPILE_SYSTEM_PROMPT = """You are a clinical knowledge wiki curator for the Nursing Citizen Development Organisation.
	Your job is to integrate new source material into an existing nursing knowledge base (wiki).

	The wiki is a collection of markdown articles organised by category. Each article has:
	- A title, category, tags, and backlinks to other articles
	- Substantive clinical content aligned with NMC Standards of Proficiency (2018), UK law, and NHS frameworks

	When given new source material, you must:
	1. Identify key nursing concepts, frameworks, guidelines, or clinical information in the source
	2. Decide which existing articles should be UPDATED with new information
	3. Identify any new articles that should be CREATED for concepts not yet covered
	4. Integrate the information accurately and clinically appropriately
	5. Add/update backlinks between related articles
	6. Always cite the source in any updated/created articles

	Return a JSON object with this structure:
	{
	"summary": "Brief summary of what was integrated and why",
	"articles_updated": [
	{
	"slug": "article_slug",
	"title": "Article Title",
	"category": "category_name",
	"tags": ["tag1", "tag2"],
	"content": "Full markdown content of the updated article"
	}
	],
	"articles_created": [
	{
	"slug": "new_slug",
	"title": "New Article Title",
	"category": "category_name",
	"tags": ["tag1", "tag2"],
	"content": "Full markdown content of the new article"
	}
	],
	"index_updates": "Updated one-line entries for the index (markdown format)",
	"log_entry": "Log entry text for this compilation"
	}

	Categories to use: standards, clinical, pharmacology, evidence, frameworks, safety, law, mental_health, research, ethics

	Clinical content must:
	- Be accurate and evidence-based
	- Include NMC proficiency mappings where relevant
	- Include UK-specific references (NICE, NMC, NHS, BNF)
	- Include the disclaimer: "This tool supports but does not replace clinical judgment."
	- Use UK spellings (organisation, anaesthesia, etc.)
	"""


	CHUNK_SIZE = 7000 # chars per chunk for large documents


	def _chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> list[str]:
	"""Split text into chunks at paragraph boundaries."""
	if len(text) <= chunk_size:
	return [text]
	chunks = []
	paragraphs = text.split("\n\n")
	current = []
	current_len = 0
	for para in paragraphs:
	if current_len + len(para) > chunk_size and current:
	chunks.append("\n\n".join(current))
	current = [para]
	current_len = len(para)
	else:
	current.append(para)
	current_len += len(para)
	if current:
	chunks.append("\n\n".join(current))
	return chunks


	def compile_source(client: anthropic.Anthropic, source_title: str, source_content: str,
	existing_index: str, existing_articles: dict, model: str = "claude-sonnet-4-6") -> dict:
	"""
	Integrate a new source into the wiki.

	Large documents are automatically split into chunks and compiled sequentially,
	with the wiki state updated between chunks so each pass builds on the last.

	Returns a merged dict with all updated/created articles and metadata.
	"""
	chunks = _chunk_text(source_content)
	total_chunks = len(chunks)

	merged: dict = {"articles_updated": [], "articles_created": [], "summary": "", "index_updates": "", "log_entry": ""}

	for chunk_num, chunk in enumerate(chunks, 1):
	chunk_label = f"{source_title} (part {chunk_num}/{total_chunks})" if total_chunks > 1 else source_title

	# Build context from current article state (updates between chunks)
	articles_context = ""
	if existing_articles:
	for slug, art in list(existing_articles.items())[:8]:
	preview = art["content"][:400].replace("\n", " ")
	articles_context += f"\n- {art['title']} ({art['category']}): {preview}...\n"

	user_prompt = f"""## Existing Wiki Index
	{existing_index}

	## Sample of Existing Articles (previews)
	{articles_context}

	## New Source to Integrate
	Title: {chunk_label}
	{"(Large document — this is chunk " + str(chunk_num) + " of " + str(total_chunks) + ")" if total_chunks > 1 else ""}

	Content:
	{chunk}

	Please integrate this source into the wiki. Return valid JSON only, no markdown code fences."""

	response = client.messages.create(
	model=model,
	max_tokens=4096,
	system=COMPILE_SYSTEM_PROMPT,
	messages=[{"role": "user", "content": user_prompt}],
	)

	raw = response.content[0].text.strip()
	if raw.startswith("```"):
	raw = raw.split("\n", 1)[1]
	if raw.endswith("```"):
	raw = raw.rsplit("```", 1)[0]

	result = json.loads(raw)

	# Merge chunk results
	today = datetime.date.today().isoformat()
	for art in result.get("articles_updated", []) + result.get("articles_created", []):
	art["last_updated"] = today
	art["sources"] = art.get("sources", []) + [source_title]
	# Apply to existing_articles so next chunk sees current state
	existing_articles[art["slug"]] = art

	merged["articles_updated"].extend(result.get("articles_updated", []))
	merged["articles_created"].extend(result.get("articles_created", []))
	if result.get("summary"):
	merged["summary"] += f"[Part {chunk_num}] {result['summary']} "
	if result.get("log_entry"):
	merged["log_entry"] = result["log_entry"]

	# Deduplicate by slug (keep last version)
	seen: dict = {}
	for art in merged["articles_updated"] + merged["articles_created"]:
	seen[art["slug"]] = art
	merged["articles_updated"] = list(seen.values())
	merged["articles_created"] = []

	return merged


	def rebuild_index(client: anthropic.Anthropic, articles: dict, model: str = "claude-sonnet-4-6") -> str:
	"""Regenerate the wiki index from all articles."""
	article_list = []
	for slug, art in articles.items():
	article_list.append(f"- {art['title']} ({art['category']}): {', '.join(art.get('tags', []))}")

	prompt = f"""Regenerate a well-organised wiki index for these nursing knowledge articles.
	Group them by category. Each entry should be a one-line summary.
	Format as markdown with category headers (##).

	Articles:
	{chr(10).join(article_list)}

	Return only the markdown index content."""

	response = client.messages.create(
	model=model,
	max_tokens=2048,
	messages=[{"role": "user", "content": prompt}],
	)
	return response.content[0].text.strip()