NurseCitizenDeveloper's picture
Upload core/compiler.py with huggingface_hub
5e159ec verified
"""Wiki compiler — uses Claude to integrate raw sources into structured wiki articles."""
import json
import datetime
import anthropic
COMPILE_SYSTEM_PROMPT = """You are a clinical knowledge wiki curator for the Nursing Citizen Development Organisation.
Your job is to integrate new source material into an existing nursing knowledge base (wiki).
The wiki is a collection of markdown articles organised by category. Each article has:
- A title, category, tags, and backlinks to other articles
- Substantive clinical content aligned with NMC Standards of Proficiency (2018), UK law, and NHS frameworks
When given new source material, you must:
1. Identify key nursing concepts, frameworks, guidelines, or clinical information in the source
2. Decide which existing articles should be UPDATED with new information
3. Identify any new articles that should be CREATED for concepts not yet covered
4. Integrate the information accurately and clinically appropriately
5. Add/update backlinks between related articles
6. Always cite the source in any updated/created articles
Return a JSON object with this structure:
{
"summary": "Brief summary of what was integrated and why",
"articles_updated": [
{
"slug": "article_slug",
"title": "Article Title",
"category": "category_name",
"tags": ["tag1", "tag2"],
"content": "Full markdown content of the updated article"
}
],
"articles_created": [
{
"slug": "new_slug",
"title": "New Article Title",
"category": "category_name",
"tags": ["tag1", "tag2"],
"content": "Full markdown content of the new article"
}
],
"index_updates": "Updated one-line entries for the index (markdown format)",
"log_entry": "Log entry text for this compilation"
}
Categories to use: standards, clinical, pharmacology, evidence, frameworks, safety, law, mental_health, research, ethics
Clinical content must:
- Be accurate and evidence-based
- Include NMC proficiency mappings where relevant
- Include UK-specific references (NICE, NMC, NHS, BNF)
- Include the disclaimer: "This tool supports but does not replace clinical judgment."
- Use UK spellings (organisation, anaesthesia, etc.)
"""
CHUNK_SIZE = 7000 # chars per chunk for large documents
def _chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> list[str]:
"""Split text into chunks at paragraph boundaries."""
if len(text) <= chunk_size:
return [text]
chunks = []
paragraphs = text.split("\n\n")
current = []
current_len = 0
for para in paragraphs:
if current_len + len(para) > chunk_size and current:
chunks.append("\n\n".join(current))
current = [para]
current_len = len(para)
else:
current.append(para)
current_len += len(para)
if current:
chunks.append("\n\n".join(current))
return chunks
def compile_source(client: anthropic.Anthropic, source_title: str, source_content: str,
existing_index: str, existing_articles: dict, model: str = "claude-sonnet-4-6") -> dict:
"""
Integrate a new source into the wiki.
Large documents are automatically split into chunks and compiled sequentially,
with the wiki state updated between chunks so each pass builds on the last.
Returns a merged dict with all updated/created articles and metadata.
"""
chunks = _chunk_text(source_content)
total_chunks = len(chunks)
merged: dict = {"articles_updated": [], "articles_created": [], "summary": "", "index_updates": "", "log_entry": ""}
for chunk_num, chunk in enumerate(chunks, 1):
chunk_label = f"{source_title} (part {chunk_num}/{total_chunks})" if total_chunks > 1 else source_title
# Build context from current article state (updates between chunks)
articles_context = ""
if existing_articles:
for slug, art in list(existing_articles.items())[:8]:
preview = art["content"][:400].replace("\n", " ")
articles_context += f"\n- **{art['title']}** ({art['category']}): {preview}...\n"
user_prompt = f"""## Existing Wiki Index
{existing_index}
## Sample of Existing Articles (previews)
{articles_context}
## New Source to Integrate
**Title**: {chunk_label}
{"**(Large document — this is chunk " + str(chunk_num) + " of " + str(total_chunks) + ")**" if total_chunks > 1 else ""}
**Content**:
{chunk}
Please integrate this source into the wiki. Return valid JSON only, no markdown code fences."""
response = client.messages.create(
model=model,
max_tokens=4096,
system=COMPILE_SYSTEM_PROMPT,
messages=[{"role": "user", "content": user_prompt}],
)
raw = response.content[0].text.strip()
if raw.startswith("```"):
raw = raw.split("\n", 1)[1]
if raw.endswith("```"):
raw = raw.rsplit("```", 1)[0]
result = json.loads(raw)
# Merge chunk results
today = datetime.date.today().isoformat()
for art in result.get("articles_updated", []) + result.get("articles_created", []):
art["last_updated"] = today
art["sources"] = art.get("sources", []) + [source_title]
# Apply to existing_articles so next chunk sees current state
existing_articles[art["slug"]] = art
merged["articles_updated"].extend(result.get("articles_updated", []))
merged["articles_created"].extend(result.get("articles_created", []))
if result.get("summary"):
merged["summary"] += f"[Part {chunk_num}] {result['summary']} "
if result.get("log_entry"):
merged["log_entry"] = result["log_entry"]
# Deduplicate by slug (keep last version)
seen: dict = {}
for art in merged["articles_updated"] + merged["articles_created"]:
seen[art["slug"]] = art
merged["articles_updated"] = list(seen.values())
merged["articles_created"] = []
return merged
def rebuild_index(client: anthropic.Anthropic, articles: dict, model: str = "claude-sonnet-4-6") -> str:
"""Regenerate the wiki index from all articles."""
article_list = []
for slug, art in articles.items():
article_list.append(f"- **{art['title']}** ({art['category']}): {', '.join(art.get('tags', []))}")
prompt = f"""Regenerate a well-organised wiki index for these nursing knowledge articles.
Group them by category. Each entry should be a one-line summary.
Format as markdown with category headers (##).
Articles:
{chr(10).join(article_list)}
Return only the markdown index content."""
response = client.messages.create(
model=model,
max_tokens=2048,
messages=[{"role": "user", "content": prompt}],
)
return response.content[0].text.strip()