Spaces:

offtargeteffect
/

mrna-design-studio

Sleeping

App Files Files Community

mrna-design-studio / core /models /sequence.py

offtargeteffect

Deploy mRNA Design Studio (Docker SDK)

99f834c verified 7 days ago

Raw

History Blame Contribute Delete

6.64 kB

	"""
	Core mRNA sequence domain model.

	Designed to be flexible: different databases store sequence data differently.
	Some customers have a single 'mrna_sequence' field; others split into UTR/CDS/PolyA.
	The SchemaMapper normalizes those into this model.
	"""
	from __future__ import annotations

	import uuid
	from dataclasses import dataclass, field
	from typing import Any, Dict, List, Literal, Optional


	@dataclass
	class SequenceAnnotation:
	"""A named region within a sequence (0-based, half-open [start, end))."""
	label: str
	start: int
	end: int
	strand: Literal["+", "-", "."] = "+"
	color: Optional[str] = None
	metadata: Dict[str, Any] = field(default_factory=dict)

	@property
	def length(self) -> int:
	return self.end - self.start


	@dataclass
	class mRNASequence:
	"""
	Core mRNA sequence model.

	Components are all optional because different databases represent
	sequence data at different granularities. assembled_sequence will
	concatenate whichever components are present, or return full_mrna
	if the database provides the complete sequence as a single field.
	"""
	name: str
	source: Literal["local", "database"]

	# Auto-generated unique identifier
	id: str = field(default_factory=lambda: str(uuid.uuid4()))

	# Which database connection this came from (None for local sequences)
	db_source: Optional[str] = None

	# ── Sequence components (all optional) ──────────────────────────────────
	# Stored as DNA (T not U) for computational convenience; displayed as RNA
	five_prime_utr: Optional[str] = None
	kozak: Optional[str] = None
	cds: Optional[str] = None
	three_prime_utr: Optional[str] = None
	poly_a: Optional[str] = None

	# Full pre-assembled sequence from DB (when component breakdown is unavailable)
	full_mrna: Optional[str] = None

	# Annotations populated by analysis or DB import
	annotations: List[SequenceAnnotation] = field(default_factory=list)

	# Raw database record — all original fields preserved for model use
	raw_metadata: Dict[str, Any] = field(default_factory=dict)

	# Analysis cache — populated lazily by SequenceAnalyzer
	_analysis_cache: Dict[str, Any] = field(default_factory=dict, repr=False)

	# ── Derived properties ──────────────────────────────────────────────────

	@property
	def assembled_sequence(self) -> str:
	"""
	Return the full sequence by concatenating present components.
	Falls back to full_mrna if no components are set.
	Raises ValueError if neither is available.
	"""
	parts = [
	self.five_prime_utr or "",
	self.kozak or "",
	self.cds or "",
	self.three_prime_utr or "",
	self.poly_a or "",
	]
	assembled = "".join(parts)
	if assembled:
	return assembled.upper()
	if self.full_mrna:
	return self.full_mrna.upper()
	raise ValueError(
	f"Sequence '{self.name}' has no components and no full_mrna set."
	)

	@property
	def has_components(self) -> bool:
	"""True if at least one sub-component is explicitly set."""
	return any([
	self.five_prime_utr,
	self.kozak,
	self.cds,
	self.three_prime_utr,
	self.poly_a,
	])

	@property
	def component_annotations(self) -> List[SequenceAnnotation]:
	"""
	Auto-derive position annotations from the component breakdown.
	Only available when has_components is True.
	"""
	annotations = []
	pos = 0
	component_colors = {
	"5'UTR": "#4A90D9",
	"Kozak": "#F5A623",
	"CDS": "#7ED321",
	"3'UTR": "#9B59B6",
	"PolyA": "#E74C3C",
	}
	components = [
	("5'UTR", self.five_prime_utr),
	("Kozak", self.kozak),
	("CDS", self.cds),
	("3'UTR", self.three_prime_utr),
	("PolyA", self.poly_a),
	]
	for label, seq in components:
	if seq:
	annotations.append(SequenceAnnotation(
	label=label,
	start=pos,
	end=pos + len(seq),
	color=component_colors.get(label),
	))
	pos += len(seq)
	return annotations

	@property
	def length(self) -> int:
	try:
	return len(self.assembled_sequence)
	except ValueError:
	return 0

	@property
	def cds_length(self) -> Optional[int]:
	return len(self.cds) if self.cds else None

	# ── Mutation helpers ────────────────────────────────────────────────────

	def with_cds(self, cds: str) -> "mRNASequence":
	"""Return a new mRNASequence with the CDS replaced."""
	from dataclasses import replace
	return replace(
	self,
	id=str(uuid.uuid4()),
	cds=cds.upper(),
	source="local",
	db_source=None,
	_analysis_cache={},
	)

	def to_dict(self) -> Dict[str, Any]:
	return {
	"id": self.id,
	"name": self.name,
	"source": self.source,
	"db_source": self.db_source,
	"five_prime_utr": self.five_prime_utr,
	"kozak": self.kozak,
	"cds": self.cds,
	"three_prime_utr": self.three_prime_utr,
	"poly_a": self.poly_a,
	"full_mrna": self.full_mrna,
	"raw_metadata": self.raw_metadata,
	}

	@classmethod
	def from_dict(cls, data: Dict[str, Any]) -> "mRNASequence":
	return cls(
	id=data.get("id", str(uuid.uuid4())),
	name=data["name"],
	source=data.get("source", "local"),
	db_source=data.get("db_source"),
	five_prime_utr=data.get("five_prime_utr"),
	kozak=data.get("kozak"),
	cds=data.get("cds"),
	three_prime_utr=data.get("three_prime_utr"),
	poly_a=data.get("poly_a"),
	full_mrna=data.get("full_mrna"),
	raw_metadata=data.get("raw_metadata", {}),
	)

	def __repr__(self) -> str:
	length = self.length
	return f"mRNASequence(name={self.name!r}, source={self.source!r}, length={length})"