Spaces:
Sleeping
Sleeping
| """ | |
| Core mRNA sequence domain model. | |
| Designed to be flexible: different databases store sequence data differently. | |
| Some customers have a single 'mrna_sequence' field; others split into UTR/CDS/PolyA. | |
| The SchemaMapper normalizes those into this model. | |
| """ | |
| from __future__ import annotations | |
| import uuid | |
| from dataclasses import dataclass, field | |
| from typing import Any, Dict, List, Literal, Optional | |
| class SequenceAnnotation: | |
| """A named region within a sequence (0-based, half-open [start, end)).""" | |
| label: str | |
| start: int | |
| end: int | |
| strand: Literal["+", "-", "."] = "+" | |
| color: Optional[str] = None | |
| metadata: Dict[str, Any] = field(default_factory=dict) | |
| def length(self) -> int: | |
| return self.end - self.start | |
| class mRNASequence: | |
| """ | |
| Core mRNA sequence model. | |
| Components are all optional because different databases represent | |
| sequence data at different granularities. assembled_sequence will | |
| concatenate whichever components are present, or return full_mrna | |
| if the database provides the complete sequence as a single field. | |
| """ | |
| name: str | |
| source: Literal["local", "database"] | |
| # Auto-generated unique identifier | |
| id: str = field(default_factory=lambda: str(uuid.uuid4())) | |
| # Which database connection this came from (None for local sequences) | |
| db_source: Optional[str] = None | |
| # ββ Sequence components (all optional) ββββββββββββββββββββββββββββββββββ | |
| # Stored as DNA (T not U) for computational convenience; displayed as RNA | |
| five_prime_utr: Optional[str] = None | |
| kozak: Optional[str] = None | |
| cds: Optional[str] = None | |
| three_prime_utr: Optional[str] = None | |
| poly_a: Optional[str] = None | |
| # Full pre-assembled sequence from DB (when component breakdown is unavailable) | |
| full_mrna: Optional[str] = None | |
| # Annotations populated by analysis or DB import | |
| annotations: List[SequenceAnnotation] = field(default_factory=list) | |
| # Raw database record β all original fields preserved for model use | |
| raw_metadata: Dict[str, Any] = field(default_factory=dict) | |
| # Analysis cache β populated lazily by SequenceAnalyzer | |
| _analysis_cache: Dict[str, Any] = field(default_factory=dict, repr=False) | |
| # ββ Derived properties ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def assembled_sequence(self) -> str: | |
| """ | |
| Return the full sequence by concatenating present components. | |
| Falls back to full_mrna if no components are set. | |
| Raises ValueError if neither is available. | |
| """ | |
| parts = [ | |
| self.five_prime_utr or "", | |
| self.kozak or "", | |
| self.cds or "", | |
| self.three_prime_utr or "", | |
| self.poly_a or "", | |
| ] | |
| assembled = "".join(parts) | |
| if assembled: | |
| return assembled.upper() | |
| if self.full_mrna: | |
| return self.full_mrna.upper() | |
| raise ValueError( | |
| f"Sequence '{self.name}' has no components and no full_mrna set." | |
| ) | |
| def has_components(self) -> bool: | |
| """True if at least one sub-component is explicitly set.""" | |
| return any([ | |
| self.five_prime_utr, | |
| self.kozak, | |
| self.cds, | |
| self.three_prime_utr, | |
| self.poly_a, | |
| ]) | |
| def component_annotations(self) -> List[SequenceAnnotation]: | |
| """ | |
| Auto-derive position annotations from the component breakdown. | |
| Only available when has_components is True. | |
| """ | |
| annotations = [] | |
| pos = 0 | |
| component_colors = { | |
| "5'UTR": "#4A90D9", | |
| "Kozak": "#F5A623", | |
| "CDS": "#7ED321", | |
| "3'UTR": "#9B59B6", | |
| "PolyA": "#E74C3C", | |
| } | |
| components = [ | |
| ("5'UTR", self.five_prime_utr), | |
| ("Kozak", self.kozak), | |
| ("CDS", self.cds), | |
| ("3'UTR", self.three_prime_utr), | |
| ("PolyA", self.poly_a), | |
| ] | |
| for label, seq in components: | |
| if seq: | |
| annotations.append(SequenceAnnotation( | |
| label=label, | |
| start=pos, | |
| end=pos + len(seq), | |
| color=component_colors.get(label), | |
| )) | |
| pos += len(seq) | |
| return annotations | |
| def length(self) -> int: | |
| try: | |
| return len(self.assembled_sequence) | |
| except ValueError: | |
| return 0 | |
| def cds_length(self) -> Optional[int]: | |
| return len(self.cds) if self.cds else None | |
| # ββ Mutation helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def with_cds(self, cds: str) -> "mRNASequence": | |
| """Return a new mRNASequence with the CDS replaced.""" | |
| from dataclasses import replace | |
| return replace( | |
| self, | |
| id=str(uuid.uuid4()), | |
| cds=cds.upper(), | |
| source="local", | |
| db_source=None, | |
| _analysis_cache={}, | |
| ) | |
| def to_dict(self) -> Dict[str, Any]: | |
| return { | |
| "id": self.id, | |
| "name": self.name, | |
| "source": self.source, | |
| "db_source": self.db_source, | |
| "five_prime_utr": self.five_prime_utr, | |
| "kozak": self.kozak, | |
| "cds": self.cds, | |
| "three_prime_utr": self.three_prime_utr, | |
| "poly_a": self.poly_a, | |
| "full_mrna": self.full_mrna, | |
| "raw_metadata": self.raw_metadata, | |
| } | |
| def from_dict(cls, data: Dict[str, Any]) -> "mRNASequence": | |
| return cls( | |
| id=data.get("id", str(uuid.uuid4())), | |
| name=data["name"], | |
| source=data.get("source", "local"), | |
| db_source=data.get("db_source"), | |
| five_prime_utr=data.get("five_prime_utr"), | |
| kozak=data.get("kozak"), | |
| cds=data.get("cds"), | |
| three_prime_utr=data.get("three_prime_utr"), | |
| poly_a=data.get("poly_a"), | |
| full_mrna=data.get("full_mrna"), | |
| raw_metadata=data.get("raw_metadata", {}), | |
| ) | |
| def __repr__(self) -> str: | |
| length = self.length | |
| return f"mRNASequence(name={self.name!r}, source={self.source!r}, length={length})" | |