from __future__ import annotations import re _INVALID_NAME_PHRASES = { "telefono", "teléfono", "fecha egreso", "tipo y n", "masculino", "femenino", "paciente", "usuario", } _INVALID_NAME_TOKENS = { "telefono", "teléfono", "fecha", "egreso", "tipo", "paciente", "usuario", "masculino", "femenino", "sexo", "identificacion", "identificación", } _TRAILING_LABEL_TOKENS = { "no", "caso", "edad", "sexo", "identificacion", "identificación", } _NAME_JOINERS = {"de", "del", "la", "las", "los", "y"} _TEXT_PATTERNS = ( re.compile( r"paciente\s*:\s*(?:cc|ti|ce|dni|nit)?\s*[-:]?\s*\d{5,12}\s*-\s*([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ ]{5,80})", re.IGNORECASE, ), re.compile( r"(?:nombre(?:\s+del)?\s+paciente|nombre\s+paciente)\s*[:#-]?\s*([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ ]{5,80})", re.IGNORECASE, ), re.compile( r"([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ ]{5,80})\s*Nombre(?:\s+del)?\s+Paciente", re.IGNORECASE, ), re.compile( r"paciente\s*:\s*([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ ]{5,80})", re.IGNORECASE, ), ) def _normalize_spaces(value: str) -> str: return re.sub(r"\s+", " ", str(value or "")).strip() def sanitize_patient_name_candidate(value: str) -> str: raw = _normalize_spaces(value).strip(" :#-\n\r\t") if not raw: return "" normalized = raw.lower() normalized = re.sub(r"[^a-z0-9áéíóúñ ]+", " ", normalized) normalized = re.sub(r"\s+", " ", normalized).strip() if not normalized: return "" if normalized in _INVALID_NAME_PHRASES: return "" if any( phrase in normalized for phrase in ("fecha egreso", "tipo y n", "telefono", "teléfono") ): return "" tokens = re.findall(r"[a-záéíóúñ]+", normalized) if len(tokens) < 2: return "" while len(tokens) >= 2 and tokens[-1] in _TRAILING_LABEL_TOKENS: tokens.pop() if len(tokens) < 2: return "" non_joiner_tokens = [token for token in tokens if token not in _NAME_JOINERS] if len(non_joiner_tokens) < 2: return "" if any(token in _INVALID_NAME_TOKENS for token in non_joiner_tokens): return "" return " ".join(token.upper() for token in tokens[:6]).strip() def extract_patient_name_from_html(analisis_html: str) -> str: if not analisis_html: return "" patterns = ( re.compile( r"

Nombre del paciente\s*

\s*

([^<]+)

", re.IGNORECASE | re.DOTALL, ), re.compile( r"

Nombre del paciente[:\s]*

\s*

([^<]+)

", re.IGNORECASE | re.DOTALL, ), re.compile( r"Nombre del paciente\s*:?\s*([A-Za-zÀ-ÿ\u00f1\u00d1\s]+)(?:

|.*?Nombre.*?paciente.*?:?\s*([A-Za-zÀ-ÿ\u00f1\u00d1\s]{3,50}?)(?:

|([A-Za-zÀ-ÿ\u00f1\u00d1\s]{5,50})<", lines[probe], ) if not candidate_match: continue candidate = sanitize_patient_name_candidate(candidate_match.group(1)) if candidate: return candidate return "" def extract_patient_name_from_text(text: str) -> str: if not text: return "" for pattern in _TEXT_PATTERNS: match = pattern.search(text) if not match: continue candidate = sanitize_patient_name_candidate(match.group(1)) if candidate: return candidate lines = [_normalize_spaces(line) for line in text.splitlines()] for index, line in enumerate(lines): if not line: continue if re.fullmatch(r"(?:nombre(?:\s+del)?\s+paciente|paciente)", line, re.IGNORECASE): for probe in range(index + 1, min(index + 4, len(lines))): candidate = sanitize_patient_name_candidate(lines[probe]) if candidate: return candidate if re.search(r"nombre(?:\s+del)?\s+paciente", line, re.IGNORECASE): fragments = re.split(r"[:#-]", line, maxsplit=1) if len(fragments) == 2: candidate = sanitize_patient_name_candidate(fragments[1]) if candidate: return candidate return ""